In [1]:
import sys
import os
import pandas as pd
from data.load_credit_data import load_credit_data
from data.auto_feature_grouping import detect_and_adjust_data_schema, group_columns, manually_adjust_input_cols
from config import TARGET_COL, DEFAULT_FEATURES, ID_COLS, CAT_COLS
from feature_engineering.imputations import knn_impute
from classification_methods.classification_methods import lgbm_classification
from sklearn.metrics import classification_report, confusion_matrix

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
train = load_credit_data('train.csv')
test = load_credit_data('test.csv')
train, _ = detect_and_adjust_data_schema(train)
test, _ = detect_and_adjust_data_schema(test)
print(f'Train shape: {train.shape}, Test shape: {test.shape}')

train_groups = group_columns(train)
test_groups = group_columns(test)
change_cols = {'current_col': "id_cols", "new_col": "continuous_cols", "val": "Monthly_Balance"}
train_groups = manually_adjust_input_cols(train_groups, change)
test_groups = manually_adjust_input_cols(test_groups, change)
print("Train column groups:", train_groups)
print("Test column groups:", test_groups)

  return pd.read_csv(file_path)


Train shape: (100000, 28), Test shape: (50000, 27)
Moved 'Monthly_Balance' from 'id_cols' to 'continuous_cols'.
Moved 'Monthly_Balance' from 'id_cols' to 'continuous_cols'.
Train column groups: {'id_cols': ['ID', 'Customer_ID', 'Name'], 'date_cols': [], 'continuous_cols': ['Monthly_Inhand_Salary', 'Num_Bank_Accounts', 'Num_Credit_Card', 'Interest_Rate', 'Delay_from_due_date', 'Num_Credit_Inquiries', 'Credit_Utilization_Ratio', 'Total_EMI_per_month', 'Monthly_Balance'], 'binary_cols': [], 'categorical_cols': ['Month', 'Age', 'SSN', 'Occupation', 'Annual_Income', 'Num_of_Loan', 'Type_of_Loan', 'Num_of_Delayed_Payment', 'Changed_Credit_Limit', 'Credit_Mix', 'Outstanding_Debt', 'Credit_History_Age', 'Payment_of_Min_Amount', 'Amount_invested_monthly', 'Payment_Behaviour', 'Credit_Score'], 'other_cols': []}
Test column groups: {'id_cols': ['ID', 'Customer_ID', 'Name'], 'date_cols': [], 'continuous_cols': ['Monthly_Inhand_Salary', 'Num_Bank_Accounts', 'Num_Credit_Card', 'Interest_Rate', 'Dela

In [5]:
# 2. Define target column  and imputation features
target_col = TARGET_COL
print(f'Target column: {target_col}')
imputation_features = ["Monthly_Balance"]
# Determine LGBM features
if train_groups is not None:
    # Flatten all values from train_groups except those in imputation_features
    all_grouped_features = [item for sublist in train_groups.values() for item in sublist]
    lgbm_features = [f for f in all_grouped_features if f not in imputation_features]
else:
    lgbm_features = [f for f in DEFAULT_FEATURES if f not in imputation_features]

Target column: Credit_Mix


In [8]:
# Features for KNN imputation (include all except target)
knn_features = lgbm_features
if target_col in knn_features:
    knn_features.remove(target_col)

# Separate numeric and non-numeric columns for KNN imputation
numeric_features = train[knn_features].select_dtypes(include='number').columns.tolist()
non_numeric_features = [col for col in knn_features if col not in numeric_features]

# Impute only numeric columns
train_numeric_imputed = knn_impute(train[numeric_features])
test_numeric_imputed = knn_impute(test[numeric_features])

# Concatenate imputed numeric columns with non-numeric columns
train_imputed = pd.concat([
    train_numeric_imputed,
    train[non_numeric_features].reset_index(drop=True)
], axis=1)

# Remove target_col from non_numeric_features for test set and filter only columns present in test
non_numeric_features_test = [col for col in non_numeric_features if col in test.columns and col != target_col]
test_imputed = pd.concat([
    test_numeric_imputed,
    test[non_numeric_features_test].reset_index(drop=True)
], axis=1)

train_imputed[target_col] = train[target_col].values
if target_col in test.columns:
    test_imputed[target_col] = test[target_col].values

# 4. Train LGBM on imputed train data and apply on test
# Filter lgbm_features to only those present in train_imputed and test_imputed
lgbm_features_train = [f for f in lgbm_features if f in train_imputed.columns]
lgbm_features_test = [f for f in lgbm_features if f in test_imputed.columns]
X_train_imp = train_imputed[lgbm_features_train]
y_train_imp = train_imputed[target_col]
X_test_imp = test_imputed[lgbm_features_test]
model_imp, acc_imp, preds_imp, y_test_imp = lgbm_classification(X_train_imp, y_train_imp)
print(f'Imputed Data - Test Accuracy: {acc_imp:.4f}')

# 5. Train LGBM on non-imputed train data and apply on test
train_dropna = train.dropna(subset=numeric_features + [target_col])
X_train_noimp = train_dropna[lgbm_features_train]
y_train_noimp = train_dropna[target_col]
model_noimp, acc_noimp, preds_noimp, y_test_noimp = lgbm_classification(X_train_noimp, y_train_noimp)
print(f'Non-Imputed Data - Test Accuracy: {acc_noimp:.4f}')

[LGBM] Dropping non-numeric columns: ['ID', 'Customer_ID', 'Name', 'Month', 'Age', 'SSN', 'Occupation', 'Annual_Income', 'Num_of_Loan', 'Type_of_Loan', 'Num_of_Delayed_Payment', 'Changed_Credit_Limit', 'Outstanding_Debt', 'Credit_History_Age', 'Payment_of_Min_Amount', 'Amount_invested_monthly', 'Payment_Behaviour', 'Credit_Score']
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000423 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1854
[LightGBM] [Info] Number of data points in the train set: 80000, number of used features: 8
[LightGBM] [Info] Start training from score -1.666074
[LightGBM] [Info] Start training from score -1.408387
[LightGBM] [Info] Start training from score -1.007926
[LightGBM] [Info] Start training from score -1.601966
Imputed Data - Test Accuracy: 0.7194
[LGBM] Dropping non-numeric columns: ['ID', 'Customer_ID', 'Na

In [9]:
# 6. Compare basic statistical criteria
print('Imputed Data Classification Report:')
print(classification_report(y_test_imp, preds_imp))
print('Non-Imputed Data Classification Report:')
print(classification_report(y_test_noimp, preds_noimp))
print('Imputed Data Confusion Matrix:')
print(confusion_matrix(y_test_imp, preds_imp))
print('Non-Imputed Data Confusion Matrix:')
print(confusion_matrix(y_test_noimp, preds_noimp))

Imputed Data Classification Report:
              precision    recall  f1-score   support

         Bad       0.69      0.97      0.81      3870
        Good       0.74      0.90      0.81      4774
    Standard       0.73      0.87      0.79      7281
           _       0.19      0.00      0.00      4075

    accuracy                           0.72     20000
   macro avg       0.59      0.69      0.60     20000
weighted avg       0.61      0.72      0.64     20000

Non-Imputed Data Classification Report:
              precision    recall  f1-score   support

         Bad       0.69      0.97      0.81      3142
        Good       0.75      0.90      0.82      4114
    Standard       0.73      0.87      0.79      6097
           _       0.15      0.00      0.00      3311

    accuracy                           0.72     16664
   macro avg       0.58      0.69      0.61     16664
weighted avg       0.61      0.72      0.65     16664

Imputed Data Confusion Matrix:
[[3769    0   94    7]
