In [1]:
import sys
import os
import pandas as pd
from data.load_credit_data import load_credit_data
from config import TARGET_COL, DEFAULT_FEATURES, ID_COLS, CAT_COLS
from feature_engineering.imputations import knn_impute
from classification_methods.classification_methods import lgbm_classification
from sklearn.metrics import classification_report, confusion_matrix

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
train = load_credit_data('train.csv')
test = load_credit_data('test.csv')
print(f'Train shape: {train.shape}, Test shape: {test.shape}')

# 2. Define target column
target_col = TARGET_COL
print(f'Target column: {target_col}')

# Features for LGBM (exclude ID columns)
lgbm_features = DEFAULT_FEATURES.copy()

# Features for KNN imputation (include all except target)
knn_features = list(set(DEFAULT_FEATURES + CAT_COLS + ID_COLS))
if target_col in knn_features:
    knn_features.remove(target_col)

# Separate numeric and non-numeric columns for KNN imputation
numeric_features = train[knn_features].select_dtypes(include='number').columns.tolist()
non_numeric_features = [col for col in knn_features if col not in numeric_features]

# Impute only numeric columns
train_numeric_imputed = knn_impute(train[numeric_features])
test_numeric_imputed = knn_impute(test[numeric_features])

# Concatenate imputed numeric columns with non-numeric columns
train_imputed = pd.concat([
    train_numeric_imputed,
    train[non_numeric_features].reset_index(drop=True)
], axis=1)
test_imputed = pd.concat([
    test_numeric_imputed,
    test[non_numeric_features].reset_index(drop=True)
], axis=1)
train_imputed[target_col] = train[target_col].values
if target_col in test.columns:
    test_imputed[target_col] = test[target_col].values

# 4. Train LGBM on imputed train data and apply on test
X_train_imp = train_imputed[lgbm_features]
y_train_imp = train_imputed[target_col]
X_test_imp = test_imputed[lgbm_features]
model_imp, acc_imp, preds_imp, y_test_imp = lgbm_classification(X_train_imp, y_train_imp)
print(f'Imputed Data - Test Accuracy: {acc_imp:.4f}')

# 5. Train LGBM on non-imputed train data and apply on test
train_dropna = train.dropna(subset=numeric_features + [target_col])
X_train_noimp = train_dropna[lgbm_features]
y_train_noimp = train_dropna[target_col]
model_noimp, acc_noimp, preds_noimp, y_test_noimp = lgbm_classification(X_train_noimp, y_train_noimp)
print(f'Non-Imputed Data - Test Accuracy: {acc_noimp:.4f}')


  return pd.read_csv(file_path)


Train shape: (100000, 28), Test shape: (50000, 27)
Target column: Credit_Mix
[LGBM] Dropping non-numeric columns: ['Month', 'Age', 'Annual_Income']
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000164 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 510
[LightGBM] [Info] Number of data points in the train set: 80000, number of used features: 2
[LightGBM] [Info] Start training from score -1.666074
[LightGBM] [Info] Start training from score -1.408387
[LightGBM] [Info] Start training from score -1.007926
[LightGBM] [Info] Start training from score -1.601966
Imputed Data - Test Accuracy: 0.5606
[LGBM] Dropping non-numeric columns: ['Month', 'Age', 'Annual_Income']
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000156 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory i

In [3]:
# 6. Compare basic statistical criteria
print('Imputed Data Classification Report:')
print(classification_report(y_test_imp, preds_imp))
print('Non-Imputed Data Classification Report:')
print(classification_report(y_test_noimp, preds_noimp))
print('Imputed Data Confusion Matrix:')
print(confusion_matrix(y_test_imp, preds_imp))
print('Non-Imputed Data Confusion Matrix:')
print(confusion_matrix(y_test_noimp, preds_noimp))

Imputed Data Classification Report:
              precision    recall  f1-score   support

         Bad       0.59      0.58      0.59      3870
        Good       0.69      0.59      0.64      4774
    Standard       0.51      0.84      0.63      7281
           _       0.14      0.00      0.00      4075

    accuracy                           0.56     20000
   macro avg       0.48      0.50      0.47     20000
weighted avg       0.49      0.56      0.50     20000

Non-Imputed Data Classification Report:
              precision    recall  f1-score   support

         Bad       0.59      0.62      0.60      3259
        Good       0.70      0.64      0.67      4088
    Standard       0.52      0.82      0.64      6245
           _       0.05      0.00      0.00      3408

    accuracy                           0.57     17000
   macro avg       0.47      0.52      0.48     17000
weighted avg       0.48      0.57      0.51     17000

Imputed Data Confusion Matrix:
[[2260    7 1596    7]
