## 여러 machine Learning model 사용

### LightGBM

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.preprocessing import StandardScaler
from imblearn.under_sampling import RandomUnderSampler
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle

from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

from sklearn.metrics import roc_curve, auc, log_loss
from sklearn.metrics import classification_report, precision_recall_curve, auc
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
from tqdm import tqdm

import torch 
import os
import torch

import warnings
warnings.filterwarnings("ignore")

In [2]:
## Load original data
data_path = "C:/Users/GCU/Lending_club/Data_Analysis_lending-club/Dataset/return_feature_train.csv"
data = pd.read_csv(data_path, low_memory=False)

##load test data
test_data_path = "C:/Users/GCU/Lending_club/Data_Analysis_lending-club/Dataset/return_feature_test.csv"
test_data = pd.read_csv(test_data_path)

In [3]:
keep_features = ['grade', 'term_months', 'total_pymnt', 'total_pymnt_inv','total_il_high_credit_limit', 'loan_amnt']

data_classification = data.copy()
test_classification = test_data.copy()

data_classification = data_classification.drop(columns = keep_features)
test_classification = test_classification.drop(columns = keep_features)

In [4]:
data_x = data_classification.drop(columns='loan_status')
data_y = data_classification[['loan_status']]

X_test = test_classification.drop(columns='loan_status')
y_test = test_classification[['loan_status']]

In [9]:
from imblearn.over_sampling import SMOTE
from sklearn.metrics import precision_score, recall_score, f1_score, precision_recall_curve, auc

from sklearn.utils import shuffle

results_summary = []

def method_evaluation(model, data_x, data_y, X_test, y_test, method):
    print(f"Preprocessing method : {method}")

    if method == 'Base':
        X_train, X_val, y_train, y_val = train_test_split(data_x, data_y, test_size=0.2, stratify=data_y)

        model.fit(
        X_train, y_train,
        eval_set=[(X_train, y_train), (X_val, y_val)],  # Train & Validation Loss 저장
        )

        y_prob = model.predict_proba(X_test)[:, 1]  # ROC Curve & PRC에 필요

        threshold = 0.5
        y_pred = (y_prob >= threshold).astype(int)

    elif method == 'Undersampling':
        X_train, X_val, y_train, y_val = train_test_split(data_x, data_y, test_size=0.13, stratify=data_y)
        undersampler = RandomUnderSampler()
        X_under, y_under = undersampler.fit_resample(X_train, y_train)

        data_under = pd.concat([pd.DataFrame(X_under, columns=data_x.columns), pd.DataFrame(y_under, columns=['loan_status'])], axis=1)
        X_train = data_under.drop(columns=['loan_status'])  # Feature (입력 데이터)
        y_train = data_under[['loan_status']]  # Target (타겟 변수)

        model.fit(
        X_train, y_train,
        eval_set=[(X_train, y_train), (X_val, y_val)],  # Train & Validation Loss 저장
        )
        y_prob = model.predict_proba(X_test)[:, 1]  # ROC Curve & PRC에 필요

        threshold = 0.5
        y_pred = (y_prob >= threshold).astype(int)

    elif method == 'SMOTE':
        X_train, X_val, y_train, y_val = train_test_split(data_x, data_y, test_size=0.33, stratify=data_y)

        smote = SMOTE()

        X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

        augmentation_data = pd.concat([X_train_smote, y_train_smote], axis = 1)
        X_train = augmentation_data.drop(columns='loan_status')
        y_train = augmentation_data[['loan_status']]

        model.fit(
            X_train, y_train,
            eval_set=[(X_train, y_train), (X_val, y_val)],  # Train & Validation Loss 저장
        )
        y_prob = model.predict_proba(X_test)[:, 1]  # ROC Curve & PRC에 필요

        threshold = 0.5
        y_pred = (y_prob >= threshold).astype(int)
    
    elif method == 'table-gan':
        #Fake dataset
        fake_path = "C:/Users/GCU/Lending_club/Data_Analysis_lending-club/tablegan/samples/return_feature/return_feature_OI_11_00_fake.csv"
        fake = pd.read_csv(fake_path)
        fake['loan_status'] = 1
        
        keep_features = ['grade', 'term_months', 'total_pymnt', 'total_pymnt_inv','total_il_high_credit_limit', 'loan_amnt']
        fake_Classification = fake.copy()
        fake_Classification = fake_Classification.drop(columns = keep_features)

        X_train, X_val, y_train, y_val = train_test_split(data_x, data_y, test_size=0.33, stratify=data_y)
        train_dataset = pd.concat([X_train, y_train], axis=1)
        
        train_total = pd.concat([train_dataset,fake_Classification])
        train_total = shuffle(train_total, random_state=42)

        X_train = train_total.drop(columns='loan_status')
        y_train = train_total[['loan_status']]

        model.fit(
            X_train, y_train,
            eval_set=[(X_train, y_train), (X_val, y_val)],  # Train & Validation Loss 저장
        )
        y_prob = model.predict_proba(X_test)[:, 1]  # ROC Curve & PRC에 필요

        threshold = 0.5
        y_pred = (y_prob >= threshold).astype(int)

    elif method == 'Smotified-gan':
        fake_path = "C:/Users/GCU/Lending_club/Data_Analysis_lending-club/tablegan/samples/oversampled/oversampled_OI_11_00_fake.csv"
        fake = pd.read_csv(fake_path)
        fake['loan_status'] = 1

        keep_features = ['grade', 'term_months', 'total_pymnt', 'total_pymnt_inv','total_il_high_credit_limit', 'loan_amnt']
        fake_Classification = fake.copy()
        fake_Classification = fake_Classification.drop(columns = keep_features)
        
        X_train, X_val, y_train, y_val = train_test_split(data_x, data_y, test_size=0.2, stratify=data_y)
        train_dataset = pd.concat([X_train, y_train], axis=1)
        
        train_total = pd.concat([train_dataset,fake_Classification])
        train_total = shuffle(train_total)

        X_train = train_total.drop(columns='loan_status')
        y_train = train_total[['loan_status']]

        model.fit(
            X_train, y_train,
            eval_set=[(X_train, y_train), (X_val, y_val)],  # Train & Validation Loss 저장
        )
        y_prob = model.predict_proba(X_test)[:, 1]  # ROC Curve & PRC에 필요

        threshold = 0.5
        y_pred = (y_prob >= threshold).astype(int)
    

    precision_0 = precision_score(y_test, y_pred, pos_label=0)
    recall_0 = recall_score(y_test, y_pred, pos_label=0)
    f1_0 = f1_score(y_test, y_pred, pos_label=0)

    precision_1 = precision_score(y_test, y_pred, pos_label=1)
    recall_1 = recall_score(y_test, y_pred, pos_label=1)
    f1_1 = f1_score(y_test, y_pred, pos_label=1)

    precision, recall, _ = precision_recall_curve(y_test, y_prob)
    au_prc = auc(recall, precision)

    results_summary.append({
    'Method': method,
    'Recall_0': recall_0,
    'Precision_0': precision_0,
    'F1_0': f1_0,
    'Recall_1': recall_1,
    'Precision_1': precision_1,
    'F1_1': f1_1,
    'AU_PRC': au_prc,
    'y_prob': y_prob,
    'y_test': y_test.squeeze()  
    })

In [None]:
import numpy as np
from sklearn.model_selection import train_test_split
from lightgbm import LGBMClassifier
from tqdm import tqdm

Methods = ['Base', 'Undersampling', 'SMOTE', 'table-gan','Smotified-gan']
n_runs = 100

all_results = []

X_test = test_classification.drop(columns='loan_status')
y_test = test_classification['loan_status']

for method in Methods:
    print(f"\n===== {method} 시작 =====")
    
    method_results = []

    for run in tqdm(range(n_runs), desc=f'{method} Repeats'):
        data_x = data_classification.drop(columns='loan_status')
        data_y = data_classification['loan_status']

        params = {
            'objective': 'binary',
            'num_leaves': 31,
            'n_estimators': 500,
            'class_weight': None,
            'random_state': 42,
        }

        lgbm_model = LGBMClassifier(**params)

        method_evaluation(model, data_x, data_y, X_test, y_test, method)

        last_result = results_summary[-1]  # 방금 추가된 결과만 가져옴
        method_results.append(last_result)

    all_results.append((method, method_results))


===== Base 시작 =====


Base Repeats:   0%|          | 0/100 [00:00<?, ?it/s]

Preprocessing method : Base
[LightGBM] [Info] Number of positive: 174231, number of negative: 718935
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.012883 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2974
[LightGBM] [Info] Number of data points in the train set: 893166, number of used features: 14
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.195071 -> initscore=-1.417389
[LightGBM] [Info] Start training from score -1.417389


Base Repeats:   1%|          | 1/100 [00:01<02:40,  1.62s/it]

Preprocessing method : Base
[LightGBM] [Info] Number of positive: 174231, number of negative: 718935
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.012987 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2976
[LightGBM] [Info] Number of data points in the train set: 893166, number of used features: 14
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.195071 -> initscore=-1.417389
[LightGBM] [Info] Start training from score -1.417389


Base Repeats:   2%|▏         | 2/100 [00:03<02:34,  1.57s/it]

Preprocessing method : Base
[LightGBM] [Info] Number of positive: 174231, number of negative: 718935
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.013952 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2974
[LightGBM] [Info] Number of data points in the train set: 893166, number of used features: 14
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.195071 -> initscore=-1.417389
[LightGBM] [Info] Start training from score -1.417389


Base Repeats:   3%|▎         | 3/100 [00:04<02:32,  1.57s/it]

Preprocessing method : Base
[LightGBM] [Info] Number of positive: 174231, number of negative: 718935
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.012618 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2976
[LightGBM] [Info] Number of data points in the train set: 893166, number of used features: 14
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.195071 -> initscore=-1.417389
[LightGBM] [Info] Start training from score -1.417389


Base Repeats:   4%|▍         | 4/100 [00:06<02:31,  1.58s/it]

Preprocessing method : Base
[LightGBM] [Info] Number of positive: 174231, number of negative: 718935
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.012996 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2975
[LightGBM] [Info] Number of data points in the train set: 893166, number of used features: 14
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.195071 -> initscore=-1.417389
[LightGBM] [Info] Start training from score -1.417389


Base Repeats:   5%|▌         | 5/100 [00:07<02:28,  1.56s/it]

Preprocessing method : Base
[LightGBM] [Info] Number of positive: 174231, number of negative: 718935
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003527 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2972
[LightGBM] [Info] Number of data points in the train set: 893166, number of used features: 14
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.195071 -> initscore=-1.417389
[LightGBM] [Info] Start training from score -1.417389


Base Repeats:   6%|▌         | 6/100 [00:09<02:26,  1.55s/it]

Preprocessing method : Base
[LightGBM] [Info] Number of positive: 174231, number of negative: 718935
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.012803 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2975
[LightGBM] [Info] Number of data points in the train set: 893166, number of used features: 14
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.195071 -> initscore=-1.417389
[LightGBM] [Info] Start training from score -1.417389


Base Repeats:   7%|▋         | 7/100 [00:10<02:24,  1.56s/it]

Preprocessing method : Base
[LightGBM] [Info] Number of positive: 174231, number of negative: 718935
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003290 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2974
[LightGBM] [Info] Number of data points in the train set: 893166, number of used features: 14
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.195071 -> initscore=-1.417389
[LightGBM] [Info] Start training from score -1.417389


Base Repeats:   8%|▊         | 8/100 [00:12<02:24,  1.57s/it]

Preprocessing method : Base
[LightGBM] [Info] Number of positive: 174231, number of negative: 718935
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003301 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2974
[LightGBM] [Info] Number of data points in the train set: 893166, number of used features: 14
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.195071 -> initscore=-1.417389
[LightGBM] [Info] Start training from score -1.417389


Base Repeats:   9%|▉         | 9/100 [00:14<02:22,  1.56s/it]

Preprocessing method : Base
[LightGBM] [Info] Number of positive: 174231, number of negative: 718935
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.012659 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2972
[LightGBM] [Info] Number of data points in the train set: 893166, number of used features: 14
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.195071 -> initscore=-1.417389
[LightGBM] [Info] Start training from score -1.417389


Base Repeats:  10%|█         | 10/100 [00:15<02:19,  1.55s/it]

Preprocessing method : Base
[LightGBM] [Info] Number of positive: 174231, number of negative: 718935
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003390 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2974
[LightGBM] [Info] Number of data points in the train set: 893166, number of used features: 14
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.195071 -> initscore=-1.417389
[LightGBM] [Info] Start training from score -1.417389


Base Repeats:  11%|█         | 11/100 [00:17<02:19,  1.56s/it]

Preprocessing method : Base
[LightGBM] [Info] Number of positive: 174231, number of negative: 718935
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004391 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2974
[LightGBM] [Info] Number of data points in the train set: 893166, number of used features: 14
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.195071 -> initscore=-1.417389
[LightGBM] [Info] Start training from score -1.417389


Base Repeats:  12%|█▏        | 12/100 [00:18<02:17,  1.56s/it]

Preprocessing method : Base
[LightGBM] [Info] Number of positive: 174231, number of negative: 718935
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003310 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2974
[LightGBM] [Info] Number of data points in the train set: 893166, number of used features: 14
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.195071 -> initscore=-1.417389
[LightGBM] [Info] Start training from score -1.417389


Base Repeats:  13%|█▎        | 13/100 [00:20<02:14,  1.55s/it]

Preprocessing method : Base
[LightGBM] [Info] Number of positive: 174231, number of negative: 718935
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.014044 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2974
[LightGBM] [Info] Number of data points in the train set: 893166, number of used features: 14
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.195071 -> initscore=-1.417389
[LightGBM] [Info] Start training from score -1.417389


In [None]:
summary_stats = []

for method, results in all_results:
    df = pd.DataFrame(results)

    summary_stats.append({
        'Method': method,
        'Recall_1_Mean': df['Recall_1'].mean(),
        'Recall_1_Std': df['Recall_1'].std(),
        'F1_1_Mean': df['F1_1'].mean(),
        'F1_1_Std': df['F1_1'].std(),
        'AU_PRC_Mean': df['AU_PRC'].mean(),
        'AU_PRC_Std': df['AU_PRC'].std(),
    })

stats_df = pd.DataFrame(summary_stats)
print(stats_df)

          Method  Recall_1_Mean  Recall_1_Std  F1_1_Mean  F1_1_Std  \
0           Base       0.000000           NaN   0.000000       NaN   
1  Undersampling       0.909644           NaN   0.717852       NaN   
2          SMOTE       0.819811           NaN   0.738830       NaN   
3      table-gan       1.000000           NaN   0.326458       NaN   
4  Smotified-gan       1.000000           NaN   0.326458       NaN   

   AU_PRC_Mean  AU_PRC_Std  
0     0.765373         NaN  
1     0.766754         NaN  
2     0.763563         NaN  
3     0.671002         NaN  
4     0.703048         NaN  


In [None]:
stats_df.head()

Unnamed: 0,Method,Recall_1_Mean,Recall_1_Std,F1_1_Mean,F1_1_Std,AU_PRC_Mean,AU_PRC_Std
0,Base,0.0,,0.0,,0.765373,
1,Undersampling,0.909644,,0.717852,,0.766754,
2,SMOTE,0.819811,,0.73883,,0.763563,
3,table-gan,1.0,,0.326458,,0.671002,
4,Smotified-gan,1.0,,0.326458,,0.703048,


In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

plt.figure(figsize=(8, 6))
all_f1_df = pd.concat([
    pd.DataFrame({'Method': method, 'F1_1': [r['F1_1'] for r in results]})
    for method, results in all_results
])

sns.boxplot(data=all_f1_df, x='Method', y='F1_1')
plt.title('F1-Score (Class 1) Distribution by Method')
plt.grid(True)
plt.show()


In [None]:
plt.figure(figsize=(8, 6))
x = stats_df['Method']
y = stats_df['AU_PRC_Mean']
err = stats_df['AU_PRC_Std']

plt.bar(x, y, yerr=err, capsize=5)
plt.ylabel('AU-PRC (Mean ± Std)')
plt.title('AU-PRC by Sampling Method')
plt.grid(axis='y')
plt.show()
