In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

from sklearn.preprocessing import LabelEncoder, MinMaxScaler,OneHotEncoder
from sklearn.preprocessing import StandardScaler
from imblearn.under_sampling import RandomUnderSampler
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle

from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

from sklearn.metrics import roc_curve, auc, log_loss
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
from sklearn.metrics import classification_report, precision_recall_curve, auc
from sklearn.metrics import accuracy_score
from tqdm import tqdm

import torch 
import os
import torch

import warnings
warnings.filterwarnings("ignore")

In [2]:
is_cuda = torch.cuda.is_available()
device = torch.device('cuda' if is_cuda else 'cpu')

print(device)

cuda


In [None]:
## Load original data
data_path = "C:/Users/GCU/Lending_club/Data_Analysis_lending-club/VAE-CTAB-GAN/Real_Datasets/train_category.csv"
data = pd.read_csv(data_path, low_memory=False)

##load test data
test_data_path = "C:/Users/GCU/Lending_club/Data_Analysis_lending-club/VAE-CTAB-GAN/Real_Datasets/test_category.csv"
test_data = pd.read_csv(test_data_path)

data_0 = data[data['loan_status']==0]
data_0_sampled = data_0.sample(540000,random_state=42)

"""label1_path = "C:/Users/GCU/Lending_club/Data_Analysis_lending-club/VAE-CTAB-GAN/Real_Datasets/train_category_1.csv"
label1 = pd.read_csv(label1_path, low_memory=False)"""

In [5]:
data_x = data.drop(columns='loan_status')
data_y = data[['loan_status']]
rus = RandomUnderSampler(random_state=42)
X_under, y_under = rus.fit_resample(data_x, data_y)

data_base = pd.concat([X_under,y_under], axis=1)

In [6]:
#keep_features = ['grade', 'term_months', 'total_pymnt', 'total_pymnt_inv','total_il_high_credit_limit', 'loan_amnt']
keep_features = ['grade', 'total_pymnt', 'total_pymnt_inv', 'loan_amnt','funded_amnt']

data_classification = data_base.copy()
test_classification = test_data.copy()

data_classification = data_classification.drop(columns = keep_features)
test_classification = test_classification.drop(columns = keep_features)

In [7]:
## data Encoding
Labelencoding_features = ['term_months', 'sub_grade']
onehot_features = ['debt_settlement_flag', 'home_ownership', 'purpose']

onehot_encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')

label_encoders = {}
for col in Labelencoding_features:
    le = LabelEncoder()
    data_classification[col] = le.fit_transform(data_classification[col])
    label_encoders[col] = le

# One-Hot Encoding
onehot_encoded = onehot_encoder.fit_transform(data_classification[onehot_features])
onehot_encoded_df = pd.DataFrame(
    onehot_encoded, 
    columns=onehot_encoder.get_feature_names_out(onehot_features),
    index=data_classification.index
)

# Merge
data_classification.drop(columns=onehot_features, inplace=True)
data_classification = pd.concat([data_classification, onehot_encoded_df], axis=1)

print("✅ Encoding 완료! 결과 shape:", data_classification.shape)

✅ Encoding 완료! 결과 shape: (435578, 35)


In [8]:
##Test data encoding
for col in Labelencoding_features:
    le = label_encoders[col]
    test_classification[col] = le.transform(test_classification[col])

# One-Hot Encoding (train에서 fit된 onehot_encoder 재사용)
onehot_encoded_test = onehot_encoder.transform(test_classification[onehot_features])
onehot_encoded_test_df = pd.DataFrame(
    onehot_encoded_test,
    columns=onehot_encoder.get_feature_names_out(onehot_features),
    index=test_classification.index
)

# Merge
test_classification.drop(columns=onehot_features, inplace=True)
test_classification = pd.concat([test_classification, onehot_encoded_test_df], axis=1)

print("✅ 테스트 데이터 인코딩 완료! 결과 shape:", test_classification.shape)

✅ 테스트 데이터 인코딩 완료! 결과 shape: (744306, 35)


In [9]:
def encode_features(df, label_encoders, onehot_encoder, label_cols, onehot_cols):
    df = df.copy()
    
    # Label Encoding
    for col in label_cols:
        le = label_encoders[col]
        df[col] = le.transform(df[col])

    # One-Hot Encoding
    onehot_encoded = onehot_encoder.transform(df[onehot_cols])
    onehot_df = pd.DataFrame(
        onehot_encoded, 
        columns=onehot_encoder.get_feature_names_out(onehot_cols),
        index=df.index
    )

    df.drop(columns=onehot_cols, inplace=True)
    df = pd.concat([df, onehot_df], axis=1)
    
    return df

In [None]:
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, classification_report

def ml_utility(model, X_test, y_test, method):
    results = []

    if method == "non-augmented":
        X_train = data_classification.drop(columns='loan_status')
        y_train = data_classification[['loan_status']]


    elif method == 'SMOTE-NC':
        print("smote-nc")
        smote_path = "C:/Users/GCU/Lending_club/Data_Analysis_lending-club/portfolios/base/smote-nc.csv"
        smote_data = pd.read_csv(smote_path)
        ##합성 데이터와 합치기
        combined = pd.concat([data_0_sampled, smote_data], axis=0).sample(frac=1, random_state=42)

        drop_cols = ['grade', 'total_pymnt', 'total_pymnt_inv', 'loan_amnt', 'funded_amnt']
        combined = combined.drop(columns=drop_cols)
        combined['term_months'] = combined['term_months'].apply(lambda x: 36 if abs(x - 36) < abs(x - 60) else 60)
        combined = encode_features(combined, label_encoders, onehot_encoder, Labelencoding_features, onehot_features)
        X_train = combined.drop(columns='loan_status')
        y_train = combined[['loan_status']]
        
    else:
        fake_base = {
            'table-gan': 'tablegan/tablegan.csv',
            'vae-tablegan': 'tablegan/vae-tablegan.csv',
            'ctgan': 'ctgan/ctgan.csv',
            'vae-ctgan': 'ctgan/vae-ctgan.csv',
            'ctabgan': 'ctabgan/ctabgan.csv',
            'vae-ctabgan': 'ctabgan/vae-ctabgan.csv'
        }

        if method not in fake_base:
            raise ValueError(f"Unknown method: {method}")
        print(f"{method}")
        fake_path = f"C:/Users/GCU/Lending_club/Data_Analysis_lending-club/portfolios/{fake_base[method]}"
        fake = pd.read_csv(fake_path)
        fake['loan_status'] = 1  # 모든 fake는 default
            
        combined = pd.concat([data_0_sampled, fake], axis=0).sample(frac=1, random_state=42)
        
        keep_features = ['grade', 'total_pymnt', 'total_pymnt_inv', 'loan_amnt', 'funded_amnt']
        fake = combined.drop(columns=keep_features)
        fake['term_months'] = fake['term_months'].apply(lambda x: 36 if abs(x - 36) < abs(x - 60) else 60)
        fake = encode_features(fake, label_encoders, onehot_encoder, Labelencoding_features, onehot_features)

        X_train = fake.drop(columns='loan_status')
        y_train = fake[['loan_status']]
    
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    y_proba = model.predict_proba(X_test)[:, 1]
    report = classification_report(y_test, y_pred, output_dict=True)

    results.append({
        "Method": method,
        "Accuracy": round(accuracy_score(y_test, y_pred), 4),
        "F1": round(f1_score(y_test, y_pred), 4),
        "AUC": round(roc_auc_score(y_test, y_proba), 4),
        "Class1 Recall": round(report["1"]["recall"], 4),
        "Class1 Precision": round(report["1"]["precision"], 4)
    })

    return results

In [None]:
from xgboost import XGBClassifier
from tqdm import tqdm
import pandas as pd

methods = [
    "non-augmented", "SMOTE-NC",
    "table-gan", "vae-tablegan",
    "ctgan", "vae-ctgan",
    "ctabgan", "vae-ctabgan",
]

X_test = test_classification.drop(columns='loan_status')
y_test = test_classification['loan_status']

# 전체 결과 저장용 DataFrame
ml_utility_df = pd.DataFrame()

for method in tqdm(methods, desc="Methods"):
    model = XGBClassifier(
        n_estimators=500,
        learning_rate=0.01,
        objective='binary:logistic',
        eval_metric="logloss",
        random_state=42,
        verbosity=0  # verbose=False를 의미함
    )

    result_table = ml_utility(model, X_test, y_test, method)  # result_table: list of dict
    ml_utility_df = pd.concat([ml_utility_df, pd.DataFrame(result_table)], ignore_index=True)

Methods:  12%|█▎        | 1/8 [00:01<00:12,  1.74s/it]

smote-nc


Methods:  25%|██▌       | 2/8 [00:06<00:20,  3.42s/it]

method


Methods:  38%|███▊      | 3/8 [00:10<00:18,  3.75s/it]

method


Methods:  50%|█████     | 4/8 [00:14<00:15,  3.97s/it]

method


Methods:  62%|██████▎   | 5/8 [00:19<00:12,  4.14s/it]

method


Methods:  75%|███████▌  | 6/8 [00:23<00:08,  4.20s/it]

method


Methods:  88%|████████▊ | 7/8 [00:28<00:04,  4.30s/it]

method


Methods: 100%|██████████| 8/8 [00:32<00:00,  4.03s/it]


In [17]:
print(ml_utility_df.head(10))

          Method  Accuracy      F1     AUC  Class1 Recall  Class1 Precision
0  non-augmented    0.8754  0.7415  0.9480         0.9162            0.6228
1       SMOTE-NC    0.8815  0.7380  0.9304         0.8556            0.6488
2      table-gan    0.8366  0.3679  0.5752         0.2438            0.7495
3   vae-tablegan    0.8323  0.2460  0.5701         0.1403            0.9996
4          ctgan    0.8726  0.7347  0.9376         0.9045            0.6185
5      vae-ctgan    0.8323  0.2460  0.5701         0.1403            0.9996
6        ctabgan    0.8933  0.7426  0.9337         0.7887            0.7015
7    vae-ctabgan    0.8883  0.7566  0.9456         0.8901            0.6580


In [18]:
# ✅ ML Utility 차이 계산 (baseline = non-augmented)
baseline = ml_utility_df[ml_utility_df["Method"] == "non-augmented"].iloc[0]

for metric in ["Accuracy", "F1", "AUC", "Class1 Recall", "Class1 Precision"]:
    diff_col = f"{metric} Diff"
    ml_utility_df[diff_col] = ml_utility_df[metric].apply(lambda x: abs(x - baseline[metric]))

# ✅ baseline은 차이 0으로 세팅 (가독성)
for metric in ["Accuracy Diff", "F1 Diff", "AUC Diff", "Class1 Recall Diff", "Class1 Precision Diff"]:
    ml_utility_df.loc[ml_utility_df["Method"] == "non-augmented", metric] = 0.0

# ✅ 결과 저장
#ml_utility_df.to_csv("ml_utility_results_with_diff.csv", index=False)

In [20]:
ml_utility_df.head(10)

Unnamed: 0,Method,Accuracy,F1,AUC,Class1 Recall,Class1 Precision,Accuracy Diff,F1 Diff,AUC Diff,Class1 Recall Diff,Class1 Precision Diff
0,non-augmented,0.8754,0.7415,0.948,0.9162,0.6228,0.0,0.0,0.0,0.0,0.0
1,SMOTE-NC,0.8815,0.738,0.9304,0.8556,0.6488,0.0061,0.0035,0.0176,0.0606,0.026
2,table-gan,0.8366,0.3679,0.5752,0.2438,0.7495,0.0388,0.3736,0.3728,0.6724,0.1267
3,vae-tablegan,0.8323,0.246,0.5701,0.1403,0.9996,0.0431,0.4955,0.3779,0.7759,0.3768
4,ctgan,0.8726,0.7347,0.9376,0.9045,0.6185,0.0028,0.0068,0.0104,0.0117,0.0043
5,vae-ctgan,0.8323,0.246,0.5701,0.1403,0.9996,0.0431,0.4955,0.3779,0.7759,0.3768
6,ctabgan,0.8933,0.7426,0.9337,0.7887,0.7015,0.0179,0.0011,0.0143,0.1275,0.0787
7,vae-ctabgan,0.8883,0.7566,0.9456,0.8901,0.658,0.0129,0.0151,0.0024,0.0261,0.0352


In [None]:
"""from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
from scipy.spatial.distance import jensenshannon
from scipy.stats import wasserstein_distance, pearsonr
import numpy as np
import pandas as pd

def compute_classification_metrics(y_true, y_pred_proba):
    y_pred = (y_pred_proba >= 0.5).astype(int)
    return {
        "accuracy": accuracy_score(y_true, y_pred),
        "f1_score": f1_score(y_true, y_pred),
        "auc": roc_auc_score(y_true, y_pred_proba)
    }

def compute_jsd(p, q):
    p = np.clip(p, 1e-10, 1)
    q = np.clip(q, 1e-10, 1)
    return jensenshannon(p, q)**2  # Square to match traditional JSD

def compute_stat_similarity(real, synth):
    jsd = np.mean([
        compute_jsd(real[col].value_counts(normalize=True).reindex(index=synth[col].value_counts().index, fill_value=0).values,
                    synth[col].value_counts(normalize=True).reindex(index=synth[col].value_counts().index, fill_value=0).values)
        for col in real.columns
    ])

    wd = np.mean([
        wasserstein_distance(real[col], synth[col]) for col in real.select_dtypes(include=[np.number]).columns
    ])

    corr_diff = np.linalg.norm(real.corr().values - synth.corr().values)
    return jsd, wd, corr_diff
"""