In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle

from tqdm import tqdm

import torch 
import os
import torch

import warnings
warnings.filterwarnings("ignore")

In [2]:
is_cuda = torch.cuda.is_available()
device = torch.device('cuda' if is_cuda else 'cpu')

print(device)

cuda


In [12]:
import pandas as pd
import numpy as np
from scipy.spatial.distance import jensenshannon
from scipy.stats import wasserstein_distance

def compute_jsd(p, q):
    p = np.clip(p, 1e-10, 1)
    q = np.clip(q, 1e-10, 1)
    return jensenshannon(p, q)**2  # JSD 공식상 제곱

def compute_stat_similarity(real, synth):
    jsd = np.mean([
        compute_jsd(
            real[col].value_counts(normalize=True).reindex(index=synth[col].value_counts().index, fill_value=0).values,
            synth[col].value_counts(normalize=True).reindex(index=synth[col].value_counts().index, fill_value=0).values
        )
        for col in real.columns if real[col].dtype == 'object' or real[col].nunique() < 50
    ])

    wd = np.mean([
        wasserstein_distance(real[col], synth[col])
        for col in real.select_dtypes(include=[np.number]).columns
    ])

    # corr_diff (loan_status 제외)
    real_corr = real.drop(columns='loan_status').corr(numeric_only=True)
    synth_corr = synth.drop(columns='loan_status').corr(numeric_only=True)

    common_cols = real_corr.columns.intersection(synth_corr.columns)
    real_corr = real_corr.loc[common_cols, common_cols]
    synth_corr = synth_corr.loc[common_cols, common_cols]

    if real_corr.isnull().values.any() or synth_corr.isnull().values.any():
        corr_diff = np.nan
    else:
        corr_diff = np.linalg.norm(real_corr.values - synth_corr.values)

    return jsd, wd, corr_diff

In [13]:
# real = class 1만 있는 진짜 데이터셋
## Load original class == Default data
real_path = "C:/Users/GCU/Lending_club/Data_Analysis_lending-club/VAE-CTAB-GAN/Real_Datasets/train_category_1.csv"
real = pd.read_csv(real_path, low_memory=False)

methods = {
    'smote-nc': 'C:/Users/GCU/Lending_club/Data_Analysis_lending-club/portfolios/base/smote-nc.csv',
    'tablegan': 'C:/Users/GCU/Lending_club/Data_Analysis_lending-club/portfolios/tablegan/tablegan.csv',
    'vae-tablegan': 'C:/Users/GCU/Lending_club/Data_Analysis_lending-club/portfolios/tablegan/vae-tablegan.csv',
    'ctgan': 'C:/Users/GCU/Lending_club/Data_Analysis_lending-club/portfolios/ctgan/ctgan.csv',
    'vae-ctgan': 'C:/Users/GCU/Lending_club/Data_Analysis_lending-club/portfolios/ctgan/vae-ctgan.csv',
    'ctabgan': 'C:/Users/GCU/Lending_club/Data_Analysis_lending-club/portfolios/ctabgan/ctabgan.csv',
    'vae-ctabgan': 'C:/Users/GCU/Lending_club/Data_Analysis_lending-club/portfolios/ctabgan/vae-ctabgan.csv'
}

results = []

for method, path in tqdm(methods.items(), desc="Statistical Similarity"):
    synth = pd.read_csv(path)
    synth = synth[real.columns] 

    jsd, wd, corr_diff = compute_stat_similarity(real, synth)
    results.append({
        "Method": method,
        "JSD": round(jsd, 4),
        "WD": round(wd, 4),
        "Corr Diff": round(corr_diff, 4)
    })

df_stats = pd.DataFrame(results).sort_values(by='JSD')  # or WD


Statistical Similarity: 100%|██████████| 7/7 [00:22<00:00,  3.26s/it]


In [14]:
df_stats.head(10)

Unnamed: 0,Method,JSD,WD,Corr Diff
6,vae-ctabgan,0.0021,527.7055,1.825
5,ctabgan,0.0052,666.4967,2.8536
3,ctgan,0.0088,831.5996,1.7103
0,smote-nc,0.029,58.6947,0.3444
1,tablegan,0.1037,8857.3239,2.1983
2,vae-tablegan,0.1387,10871.4948,2.3196
4,vae-ctgan,0.1387,10871.4948,2.3196


In [8]:
print(real.corr(numeric_only=True))
print(synth.corr(numeric_only=True))


                      last_fico_range_high  annual_inc       dti  \
last_fico_range_high              1.000000    0.010041  0.025992   
annual_inc                        0.010041    1.000000 -0.036986   
dti                               0.025992   -0.036986  1.000000   
mo_sin_old_rev_tl_op              0.191363    0.028487  0.045199   
revol_util                       -0.065966    0.012079  0.106729   
int_rate                         -0.087662   -0.013861  0.107078   
installment                       0.079469    0.078539  0.023934   
avg_cur_bal                       0.091423    0.065267 -0.037283   
revol_bal                         0.059197    0.065566  0.114844   
total_pymnt                       0.064581    0.055076  0.011020   
total_pymnt_inv                   0.063238    0.055068  0.011472   
funded_amnt                       0.093320    0.084343  0.017415   
loan_amnt                         0.094000    0.084359  0.017169   
credit_history_years              0.171817    0.