In [21]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle

from tqdm import tqdm

import torch 
import os
import torch

import warnings
warnings.filterwarnings("ignore")

In [22]:
is_cuda = torch.cuda.is_available()
device = torch.device('cuda' if is_cuda else 'cpu')

print(device)

cuda


In [23]:
import pandas as pd
import numpy as np
from scipy.spatial.distance import jensenshannon
from scipy.stats import wasserstein_distance

def compute_jsd(p, q):
    p = np.clip(p, 1e-10, 1)
    q = np.clip(q, 1e-10, 1)
    return jensenshannon(p, q)**2  # JSD 공식상 제곱

def compute_stat_similarity(real, synth):
    jsd = np.mean([
        compute_jsd(
            real[col].value_counts(normalize=True).reindex(index=synth[col].value_counts().index, fill_value=0).values,
            synth[col].value_counts(normalize=True).reindex(index=synth[col].value_counts().index, fill_value=0).values
        )
        for col in real.columns if real[col].dtype == 'object' or real[col].nunique() < 50
    ])

    wd = np.mean([
        wasserstein_distance(real[col], synth[col])
        for col in real.select_dtypes(include=[np.number]).columns
    ])

    # corr_diff (loan_status 제외)
    real_corr = real.drop(columns='loan_status').corr(numeric_only=True)
    synth_corr = synth.drop(columns='loan_status').corr(numeric_only=True)

    common_cols = real_corr.columns.intersection(synth_corr.columns)
    real_corr = real_corr.loc[common_cols, common_cols]
    synth_corr = synth_corr.loc[common_cols, common_cols]

    if real_corr.isnull().values.any() or synth_corr.isnull().values.any():
        corr_diff = np.nan
    else:
        corr_diff = np.linalg.norm(real_corr.values - synth_corr.values)

    return jsd, wd, corr_diff

In [24]:
# real = class 1만 있는 진짜 데이터셋
## Load original class == Default data
real_path = "C:/Users/GCU/Lending_club/Data_Analysis_lending-club/VAE-CTAB-GAN/Real_Datasets/train_category_1.csv"
real = pd.read_csv(real_path, low_memory=False)

methods = {
    'smote-nc': 'C:/Users/GCU/Lending_club/Data_Analysis_lending-club/portfolios/base/smote-nc.csv',
    'tablegan': 'C:/Users/GCU/Lending_club/Data_Analysis_lending-club/portfolios/tablegan/tablegan.csv',
    'vae-tablegan': 'C:/Users/GCU/Lending_club/Data_Analysis_lending-club/portfolios/tablegan/vae-tablegan.csv',
    'ctgan': 'C:/Users/GCU/Lending_club/Data_Analysis_lending-club/portfolios/ctgan/ctgan.csv',
    'vae-ctgan': 'C:/Users/GCU/Lending_club/Data_Analysis_lending-club/portfolios/ctgan/vae-ctgan.csv',
    'ctabgan': 'C:/Users/GCU/Lending_club/Data_Analysis_lending-club/portfolios/ctabgan/ctabgan.csv',
    'vae-ctabgan': 'C:/Users/GCU/Lending_club/Data_Analysis_lending-club/portfolios/ctabgan/vae-ctabgan.csv'
}

results = []

n_runs = 100  # 반복 횟수

for method, path in tqdm(methods.items(), desc="Statistical Similarity (100 runs)"):
    synth = pd.read_csv(path)
    synth = synth[real.columns]

    jsd_list, wd_list, corr_list = [], [], []

    for _ in range(n_runs):
        # 매번 섞거나 재샘플할 필요 없으면 그대로 비교
        jsd, wd, corr_diff = compute_stat_similarity(real, synth)
        jsd_list.append(jsd)
        wd_list.append(wd)
        corr_list.append(corr_diff)

    results.append({
        "Method": method,
        "JSD Mean": np.mean(jsd_list),
        "JSD Std": np.std(jsd_list),
        "WD Mean": np.mean(wd_list),
        "WD Std": np.std(wd_list),
        "Corr Diff Mean": np.mean(corr_list),
        "Corr Diff Std": np.std(corr_list)
    })

df_stats_agg = pd.DataFrame(results).sort_values(by='JSD Mean')  # 정렬 기준은 원하는 대로

Statistical Similarity (100 runs): 100%|██████████| 7/7 [30:09<00:00, 258.51s/it]


In [25]:
df_stats_agg.head(10)

Unnamed: 0,Method,JSD Mean,JSD Std,WD Mean,WD Std,Corr Diff Mean,Corr Diff Std
6,vae-ctabgan,0.00205,4.336809e-19,527.705457,0.0,1.82502,0.0
5,ctabgan,0.005227,1.734723e-18,666.496742,1.136868e-13,2.85363,8.881784e-16
3,ctgan,0.008825,3.469447e-18,831.599573,0.0,1.710297,0.0
0,smote-nc,0.028975,0.0,58.694674,0.0,0.344389,1.665335e-16
1,tablegan,0.103673,2.775558e-17,8857.323937,0.0,2.198281,4.440892e-16
2,vae-tablegan,0.138717,5.5511150000000004e-17,10871.494838,1.818989e-12,2.319589,0.0
4,vae-ctgan,0.138717,5.5511150000000004e-17,10871.494838,1.818989e-12,2.319589,0.0
