In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

from sklearn.preprocessing import LabelEncoder, MinMaxScaler,OneHotEncoder
from sklearn.preprocessing import StandardScaler
from imblearn.under_sampling import RandomUnderSampler
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle

from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

from sklearn.metrics import roc_curve, auc, log_loss
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
from sklearn.metrics import classification_report, precision_recall_curve, auc
from sklearn.metrics import accuracy_score
from tqdm import tqdm

import torch 
import os
import torch

import warnings
warnings.filterwarnings("ignore")

In [None]:
is_cuda = torch.cuda.is_available()
device = torch.device('cuda' if is_cuda else 'cpu')

print(device)

In [None]:
methods = [
    "non_augmented",
    "undersampling",
    "smote_nc",
    "tablegan",
    "vae_tablegan",
    "ctgan",
    "vae_ctgan",
    "ctab_gan",
    "vae_ctabgan"
]

In [None]:
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
from scipy.spatial.distance import jensenshannon
from scipy.stats import wasserstein_distance, pearsonr
import numpy as np
import pandas as pd

def compute_classification_metrics(y_true, y_pred_proba):
    y_pred = (y_pred_proba >= 0.5).astype(int)
    return {
        "accuracy": accuracy_score(y_true, y_pred),
        "f1_score": f1_score(y_true, y_pred),
        "auc": roc_auc_score(y_true, y_pred_proba)
    }

def compute_jsd(p, q):
    p = np.clip(p, 1e-10, 1)
    q = np.clip(q, 1e-10, 1)
    return jensenshannon(p, q)**2  # Square to match traditional JSD

def compute_stat_similarity(real, synth):
    jsd = np.mean([
        compute_jsd(real[col].value_counts(normalize=True).reindex(index=synth[col].value_counts().index, fill_value=0).values,
                    synth[col].value_counts(normalize=True).reindex(index=synth[col].value_counts().index, fill_value=0).values)
        for col in real.columns
    ])

    wd = np.mean([
        wasserstein_distance(real[col], synth[col]) for col in real.select_dtypes(include=[np.number]).columns
    ])

    corr_diff = np.linalg.norm(real.corr().values - synth.corr().values)
    return jsd, wd, corr_diff


In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

results = []

for method in methods:
    print(f"🔍 Evaluating {method}...")

    # 데이터 불러오기
    real_train = pd.read_csv(f"./data/real/train.csv")
    real_test = pd.read_csv(f"./data/real/test.csv")
    synth_train = pd.read_csv(f"./data/{method}/synthetic.csv")

    # feature/label 분리
    X_real_train = real_train.drop("label", axis=1)
    y_real_train = real_train["label"]
    X_real_test = real_test.drop("label", axis=1)
    y_real_test = real_test["label"]
    X_synth = synth_train.drop("label", axis=1)
    y_synth = synth_train["label"]

    # 모델 훈련 (ML Utility: real vs synth)
    clf_real = RandomForestClassifier().fit(X_real_train, y_real_train)
    clf_synth = RandomForestClassifier().fit(X_synth, y_synth)

    real_pred = clf_real.predict_proba(X_real_test)[:, 1]
    synth_pred = clf_synth.predict_proba(X_real_test)[:, 1]

    metrics_real = compute_classification_metrics(y_real_test, real_pred)
    metrics_synth = compute_classification_metrics(y_real_test, synth_pred)

    utility_diff = {
        k: abs(metrics_synth[k] - metrics_real[k]) for k in metrics_real
    }

    jsd, wd, corr_diff = compute_stat_similarity(X_real_train, X_synth)

    results.append({
        "Method": method,
        "Accuracy": round(utility_diff["accuracy"] * 100, 2),
        "F1-score": round(utility_diff["f1_score"], 3),
        "AUC": round(utility_diff["auc"], 3),
        "Avg JSD": round(jsd, 3),
        "Avg WD": round(wd, 2),
        "Diff. Corr.": round(corr_diff, 2)
    })
