## 생성형 데이터와 original 데이터 간에 각 분포 차이. 
- Wasserstein_distance
- 

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.preprocessing import StandardScaler
from imblearn.under_sampling import RandomUnderSampler
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle

from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

from sklearn.metrics import roc_curve, auc, log_loss
from sklearn.metrics import classification_report, precision_recall_curve, auc
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
from tqdm import tqdm

import torch 
import os
import torch

import warnings
warnings.filterwarnings("ignore")

In [2]:
## Load original data
data_path = "C:/Users/GCU/Lending_club/Data_Analysis_lending-club/Dataset/return_feature_train.csv"
data = pd.read_csv(data_path, low_memory=False)

##load test data
test_data_path = "C:/Users/GCU/Lending_club/Data_Analysis_lending-club/Dataset/return_feature_test.csv"
test_data = pd.read_csv(test_data_path)

#Fake dataset
fake_path = "C:/Users/GCU/Lending_club/Data_Analysis_lending-club/tablegan/samples/return_feature/return_feature_OI_11_00_fake.csv"
fake = pd.read_csv(fake_path)
fake['loan_status'] = 1

In [4]:
data['loan_status'].value_counts()

X_train, X_val = train_test_split(data, test_size = 0.2, stratify = data['loan_status'])

In [5]:
X_train['loan_status'].value_counts()

loan_status
0    718935
1    174231
Name: count, dtype: int64

In [None]:
keep_features = ['grade', 'term_months', 'total_pymnt', 'total_pymnt_inv']

data_classification = data.copy()
test_classification = test_data.copy()
fake_Classification = fake.copy()

data_classification = data_classification.drop(columns = keep_features)
test_classification = test_classification.drop(columns = keep_features)
fake_Classification = fake_Classification.drop(columns = keep_features)

In [None]:
from scipy.stats import wasserstein_distance
import pandas as pd

def compute_wasserstein_topk(real_df: pd.DataFrame, fake_df: pd.DataFrame, top_k: int = 5, visualize: bool = True) -> pd.DataFrame:
    """
    각 feature별 Wasserstein Distance를 계산하고, 차이가 큰 top-k feature를 반환합니다.

    Parameters:
    - real_df (pd.DataFrame): 원본 데이터
    - fake_df (pd.DataFrame): 생성된 데이터
    - top_k (int): 차이가 큰 상위 K개의 feature 추출

    Returns:
    - pd.DataFrame: feature 이름과 Wasserstein Distance 포함된 top-k 데이터프레임
    """
    assert list(data_classification.columns) == list(fake_Classification.columns), "컬럼 이름이 일치해야 합니다."

    distances = {}
    for col in real_df.columns:
        real_col = data_classification[col].dropna()
        fake_col = fake_Classification[col].dropna()
        distances[col] = wasserstein_distance(real_col, fake_col)

    dist_series = pd.Series(distances, name="Wasserstein Distance")
    mean_distance = dist_series.mean()
    print(f"📏 평균 Wasserstein Distance: {mean_distance:.4f}")
    
    topk_df = dist_series.sort_values(ascending=False).head(top_k).reset_index()
    topk_df.columns = ["Feature", "Wasserstein Distance"]

    # 📊 시각화
    if visualize:
        plt.figure(figsize=(10, 6))
        sns.barplot(data=topk_df, x="Wasserstein Distance", y="Feature", palette="viridis")
        plt.title(f"Top {top_k} Features with Highest Wasserstein Distance")
        plt.xlabel("Wasserstein Distance")
        plt.ylabel("Feature")
        plt.tight_layout()
        plt.show()

    return topk_df


In [None]:
topk_result = compute_wasserstein_topk(real, fake, top_k=3, visualize = True)

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics.pairwise import rbf_kernel
from sklearn.decomposition import PCA

# ✅ 1. MMD 시각화 함수
def visualize_mmd_matrix(real_df, fake_df, gamma=1.0):
    real_k = rbf_kernel(real_df, real_df, gamma=gamma)
    fake_k = rbf_kernel(fake_df, fake_df, gamma=gamma)
    cross_k = rbf_kernel(real_df, fake_df, gamma=gamma)

    # 전체 커널 평균 계산
    mmd_value = real_k.mean() + fake_k.mean() - 2 * cross_k.mean()
    print(f"📏 MMD (RBF, γ={gamma}): {mmd_value:.4f}")

    # 시각화 (real vs fake kernel matrix)
    fig, axes = plt.subplots(1, 2, figsize=(12, 5))
    sns.heatmap(real_k[:100, :100], ax=axes[0], cmap='Blues')
    axes[0].set_title("Real Kernel Matrix")
    sns.heatmap(fake_k[:100, :100], ax=axes[1], cmap='Oranges')
    axes[1].set_title("Fake Kernel Matrix")
    plt.suptitle("MMD - RBF Kernel Matrix Visualization (Top 100 samples)")
    plt.tight_layout()
    plt.show()

# ✅ 2. Correlation Matrix Distance 시각화 함수
def visualize_correlation_difference(real_df, fake_df):
    real_corr = real_df.corr()
    fake_corr = fake_df.corr()
    diff_corr = np.abs(real_corr - fake_corr)

    distance = np.linalg.norm(real_corr - fake_corr, ord='fro')
    print(f"📏 Correlation Matrix Distance (Frobenius Norm): {distance:.4f}")

    fig, axes = plt.subplots(1, 3, figsize=(16, 5))
    sns.heatmap(real_corr, ax=axes[0], cmap='coolwarm', center=0, annot=False)
    axes[0].set_title("Real Correlation")
    sns.heatmap(fake_corr, ax=axes[1], cmap='coolwarm', center=0, annot=False)
    axes[1].set_title("Fake Correlation")
    sns.heatmap(diff_corr, ax=axes[2], cmap='YlOrRd')
    axes[2].set_title("Difference (|Real - Fake|)")
    plt.tight_layout()
    plt.show()

# ✅ 3. PCA 시각화 함수
def visualize_pca_projection(real_df, fake_df):
    combined = pd.concat([real_df, fake_df])
    labels = [0] * len(real_df) + [1] * len(fake_df)
    pca = PCA(n_components=2)
    components = pca.fit_transform(combined)

    plt.figure(figsize=(6, 6))
    sns.scatterplot(x=components[:, 0], y=components[:, 1], hue=labels,
                    palette=['blue', 'orange'], alpha=0.6)
    plt.title("PCA Projection (Blue: Real, Orange: Fake)")
    plt.xlabel("Principal Component 1")
    plt.ylabel("Principal Component 2")
    plt.legend(title="Data")
    plt.tight_layout()
    plt.show()


In [None]:
real_df = data_classification
fake_df = fake_classification

visualize_mmd_matrix(real_df, fake_df, gamma=1.0)
visualize_correlation_difference(real_df, fake_df)
visualize_pca_projection(real_df, fake_df)