In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import pickle

from sklearn.preprocessing import LabelEncoder, MinMaxScaler,OneHotEncoder
from sklearn.preprocessing import StandardScaler
from imblearn.under_sampling import RandomUnderSampler
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle

from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import mean_squared_error, r2_score

from sklearn.metrics import roc_curve, auc, log_loss
from sklearn.metrics import classification_report, precision_recall_curve, auc
from sklearn.metrics import accuracy_score
from tqdm import tqdm

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

import warnings
warnings.filterwarnings("ignore")

In [None]:
is_cuda = torch.cuda.is_available()
device = torch.device('cuda' if is_cuda else 'cpu')

print(device)

cuda


In [None]:
data_path = "C:/Users/GCU/Lending_club/Data_Analysis_lending-club/VAE-CTAB-GAN/Real_Datasets/train_category_1.csv"
df = pd.read_csv(data_path)

categorical_cols = df.select_dtypes(include='object').columns.tolist()
df_encoded = pd.get_dummies(df, columns=categorical_cols)

continuous_cols = df_encoded.select_dtypes(include=['float64', 'int64']).columns.difference(['loan_status'])
scaler = MinMaxScaler()
df_encoded[continuous_cols] = scaler.fit_transform(df_encoded[continuous_cols])

X = df_encoded.drop(columns=["loan_status"]).values.astype(np.float32)
y = df_encoded["loan_status"].values

In [None]:
class VAE(nn.Module):
    def __init__(self, input_dim, latent_dim=32):
        super(VAE, self).__init__()
        self.encoder = nn.Sequential(
            nn.Linear(input_dim, 128),
            nn.ReLU(),
            nn.Linear(128, 64),
            nn.ReLU()
        )
        self.fc_mu = nn.Linear(64, latent_dim)
        self.fc_logvar = nn.Linear(64, latent_dim)
        self.decoder = nn.Sequential(
            nn.Linear(latent_dim, 64),
            nn.ReLU(),
            nn.Linear(64, 128),
            nn.ReLU(),
            nn.Linear(128, input_dim)
        )

    def reparameterize(self, mu, logvar):
        std = torch.exp(0.5 * logvar)
        eps = torch.randn_like(std)
        return mu + eps * std

    def forward(self, x):
        h = self.encoder(x)
        mu = self.fc_mu(h)
        logvar = self.fc_logvar(h)
        z = self.reparameterize(mu, logvar)
        return self.decoder(z), mu, logvar

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
vae = VAE(input_dim=X.shape[1]).to(device)
optimizer = optim.Adam(vae.parameters(), lr=1e-3)
loss_fn = nn.MSELoss()

X_tensor = torch.tensor(X, dtype=torch.float32)
train_loader = DataLoader(TensorDataset(X_tensor), batch_size=64, shuffle=True)

def vae_loss(recon_x, x, mu, logvar):
    recon = loss_fn(recon_x, x)
    kl = -0.5 * torch.mean(1 + logvar - mu.pow(2) - logvar.exp())
    return recon + kl

In [None]:
for epoch in tqdm(range(500),desc="Epoch :"):
    vae.train()
    total_loss = 0
    for batch in train_loader:
        x = batch[0].to(device)
        optimizer.zero_grad()
        recon, mu, logvar = vae(x)
        loss = vae_loss(recon, x, mu, logvar)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1}, Loss: {total_loss:.4f}")

Epoch ::  20%|██        | 1/5 [00:07<00:30,  7.73s/it]

Epoch 1, Loss: 157.7943


Epoch ::  40%|████      | 2/5 [00:15<00:22,  7.58s/it]

Epoch 2, Loss: 156.5616


Epoch ::  60%|██████    | 3/5 [00:23<00:15,  7.70s/it]

Epoch 3, Loss: 156.5106


Epoch ::  80%|████████  | 4/5 [00:30<00:07,  7.78s/it]

Epoch 4, Loss: 156.4790


Epoch :: 100%|██████████| 5/5 [00:38<00:00,  7.77s/it]

Epoch 5, Loss: 156.4593





In [None]:
def inverse_one_hot(samples_df, original_df, categorical_cols):
    decoded_df = samples_df.copy()

    for col in categorical_cols:
        # 원래 one-hot으로 생성된 컬럼 리스트 추출
        one_hot_cols = [c for c in samples_df.columns if c.startswith(col + '_')]
        if not one_hot_cols:
            continue
        
        # one-hot 복원: 가장 큰 값의 인덱스를 카테고리로
        col_values = pd.Series(np.argmax(samples_df[one_hot_cols].values, axis=1))
        col_categories = [c.split('_', 1)[1] for c in one_hot_cols]
        decoded_df[col] = col_values.map(lambda x: col_categories[x])

        # one-hot 컬럼 제거
        decoded_df = decoded_df.drop(columns=one_hot_cols)

    return decoded_df


In [None]:
vae.eval()
with torch.no_grad():
    z = torch.randn(540, 32).to(device)
    samples = vae.decoder(z).cpu().numpy()

# 🔄 Inverse Transform
columns = df_encoded.drop(columns=["loan_status"]).columns
samples_df = pd.DataFrame(samples, columns=columns)
samples_df["loan_status"] = 1

samples_df[continuous_cols] = scaler.inverse_transform(samples_df[continuous_cols])
samples_decoded = inverse_one_hot(samples_df, df, categorical_cols)

samples_decoded.head()

Unnamed: 0,last_fico_range_high,annual_inc,dti,mo_sin_old_rev_tl_op,revol_util,int_rate,installment,avg_cur_bal,revol_bal,total_pymnt,...,funded_amnt,loan_amnt,credit_history_years,term_months,loan_status,debt_settlement_flag,sub_grade,home_ownership,purpose,grade
0,569.839783,67685.351562,17.982821,167.109604,0.539079,0.159572,479.713867,9764.25293,12579.801758,8788.825195,...,16137.469727,16148.141602,15.316967,46.069893,1,settlement_flag_N,grade_D2,ownership_RENT,debt_consolidation,C
1,569.880859,68485.632812,17.968756,167.164444,0.539242,0.159577,479.752472,9760.482422,12581.774414,8794.366211,...,16133.069336,16149.683594,15.31299,46.070869,1,settlement_flag_N,grade_D2,ownership_RENT,debt_consolidation,C
2,569.839783,67685.351562,17.982821,167.109604,0.539079,0.159572,479.713867,9764.25293,12579.801758,8788.825195,...,16137.469727,16148.141602,15.316967,46.069893,1,settlement_flag_N,grade_D2,ownership_RENT,debt_consolidation,C
3,569.839783,67685.351562,17.982821,167.109604,0.539079,0.159572,479.713867,9764.25293,12579.801758,8788.825195,...,16137.469727,16148.141602,15.316967,46.069893,1,settlement_flag_N,grade_D2,ownership_RENT,debt_consolidation,C
4,569.839783,67685.351562,17.982821,167.109604,0.539079,0.159572,479.713867,9764.25293,12579.801758,8788.825195,...,16137.469727,16148.141602,15.316967,46.069893,1,settlement_flag_N,grade_D2,ownership_RENT,debt_consolidation,C


In [None]:
save_path = "C:/Users/GCU/Lending_club/Data_Analysis_lending-club/VAE-CTAB-GAN/Fake_Datasets/"
samples_decoded.to_csv(path_or_buf=save_path+"VAE.csv", index=False)

In [None]:
sample_data_path = "C:/Users/GCU/Lending_club/Data_Analysis_lending-club/VAE-CTAB-GAN/Fake_Datasets/VAE.csv"

fake = pd.read_csv(sample_data_path)

In [None]:
fake.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 540 entries, 0 to 539
Data columns (total 21 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   last_fico_range_high  0 non-null      float64
 1   annual_inc            0 non-null      float64
 2   dti                   0 non-null      float64
 3   mo_sin_old_rev_tl_op  0 non-null      float64
 4   revol_util            0 non-null      float64
 5   int_rate              0 non-null      float64
 6   installment           0 non-null      float64
 7   avg_cur_bal           0 non-null      float64
 8   revol_bal             0 non-null      float64
 9   total_pymnt           0 non-null      float64
 10  total_pymnt_inv       0 non-null      float64
 11  funded_amnt           0 non-null      float64
 12  loan_amnt             0 non-null      float64
 13  credit_history_years  0 non-null      float64
 14  term_months           0 non-null      float64
 15  loan_status           5