In [13]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

from sklearn.preprocessing import LabelEncoder, MinMaxScaler,OneHotEncoder
from sklearn.preprocessing import StandardScaler
from imblearn.under_sampling import RandomUnderSampler
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle

from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from lightgbm import LGBMClassifier
from sklearn.linear_model import LogisticRegression

from sklearn.metrics import roc_curve, auc, log_loss
from sklearn.metrics import classification_report, precision_recall_curve, auc
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
from tqdm import tqdm

import torch 
import os
import torch

import warnings
warnings.filterwarnings("ignore")

In [14]:
is_cuda = torch.cuda.is_available()
device = torch.device('cuda' if is_cuda else 'cpu')

print(device)

cuda


In [15]:
data_path = "C:/Users/GCU/Lending_club/Data_Analysis_lending-club/portfolios/base/train_category.csv"
data = pd.read_csv(data_path, low_memory=False)

test_data_path = "C:/Users/GCU/Lending_club/Data_Analysis_lending-club/portfolios/base/test_category.csv"
test_data = pd.read_csv(test_data_path)

In [16]:
#keep_features = ['grade', 'term_months', 'total_pymnt', 'total_pymnt_inv','total_il_high_credit_limit', 'loan_amnt']
keep_features = ['grade', 'total_pymnt', 'total_pymnt_inv', 'loan_amnt','funded_amnt']

data_classification = data.copy()
test_classification = test_data.copy()

data_classification = data_classification.drop(columns = keep_features)
test_classification = test_classification.drop(columns = keep_features)

In [17]:
## data Encoding
Labelencoding_features = ['term_months', 'sub_grade']
onehot_features = ['debt_settlement_flag', 'home_ownership', 'purpose']

onehot_encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')

label_encoders = {}
for col in Labelencoding_features:
    le = LabelEncoder()
    data_classification[col] = le.fit_transform(data_classification[col])
    label_encoders[col] = le

# One-Hot Encoding
onehot_encoded = onehot_encoder.fit_transform(data_classification[onehot_features])
onehot_encoded_df = pd.DataFrame(
    onehot_encoded, 
    columns=onehot_encoder.get_feature_names_out(onehot_features),
    index=data_classification.index
)

# Merge
data_classification.drop(columns=onehot_features, inplace=True)
data_classification = pd.concat([data_classification, onehot_encoded_df], axis=1)

print("✅ Encoding 완료! 결과 shape:", data_classification.shape)

✅ Encoding 완료! 결과 shape: (1116458, 35)


In [18]:
##Test data encoding
for col in Labelencoding_features:
    le = label_encoders[col]
    test_classification[col] = le.transform(test_classification[col])

# One-Hot Encoding (train에서 fit된 onehot_encoder 재사용)
onehot_encoded_test = onehot_encoder.transform(test_classification[onehot_features])
onehot_encoded_test_df = pd.DataFrame(
    onehot_encoded_test,
    columns=onehot_encoder.get_feature_names_out(onehot_features),
    index=test_classification.index
)

# Merge
test_classification.drop(columns=onehot_features, inplace=True)
test_classification = pd.concat([test_classification, onehot_encoded_test_df], axis=1)

print("✅ 테스트 데이터 인코딩 완료! 결과 shape:", test_classification.shape)

✅ 테스트 데이터 인코딩 완료! 결과 shape: (744306, 35)


In [19]:
def encode_features(df, label_encoders, onehot_encoder, label_cols, onehot_cols):
    df = df.copy()
    
    # Label Encoding
    for col in label_cols:
        le = label_encoders[col]
        df[col] = le.transform(df[col])

    # One-Hot Encoding
    onehot_encoded = onehot_encoder.transform(df[onehot_cols])
    onehot_df = pd.DataFrame(
        onehot_encoded, 
        columns=onehot_encoder.get_feature_names_out(onehot_cols),
        index=df.index
    )

    df.drop(columns=onehot_cols, inplace=True)
    df = pd.concat([df, onehot_df], axis=1)
    
    return df

In [20]:
from sklearn.model_selection import train_test_split
from imblearn.under_sampling import RandomUnderSampler
from sklearn.utils import shuffle

def load_data_by_method(method):
    test_data = test_classification
    X_test = test_data.drop(columns='loan_status')
    y_test = test_data[['loan_status']]

    base_data = data_classification

    if method == 'Base':
        X_train = base_data.drop(columns='loan_status')
        y_train = base_data[['loan_status']]

    elif method == 'Undersampling':
        data_x = base_data.drop(columns='loan_status')
        data_y = base_data[['loan_status']]
        X_temp, _, y_temp, _ = train_test_split(data_x, data_y, test_size=0.13, random_state=42, stratify=data_y)
        rus = RandomUnderSampler(random_state=42)
        X_under, y_under = rus.fit_resample(X_temp, y_temp)
        X_train, y_train = X_under, y_under

    elif method == 'SMOTE-NC':
        smote_path = "C:/Users/GCU/Lending_club/Data_Analysis_lending-club/portfolios/smotenc/smotenc_data.csv"
        smote_data = pd.read_csv(smote_path)
        drop_cols = ['grade', 'total_pymnt', 'total_pymnt_inv', 'loan_amnt', 'funded_amnt']
        smote_data = smote_data.drop(columns=drop_cols)
        smote_data = encode_features(smote_data, label_encoders, onehot_encoder, Labelencoding_features, onehot_features)
        X_train = smote_data.drop(columns='loan_status')
        y_train = smote_data[['loan_status']]

    else:
        fake_base = {
            'table-gan': 'tablegan/tablegan.csv',
            'Smotified-tablegan': 'tablegan/smotified-tablegan.csv',
            'vae-tablegan': 'tablegan/vae-tablegan.csv',
            'ctgan': 'ctgan/ctgan.csv',
            'smotified-ctgan': 'ctgan/smotified-ctgan.csv',
            'vae-ctgan': 'ctgan/vae-ctgan.csv',
            'smotified-vae-ctgan': 'ctgan/smotified-vae-ctgan.csv',
            'ctabgan': 'ctabgan/ctabgan.csv',
            'smotified-ctabgan': 'ctabgan/smotified_ctabgan.csv',
            'vae-ctabgan': 'ctabgan/vae-ctabgan.csv',
            'smotified-vae-ctabgan': 'ctabgan/smotified-vae-ctabgan.csv'
        }

        if method not in fake_base:
            raise ValueError(f"Unknown method: {method}")

        fake_path = f"C:/Users/GCU/Lending_club/Data_Analysis_lending-club/portfolios/{fake_base[method]}"
        fake = pd.read_csv(fake_path)
        fake['loan_status'] = 1  # 모든 fake는 default

        keep_features = ['grade', 'total_pymnt', 'total_pymnt_inv', 'loan_amnt', 'funded_amnt']
        fake = fake.drop(columns=keep_features)
        fake['term_months'] = fake['term_months'].apply(lambda x: 36 if abs(x - 36) < abs(x - 60) else 60)
        fake = encode_features(fake, label_encoders, onehot_encoder, Labelencoding_features, onehot_features)

        base_x = base_data.drop(columns='loan_status')
        base_y = base_data[['loan_status']]
        X_base, _, y_base, _ = train_test_split(base_x, base_y, test_size=0.2, stratify=base_y, random_state=42)
        train_real = pd.concat([X_base, y_base], axis=1)
        train_total = pd.concat([train_real, fake])
        train_total = shuffle(train_total, random_state=42)

        X_train = train_total.drop(columns='loan_status')
        y_train = train_total[['loan_status']]

    return X_train, y_train, X_test, y_test

In [21]:
summary_with_sharpe_top = []

summary_with_grade_top = []

def linear_portfolio_top(Method, top_10_indices):
    selected = test_data.loc[top_10_indices.index].copy()
    returns = (selected['total_pymnt_inv'] - selected['funded_amnt']) / selected['funded_amnt']
    
    avg_return = returns.mean()
    std_return = returns.std()
    sharpe_ratio = avg_return / std_return if std_return != 0 else np.nan

    summary_with_sharpe_top.append({
        'Method': Method,
        'Average Return': avg_return * 100,
        'Std Dev': std_return,
        'Sharpe Ratio': sharpe_ratio
    }) 

    A_count = len(selected[selected['grade'] == 'A'])
    B_count = len(selected[selected['grade'] == 'B'])
    C_count = len(selected[selected['grade'] == 'C'])
    D_count = len(selected[selected['grade'] == 'D'])
    E_count = len(selected[selected['grade'] == 'E'])
    F_G_count = len(selected[selected['grade'] == 'F']) + len(selected[selected['grade'] == 'G'])

    summary_with_grade_top.append({
        'Method': Method,
        'A': A_count,
        'B': B_count,
        'C': C_count,
        'D': D_count,
        'E': E_count,
        'F+G': F_G_count,
        'Total': len(selected)
    }) 

In [22]:
##상위 10% 투자
from imblearn.over_sampling import SMOTE
drop2 = ['loan_status', 'return']

def select_fully_paid(y_pred, method):
    scaler = StandardScaler()
    reg = LinearRegression()

    fully_paid_indices = (y_pred == 0)
    test_regression = test_data[fully_paid_indices]
    test_regression['return'] = (test_regression['total_pymnt_inv'] - test_regression['funded_amnt'])/test_regression['funded_amnt']

    train_regression = data.copy()

    if method == 'Base':
        train_regression['return'] = (train_regression['total_pymnt_inv'] - train_regression['funded_amnt'])/train_regression['funded_amnt']

        drop_features = ['loan_status','grade', 'total_pymnt', 'total_pymnt_inv', 'loan_amnt','funded_amnt']
        train_regression = train_regression.drop(columns=drop_features)
        test_regression = test_regression.drop(columns=drop_features)

        train_regression = encode_features(train_regression, label_encoders, onehot_encoder, Labelencoding_features, onehot_features)
        test_regression = encode_features(test_regression, label_encoders, onehot_encoder, Labelencoding_features, onehot_features)

        data_x = train_regression.drop(columns='return')
        data_y = train_regression[['return']]

        test_x = test_regression.drop(columns = 'return')
        test_y = test_regression[['return']]

        data_x_scaled = scaler.fit_transform(data_x)

        X_train = data_x_scaled
        y_train = data_y

        test_x_scaled = scaler.transform(test_x)

        reg.fit(X_train, y_train)

        #print(reg.coef_)
        y_pred = reg.predict(test_x_scaled)

        print("RMSE:", mean_squared_error(test_y, y_pred, squared=False))
        print("R²:", r2_score(test_y, y_pred))

        test_regression['predicted_return'] = reg.predict(test_x_scaled)

        top_10_percent = test_regression.sort_values(by='predicted_return', ascending=False).head(int(len(test_regression)*0.1))
        return method, top_10_percent
    
    elif method == 'Undersampling':
        train_regression['return'] = (train_regression['total_pymnt_inv'] - train_regression['loan_amnt'])/train_regression['funded_amnt']
        drop_features = ['grade', 'total_pymnt', 'total_pymnt_inv', 'loan_amnt','funded_amnt']
        train_regression = train_regression.drop(columns=drop_features)
        test_regression = test_regression.drop(columns=drop_features)

        train_regression = encode_features(train_regression, label_encoders, onehot_encoder, Labelencoding_features, onehot_features)
        test_regression = encode_features(test_regression, label_encoders, onehot_encoder, Labelencoding_features, onehot_features)

        X_train = train_regression.drop(columns='loan_status')
        y_train = train_regression[['loan_status']]

        undersampler = RandomUnderSampler(random_state=42)
        X_under, y_under = undersampler.fit_resample(X_train, y_train)

        data_under = pd.concat([pd.DataFrame(X_under, columns=X_train.columns), pd.DataFrame(y_under, columns=['loan_status'])], axis=1)

        data_x = data_under.drop(columns=drop2)  
        data_y = data_under[['return']]  

        test_x = test_regression.drop(columns = drop2)
        test_y = test_regression[['return']]

        data_x_scaled = scaler.fit_transform(data_x)

        X_train = data_x_scaled
        y_train = data_y

        test_x_scaled = scaler.transform(test_x)

        reg.fit(X_train, y_train)

        #print(reg.coef_)
        y_pred = reg.predict(test_x_scaled)

        print("RMSE:", mean_squared_error(test_y, y_pred, squared=False))
        print("R²:", r2_score(test_y, y_pred))

        test_regression['predicted_return'] = reg.predict(test_x_scaled)

        top_10_percent = test_regression.sort_values(by='predicted_return', ascending=False).head(int(len(test_regression)*0.1))

        return method, top_10_percent
    
    elif method == 'SMOTE-NC':
        smote_path = "C:/Users/GCU/Lending_club/Data_Analysis_lending-club/portfolios/smotenc/smotenc_data.csv"
        data_smote = pd.read_csv(smote_path)
        
        data_smote['return'] = (data_smote['total_pymnt_inv'] - data_smote['funded_amnt'])/data_smote['funded_amnt']
        drop_features = ['loan_status','grade', 'total_pymnt', 'total_pymnt_inv', 'loan_amnt','funded_amnt']

        train_regression = data_smote.drop(columns=drop_features)
        test_regression = test_regression.drop(columns=drop_features)

        train_regression = encode_features(train_regression, label_encoders, onehot_encoder, Labelencoding_features, onehot_features)
        test_regression = encode_features(test_regression, label_encoders, onehot_encoder, Labelencoding_features, onehot_features)

        X_train = train_regression.drop(columns='return')
        y_train = train_regression[['return']]

        test_x = test_regression.drop(columns = 'return')
        test_y = test_regression[['return']]

        data_x_scaled = scaler.fit_transform(X_train)

        X_train = data_x_scaled
        #y_train = data_y

        test_x_scaled = scaler.transform(test_x)

        reg.fit(X_train, y_train)

        #print(reg.coef_)
        y_pred = reg.predict(test_x_scaled)

        print("RMSE:", mean_squared_error(test_y, y_pred, squared=False))
        print("R²:", r2_score(test_y, y_pred))

        test_regression['predicted_return'] = reg.predict(test_x_scaled)
        top_10_percent = test_regression.sort_values(by='predicted_return', ascending=False).head(int(len(test_regression)*0.1))

        return method, top_10_percent
    
    elif method == 'table-gan':
        train_regression['return'] = (train_regression['total_pymnt_inv'] - train_regression['funded_amnt'])/train_regression['funded_amnt']
        drop_features = ['loan_status','grade', 'total_pymnt', 'total_pymnt_inv', 'loan_amnt','funded_amnt']
        train_regression = train_regression.drop(columns=drop_features)
        test_regression = test_regression.drop(columns=drop_features)
        
        fake_path = "C:/Users/GCU/Lending_club/Data_Analysis_lending-club/portfolios/tablegan/tablegan.csv"
        fake = pd.read_csv(fake_path)
        fake['loan_status'] = 1

        fake_regression = fake.copy()

        fake_regression['return'] = (fake_regression['total_pymnt_inv'] - fake_regression['funded_amnt'])/fake_regression['funded_amnt']
        fake_regression = fake_regression.drop(columns=drop_features)
        fake_regression['term_months'] = fake_regression['term_months'].apply(lambda x:36 if abs(x-36)<abs(x-60) else 60)

        train_regression = encode_features(train_regression, label_encoders, onehot_encoder, Labelencoding_features, onehot_features)
        test_regression = encode_features(test_regression, label_encoders, onehot_encoder, Labelencoding_features, onehot_features)
        fake_regression = encode_features(fake_regression,label_encoders, onehot_encoder, Labelencoding_features, onehot_features)

        train_total = pd.concat([train_regression, fake_regression])

        data_x = train_total.drop(columns='return')
        data_y = train_total[['return']]

        test_x = test_regression.drop(columns = 'return')
        test_y = test_regression[['return']]

        data_x_scaled = scaler.fit_transform(data_x)

        X_train = data_x_scaled
        y_train = data_y

        test_x_scaled = scaler.transform(test_x)

        reg.fit(X_train, y_train)

        #print(reg.coef_)
        y_pred = reg.predict(test_x_scaled)

        print("RMSE:", mean_squared_error(test_y, y_pred, squared=False))
        print("R²:", r2_score(test_y, y_pred))

        test_regression['predicted_return'] = reg.predict(test_x_scaled)
        top_10_percent = test_regression.sort_values(by='predicted_return', ascending=False).head(int(len(test_regression)*0.1))

        return method, top_10_percent
        
    elif method == 'Smotified-tablegan':
        train_regression['return'] = (train_regression['total_pymnt_inv'] - train_regression['funded_amnt'])/train_regression['funded_amnt']
        drop_features = ['loan_status','grade', 'total_pymnt', 'total_pymnt_inv', 'loan_amnt','funded_amnt']
        train_regression = train_regression.drop(columns=drop_features)
        test_regression = test_regression.drop(columns=drop_features)
        
        fake_path = "C:/Users/GCU/Lending_club/Data_Analysis_lending-club/portfolios/tablegan/smotified-tablegan.csv"
        fake = pd.read_csv(fake_path)
        fake['loan_status'] = 1

        fake_regression = fake.copy()

        fake_regression['return'] = (fake_regression['total_pymnt_inv'] - fake_regression['funded_amnt'])/fake_regression['funded_amnt']
        fake_regression = fake_regression.drop(columns=drop_features)
        fake_regression['term_months'] = fake_regression['term_months'].apply(lambda x:36 if abs(x-36)<abs(x-60) else 60)

        train_regression = encode_features(train_regression, label_encoders, onehot_encoder, Labelencoding_features, onehot_features)
        test_regression = encode_features(test_regression, label_encoders, onehot_encoder, Labelencoding_features, onehot_features)
        fake_regression = encode_features(fake_regression,label_encoders, onehot_encoder, Labelencoding_features, onehot_features)

        train_total = pd.concat([train_regression, fake_regression])

        data_x = train_total.drop(columns='return')
        data_y = train_total[['return']]

        test_x = test_regression.drop(columns = 'return')
        test_y = test_regression[['return']]

        data_x_scaled = scaler.fit_transform(data_x)

        X_train = data_x_scaled
        y_train = data_y

        test_x_scaled = scaler.transform(test_x)

        reg.fit(X_train, y_train)

        #print(reg.coef_)
        y_pred = reg.predict(test_x_scaled)

        print("RMSE:", mean_squared_error(test_y, y_pred, squared=False))
        print("R²:", r2_score(test_y, y_pred))

        test_regression['predicted_return'] = reg.predict(test_x_scaled)
        top_10_percent = test_regression.sort_values(by='predicted_return', ascending=False).head(int(len(test_regression)*0.1))

        return method, top_10_percent 
    
    elif method == 'vae-tablegan':
        train_regression['return'] = (train_regression['total_pymnt_inv'] - train_regression['funded_amnt'])/train_regression['funded_amnt']
        drop_features = ['loan_status','grade', 'total_pymnt', 'total_pymnt_inv', 'loan_amnt','funded_amnt']
        train_regression = train_regression.drop(columns=drop_features)
        test_regression = test_regression.drop(columns=drop_features)
        
        fake_path = "C:/Users/GCU/Lending_club/Data_Analysis_lending-club/portfolios/tablegan/vae-tablegan.csv"
        fake = pd.read_csv(fake_path)
        fake['loan_status'] = 1

        fake_regression = fake.copy()

        fake_regression['return'] = (fake_regression['total_pymnt_inv'] - fake_regression['funded_amnt'])/fake_regression['funded_amnt']
        fake_regression = fake_regression.drop(columns=drop_features)
        fake_regression['term_months'] = fake_regression['term_months'].apply(lambda x:36 if abs(x-36)<abs(x-60) else 60)

        train_regression = encode_features(train_regression, label_encoders, onehot_encoder, Labelencoding_features, onehot_features)
        test_regression = encode_features(test_regression, label_encoders, onehot_encoder, Labelencoding_features, onehot_features)
        fake_regression = encode_features(fake_regression,label_encoders, onehot_encoder, Labelencoding_features, onehot_features)

        train_total = pd.concat([train_regression, fake_regression])

        data_x = train_total.drop(columns='return')
        data_y = train_total[['return']]

        test_x = test_regression.drop(columns = 'return')
        test_y = test_regression[['return']]

        data_x_scaled = scaler.fit_transform(data_x)

        X_train = data_x_scaled
        y_train = data_y

        test_x_scaled = scaler.transform(test_x)

        reg.fit(X_train, y_train)

        #print(reg.coef_)
        y_pred = reg.predict(test_x_scaled)

        print("RMSE:", mean_squared_error(test_y, y_pred, squared=False))
        print("R²:", r2_score(test_y, y_pred))

        test_regression['predicted_return'] = reg.predict(test_x_scaled)
        top_10_percent = test_regression.sort_values(by='predicted_return', ascending=False).head(int(len(test_regression)*0.1))

        return method, top_10_percent 
    
    elif method == 'ctgan':
        train_regression['return'] = (train_regression['total_pymnt_inv'] - train_regression['funded_amnt'])/train_regression['funded_amnt']
        drop_features = ['loan_status','grade', 'total_pymnt', 'total_pymnt_inv', 'loan_amnt','funded_amnt']
        train_regression = train_regression.drop(columns=drop_features)
        test_regression = test_regression.drop(columns=drop_features)
        
        fake_path = "C:/Users/GCU/Lending_club/Data_Analysis_lending-club/portfolios/ctgan/ctgan.csv"
        fake = pd.read_csv(fake_path)
        fake['loan_status'] = 1

        fake_regression = fake.copy()

        fake_regression['return'] = (fake_regression['total_pymnt_inv'] - fake_regression['funded_amnt'])/fake_regression['funded_amnt']
        fake_regression = fake_regression.drop(columns=drop_features)
        fake_regression['term_months'] = fake_regression['term_months'].apply(lambda x:36 if abs(x-36)<abs(x-60) else 60)

        train_regression = encode_features(train_regression, label_encoders, onehot_encoder, Labelencoding_features, onehot_features)
        test_regression = encode_features(test_regression, label_encoders, onehot_encoder, Labelencoding_features, onehot_features)
        fake_regression = encode_features(fake_regression,label_encoders, onehot_encoder, Labelencoding_features, onehot_features)

        train_total = pd.concat([train_regression, fake_regression])

        data_x = train_total.drop(columns='return')
        data_y = train_total[['return']]

        test_x = test_regression.drop(columns = 'return')
        test_y = test_regression[['return']]

        data_x_scaled = scaler.fit_transform(data_x)

        X_train = data_x_scaled
        y_train = data_y

        test_x_scaled = scaler.transform(test_x)

        reg.fit(X_train, y_train)

        #print(reg.coef_)
        y_pred = reg.predict(test_x_scaled)

        print("RMSE:", mean_squared_error(test_y, y_pred, squared=False))
        print("R²:", r2_score(test_y, y_pred))

        test_regression['predicted_return'] = reg.predict(test_x_scaled)
        top_10_percent = test_regression.sort_values(by='predicted_return', ascending=False).head(int(len(test_regression)*0.1))

        return method, top_10_percent 
    
    elif method == 'smotified-ctgan':
        train_regression['return'] = (train_regression['total_pymnt_inv'] - train_regression['funded_amnt'])/train_regression['funded_amnt']
        drop_features = ['loan_status','grade', 'total_pymnt', 'total_pymnt_inv', 'loan_amnt','funded_amnt']
        train_regression = train_regression.drop(columns=drop_features)
        test_regression = test_regression.drop(columns=drop_features)
        
        fake_path = "C:/Users/GCU/Lending_club/Data_Analysis_lending-club/portfolios/ctgan/smotified-ctgan.csv"
        fake = pd.read_csv(fake_path)
        fake['loan_status'] = 1

        fake_regression = fake.copy()

        fake_regression['return'] = (fake_regression['total_pymnt_inv'] - fake_regression['funded_amnt'])/fake_regression['funded_amnt']
        fake_regression = fake_regression.drop(columns=drop_features)
        fake_regression['term_months'] = fake_regression['term_months'].apply(lambda x:36 if abs(x-36)<abs(x-60) else 60)

        train_regression = encode_features(train_regression, label_encoders, onehot_encoder, Labelencoding_features, onehot_features)
        test_regression = encode_features(test_regression, label_encoders, onehot_encoder, Labelencoding_features, onehot_features)
        fake_regression = encode_features(fake_regression,label_encoders, onehot_encoder, Labelencoding_features, onehot_features)

        train_total = pd.concat([train_regression, fake_regression])

        data_x = train_total.drop(columns='return')
        data_y = train_total[['return']]

        test_x = test_regression.drop(columns = 'return')
        test_y = test_regression[['return']]

        data_x_scaled = scaler.fit_transform(data_x)

        X_train = data_x_scaled
        y_train = data_y

        test_x_scaled = scaler.transform(test_x)

        reg.fit(X_train, y_train)

        #print(reg.coef_)
        y_pred = reg.predict(test_x_scaled)

        print("RMSE:", mean_squared_error(test_y, y_pred, squared=False))
        print("R²:", r2_score(test_y, y_pred))

        test_regression['predicted_return'] = reg.predict(test_x_scaled)
        top_10_percent = test_regression.sort_values(by='predicted_return', ascending=False).head(int(len(test_regression)*0.1))

        return method, top_10_percent 
    
    elif method == 'vae-ctgan':
        train_regression['return'] = (train_regression['total_pymnt_inv'] - train_regression['funded_amnt'])/train_regression['funded_amnt']
        drop_features = ['loan_status','grade', 'total_pymnt', 'total_pymnt_inv', 'loan_amnt','funded_amnt']
        train_regression = train_regression.drop(columns=drop_features)
        test_regression = test_regression.drop(columns=drop_features)
        
        fake_path = "C:/Users/GCU/Lending_club/Data_Analysis_lending-club/portfolios/ctgan/vae-ctgan.csv"
        fake = pd.read_csv(fake_path)
        fake['loan_status'] = 1

        fake_regression = fake.copy()

        fake_regression['return'] = (fake_regression['total_pymnt_inv'] - fake_regression['funded_amnt'])/fake_regression['funded_amnt']
        fake_regression = fake_regression.drop(columns=drop_features)
        fake_regression['term_months'] = fake_regression['term_months'].apply(lambda x:36 if abs(x-36)<abs(x-60) else 60)

        train_regression = encode_features(train_regression, label_encoders, onehot_encoder, Labelencoding_features, onehot_features)
        test_regression = encode_features(test_regression, label_encoders, onehot_encoder, Labelencoding_features, onehot_features)
        fake_regression = encode_features(fake_regression,label_encoders, onehot_encoder, Labelencoding_features, onehot_features)

        train_total = pd.concat([train_regression, fake_regression])

        data_x = train_total.drop(columns='return')
        data_y = train_total[['return']]

        test_x = test_regression.drop(columns = 'return')
        test_y = test_regression[['return']]

        data_x_scaled = scaler.fit_transform(data_x)

        X_train = data_x_scaled
        y_train = data_y

        test_x_scaled = scaler.transform(test_x)

        reg.fit(X_train, y_train)

        #print(reg.coef_)
        y_pred = reg.predict(test_x_scaled)

        print("RMSE:", mean_squared_error(test_y, y_pred, squared=False))
        print("R²:", r2_score(test_y, y_pred))

        test_regression['predicted_return'] = reg.predict(test_x_scaled)
        top_10_percent = test_regression.sort_values(by='predicted_return', ascending=False).head(int(len(test_regression)*0.1))

        return method, top_10_percent 
    
    elif method == 'smotified-vae-ctgan':
        train_regression['return'] = (train_regression['total_pymnt_inv'] - train_regression['funded_amnt'])/train_regression['funded_amnt']
        drop_features = ['loan_status','grade', 'total_pymnt', 'total_pymnt_inv', 'loan_amnt','funded_amnt']
        train_regression = train_regression.drop(columns=drop_features)
        test_regression = test_regression.drop(columns=drop_features)
        
        fake_path = "C:/Users/GCU/Lending_club/Data_Analysis_lending-club/portfolios/ctgan/smotified-vae-ctgan.csv"
        fake = pd.read_csv(fake_path)
        fake['loan_status'] = 1

        fake_regression = fake.copy()

        fake_regression['return'] = (fake_regression['total_pymnt_inv'] - fake_regression['funded_amnt'])/fake_regression['funded_amnt']
        fake_regression = fake_regression.drop(columns=drop_features)
        fake_regression['term_months'] = fake_regression['term_months'].apply(lambda x:36 if abs(x-36)<abs(x-60) else 60)
        
        train_regression = encode_features(train_regression, label_encoders, onehot_encoder, Labelencoding_features, onehot_features)
        test_regression = encode_features(test_regression, label_encoders, onehot_encoder, Labelencoding_features, onehot_features)
        fake_regression = encode_features(fake_regression,label_encoders, onehot_encoder, Labelencoding_features, onehot_features)

        train_total = pd.concat([train_regression, fake_regression])

        data_x = train_total.drop(columns='return')
        data_y = train_total[['return']]

        test_x = test_regression.drop(columns = 'return')
        test_y = test_regression[['return']]

        data_x_scaled = scaler.fit_transform(data_x)

        X_train = data_x_scaled
        y_train = data_y

        test_x_scaled = scaler.transform(test_x)

        reg.fit(X_train, y_train)

        #print(reg.coef_)
        y_pred = reg.predict(test_x_scaled)

        print("RMSE:", mean_squared_error(test_y, y_pred, squared=False))
        print("R²:", r2_score(test_y, y_pred))

        test_regression['predicted_return'] = reg.predict(test_x_scaled)
        top_10_percent = test_regression.sort_values(by='predicted_return', ascending=False).head(int(len(test_regression)*0.1))

        return method, top_10_percent 

    elif method == 'ctabgan':
        train_regression['return'] = (train_regression['total_pymnt_inv'] - train_regression['funded_amnt'])/train_regression['funded_amnt']
        drop_features = ['loan_status','grade', 'total_pymnt', 'total_pymnt_inv', 'loan_amnt','funded_amnt']
        train_regression = train_regression.drop(columns=drop_features)
        test_regression = test_regression.drop(columns=drop_features)
        
        fake_path = "C:/Users/GCU/Lending_club/Data_Analysis_lending-club/portfolios/ctabgan/ctabgan.csv"
        fake = pd.read_csv(fake_path)
        fake['loan_status'] = 1

        fake_regression = fake.copy()

        fake_regression['return'] = (fake_regression['total_pymnt_inv'] - fake_regression['funded_amnt'])/fake_regression['funded_amnt']
        fake_regression = fake_regression.drop(columns=drop_features)
        fake_regression['term_months'] = fake_regression['term_months'].apply(lambda x:36 if abs(x-36)<abs(x-60) else 60)
        
        train_regression = encode_features(train_regression, label_encoders, onehot_encoder, Labelencoding_features, onehot_features)
        test_regression = encode_features(test_regression, label_encoders, onehot_encoder, Labelencoding_features, onehot_features)
        fake_regression = encode_features(fake_regression,label_encoders, onehot_encoder, Labelencoding_features, onehot_features)

        train_total = pd.concat([train_regression, fake_regression])

        data_x = train_total.drop(columns='return')
        data_y = train_total[['return']]

        test_x = test_regression.drop(columns = 'return')
        test_y = test_regression[['return']]

        data_x_scaled = scaler.fit_transform(data_x)

        X_train = data_x_scaled
        y_train = data_y

        test_x_scaled = scaler.transform(test_x)

        reg.fit(X_train, y_train)

        #print(reg.coef_)
        y_pred = reg.predict(test_x_scaled)

        print("RMSE:", mean_squared_error(test_y, y_pred, squared=False))
        print("R²:", r2_score(test_y, y_pred))

        test_regression['predicted_return'] = reg.predict(test_x_scaled)
        top_10_percent = test_regression.sort_values(by='predicted_return', ascending=False).head(int(len(test_regression)*0.1))

        return method, top_10_percent 
    
    elif method == 'smotified-ctabgan':
        train_regression['return'] = (train_regression['total_pymnt_inv'] - train_regression['funded_amnt'])/train_regression['funded_amnt']
        drop_features = ['loan_status','grade', 'total_pymnt', 'total_pymnt_inv', 'loan_amnt','funded_amnt']
        train_regression = train_regression.drop(columns=drop_features)
        test_regression = test_regression.drop(columns=drop_features)
        
        fake_path = "C:/Users/GCU/Lending_club/Data_Analysis_lending-club/portfolios/ctabgan/smotified_ctabgan.csv"
        fake = pd.read_csv(fake_path)
        fake['loan_status'] = 1

        fake_regression = fake.copy()

        fake_regression['return'] = (fake_regression['total_pymnt_inv'] - fake_regression['funded_amnt'])/fake_regression['funded_amnt']
        fake_regression = fake_regression.drop(columns=drop_features)
        fake_regression['term_months'] = fake_regression['term_months'].apply(lambda x:36 if abs(x-36)<abs(x-60) else 60)
        
        train_regression = encode_features(train_regression, label_encoders, onehot_encoder, Labelencoding_features, onehot_features)
        test_regression = encode_features(test_regression, label_encoders, onehot_encoder, Labelencoding_features, onehot_features)
        fake_regression = encode_features(fake_regression,label_encoders, onehot_encoder, Labelencoding_features, onehot_features)

        train_total = pd.concat([train_regression, fake_regression])

        data_x = train_total.drop(columns='return')
        data_y = train_total[['return']]

        test_x = test_regression.drop(columns = 'return')
        test_y = test_regression[['return']]

        data_x_scaled = scaler.fit_transform(data_x)

        X_train = data_x_scaled
        y_train = data_y

        test_x_scaled = scaler.transform(test_x)

        reg.fit(X_train, y_train)

        #print(reg.coef_)
        y_pred = reg.predict(test_x_scaled)

        print("RMSE:", mean_squared_error(test_y, y_pred, squared=False))
        print("R²:", r2_score(test_y, y_pred))

        test_regression['predicted_return'] = reg.predict(test_x_scaled)
        top_10_percent = test_regression.sort_values(by='predicted_return', ascending=False).head(int(len(test_regression)*0.1))

        return method, top_10_percent 

    elif method == 'vae-ctabgan':
        train_regression['return'] = (train_regression['total_pymnt_inv'] - train_regression['funded_amnt'])/train_regression['funded_amnt']
        drop_features = ['loan_status','grade', 'total_pymnt', 'total_pymnt_inv', 'loan_amnt','funded_amnt']
        train_regression = train_regression.drop(columns=drop_features)
        test_regression = test_regression.drop(columns=drop_features)
        
        fake_path = "C:/Users/GCU/Lending_club/Data_Analysis_lending-club/portfolios/ctabgan/vae-ctabgan.csv"
        fake = pd.read_csv(fake_path)
        fake['loan_status'] = 1

        fake_regression = fake.copy()

        fake_regression['return'] = (fake_regression['total_pymnt_inv'] - fake_regression['funded_amnt'])/fake_regression['funded_amnt']
        fake_regression = fake_regression.drop(columns=drop_features)
        fake_regression['term_months'] = fake_regression['term_months'].apply(lambda x:36 if abs(x-36)<abs(x-60) else 60)
        
        train_regression = encode_features(train_regression, label_encoders, onehot_encoder, Labelencoding_features, onehot_features)
        test_regression = encode_features(test_regression, label_encoders, onehot_encoder, Labelencoding_features, onehot_features)
        fake_regression = encode_features(fake_regression,label_encoders, onehot_encoder, Labelencoding_features, onehot_features)

        train_total = pd.concat([train_regression, fake_regression])

        data_x = train_total.drop(columns='return')
        data_y = train_total[['return']]

        test_x = test_regression.drop(columns = 'return')
        test_y = test_regression[['return']]

        data_x_scaled = scaler.fit_transform(data_x)

        X_train = data_x_scaled
        y_train = data_y

        test_x_scaled = scaler.transform(test_x)

        reg.fit(X_train, y_train)

        #print(reg.coef_)
        y_pred = reg.predict(test_x_scaled)

        print("RMSE:", mean_squared_error(test_y, y_pred, squared=False))
        print("R²:", r2_score(test_y, y_pred))

        test_regression['predicted_return'] = reg.predict(test_x_scaled)
        top_10_percent = test_regression.sort_values(by='predicted_return', ascending=False).head(int(len(test_regression)*0.1))

        return method, top_10_percent 
    
    elif method == 'smotified-vae-ctabgan':
        train_regression['return'] = (train_regression['total_pymnt_inv'] - train_regression['funded_amnt'])/train_regression['funded_amnt']
        drop_features = ['loan_status','grade', 'total_pymnt', 'total_pymnt_inv', 'loan_amnt','funded_amnt']
        train_regression = train_regression.drop(columns=drop_features)
        test_regression = test_regression.drop(columns=drop_features)
        
        fake_path = "C:/Users/GCU/Lending_club/Data_Analysis_lending-club/portfolios/ctabgan/smotified-vae-ctabgan.csv"
        fake = pd.read_csv(fake_path)
        fake['loan_status'] = 1

        fake_regression = fake.copy()

        fake_regression['return'] = (fake_regression['total_pymnt_inv'] - fake_regression['funded_amnt'])/fake_regression['funded_amnt']
        fake_regression = fake_regression.drop(columns=drop_features)
        fake_regression['term_months'] = fake_regression['term_months'].apply(lambda x:36 if abs(x-36)<abs(x-60) else 60)
        
        train_regression = encode_features(train_regression, label_encoders, onehot_encoder, Labelencoding_features, onehot_features)
        test_regression = encode_features(test_regression, label_encoders, onehot_encoder, Labelencoding_features, onehot_features)
        fake_regression = encode_features(fake_regression,label_encoders, onehot_encoder, Labelencoding_features, onehot_features)

        train_total = pd.concat([train_regression, fake_regression])

        data_x = train_total.drop(columns='return')
        data_y = train_total[['return']]

        test_x = test_regression.drop(columns = 'return')
        test_y = test_regression[['return']]

        data_x_scaled = scaler.fit_transform(data_x)

        X_train = data_x_scaled
        y_train = data_y

        test_x_scaled = scaler.transform(test_x)

        reg.fit(X_train, y_train)

        #print(reg.coef_)
        y_pred = reg.predict(test_x_scaled)

        print("RMSE:", mean_squared_error(test_y, y_pred, squared=False))
        print("R²:", r2_score(test_y, y_pred))

        test_regression['predicted_return'] = reg.predict(test_x_scaled)
        top_10_percent = test_regression.sort_values(by='predicted_return', ascending=False).head(int(len(test_regression)*0.1))

        return method, top_10_percent 

In [23]:
from sklearn.utils import shuffle

classification_metrics_summary = []

def return_evaluation(model, data_x, data_y, X_test, y_test, method):
    print(f"Preprocessing method : {method}")

    if method == 'Base':
        X_train, X_val, y_train, y_val = train_test_split(data_x, data_y, test_size=0.2,
                                                           random_state=42, stratify=data_y)
        model.fit(
        X_train, y_train
        )

        y_prob = model.predict_proba(X_test)[:, 1]  # ROC Curve & PRC에 필요

        threshold = 0.5
        y_pred = (y_prob >= threshold).astype(int)


    elif method == 'Undersampling':
        X_train, X_val, y_train, y_val = train_test_split(data_x, data_y, test_size=0.13, random_state=42, stratify=data_y)
        undersampler = RandomUnderSampler(random_state=42)
        X_under, y_under = undersampler.fit_resample(X_train, y_train)

        data_under = pd.concat([pd.DataFrame(X_under, columns=data_x.columns), pd.DataFrame(y_under, columns=['loan_status'])], axis=1)
        X_train = data_under.drop(columns=['loan_status'])  # Feature (입력 데이터)
        y_train = data_under[['loan_status']]  # Target (타겟 변수)

        model.fit(
        X_train, y_train
        )
        y_prob = model.predict_proba(X_test)[:, 1]  # ROC Curve & PRC에 필요

        threshold = 0.5
        y_pred = (y_prob >= threshold).astype(int)

    elif method == 'SMOTE-NC':
        smote_path = "C:/Users/GCU/Lending_club/Data_Analysis_lending-club/portfolios/smotenc/smotenc_data.csv"
        data_smote = pd.read_csv(smote_path)

        keep_features = ['grade', 'total_pymnt', 'total_pymnt_inv', 'loan_amnt','funded_amnt']
        data_smote_classification = data_smote.copy()
        data_smote_classification = data_smote_classification.drop(columns = keep_features)

        data_smote_classification = encode_features(data_smote_classification, label_encoders, onehot_encoder, Labelencoding_features, onehot_features)
        
        data_x = data_smote_classification.drop(columns='loan_status') 
        data_y = data_smote_classification[['loan_status']]

        X_train, X_val, y_train, y_val = train_test_split(data_x, data_y, test_size=0.2, random_state=42, stratify=data_y)

        model.fit(
            X_train, y_train
        )
        y_prob = model.predict_proba(X_test)[:, 1]  # ROC Curve & PRC에 필요

        threshold = 0.5
        y_pred = (y_prob >= threshold).astype(int)
    
    elif method == 'table-gan':
        fake_path = "C:/Users/GCU/Lending_club/Data_Analysis_lending-club/portfolios/tablegan/tablegan.csv"
        fake = pd.read_csv(fake_path)
        fake['loan_status'] = 1
        
        keep_features = ['grade', 'total_pymnt', 'total_pymnt_inv', 'loan_amnt','funded_amnt']
        fake_Classification = fake.copy()
        fake_Classification = fake_Classification.drop(columns = keep_features)
        fake_Classification['term_months'] = fake_Classification['term_months'].apply(lambda x:36 if abs(x-36)<abs(x-60) else 60)
        fake_Classification = encode_features(fake_Classification, label_encoders, onehot_encoder, Labelencoding_features, onehot_features)

        X_train, X_val, y_train, y_val = train_test_split(data_x, data_y, test_size=0.2, random_state=42, stratify=data_y)
        train_dataset = pd.concat([X_train, y_train], axis=1)
        
        train_total = pd.concat([train_dataset,fake_Classification])
        train_total = shuffle(train_total, random_state=42)

        X_train = train_total.drop(columns='loan_status')
        y_train = train_total[['loan_status']]

        model.fit(
            X_train, y_train
        )
        y_prob = model.predict_proba(X_test)[:, 1]  # ROC Curve & PRC에 필요

        threshold = 0.5
        y_pred = (y_prob >= threshold).astype(int)
        
    elif method == 'Smotified-tablegan':
        fake_path = "C:/Users/GCU/Lending_club/Data_Analysis_lending-club/portfolios/tablegan/smotified-tablegan.csv"
        fake = pd.read_csv(fake_path)
        fake['loan_status'] = 1

        keep_features = ['grade', 'total_pymnt', 'total_pymnt_inv', 'loan_amnt','funded_amnt']
        fake_Classification = fake.copy()
        fake_Classification = fake_Classification.drop(columns = keep_features)
        fake_Classification['term_months'] = fake_Classification['term_months'].apply(lambda x:36 if abs(x-36)<abs(x-60) else 60)
        fake_Classification = encode_features(fake_Classification, label_encoders, onehot_encoder, Labelencoding_features, onehot_features)
        
        X_train, X_val, y_train, y_val = train_test_split(data_x, data_y, test_size=0.2, stratify=data_y, random_state=42)
        train_dataset = pd.concat([X_train, y_train], axis=1)
        
        train_total = pd.concat([train_dataset,fake_Classification])
        train_total = shuffle(train_total)

        X_train = train_total.drop(columns='loan_status')
        y_train = train_total[['loan_status']]

        model.fit(
            X_train, y_train
        )
        y_prob = model.predict_proba(X_test)[:, 1]  # ROC Curve & PRC에 필요

        threshold = 0.5
        y_pred = (y_prob >= threshold).astype(int)
    
    elif method == 'vae-tablegan':
        #Fake dataset
        fake_path = "C:/Users/GCU/Lending_club/Data_Analysis_lending-club/portfolios/tablegan/vae-tablegan.csv"
        fake = pd.read_csv(fake_path)
        fake['loan_status'] = 1

        keep_features = ['grade', 'total_pymnt', 'total_pymnt_inv', 'loan_amnt','funded_amnt']
        fake_Classification = fake.copy()
        fake_Classification = fake_Classification.drop(columns = keep_features)
        fake_Classification['term_months'] = fake_Classification['term_months'].apply(lambda x:36 if abs(x-36)<abs(x-60) else 60)
        fake_Classification = encode_features(fake_Classification, label_encoders, onehot_encoder, Labelencoding_features, onehot_features)
        
        X_train, X_val, y_train, y_val = train_test_split(data_x, data_y, test_size=0.2, stratify=data_y, random_state=42)
        train_dataset = pd.concat([X_train, y_train], axis=1)
        
        train_total = pd.concat([train_dataset,fake_Classification])
        train_total = shuffle(train_total)

        X_train = train_total.drop(columns='loan_status')
        y_train = train_total[['loan_status']]

        model.fit(
            X_train, y_train
        )
        y_prob = model.predict_proba(X_test)[:, 1]  # ROC Curve & PRC에 필요

        threshold = 0.5
        y_pred = (y_prob >= threshold).astype(int)
    
    elif method == 'ctgan':
        #Fake dataset
        fake_path = "C:/Users/GCU/Lending_club/Data_Analysis_lending-club/portfolios/ctgan/ctgan.csv"
        fake = pd.read_csv(fake_path)
        fake['loan_status'] = 1

        keep_features = ['grade', 'total_pymnt', 'total_pymnt_inv', 'loan_amnt','funded_amnt']
        fake_Classification = fake.copy()
        fake_Classification = fake_Classification.drop(columns = keep_features)
        fake_Classification['term_months'] = fake_Classification['term_months'].apply(lambda x:36 if abs(x-36)<abs(x-60) else 60)
        fake_Classification = encode_features(fake_Classification, label_encoders, onehot_encoder, Labelencoding_features, onehot_features)
        
        X_train, X_val, y_train, y_val = train_test_split(data_x, data_y, test_size=0.2, stratify=data_y, random_state=42)
        train_dataset = pd.concat([X_train, y_train], axis=1)
        
        train_total = pd.concat([train_dataset,fake_Classification])
        train_total = shuffle(train_total)

        X_train = train_total.drop(columns='loan_status')
        y_train = train_total[['loan_status']]

        model.fit(
            X_train, y_train
        )
        y_prob = model.predict_proba(X_test)[:, 1]  # ROC Curve & PRC에 필요

        threshold = 0.5
        y_pred = (y_prob >= threshold).astype(int)
    
    elif method == 'smotified-ctgan':
        #Fake dataset
        fake_path = "C:/Users/GCU/Lending_club/Data_Analysis_lending-club/portfolios/ctgan/smotified-ctgan.csv"
        fake = pd.read_csv(fake_path)
        fake['loan_status'] = 1

        keep_features = ['grade', 'total_pymnt', 'total_pymnt_inv', 'loan_amnt','funded_amnt']
        fake_Classification = fake.copy()
        fake_Classification = fake_Classification.drop(columns = keep_features)
        fake_Classification['term_months'] = fake_Classification['term_months'].apply(lambda x:36 if abs(x-36)<abs(x-60) else 60)
        fake_Classification = encode_features(fake_Classification, label_encoders, onehot_encoder, Labelencoding_features, onehot_features)
        
        X_train, X_val, y_train, y_val = train_test_split(data_x, data_y, test_size=0.2, stratify=data_y, random_state=42)
        train_dataset = pd.concat([X_train, y_train], axis=1)
        
        train_total = pd.concat([train_dataset,fake_Classification])
        train_total = shuffle(train_total)

        X_train = train_total.drop(columns='loan_status')
        y_train = train_total[['loan_status']]

        model.fit(
            X_train, y_train
        )
        y_prob = model.predict_proba(X_test)[:, 1]  # ROC Curve & PRC에 필요

        threshold = 0.5
        y_pred = (y_prob >= threshold).astype(int)
    
    elif method == 'vae-ctgan':
        #Fake dataset
        fake_path = "C:/Users/GCU/Lending_club/Data_Analysis_lending-club/portfolios/ctgan/vae-ctgan.csv"
        fake = pd.read_csv(fake_path)
        fake['loan_status'] = 1

        keep_features = ['grade', 'total_pymnt', 'total_pymnt_inv', 'loan_amnt','funded_amnt']
        fake_Classification = fake.copy()
        fake_Classification = fake_Classification.drop(columns = keep_features)
        fake_Classification['term_months'] = fake_Classification['term_months'].apply(lambda x:36 if abs(x-36)<abs(x-60) else 60)
        fake_Classification = encode_features(fake_Classification, label_encoders, onehot_encoder, Labelencoding_features, onehot_features)
        
        X_train, X_val, y_train, y_val = train_test_split(data_x, data_y, test_size=0.2, stratify=data_y, random_state=42)
        train_dataset = pd.concat([X_train, y_train], axis=1)
        
        train_total = pd.concat([train_dataset,fake_Classification])
        train_total = shuffle(train_total)

        X_train = train_total.drop(columns='loan_status')
        y_train = train_total[['loan_status']]

        model.fit(
            X_train, y_train
        )
        y_prob = model.predict_proba(X_test)[:, 1]  # ROC Curve & PRC에 필요

        threshold = 0.5
        y_pred = (y_prob >= threshold).astype(int)
        
    elif method == 'smotified-vae-ctgan':
        #Fake dataset
        fake_path = "C:/Users/GCU/Lending_club/Data_Analysis_lending-club/portfolios/ctgan/smotified-vae-ctgan.csv"
        fake = pd.read_csv(fake_path)
        fake['loan_status'] = 1

        keep_features = ['grade', 'total_pymnt', 'total_pymnt_inv', 'loan_amnt','funded_amnt']
        fake_Classification = fake.copy()
        fake_Classification = fake_Classification.drop(columns = keep_features)
        fake_Classification['term_months'] = fake_Classification['term_months'].apply(lambda x:36 if abs(x-36)<abs(x-60) else 60)
        fake_Classification = encode_features(fake_Classification, label_encoders, onehot_encoder, Labelencoding_features, onehot_features)
        
        X_train, X_val, y_train, y_val = train_test_split(data_x, data_y, test_size=0.2, stratify=data_y, random_state=42)
        train_dataset = pd.concat([X_train, y_train], axis=1)
        
        train_total = pd.concat([train_dataset,fake_Classification])
        train_total = shuffle(train_total)

        X_train = train_total.drop(columns='loan_status')
        y_train = train_total[['loan_status']]

        model.fit(
            X_train, y_train
        )
        y_prob = model.predict_proba(X_test)[:, 1]  # ROC Curve & PRC에 필요

        threshold = 0.5
        y_pred = (y_prob >= threshold).astype(int)
    
    elif method == 'ctabgan':
        #Fake dataset
        fake_path = "C:/Users/GCU/Lending_club/Data_Analysis_lending-club/portfolios/ctabgan/ctabgan.csv"
        fake = pd.read_csv(fake_path)
        fake['loan_status'] = 1

        keep_features = ['grade', 'total_pymnt', 'total_pymnt_inv', 'loan_amnt','funded_amnt']
        fake_Classification = fake.copy()
        fake_Classification = fake_Classification.drop(columns = keep_features)
        fake_Classification['term_months'] = fake_Classification['term_months'].apply(lambda x:36 if abs(x-36)<abs(x-60) else 60)
        fake_Classification = encode_features(fake_Classification, label_encoders, onehot_encoder, Labelencoding_features, onehot_features)
        
        X_train, X_val, y_train, y_val = train_test_split(data_x, data_y, test_size=0.2, stratify=data_y, random_state=42)
        train_dataset = pd.concat([X_train, y_train], axis=1)
        
        train_total = pd.concat([train_dataset,fake_Classification])
        train_total = shuffle(train_total)

        X_train = train_total.drop(columns='loan_status')
        y_train = train_total[['loan_status']]

        model.fit(
            X_train, y_train
        )
        y_prob = model.predict_proba(X_test)[:, 1]  # ROC Curve & PRC에 필요

        threshold = 0.5
        y_pred = (y_prob >= threshold).astype(int)
    
    elif method == 'smotified-ctabgan':
        #Fake dataset
        fake_path = "C:/Users/GCU/Lending_club/Data_Analysis_lending-club/portfolios/ctabgan/smotified_ctabgan.csv"
        fake = pd.read_csv(fake_path)
        fake['loan_status'] = 1

        keep_features = ['grade', 'total_pymnt', 'total_pymnt_inv', 'loan_amnt','funded_amnt']
        fake_Classification = fake.copy()
        fake_Classification = fake_Classification.drop(columns = keep_features)
        fake_Classification['term_months'] = fake_Classification['term_months'].apply(lambda x:36 if abs(x-36)<abs(x-60) else 60)
        fake_Classification = encode_features(fake_Classification, label_encoders, onehot_encoder, Labelencoding_features, onehot_features)
        
        X_train, X_val, y_train, y_val = train_test_split(data_x, data_y, test_size=0.2, stratify=data_y, random_state=42)
        train_dataset = pd.concat([X_train, y_train], axis=1)
        
        train_total = pd.concat([train_dataset,fake_Classification])
        train_total = shuffle(train_total)

        X_train = train_total.drop(columns='loan_status')
        y_train = train_total[['loan_status']]

        model.fit(
            X_train, y_train
        )
        y_prob = model.predict_proba(X_test)[:, 1]  # ROC Curve & PRC에 필요

        threshold = 0.5
        y_pred = (y_prob >= threshold).astype(int)
    
    elif method == 'vae-ctabgan':
        #Fake dataset
        fake_path = "C:/Users/GCU/Lending_club/Data_Analysis_lending-club/portfolios/ctabgan/vae-ctabgan.csv"
        fake = pd.read_csv(fake_path)
        fake['loan_status'] = 1

        keep_features = ['grade', 'total_pymnt', 'total_pymnt_inv', 'loan_amnt','funded_amnt']
        fake_Classification = fake.copy()
        fake_Classification = fake_Classification.drop(columns = keep_features)
        fake_Classification['term_months'] = fake_Classification['term_months'].apply(lambda x:36 if abs(x-36)<abs(x-60) else 60)
        fake_Classification = encode_features(fake_Classification, label_encoders, onehot_encoder, Labelencoding_features, onehot_features)
        
        X_train, X_val, y_train, y_val = train_test_split(data_x, data_y, test_size=0.2, stratify=data_y, random_state=42)
        train_dataset = pd.concat([X_train, y_train], axis=1)
        
        train_total = pd.concat([train_dataset,fake_Classification])
        train_total = shuffle(train_total)

        X_train = train_total.drop(columns='loan_status')
        y_train = train_total[['loan_status']]

        model.fit(
            X_train, y_train
        )
        y_prob = model.predict_proba(X_test)[:, 1]  # ROC Curve & PRC에 필요

        threshold = 0.5
        y_pred = (y_prob >= threshold).astype(int)
    
    elif method == 'smotified-vae-ctabgan':
        #Fake dataset
        fake_path = "C:/Users/GCU/Lending_club/Data_Analysis_lending-club/portfolios/ctabgan/smotified-vae-ctabgan.csv"
        fake = pd.read_csv(fake_path)
        fake['loan_status'] = 1

        keep_features = ['grade', 'total_pymnt', 'total_pymnt_inv', 'loan_amnt','funded_amnt']
        fake_Classification = fake.copy()
        fake_Classification = fake_Classification.drop(columns = keep_features)
        fake_Classification['term_months'] = fake_Classification['term_months'].apply(lambda x:36 if abs(x-36)<abs(x-60) else 60)
        fake_Classification = encode_features(fake_Classification, label_encoders, onehot_encoder, Labelencoding_features, onehot_features)
        
        X_train, X_val, y_train, y_val = train_test_split(data_x, data_y, test_size=0.2, stratify=data_y, random_state=42)
        train_dataset = pd.concat([X_train, y_train], axis=1)
        
        train_total = pd.concat([train_dataset,fake_Classification])
        train_total = shuffle(train_total)

        X_train = train_total.drop(columns='loan_status')
        y_train = train_total[['loan_status']]

        model.fit(
            X_train, y_train
        )
        y_prob = model.predict_proba(X_test)[:, 1]  # ROC Curve & PRC에 필요

        threshold = 0.5
        y_pred = (y_prob >= threshold).astype(int)
    
    report = classification_report(
        y_test, y_pred,
        target_names=["Fully Paid", "Default"],
        output_dict=True,
        zero_division=0
    )

    classification_metrics_summary.append({
        "Method": method,
        "Class 0 Precision": report["Fully Paid"]["precision"],
        "Class 0 Recall": report["Fully Paid"]["recall"],
        "Class 0 F1": report["Fully Paid"]["f1-score"],
        "Class 1 Precision": report["Default"]["precision"],
        "Class 1 Recall": report["Default"]["recall"],
        "Class 1 F1": report["Default"]["f1-score"],
        "Macro Avg F1": report["macro avg"]["f1-score"],
        "Weighted Avg F1": report["weighted avg"]["f1-score"],
    })

    #10% index뽑기기
    Method, top_10_indices = select_fully_paid(y_pred, method)

    #portfolio 만들기들기
    linear_portfolio_top(Method, top_10_indices)

In [25]:
methods = [
    'Base', 'Undersampling', 'SMOTE-NC',
    'table-gan', 'Smotified-tablegan', 'vae-tablegan',
    'ctgan', 'smotified-ctgan', 'vae-ctgan',
    'smotified-vae-ctgan', 'ctabgan', 'smotified-ctabgan', 'vae-ctabgan', 'smotified-vae-ctabgan'
]

X_test = test_classification.drop(columns='loan_status')
y_test = test_classification['loan_status']

returns_df = pd.DataFrame()

for method in tqdm(methods, desc="Methods"):
    data_x = data_classification.drop(columns='loan_status')
    data_y = data_classification['loan_status']

    model = LogisticRegression(
        penalty='l2',                  
        C=1.0,                         
        max_iter=500,                
        n_jobs=-1,                    
        random_state=42
    )
    
    return_evaluation(model, data_x, data_y, X_test, y_test, method)

#classification 결과
classification_metrics_df = pd.DataFrame(classification_metrics_summary)

#상위 10% 투자전략
returns_with_sharpe_df_top = pd.DataFrame(summary_with_sharpe_top)
portfolio_grade_distribution_df_top = pd.DataFrame(summary_with_grade_top)

Methods:   0%|          | 0/14 [00:00<?, ?it/s]

Preprocessing method : Base
RMSE: 0.2255712121825778
R²: 0.28348672641562445


Methods:   7%|▋         | 1/14 [00:36<07:49, 36.14s/it]

Preprocessing method : Undersampling
RMSE: 0.21079269931409053
R²: -0.04847987488476457


Methods:  14%|█▍        | 2/14 [00:51<04:49, 24.08s/it]

Preprocessing method : SMOTE-NC
RMSE: 0.21524131555684947
R²: -0.06276604139076869


Methods:  21%|██▏       | 3/14 [01:50<07:18, 39.86s/it]

Preprocessing method : table-gan
RMSE: 0.2106338311480332
R²: -0.017375804058554545


Methods:  29%|██▊       | 4/14 [02:38<07:11, 43.11s/it]

Preprocessing method : Smotified-tablegan
RMSE: 0.2064672399371244
R²: 0.22230682630258836


Methods:  36%|███▌      | 5/14 [03:31<06:59, 46.61s/it]

Preprocessing method : vae-tablegan
RMSE: 0.20923083189113342
R²: 0.22784978341760453


Methods:  43%|████▎     | 6/14 [04:26<06:34, 49.37s/it]

Preprocessing method : ctgan
RMSE: 0.20540199079020285
R²: 0.008906048811501255


Methods:  50%|█████     | 7/14 [05:18<05:51, 50.27s/it]

Preprocessing method : smotified-ctgan
RMSE: 0.20796240169901162
R²: 0.010394615771601168


Methods:  57%|█████▋    | 8/14 [06:08<05:01, 50.26s/it]

Preprocessing method : vae-ctgan
RMSE: 0.20519944528585637
R²: 0.22324815672306264


Methods:  64%|██████▍   | 9/14 [07:01<04:15, 51.10s/it]

Preprocessing method : smotified-vae-ctgan
RMSE: 0.21376216311387972
R²: 0.22563337113009752


Methods:  71%|███████▏  | 10/14 [07:53<03:25, 51.48s/it]

Preprocessing method : ctabgan
RMSE: 0.323154602418979
R²: -1.3946201628102295


Methods:  79%|███████▊  | 11/14 [08:42<02:31, 50.62s/it]

Preprocessing method : smotified-ctabgan
RMSE: 0.3019524904887662
R²: -1.126324592373455


Methods:  86%|████████▌ | 12/14 [09:30<01:39, 49.91s/it]

Preprocessing method : vae-ctabgan
RMSE: 0.19336389328567696
R²: 0.13849251471993995


Methods:  93%|█████████▎| 13/14 [10:18<00:49, 49.28s/it]

Preprocessing method : smotified-vae-ctabgan
RMSE: 0.19826597637981846
R²: 0.07049206945088338


Methods: 100%|██████████| 14/14 [11:06<00:00, 47.61s/it]


In [27]:
classification_metrics_df.head(15)

Unnamed: 0,Method,Class 0 Precision,Class 0 Recall,Class 0 F1,Class 1 Precision,Class 1 Recall,Class 1 F1,Macro Avg F1,Weighted Avg F1
0,Base,0.839542,0.971838,0.900859,0.667763,0.23356,0.346075,0.623467,0.792637
1,Undersampling,0.90953,0.777164,0.838153,0.425498,0.681015,0.523754,0.680954,0.776823
2,SMOTE-NC,0.906636,0.783277,0.840454,0.427275,0.667165,0.520929,0.680691,0.778124
3,table-gan,0.905963,0.779301,0.837872,0.422487,0.666221,0.517071,0.677472,0.775293
4,Smotified-tablegan,0.88114,0.865677,0.87334,0.483161,0.518148,0.500043,0.686692,0.800521
5,vae-tablegan,0.876648,0.874374,0.87551,0.487114,0.492327,0.489707,0.682608,0.800251
6,ctgan,0.908033,0.777465,0.837692,0.423688,0.675079,0.520625,0.679158,0.775841
7,smotified-ctgan,0.906964,0.789204,0.843996,0.433624,0.665946,0.525242,0.684619,0.781817
8,vae-ctgan,0.882601,0.863263,0.872825,0.482557,0.526186,0.503428,0.688127,0.800767
9,smotified-vae-ctgan,0.874212,0.877105,0.875656,0.48587,0.479234,0.48253,0.679093,0.798969


In [28]:
returns_with_sharpe_df_top.head(15)

Unnamed: 0,Method,Average Return,Std Dev,Sharpe Ratio
0,Base,18.759043,0.147587,1.271047
1,Undersampling,14.080652,0.108609,1.296458
2,SMOTE-NC,14.063036,0.109973,1.27877
3,table-gan,13.368777,0.102612,1.302851
4,Smotified-tablegan,16.496026,0.122514,1.346464
5,vae-tablegan,16.50629,0.122995,1.342025
6,ctgan,14.334002,0.1028,1.394355
7,smotified-ctgan,14.288137,0.102756,1.390488
8,vae-ctgan,16.381848,0.121415,1.349244
9,smotified-vae-ctgan,16.663233,0.12548,1.327958


In [29]:
portfolio_grade_distribution_df_top.head(10)

Unnamed: 0,Method,A,B,C,D,E,F+G,Total
0,Base,9883,18525,20245,11582,6955,2162,69352
1,Undersampling,16611,17229,12256,4156,875,65,51192
2,SMOTE-NC,17438,17315,11782,4258,893,73,51759
3,table-gan,15292,18042,12646,4681,799,75,51535
4,Smotified-tablegan,13077,18977,15894,7643,2673,596,58860
5,vae-tablegan,13300,19245,16058,7751,2774,628,59756
6,ctgan,16208,18056,12418,4001,590,23,51296
7,smotified-ctgan,17112,18206,11877,4110,770,57,52132
8,vae-ctgan,13139,19045,15882,7537,2506,489,58598
9,smotified-vae-ctgan,13292,19148,15976,7752,2997,944,60109


In [30]:
portfolio_grade = portfolio_grade_distribution_df_top.copy()

grade_columns = ['A', 'B', 'C', 'D', 'E', 'F+G']

for col in grade_columns:
    portfolio_grade[col] = (portfolio_grade[col] / portfolio_grade['Total']) * 100
    portfolio_grade[col] = round(portfolio_grade[col],2)

# 결과 출력
print(portfolio_grade[[col for col in grade_columns]])


        A      B      C      D      E   F+G
0   14.25  26.71  29.19  16.70  10.03  3.12
1   32.45  33.66  23.94   8.12   1.71  0.13
2   33.69  33.45  22.76   8.23   1.73  0.14
3   29.67  35.01  24.54   9.08   1.55  0.15
4   22.22  32.24  27.00  12.99   4.54  1.01
5   22.26  32.21  26.87  12.97   4.64  1.05
6   31.60  35.20  24.21   7.80   1.15  0.04
7   32.82  34.92  22.78   7.88   1.48  0.11
8   22.42  32.50  27.10  12.86   4.28  0.83
9   22.11  31.86  26.58  12.90   4.99  1.57
10   5.40  28.74  45.63  18.74   1.47  0.02
11  71.66  23.07   4.77   0.46   0.04  0.00
12  29.14  33.41  25.84   9.75   1.75  0.11
13  42.37  32.15  18.40   5.96   1.05  0.07


In [31]:
portfolio_grade.head(10)

Unnamed: 0,Method,A,B,C,D,E,F+G,Total
0,Base,14.25,26.71,29.19,16.7,10.03,3.12,69352
1,Undersampling,32.45,33.66,23.94,8.12,1.71,0.13,51192
2,SMOTE-NC,33.69,33.45,22.76,8.23,1.73,0.14,51759
3,table-gan,29.67,35.01,24.54,9.08,1.55,0.15,51535
4,Smotified-tablegan,22.22,32.24,27.0,12.99,4.54,1.01,58860
5,vae-tablegan,22.26,32.21,26.87,12.97,4.64,1.05,59756
6,ctgan,31.6,35.2,24.21,7.8,1.15,0.04,51296
7,smotified-ctgan,32.82,34.92,22.78,7.88,1.48,0.11,52132
8,vae-ctgan,22.42,32.5,27.1,12.86,4.28,0.83,58598
9,smotified-vae-ctgan,22.11,31.86,26.58,12.9,4.99,1.57,60109
