## Multi Layer Perceptron

In [16]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.preprocessing import StandardScaler
from imblearn.under_sampling import RandomUnderSampler
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle

from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

from sklearn.metrics import roc_curve, auc, log_loss
from sklearn.metrics import classification_report, precision_recall_curve, auc
from sklearn.metrics import accuracy_score
from tqdm import tqdm

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

import warnings
warnings.filterwarnings("ignore")

In [2]:
is_cuda = torch.cuda.is_available()
device = torch.device('cuda' if is_cuda else 'cpu')

print(device)

cuda


In [17]:
## Load original data
data_path = "C:/Users/GCU/Lending_club/Data_Analysis_lending-club/Dataset/return_feature_train.csv"
data = pd.read_csv(data_path, low_memory=False)

##load test data
test_data_path = "C:/Users/GCU/Lending_club/Data_Analysis_lending-club/Dataset/return_feature_test.csv"
test_data = pd.read_csv(test_data_path)

## Load original data
fake_path = "C:/Users/GCU/Lending_club/Data_Analysis_lending-club/CTGAN/ctgan_output/synthetic_ctgan_data_class1.csv"
fake = pd.read_csv(fake_path, low_memory=False)

In [18]:
keep_features = ['grade', 'term_months', 'total_pymnt', 'total_pymnt_inv','total_il_high_credit_limit', 'loan_amnt']

data_classification = data.copy()
test_classification = test_data.copy()
fake_classification = fake.copy()

data_classification = data_classification.drop(columns = keep_features)
test_classification = test_classification.drop(columns = keep_features)
fake_classification = fake_classification.drop(columns = keep_features)

In [19]:
data_x = data_classification.drop(columns='loan_status')
data_y = data_classification[['loan_status']]

X_train, X_val, y_train, y_val = train_test_split(data_x, data_y, train_size=0.8, test_size=0.2, random_state=42, stratify=data_y)

train_classification = pd.concat([X_train, y_train], axis=1)

In [20]:
from sklearn.utils import shuffle

train_data = pd.concat([train_classification, fake_classification])
train_data['loan_status'].value_counts()
train_data = shuffle(train_data, random_state=42)

In [21]:
X_train = train_data.drop(columns='loan_status')
y_train = train_data[['loan_status']]

X_test = test_classification.drop(columns='loan_status')
y_test = test_classification[['loan_status']]

In [22]:
X_train_tensor = torch.tensor(X_train.values, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train.values, dtype=torch.float32)

X_val_tensor = torch.tensor(X_val.values, dtype=torch.float32)
y_val_tensor = torch.tensor(y_val.values, dtype=torch.float32)

X_test_tensor = torch.tensor(X_test.values, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test.values, dtype=torch.float32)

In [23]:
class ClassificationDataset(Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = y

    def __len__(self):
        return self.X.shape[0]

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

train_dataset = ClassificationDataset(X_train_tensor, y_train_tensor)
val_dataset = ClassificationDataset(X_val_tensor, y_val_tensor)
test_dataset = ClassificationDataset(X_test_tensor, y_test_tensor)

train_loader = DataLoader(train_dataset, batch_size=500, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=500)
test_loader = DataLoader(test_dataset, batch_size=500)

In [29]:
class MLPClassifier(nn.Module):
    def __init__(self, input_dim):
        super(MLPClassifier, self).__init__()
        self.model = nn.Sequential(
            nn.Linear(input_dim, 256),
            nn.BatchNorm1d(256),
            nn.ReLU(),
            nn.Dropout(0.3),

            nn.Linear(256, 128),
            nn.BatchNorm1d(128),
            nn.ReLU(),
            nn.Dropout(0.3),

            nn.Linear(128, 64),
            nn.BatchNorm1d(64),
            nn.ReLU(),
            nn.Dropout(0.3),

            nn.Linear(64, 1),
            nn.Sigmoid()
        )

    def forward(self, x):
        return self.model(x)

model = MLPClassifier(input_dim=X_train_tensor.shape[1]).to(device)
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [None]:
from tqdm import tqdm
import copy

patience = 7
best_val_loss = float('inf')
patience_counter = 0
best_model_state = None

for epoch in tqdm(range(100), desc="Training Epochs"):
    model.train()
    train_losses = []

    for X_batch, y_batch in train_loader:
        X_batch = X_batch.to(device)
        y_batch = y_batch.to(device)

        optimizer.zero_grad()
        output = model(X_batch)
        loss = criterion(output, y_batch)
        loss.backward()
        optimizer.step()
        train_losses.append(loss.item())

    avg_train_loss = sum(train_losses) / len(train_losses)

    # --- Validation ---
    model.eval()
    val_losses = []
    all_preds = []
    all_labels = []

    with torch.no_grad():
        for X_batch, y_batch in val_loader:
            X_batch = X_batch.to(device)
            y_batch = y_batch.to(device)

            output = model(X_batch)
            loss = criterion(output, y_batch)
            val_losses.append(loss.item())

            preds = (output >= 0.5).float()
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(y_batch.cpu().numpy())

    avg_val_loss = sum(val_losses) / len(val_losses)
    val_acc = accuracy_score(all_labels, all_preds)

    print(f"Epoch {epoch+1} | Train Loss: {avg_train_loss:.4f} | "
          f"Val Loss: {avg_val_loss:.4f} | Val Acc: {val_acc:.4f}")

    # --- Early Stopping Check ---
    if avg_val_loss < best_val_loss:
        best_val_loss = avg_val_loss
        patience_counter = 0
        best_model_state = copy.deepcopy(model.state_dict())  # 모델 저장
    else:
        patience_counter += 1
        if patience_counter >= patience:
            print(f"⏹ Early stopping triggered at epoch {epoch+1}.")
            break

# --- 최적 모델 로드 (선택) ---
if best_model_state is not None:
    model.load_state_dict(best_model_state)
    print("✅ Best model restored (based on lowest validation loss).")


Training Epochs:  20%|██        | 1/5 [00:18<01:13, 18.32s/it]

Epoch 1 | Train Loss: 0.3753 | Val Loss: 0.2965 | Val Acc: 0.8806


Training Epochs:  40%|████      | 2/5 [00:36<00:54, 18.09s/it]

Epoch 2 | Train Loss: 0.3145 | Val Loss: 0.3470 | Val Acc: 0.8616


Training Epochs:  60%|██████    | 3/5 [00:53<00:35, 17.71s/it]

Epoch 3 | Train Loss: 0.3062 | Val Loss: 0.3160 | Val Acc: 0.8703


Training Epochs:  80%|████████  | 4/5 [01:10<00:17, 17.45s/it]

Epoch 4 | Train Loss: 0.3005 | Val Loss: 0.3116 | Val Acc: 0.8725


Training Epochs: 100%|██████████| 5/5 [01:27<00:00, 17.59s/it]

Epoch 5 | Train Loss: 0.2978 | Val Loss: 0.3123 | Val Acc: 0.8689





In [32]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, average_precision_score

model.eval()

all_preds = []
all_probs = []
all_labels = []

with torch.no_grad():
    for X_batch, y_batch in test_loader:
        X_batch = X_batch.to(device)
        y_batch = y_batch.to(device)

        y_prob = model(X_batch)  
        y_pred = (y_prob >= 0.5).float() 

        all_probs.extend(y_prob.cpu().numpy())
        all_preds.extend(y_pred.cpu().numpy())
        all_labels.extend(y_batch.cpu().numpy())

y_true = np.array(all_labels).flatten()
y_pred_cls = np.array(all_preds).flatten()
y_prob_cls = np.array(all_probs).flatten()

acc = accuracy_score(y_true, y_pred_cls)
precision = precision_score(y_true, y_pred_cls)
recall = recall_score(y_true, y_pred_cls)
f1 = f1_score(y_true, y_pred_cls)
au_prc = average_precision_score(y_true, y_prob_cls)  

print(f"🔍 Evaluation Metrics:")
print(f" - Accuracy:  {acc:.4f}")
print(f" - Precision: {precision:.4f}")
print(f" - Recall:    {recall:.4f}")
print(f" - F1-Score:  {f1:.4f}")
print(f" - AU-PRC:    {au_prc:.4f}")

🔍 Evaluation Metrics:
 - Accuracy:  0.8691
 - Precision: 0.6146
 - Recall:    0.8823
 - F1-Score:  0.7245
 - AU-PRC:    0.7430


In [33]:
from sklearn.metrics import classification_report

print(classification_report(y_true, y_pred_cls, digits=4))

              precision    recall  f1-score   support

         0.0     0.9681    0.8659    0.9142    599114
         1.0     0.6146    0.8823    0.7245    145192

    accuracy                         0.8691    744306
   macro avg     0.7913    0.8741    0.8193    744306
weighted avg     0.8991    0.8691    0.8772    744306

