##### Model Training

In [None]:
# For data manipulation
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
from collections import Counter

import optuna
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import train_test_split


# For displaying all of the columns in dataframes
pd.set_option('display.max_columns', None)

#For ignoring warnings
import warnings
warnings.filterwarnings('ignore')

In [5]:
# Loading the dataset
df0 = pd.read_csv(r"C:\Users\hp\OneDrive\Documents\GitHub\credit_line_eligibility\data\cleaned_data.csv")


In [7]:
df0.head()

Unnamed: 0,loan_amnt,term,int_rate,emp_length,home_ownership,annual_inc,verification_status,loan_status,purpose,dti,open_acc,pub_rec,revol_bal,revol_util,total_acc,mort_acc
0,10000.0,36,11.44,10,141326,1.202703,117413,1,2094,1.089146,16.0,0.0,1.434536,41.8,25.0,0.0
1,8000.0,36,11.99,4,173740,0.060161,117413,1,207128,0.623256,17.0,0.0,0.681703,53.3,27.0,3.0
2,15600.0,36,10.49,0,141326,-0.796125,117893,1,73637,-0.513208,13.0,0.0,0.079328,92.2,26.0,0.0
3,7200.0,36,6.49,6,141326,-0.319423,117413,1,73637,-2.12021,6.0,0.0,-0.739714,21.5,13.0,0.0
4,24375.0,60,17.27,9,173740,-0.281432,111005,0,73637,1.893119,13.0,0.0,0.92793,69.8,43.0,1.0


In [8]:
X = df0.drop(columns=['loan_status'])
X.reset_index(inplace=True, drop=True)
y = df0['loan_status']
y.reset_index(drop=True, inplace=True)

# Step 1: Split data before standardization
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Step 2: Standardize features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)  # Fit & transform on training data
X_test_scaled = scaler.transform(X_test)  # Only transform test data (NO fitting)


In [9]:
# Step 3: Apply SMOTE on the standardized training set
smote = SMOTE(sampling_strategy='auto', random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train_scaled, y_train)

# Step 4: Print class distributions
print("Before SMOTE:", Counter(y_train))  
print("After SMOTE:", Counter(y_train_resampled)) 

Before SMOTE: Counter({1: 224616, 0: 52432})
After SMOTE: Counter({0: 224616, 1: 224616})


In [10]:
import torch
print(torch.cuda.is_available())  # Should return True
print(torch.cuda.device_count())  # Number of GPUs
print(torch.cuda.get_device_name(0))  # GPU model name


True
1
NVIDIA GeForce MX110


In [None]:
# Split train further into train-val for Optuna tuning
X_train_sub, X_val, y_train_sub, y_val = train_test_split(X_train_resampled, y_train_resampled,test_size=0.2,
                                                           random_state=42, stratify=y_train_resampled)

def objective(trial):
    # Define the hyperparameter search space
    n_estimators = trial.suggest_int("n_estimators", 50, 500, step=50)  # Number of trees
    max_depth = trial.suggest_int("max_depth", 3, 30)  # Tree depth
    min_samples_split = trial.suggest_int("min_samples_split", 2, 10)  # Min samples to split a node
    min_samples_leaf = trial.suggest_int("min_samples_leaf", 1, 10)  # Min samples per leaf
    max_features = trial.suggest_categorical("max_features", ["sqrt", "log2", None])  # Feature selection method

    # Train RF model with suggested hyperparameters
    rf = RandomForestClassifier(
        n_estimators=n_estimators,
        max_depth=max_depth,
        min_samples_split=min_samples_split,
        min_samples_leaf=min_samples_leaf,
        max_features=max_features,
        random_state=42,
        n_jobs=-1
    )

    # Use cross-validation to evaluate performance
    score = cross_val_score(rf, X_train_sub, y_train_sub, cv=3, scoring="f1").mean()
    return score  # Maximizing F1-score

# Create study and run optimization
study = optuna.create_study(direction="maximize")  # Maximize F1-score
study.optimize(objective, n_trials=30)  # Run 30 trials

# Print best hyperparameters
print("Best Hyperparameters:", study.best_params)
