##### Model Training

In [1]:
# For data manipulation
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
from collections import Counter

import optuna
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import train_test_split


# For displaying all of the columns in dataframes
pd.set_option('display.max_columns', None)

#For ignoring warnings
import warnings
warnings.filterwarnings('ignore')

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Loading the dataset
df0 = pd.read_csv(r"C:\Users\hp\OneDrive\Documents\GitHub\credit_line_eligibility\data\cleaned_data.csv")


In [3]:
df0.head()

Unnamed: 0,loan_amnt,term,int_rate,emp_length,home_ownership,annual_inc,verification_status,loan_status,purpose,dti,open_acc,pub_rec,revol_bal,revol_util,total_acc,mort_acc
0,10000.0,36,11.44,10,141326,1.202703,117413,1,2094,1.089146,16.0,0.0,1.434536,41.8,25.0,0.0
1,8000.0,36,11.99,4,173740,0.060161,117413,1,207128,0.623256,17.0,0.0,0.681703,53.3,27.0,3.0
2,15600.0,36,10.49,0,141326,-0.796125,117893,1,73637,-0.513208,13.0,0.0,0.079328,92.2,26.0,0.0
3,7200.0,36,6.49,6,141326,-0.319423,117413,1,73637,-2.12021,6.0,0.0,-0.739714,21.5,13.0,0.0
4,24375.0,60,17.27,9,173740,-0.281432,111005,0,73637,1.893119,13.0,0.0,0.92793,69.8,43.0,1.0


In [5]:
df0.shape

(346311, 16)

In [7]:
X = df0.drop(columns=['loan_status'])
X.reset_index(inplace=True, drop=True)
y = df0['loan_status']
y.reset_index(drop=True, inplace=True)

# Step 1: Split data before standardization
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Step 2: Standardize features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)  # Fit & transform on training data
X_test_scaled = scaler.transform(X_test)  # Only transform test data (NO fitting)


In [10]:
# Step 3: Apply SMOTE on the standardized training set
smote = SMOTE(sampling_strategy='auto', random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train_scaled, y_train)

# Step 4: Print class distributions
print("Before SMOTE:", Counter(y_train))  
print("After SMOTE:", Counter(y_train_resampled)) 

Before SMOTE: Counter({1: 224616, 0: 52432})
After SMOTE: Counter({0: 224616, 1: 224616})


In [24]:
X_train_resampled.shape

(449232, 15)

In [None]:
import optuna
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score, StratifiedKFold
# from optuna.integration import OptunaPruningCallback
from sklearn.utils import resample

# Reduce dataset size for trials
X_train_sample, y_train_sample = resample(X_train, y_train, 
                                          n_samples=50000, 
                                          random_state=42, stratify=y_train)

# Define Optuna objective function
def objective(trial):
    n_estimators = trial.suggest_int("n_estimators", 50, 300, step=50)
    max_depth = trial.suggest_int("max_depth", 3, 15)
    min_samples_split = trial.suggest_int("min_samples_split", 2, 10)
    # min_samples_leaf = trial.suggest_int("min_samples_leaf", 1, 10)
    max_features = trial.suggest_categorical("max_features", ["sqrt", "log2"])

    # Optimize CPU usage
    # from multiprocessing import cpu_count
    # n_cores = cpu_count() // 2

    rf = RandomForestClassifier(
        n_estimators=n_estimators,
        max_depth=max_depth,
        min_samples_split=min_samples_split,
        # min_samples_leaf=min_samples_leaf,
        max_features=max_features,
        random_state=42,
        n_jobs=-1
    )

    # Cross-validation with pruning
    cv = StratifiedKFold(n_splits=2, shuffle=True, random_state=42)
    # pruning_callback = OptunaPruningCallback(trial, "f1", interval=1)

    score = cross_val_score(rf, X_train_sample, y_train_sample, 
                            cv=cv, scoring="f1").mean()

    return score

# Use TPESampler for faster trials
study = optuna.create_study(direction="maximize", sampler=optuna.samplers.TPESampler())
study.optimize(objective, n_trials=30)

print("Best Hyperparameters:", study.best_params)


[I 2025-03-16 17:59:46,630] A new study created in memory with name: no-name-e15eb489-33fd-46b0-901a-ee128ac5c303
[I 2025-03-16 18:00:02,388] Trial 0 finished with value: 0.8997532625622422 and parameters: {'n_estimators': 250, 'max_depth': 15, 'min_samples_split': 3, 'max_features': 'sqrt'}. Best is trial 0 with value: 0.8997532625622422.
[I 2025-03-16 18:00:04,325] Trial 1 finished with value: 0.8954791962159154 and parameters: {'n_estimators': 100, 'max_depth': 4, 'min_samples_split': 6, 'max_features': 'sqrt'}. Best is trial 0 with value: 0.8997532625622422.
[I 2025-03-16 18:00:24,206] Trial 2 finished with value: 0.8980172337606758 and parameters: {'n_estimators': 350, 'max_depth': 13, 'min_samples_split': 5, 'max_features': 'log2'}. Best is trial 0 with value: 0.8997532625622422.
[I 2025-03-16 18:00:46,429] Trial 3 finished with value: 0.8991127766679959 and parameters: {'n_estimators': 250, 'max_depth': 14, 'min_samples_split': 2, 'max_features': 'sqrt'}. Best is trial 0 with va

Best Hyperparameters: {'n_estimators': 200, 'max_depth': 15, 'min_samples_split': 3, 'max_features': 'sqrt'}


In [28]:
from imblearn.over_sampling import SMOTE

smote = SMOTE(sampling_strategy='auto', random_state=42)
X_test_resampled, y_test_resampled = smote.fit_resample(X_test_scaled, y_test)


In [None]:
import numpy as np
print("Train class distribution:", np.bincount(y_train_resampled))  # Train data target class distribution fter SMOTE
print("Test class distribution:", np.bincount(y_test_resampled))  # Test data target class distribution after SMOTE

Train class distribution: [224616 224616]
Test class distribution: [56155 56155]


In [30]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

# Train RF with best parameters
best_params = study.best_params
rf_best = RandomForestClassifier(class_weight="balanced",**best_params, random_state=42, n_jobs=-1)

rf_best.fit(X_train_resampled, y_train_resampled)

# # Evaluate on validation set
y_pred = rf_best.predict(X_test_resampled)

# Step 3: Calculate performance metrics
accuracy = accuracy_score(y_test_resampled, y_pred)
precision = precision_score(y_test_resampled, y_pred, average="binary")  # Use "macro" for multi-class
recall = recall_score(y_test_resampled, y_pred, average="binary")  # Use "macro" for multi-class
f1 = f1_score(y_test_resampled, y_pred, average="binary")  # Use "macro" for multi-class

# Print results
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-score: {f1:.4f}")

# Optional: Full classification report
print("\nClassification Report:\n", classification_report(y_test_resampled, y_pred))



Accuracy: 0.8012
Precision: 0.7777
Recall: 0.8434
F1-score: 0.8092

Classification Report:
               precision    recall  f1-score   support

           0       0.83      0.76      0.79     56155
           1       0.78      0.84      0.81     56155

    accuracy                           0.80    112310
   macro avg       0.80      0.80      0.80    112310
weighted avg       0.80      0.80      0.80    112310



In [25]:
import numpy as np
print("Train class distribution:", np.bincount(y_train_resampled))  # After SMOTE
print("Test class distribution:", np.bincount(y_test))  # Original test data


Train class distribution: [224616 224616]
Test class distribution: [13108 56155]
