##### Model Training

In [2]:
# For data manipulation
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
from collections import Counter

import optuna
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

import xgboost as xgb
import lightgbm as lgb


# For displaying all of the columns in dataframes
pd.set_option('display.max_columns', None)

#For ignoring warnings
import warnings
warnings.filterwarnings('ignore')

In [4]:
# Loading the dataset
df0 = pd.read_csv(r"C:\Users\hp\OneDrive\Documents\GitHub\credit_line_eligibility\data\cleaned_data.csv")


In [5]:
df0.head()

Unnamed: 0,loan_amnt,term,int_rate,emp_length,home_ownership,annual_inc,verification_status,loan_status,purpose,dti,open_acc,pub_rec,revol_bal,revol_util,total_acc,mort_acc
0,10000.0,36,11.44,10,141326,1.202703,117413,1,2094,1.089146,16.0,0.0,1.434536,41.8,25.0,0.0
1,8000.0,36,11.99,4,173740,0.060161,117413,1,207128,0.623256,17.0,0.0,0.681703,53.3,27.0,3.0
2,15600.0,36,10.49,0,141326,-0.796125,117893,1,73637,-0.513208,13.0,0.0,0.079328,92.2,26.0,0.0
3,7200.0,36,6.49,6,141326,-0.319423,117413,1,73637,-2.12021,6.0,0.0,-0.739714,21.5,13.0,0.0
4,24375.0,60,17.27,9,173740,-0.281432,111005,0,73637,1.893119,13.0,0.0,0.92793,69.8,43.0,1.0


In [5]:
df0.shape

(346311, 16)

In [6]:
X = df0.drop(columns=['loan_status'])
X.reset_index(inplace=True, drop=True)
y = df0['loan_status']
y.reset_index(drop=True, inplace=True)

# Step 1: Split data before standardization
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Step 2: Standardize features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)  # Fit & transform on training data
X_test_scaled = scaler.transform(X_test)  # Only transform test data (NO fitting)


In [7]:
# Step 3: Apply SMOTE on the standardized training set
smote = SMOTE(sampling_strategy='auto', random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train_scaled, y_train)

# Step 4: Print class distributions
print("Before SMOTE:", Counter(y_train))  
print("After SMOTE:", Counter(y_train_resampled)) 

Before SMOTE: Counter({1: 224616, 0: 52432})
After SMOTE: Counter({0: 224616, 1: 224616})


In [8]:
from imblearn.over_sampling import SMOTE

smote = SMOTE(sampling_strategy='auto', random_state=42)
X_test_resampled, y_test_resampled = smote.fit_resample(X_test_scaled, y_test)

In [9]:

print("Train class distribution:", np.bincount(y_train_resampled))  # Train data target class distribution fter SMOTE
print("Test class distribution:", np.bincount(y_test_resampled))  # Test data target class distribution after SMOTE

Train class distribution: [224616 224616]
Test class distribution: [56155 56155]


In [None]:
import optuna
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score, StratifiedKFold
# from optuna.integration import OptunaPruningCallback
from sklearn.utils import resample

# Reduce dataset size for trials
X_train_sample, y_train_sample = resample(X_train_resampled, y_train_resampled, 
                                          n_samples=50000, 
                                          random_state=42, stratify=y_train)

# Define Optuna objective function
def objective(trial):
    n_estimators = trial.suggest_int("n_estimators", 50, 300, step=50)
    max_depth = trial.suggest_int("max_depth", 3, 15)
    min_samples_split = trial.suggest_int("min_samples_split", 2, 10)
    # min_samples_leaf = trial.suggest_int("min_samples_leaf", 1, 10)
    max_features = trial.suggest_categorical("max_features", ["sqrt", "log2"])

    # Optimize CPU usage
    # from multiprocessing import cpu_count
    # n_cores = cpu_count() // 2

    rf = RandomForestClassifier(
        n_estimators=n_estimators,
        max_depth=max_depth,
        min_samples_split=min_samples_split,
        # min_samples_leaf=min_samples_leaf,
        max_features=max_features,
        random_state=42,
        n_jobs=-1
    )

    # Cross-validation with pruning
    cv = StratifiedKFold(n_splits=2, shuffle=True, random_state=42)
    # pruning_callback = OptunaPruningCallback(trial, "f1", interval=1)

    score = cross_val_score(rf, X_train_sample, y_train_sample, 
                            cv=cv, scoring="f1").mean()

    return score

# Use TPESampler for faster trials
study = optuna.create_study(direction="maximize", sampler=optuna.samplers.TPESampler())
study.optimize(objective, n_trials=30)

print("Best Hyperparameters:", study.best_params)


[I 2025-03-16 17:59:46,630] A new study created in memory with name: no-name-e15eb489-33fd-46b0-901a-ee128ac5c303
[I 2025-03-16 18:00:02,388] Trial 0 finished with value: 0.8997532625622422 and parameters: {'n_estimators': 250, 'max_depth': 15, 'min_samples_split': 3, 'max_features': 'sqrt'}. Best is trial 0 with value: 0.8997532625622422.
[I 2025-03-16 18:00:04,325] Trial 1 finished with value: 0.8954791962159154 and parameters: {'n_estimators': 100, 'max_depth': 4, 'min_samples_split': 6, 'max_features': 'sqrt'}. Best is trial 0 with value: 0.8997532625622422.
[I 2025-03-16 18:00:24,206] Trial 2 finished with value: 0.8980172337606758 and parameters: {'n_estimators': 350, 'max_depth': 13, 'min_samples_split': 5, 'max_features': 'log2'}. Best is trial 0 with value: 0.8997532625622422.
[I 2025-03-16 18:00:46,429] Trial 3 finished with value: 0.8991127766679959 and parameters: {'n_estimators': 250, 'max_depth': 14, 'min_samples_split': 2, 'max_features': 'sqrt'}. Best is trial 0 with va

Best Hyperparameters: {'n_estimators': 200, 'max_depth': 15, 'min_samples_split': 3, 'max_features': 'sqrt'}


In [None]:


# Train RF with best parameters
best_params = study.best_params
rf_best = RandomForestClassifier(class_weight="balanced",**best_params, random_state=42, n_jobs=-1)

rf_best.fit(X_train_resampled, y_train_resampled)

# # Evaluate on validation set
y_pred = rf_best.predict(X_test_resampled)

# Step 3: Calculate performance metrics
accuracy = accuracy_score(y_test_resampled, y_pred)
precision = precision_score(y_test_resampled, y_pred, average="binary")  # Use "macro" for multi-class
recall = recall_score(y_test_resampled, y_pred, average="binary")  # Use "macro" for multi-class
f1 = f1_score(y_test_resampled, y_pred, average="binary")  # Use "macro" for multi-class

# Print results
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-score: {f1:.4f}")

# Optional: Full classification report
print("\nClassification Report:\n", classification_report(y_test_resampled, y_pred))



Accuracy: 0.8012
Precision: 0.7777
Recall: 0.8434
F1-score: 0.8092

Classification Report:
               precision    recall  f1-score   support

           0       0.83      0.76      0.79     56155
           1       0.78      0.84      0.81     56155

    accuracy                           0.80    112310
   macro avg       0.80      0.80      0.80    112310
weighted avg       0.80      0.80      0.80    112310



In [10]:
X_train_resampled.shape

(449232, 15)

In [12]:
y_train_resampled.shape

(449232,)

In [34]:

from optuna.integration import XGBoostPruningCallback
from sklearn.utils import resample

# Step 1: Create a fixed validation set (50K rows)
X_train_sample, y_train_sample = resample(X_train_resampled, y_train_resampled, n_samples=50000, random_state=42, stratify=y_train_resampled)

X_train_sample, X_val, y_train_sample, y_val = train_test_split(
    X_train_sample, y_train_sample, test_size=0.3, random_state=42, stratify=y_train_sample)

# Step 2: Define the Optuna objective function
def objective(trial):
    # Sample 50K rows from training set for faster trials
    # sample_idx = np.random.choice(len(X_train_subset), 50000, replace=False)
    # X_sample, y_sample = X_train_subset.loc[sample_idx], y_train_subset.loc[sample_idx]

    # sample_idx = np.random.choice(X_train_subset.index, 50000, replace=False)
    # X_sample, y_sample = X_train_subset.loc[sample_idx], y_train_subset.loc[sample_idx]


    # Define hyperparameter search space
    params = {
        "objective": "binary:logistic",
        "eval_metric": "logloss",
        "tree_method": "gpu_hist",  # GPU optimization
        "learning_rate": trial.suggest_loguniform("learning_rate", 0.01, 0.3),
        "max_depth": trial.suggest_int("max_depth", 3, 10),
        "min_child_weight": trial.suggest_int("min_child_weight", 1, 10),
        "gamma": trial.suggest_loguniform("gamma", 1e-3, 1.0),
        "subsample": trial.suggest_uniform("subsample", 0.5, 1.0),
        "colsample_bytree": trial.suggest_uniform("colsample_bytree", 0.5, 1.0),
        "lambda": trial.suggest_loguniform("lambda", 1e-3, 10.0),
        "alpha": trial.suggest_loguniform("alpha", 1e-3, 10.0),
        "n_estimators": 500,  # High number for early stopping
        "eval_metric": "logloss",
        "early_stopping_rounds":20,
        "callbacks":[XGBoostPruningCallback(trial, "validation_0-logloss")],
    }
    
    # pruning_callback = XGBoostPruningCallback(trial, "validation_0-logloss")
    # Train the model
    model = xgb.XGBClassifier(**params)
    model.fit(
        X_train_sample,
        y_train_sample,
        eval_set=[(X_val, y_val)],  # Fixed validation set
        # early_stopping_rounds=20,
        verbose=False,
        # callbacks=[XGBoostPruningCallback(trial, "validation_0-logloss")],
    )

    # Predict on validation set
    preds = model.predict(X_val)
    accuracy = accuracy_score(y_val, preds)
    return accuracy

# Step 3: Run Optuna
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=30)

# Best parameters
print("Best Parameters:", study.best_params)


[I 2025-03-17 02:12:04,564] A new study created in memory with name: no-name-2edec595-1157-4e45-b985-a137e23f7395
[I 2025-03-17 02:12:12,629] Trial 0 finished with value: 0.8661333333333333 and parameters: {'learning_rate': 0.028933336025850177, 'max_depth': 7, 'min_child_weight': 7, 'gamma': 0.004334347278173934, 'subsample': 0.5968127960422207, 'colsample_bytree': 0.9762145025499056, 'lambda': 0.039945988595207524, 'alpha': 0.003986516988356314}. Best is trial 0 with value: 0.8661333333333333.
[I 2025-03-17 02:12:19,718] Trial 1 finished with value: 0.8710666666666667 and parameters: {'learning_rate': 0.07374945929636939, 'max_depth': 9, 'min_child_weight': 9, 'gamma': 0.01247504906835454, 'subsample': 0.8887360211354072, 'colsample_bytree': 0.9291942903300328, 'lambda': 0.12595287878881797, 'alpha': 0.014605250785984198}. Best is trial 1 with value: 0.8710666666666667.
[I 2025-03-17 02:12:28,093] Trial 2 finished with value: 0.8694 and parameters: {'learning_rate': 0.059431266728222

Best Parameters: {'learning_rate': 0.03645909660365636, 'max_depth': 10, 'min_child_weight': 8, 'gamma': 0.0023865400336785664, 'subsample': 0.9417076475248576, 'colsample_bytree': 0.5190744717617447, 'lambda': 3.372854403465213, 'alpha': 0.8000788489612762}


In [37]:
# Train final model using best params on full training data
best_params = study.best_params
final_model = xgb.XGBClassifier(**best_params)
final_model.fit(X_train_resampled, y_train_resampled)

y_pred = final_model.predict(X_test_resampled)
accuracy = accuracy_score(y_test_resampled, y_pred)
precision = precision_score(y_test_resampled, y_pred)
recall = recall_score(y_test_resampled, y_pred)
f1 = f1_score(y_test_resampled, y_pred)

print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-score: {f1:.4f}")

# Optional: Full classification report
print("\nClassification Report:\n", classification_report(y_test_resampled, y_pred))


Accuracy: 0.8500
Precision: 0.8082
Recall: 0.9179
F1-score: 0.8596

Classification Report:
               precision    recall  f1-score   support

           0       0.90      0.78      0.84     56155
           1       0.81      0.92      0.86     56155

    accuracy                           0.85    112310
   macro avg       0.86      0.85      0.85    112310
weighted avg       0.86      0.85      0.85    112310



In [30]:
from optuna.integration.lightgbm import LightGBMPruningCallback
from sklearn.utils import resample
from sklearn.metrics import roc_auc_score

# Step 1: Create a fixed validation set (50K rows)
X_train_sample, y_train_sample = resample(X_train_resampled, y_train_resampled, n_samples=50000, random_state=42, stratify=y_train_resampled)

X_train_sample, X_val, y_train_sample, y_val = train_test_split(
    X_train_sample, y_train_sample, test_size=0.3, random_state=42, stratify=y_train_sample)


def objective(trial):
    param_grid = {
        "objective": "binary",
        "metric": "auc",
        "verbosity": -1,
        "verbose":-1,
        "boosting_type": 'gbdt',  # Can also try 'dart'
        "learning_rate": trial.suggest_loguniform("learning_rate", 0.01, 0.3),
        "num_leaves": trial.suggest_int("num_leaves", 20, 300, step=10),
        "max_depth": trial.suggest_int("max_depth", 3, 12),
        "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 10, 200),
        "lambda_l1": trial.suggest_loguniform("lambda_l1", 1e-8, 10.0),
        "lambda_l2": trial.suggest_loguniform("lambda_l2", 1e-8, 10.0),
        "feature_fraction": trial.suggest_uniform("feature_fraction", 0.4, 1.0),
        "bagging_fraction": trial.suggest_uniform("bagging_fraction", 0.4, 1.0),
        "bagging_freq": trial.suggest_int("bagging_freq", 1, 7),
    }
    
    train_data = lgb.Dataset(X_train_sample, label=y_train_sample)
    val_data = lgb.Dataset(X_val, label=y_val)
    
    # pruning_callback = 
    
    model = lgb.train(
        param_grid,
        train_data,
        valid_sets=[train_data, val_data],
        valid_names=["train", "valid_0"],
        callbacks=[lgb.early_stopping(50, verbose=False), lgb.log_evaluation(0)],
        # verbose_eval=False
    )
    
    preds = model.predict(X_val)  # Keep raw probability scores
    return roc_auc_score(y_val, preds)
    

from optuna.pruners import HyperbandPruner
# Run Optuna optimization
study = optuna.create_study(direction="maximize", pruner=HyperbandPruner)
study.optimize(objective, n_trials=30)
# optuna.logging.set_verbosity(optuna.logging.WARNING)

# Best parameters
print("Best Parameters:", study.best_params)


Best Parameters: {'learning_rate': 0.1647139156131148, 'num_leaves': 80, 'max_depth': 9, 'min_data_in_leaf': 27, 'lambda_l1': 7.056780416992359, 'lambda_l2': 2.743192045079168e-07, 'feature_fraction': 0.9268769558547962, 'bagging_fraction': 0.9900576847529032, 'bagging_freq': 3}


In [31]:
# Train final LightGBM model with best parameters
best_params = study.best_params
best_model = lgb.LGBMClassifier(**best_params)
best_model.fit(X_train_resampled, y_train_resampled)

# Predictions on test set
y_pred = best_model.predict(X_test_resampled)

# Evaluate results
accuracy = accuracy_score(y_test_resampled, y_pred)
precision = precision_score(y_test_resampled, y_pred)
recall = recall_score(y_test_resampled, y_pred)
f1 = f1_score(y_test_resampled, y_pred)

print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-score: {f1:.4f}")

print("\nClassification Report:")
print(classification_report(y_test_resampled, y_pred))


Accuracy: 0.8796
Precision: 0.8196
Recall: 0.9735
F1-score: 0.8899

Classification Report:
              precision    recall  f1-score   support

           0       0.97      0.79      0.87     56155
           1       0.82      0.97      0.89     56155

    accuracy                           0.88    112310
   macro avg       0.89      0.88      0.88    112310
weighted avg       0.89      0.88      0.88    112310

