##### Model Training

In [1]:
# For data manipulation
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
from collections import Counter
import joblib

import optuna
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

import xgboost as xgb
import lightgbm as lgb


# For displaying all of the columns in dataframes
pd.set_option('display.max_columns', None)

#For ignoring warnings
import warnings
warnings.filterwarnings('ignore')

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Loading the dataset
df0 = pd.read_csv(r"C:\Users\hp\OneDrive\Documents\GitHub\credit_line_eligibility\data\cleaned_data.csv")


In [3]:
df0.head()

Unnamed: 0,loan_amnt,term,int_rate,emp_length,home_ownership,annual_inc,verification_status,loan_status,purpose,dti,open_acc,pub_rec,revol_bal,revol_util,total_acc,mort_acc
0,10000.0,36,11.44,10,141326,1.202703,117413,1,2094,1.089146,16.0,0.0,1.434536,41.8,25.0,0.0
1,8000.0,36,11.99,4,173740,0.060161,117413,1,207128,0.623256,17.0,0.0,0.681703,53.3,27.0,3.0
2,15600.0,36,10.49,0,141326,-0.796125,117893,1,73637,-0.513208,13.0,0.0,0.079328,92.2,26.0,0.0
3,7200.0,36,6.49,6,141326,-0.319423,117413,1,73637,-2.12021,6.0,0.0,-0.739714,21.5,13.0,0.0
4,24375.0,60,17.27,9,173740,-0.281432,111005,0,73637,1.893119,13.0,0.0,0.92793,69.8,43.0,1.0


In [4]:
df0.shape

(346311, 16)

In [4]:
X = df0.drop(columns=['loan_status'])
X.reset_index(inplace=True, drop=True)
y = df0['loan_status']
y.reset_index(drop=True, inplace=True)

# Step 1: Split data before standardization
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Step 2: Standardize features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)  # Fit & transform on training data
X_test_scaled = scaler.transform(X_test)  # Only transform test data (NO fitting)


In [5]:
# Step 3: Apply SMOTE on the standardized training set
smote = SMOTE(sampling_strategy='auto', random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train_scaled, y_train)

# Step 4: Print class distributions
print("Before SMOTE:", Counter(y_train))  
print("After SMOTE:", Counter(y_train_resampled)) 

Before SMOTE: Counter({1: 224616, 0: 52432})
After SMOTE: Counter({0: 224616, 1: 224616})


In [6]:
from imblearn.over_sampling import SMOTE

smote = SMOTE(sampling_strategy='auto', random_state=42)
X_test_resampled, y_test_resampled = smote.fit_resample(X_test_scaled, y_test)

In [7]:

print("Train class distribution:", np.bincount(y_train_resampled))  # Train data target class distribution fter SMOTE
print("Test class distribution:", np.bincount(y_test_resampled))  # Test data target class distribution after SMOTE

Train class distribution: [224616 224616]
Test class distribution: [56155 56155]


In [32]:
import optuna
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.utils import resample

# Reduce dataset size for trials
X_train_sample, y_train_sample = resample(X_train_resampled, y_train_resampled, n_samples=70000, random_state=42, stratify=y_train_resampled)

rf_X_train, rf_X_val, rf_y_train, rf_y_val = train_test_split(
    X_train_sample, y_train_sample, test_size=0.3, random_state=42, stratify=y_train_sample)

# Define Optuna objective function
def objective(trial):
    n_estimators = trial.suggest_int("n_estimators", 50, 400, step=50)
    max_depth = trial.suggest_int("max_depth", 3, 15)
    min_samples_split = trial.suggest_int("min_samples_split", 2, 10)
    # min_samples_leaf = trial.suggest_int("min_samples_leaf", 1, 10)
    # ccp_alpha = trial.suggest_int("ccp_alpha", 0.01, 0.1)
    max_features = trial.suggest_categorical("max_features", ["sqrt", "log2"])


    rf = RandomForestClassifier(
        criterion="gini",
        n_estimators=n_estimators,
        max_depth=max_depth,
        min_samples_split=min_samples_split,
        ccp_alpha=0,
        max_features=max_features,
        # min_samples_leaf=min_samples_leaf,
        random_state=42,
        n_jobs=-1,
        verbose=0
    )

    # Cross-validation with pruning
    cv = StratifiedKFold(n_splits=2, shuffle=True, random_state=42)

    score = cross_val_score(rf, rf_X_train, rf_y_train, 
                            cv=cv, scoring="f1").mean()

    return score

# Use TPESampler for faster trials
study = optuna.create_study(direction="maximize", sampler=optuna.samplers.TPESampler())
study.optimize(objective, n_trials=30)
print("Best F1-Score:", study.best_value)
print("Best Hyperparameters:", study.best_params)


[I 2025-03-17 18:49:54,905] A new study created in memory with name: no-name-65a847b2-e413-499b-a67e-924dd0c77d59
[I 2025-03-17 18:50:17,536] Trial 0 finished with value: 0.7241242634686512 and parameters: {'n_estimators': 400, 'max_depth': 6, 'min_samples_split': 2, 'max_features': 'sqrt'}. Best is trial 0 with value: 0.7241242634686512.
[I 2025-03-17 18:50:37,314] Trial 1 finished with value: 0.7915476014721432 and parameters: {'n_estimators': 250, 'max_depth': 15, 'min_samples_split': 6, 'max_features': 'log2'}. Best is trial 1 with value: 0.7915476014721432.
[I 2025-03-17 18:50:54,838] Trial 2 finished with value: 0.781662777789692 and parameters: {'n_estimators': 150, 'max_depth': 13, 'min_samples_split': 6, 'max_features': 'log2'}. Best is trial 1 with value: 0.7915476014721432.
[I 2025-03-17 18:51:15,784] Trial 3 finished with value: 0.7838363320552346 and parameters: {'n_estimators': 200, 'max_depth': 13, 'min_samples_split': 2, 'max_features': 'sqrt'}. Best is trial 1 with val

Best F1-Score: 0.7929674295377108
Best Hyperparameters: {'n_estimators': 300, 'max_depth': 15, 'min_samples_split': 7, 'max_features': 'log2'}


In [33]:

# Train RF with best parameters
rf_best_params = study.best_params
rf_best = RandomForestClassifier(class_weight="balanced",**rf_best_params, random_state=42, n_jobs=-1)

rf = rf_best.fit(X_train_resampled, y_train_resampled)
joblib.dump(rf, 'rf_model.pkl')

# # Evaluate on validation set
y_pred = rf_best.predict(X_test_resampled)
rf_preds = rf_best.predict_proba(rf_X_val)[:, 1]

# Step 3: Calculate performance metrics
accuracy = accuracy_score(y_test_resampled, y_pred)
precision = precision_score(y_test_resampled, y_pred, average="binary")  # Use "macro" for multi-class
recall = recall_score(y_test_resampled, y_pred, average="binary")  # Use "macro" for multi-class
f1 = f1_score(y_test_resampled, y_pred, average="binary")  # Use "macro" for multi-class

# Print results
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-score: {f1:.4f}")

# Optional: Full classification report
print("\nClassification Report:\n", classification_report(y_test_resampled, y_pred))



Accuracy: 0.8022
Precision: 0.7785
Recall: 0.8447
F1-score: 0.8102

Classification Report:
               precision    recall  f1-score   support

           0       0.83      0.76      0.79     56155
           1       0.78      0.84      0.81     56155

    accuracy                           0.80    112310
   macro avg       0.80      0.80      0.80    112310
weighted avg       0.80      0.80      0.80    112310



In [9]:
from optuna.integration import XGBoostPruningCallback
from sklearn.utils import resample

# Step 1: Create a fixed validation set (50K rows)
X_train_sample, y_train_sample = resample(X_train_resampled, y_train_resampled, n_samples=70000, random_state=42, stratify=y_train_resampled)

xgb_X_train, X_val, xgb_y_train, y_val = train_test_split(
    X_train_sample, y_train_sample, test_size=0.3, random_state=42, stratify=y_train_sample)

# Step 2: Define the Optuna objective function
def objective(trial):

    # Define hyperparameter search space
    params = {
        "objective": "binary:logistic",
        "eval_metric": "logloss",
        "tree_method": "gpu_hist",  # GPU optimization
        "verbosity":0,
        "verbose":-1,
        "learning_rate": trial.suggest_loguniform("learning_rate", 0.01, 0.3),
        "max_depth": trial.suggest_int("max_depth", 3, 10),
        "min_child_weight": trial.suggest_int("min_child_weight", 1, 10),
        "gamma": trial.suggest_loguniform("gamma", 1e-3, 1.0),
        "subsample": trial.suggest_uniform("subsample", 0.5, 1.0),
        "colsample_bytree": trial.suggest_uniform("colsample_bytree", 0.5, 1.0),
        "lambda": trial.suggest_loguniform("lambda", 1e-3, 10.0),
        "alpha": trial.suggest_loguniform("alpha", 1e-3, 10.0),
        "n_estimators": 500,  # High number for early stopping
        "eval_metric": "logloss",
        "early_stopping_rounds":20,
        "callbacks":[XGBoostPruningCallback(trial, "validation_0-logloss")],
    }
    
    # pruning_callback = XGBoostPruningCallback(trial, "validation_0-logloss")
    # Train the model
    model = xgb.XGBClassifier(**params)
    model.fit(xgb_X_train, xgb_y_train, eval_set=[(X_val, y_val)], verbose=False )

    # Predict on validation set
    preds = model.predict(X_val)
    accuracy = accuracy_score(y_val, preds)
    return accuracy

# Step 3: Run Optuna
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=30)

# Best parameters
print("Best Accuracy:", study.best_value)
print("Best Parameters:", study.best_params)


[I 2025-03-17 23:06:29,596] A new study created in memory with name: no-name-c923feec-2f33-4287-b8af-c6042f05eedc
[I 2025-03-17 23:06:35,382] Trial 0 finished with value: 0.8654285714285714 and parameters: {'learning_rate': 0.2114729622921267, 'max_depth': 10, 'min_child_weight': 7, 'gamma': 0.052083058687922665, 'subsample': 0.766224522035279, 'colsample_bytree': 0.5166934576430975, 'lambda': 0.21238877544971269, 'alpha': 0.9943149138302296}. Best is trial 0 with value: 0.8654285714285714.
[I 2025-03-17 23:06:51,522] Trial 1 finished with value: 0.8665714285714285 and parameters: {'learning_rate': 0.025984259057600816, 'max_depth': 9, 'min_child_weight': 2, 'gamma': 0.07082640533601654, 'subsample': 0.6628063512271276, 'colsample_bytree': 0.9734993191701268, 'lambda': 3.2459732353526616, 'alpha': 0.9862393883645287}. Best is trial 1 with value: 0.8665714285714285.
[I 2025-03-17 23:06:55,486] Trial 2 finished with value: 0.8661904761904762 and parameters: {'learning_rate': 0.2682570181

Best Accuracy: 0.8665714285714285
Best Parameters: {'learning_rate': 0.025984259057600816, 'max_depth': 9, 'min_child_weight': 2, 'gamma': 0.07082640533601654, 'subsample': 0.6628063512271276, 'colsample_bytree': 0.9734993191701268, 'lambda': 3.2459732353526616, 'alpha': 0.9862393883645287}


In [10]:
# Train final model using best params on full training data
xgb_best_params = study.best_params

xgb_best = xgb.XGBClassifier(**xgb_best_params)
xgb=xgb_best.fit(X_train_resampled, y_train_resampled)
joblib.dump(xgb, 'xgb_model.pkl')

y_pred = xgb_best.predict(X_test_resampled)
xgb_pred = xgb_best.predict_proba(X_val)[:, 1]

accuracy = accuracy_score(y_test_resampled, y_pred)
precision = precision_score(y_test_resampled, y_pred)
recall = recall_score(y_test_resampled, y_pred)
f1 = f1_score(y_test_resampled, y_pred)

print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-score: {f1:.4f}")

# Optional: Full classification report
print("\nClassification Report:\n", classification_report(y_test_resampled, y_pred))


Accuracy: 0.8233
Precision: 0.7976
Recall: 0.8665
F1-score: 0.8306

Classification Report:
               precision    recall  f1-score   support

           0       0.85      0.78      0.82     56155
           1       0.80      0.87      0.83     56155

    accuracy                           0.82    112310
   macro avg       0.83      0.82      0.82    112310
weighted avg       0.83      0.82      0.82    112310



In [18]:
from sklearn.utils import resample
from sklearn.metrics import roc_auc_score
from optuna.pruners import HyperbandPruner

# Step 1: Create a fixed validation set (50K rows)
X_train_sample, y_train_sample = resample(X_train_resampled, y_train_resampled, n_samples=70000, random_state=42, stratify=y_train_resampled)

lgb_X_train, lgb_X_val, lgb_y_train, lgb_y_val = train_test_split(
    X_train_sample, y_train_sample, test_size=0.3, random_state=42, stratify=y_train_sample)


def objective(trial):
    param_grid = {
        "objective": "binary",
        "metric": "auc",
        "verbosity": -1,
        "verbose":-1,
        "boosting_type": 'gbdt', 
        "learning_rate": trial.suggest_loguniform("learning_rate", 0.01, 0.3),
        "num_leaves": trial.suggest_int("num_leaves", 20, 400, step=20),
        "max_depth": trial.suggest_int("max_depth", 3, 15),
        "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 10, 200),
        "lambda_l1": trial.suggest_loguniform("lambda_l1", 1e-3, 10.0),
        "lambda_l2": trial.suggest_loguniform("lambda_l2", 1e-3, 10.0),
        "feature_fraction": trial.suggest_uniform("feature_fraction", 0.4, 1.0),
        "bagging_fraction": trial.suggest_uniform("bagging_fraction", 0.4, 1.0),
        "bagging_freq": trial.suggest_int("bagging_freq", 1, 7),
    }
    
    train_data = lgb.Dataset(lgb_X_train, label=lgb_y_train)
    val_data = lgb.Dataset(lgb_X_val, label=lgb_y_val)
    
    model = lgb.train(
        param_grid,
        train_data,
        valid_sets=[train_data, val_data],
        valid_names=["train", "valid_0"],
        callbacks=[lgb.early_stopping(50, verbose=False), lgb.log_evaluation(0)],
    )
    
    preds = model.predict(lgb_X_val)
    return roc_auc_score(lgb_y_val, preds)
    

# Run Optuna optimization
study = optuna.create_study(direction="maximize", pruner=HyperbandPruner)
study.optimize(objective, n_trials=20)
optuna.logging.set_verbosity(optuna.logging.WARNING)

# Best parameters
print("Best ROC-AUC Score:", study.best_value)
print("Best Parameters:", study.best_params)


Best ROC-AUC Score: 0.9287084535147392
Best Parameters: {'learning_rate': 0.1386185612536086, 'num_leaves': 260, 'max_depth': 13, 'min_data_in_leaf': 122, 'lambda_l1': 0.109871274254064, 'lambda_l2': 0.003365798596021806, 'feature_fraction': 0.46518892933626377, 'bagging_fraction': 0.829746096276152, 'bagging_freq': 5}


In [19]:
# Train final LightGBM model with best parameters
lgb_best_params = study.best_params
lgb_best = lgb.LGBMClassifier(**lgb_best_params)
lgb_best.fit(X_train_resampled, y_train_resampled)
joblib.dump(lgb_best, 'lgbm_model.pkl')

# Predictions on test set
y_pred = lgb_best.predict(X_test_resampled)
lgb_preds = lgb_best.predict_proba(lgb_X_val)[:, 1]

# Evaluate results
accuracy = accuracy_score(y_test_resampled, y_pred)
precision = precision_score(y_test_resampled, y_pred)
recall = recall_score(y_test_resampled, y_pred)
f1 = f1_score(y_test_resampled, y_pred)

print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-score: {f1:.4f}")

print("\nClassification Report:")
print(classification_report(y_test_resampled, y_pred))


Accuracy: 0.8798
Precision: 0.8203
Recall: 0.9728
F1-score: 0.8900

Classification Report:
              precision    recall  f1-score   support

           0       0.97      0.79      0.87     56155
           1       0.82      0.97      0.89     56155

    accuracy                           0.88    112310
   macro avg       0.89      0.88      0.88    112310
weighted avg       0.89      0.88      0.88    112310

