##### Model Training

In [1]:
# For data manipulation
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
from collections import Counter
import joblib

import optuna
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

import xgboost as xgb
import lightgbm as lgb


# For displaying all of the columns in dataframes
pd.set_option('display.max_columns', None)

#For ignoring warnings
import warnings
warnings.filterwarnings('ignore')

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Loading the dataset
df0 = pd.read_csv(r"C:\Users\hp\OneDrive\Documents\GitHub\credit_line_eligibility\data\cleaned_data.csv")



In [3]:
df0.head()

Unnamed: 0,loan_amnt,term,int_rate,emp_length,home_ownership,annual_inc,verification_status,loan_status,purpose,dti,open_acc,pub_rec,revol_bal,revol_util,total_acc,mort_acc
0,10000.0,36,11.44,10,141326,1.202703,117413,1,2094,1.089146,16.0,0.0,1.434536,41.8,25.0,0.0
1,8000.0,36,11.99,4,173740,0.060161,117413,1,207128,0.623256,17.0,0.0,0.681703,53.3,27.0,3.0
2,15600.0,36,10.49,0,141326,-0.796125,117893,1,73637,-0.513208,13.0,0.0,0.079328,92.2,26.0,0.0
3,7200.0,36,6.49,6,141326,-0.319423,117413,1,73637,-2.12021,6.0,0.0,-0.739714,21.5,13.0,0.0
4,24375.0,60,17.27,9,173740,-0.281432,111005,0,73637,1.893119,13.0,0.0,0.92793,69.8,43.0,1.0


In [4]:
df0.shape

(346311, 16)

In [3]:
X = df0.drop(columns=['loan_status'])
X.reset_index(inplace=True, drop=True)
y = df0['loan_status']
y.reset_index(drop=True, inplace=True)

# Split data before standardization
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Standardize features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)  # Fit & transform on training data
X_test_scaled = scaler.transform(X_test)  # Only transform test data (NO fitting)


In [4]:
# Apply SMOTE on the standardized training set
smote = SMOTE(sampling_strategy='auto', random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train_scaled, y_train)

print("Before SMOTE:", Counter(y_train))  
print("After SMOTE:", Counter(y_train_resampled)) 

Before SMOTE: Counter({1: 224616, 0: 52432})
After SMOTE: Counter({0: 224616, 1: 224616})


In [5]:
from imblearn.over_sampling import SMOTE

smote = SMOTE(sampling_strategy='auto', random_state=42)
X_test_resampled, y_test_resampled = smote.fit_resample(X_test_scaled, y_test)

In [8]:

print("Train class distribution:", np.bincount(y_train_resampled))  # Train data target class distribution fter SMOTE
print("Test class distribution:", np.bincount(y_test_resampled))  # Test data target class distribution after SMOTE

Train class distribution: [224616 224616]
Test class distribution: [56155 56155]


In [6]:
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.utils import resample

# Reduce dataset size for trials
X_train_sample, y_train_sample = resample(X_train_resampled, y_train_resampled, n_samples=70000, random_state=42, stratify=y_train_resampled)

rf_X_train, rf_X_val, rf_y_train, rf_y_val = train_test_split(
    X_train_sample, y_train_sample, test_size=0.3, random_state=42, stratify=y_train_sample)

# Define Optuna objective function
def objective(trial):
    n_estimators = trial.suggest_int("n_estimators", 50, 450, step=50)
    max_depth = trial.suggest_int("max_depth", 3, 15)
    min_samples_split = trial.suggest_int("min_samples_split", 2, 10)
    # min_samples_leaf = trial.suggest_int("min_samples_leaf", 1, 10)
    max_features = trial.suggest_categorical("max_features", ["sqrt", "log2"])


    rf = RandomForestClassifier(
        criterion="gini",
        n_estimators=n_estimators,
        max_depth=max_depth,
        min_samples_split=min_samples_split,
        ccp_alpha=0,
        max_features=max_features,
        # min_samples_leaf=min_samples_leaf,
        random_state=42,
        n_jobs=-1,
        verbose=0
    )

    # Cross-validation with pruning
    cv = StratifiedKFold(n_splits=2, shuffle=True, random_state=42)

    score = cross_val_score(rf, rf_X_train, rf_y_train, 
                            cv=cv, scoring="f1").mean()

    return score

# Use TPESampler for faster trials
study = optuna.create_study(direction="maximize", sampler=optuna.samplers.TPESampler())
study.optimize(objective, n_trials=30)
print("Best F1-Score:", study.best_value)
print("Best Hyperparameters:", study.best_params)


[I 2025-03-23 23:27:56,453] A new study created in memory with name: no-name-3954e4b9-edce-476e-b7c0-f249021b7b4d
[I 2025-03-23 23:27:58,875] Trial 0 finished with value: 0.6786992545319713 and parameters: {'n_estimators': 50, 'max_depth': 3, 'min_samples_split': 3, 'max_features': 'log2'}. Best is trial 0 with value: 0.6786992545319713.
[I 2025-03-23 23:28:10,338] Trial 1 finished with value: 0.7893296435232611 and parameters: {'n_estimators': 100, 'max_depth': 15, 'min_samples_split': 4, 'max_features': 'log2'}. Best is trial 1 with value: 0.7893296435232611.
[I 2025-03-23 23:28:46,966] Trial 2 finished with value: 0.77899176294587 and parameters: {'n_estimators': 450, 'max_depth': 12, 'min_samples_split': 5, 'max_features': 'sqrt'}. Best is trial 1 with value: 0.7893296435232611.
[I 2025-03-23 23:29:20,298] Trial 3 finished with value: 0.7916289156687217 and parameters: {'n_estimators': 350, 'max_depth': 15, 'min_samples_split': 9, 'max_features': 'sqrt'}. Best is trial 3 with value

Best F1-Score: 0.7933657127506177
Best Hyperparameters: {'n_estimators': 300, 'max_depth': 15, 'min_samples_split': 4, 'max_features': 'log2'}


In [7]:

# Train RF with best parameters
rf_best_params = study.best_params
# rf_best = RandomForestClassifier(class_weight="balanced",**rf_best_params, random_state=42, n_jobs=-1)
rf_best = RandomForestClassifier(n_estimators=450, max_depth=15, min_samples_split=2, max_features='log2', random_state=42, n_jobs=-1, )
rf_best.fit(X_train_resampled, y_train_resampled)
# joblib.dump(rf, 'rf_model.pkl')

# # Evaluate on validation set
y_pred = rf_best.predict(X_test_resampled)
rf_preds = rf_best.predict_proba(rf_X_val)[:, 1]

# Step 3: Calculate performance metrics
accuracy = accuracy_score(y_test_resampled, y_pred)
precision = precision_score(y_test_resampled, y_pred, average="binary")  
recall = recall_score(y_test_resampled, y_pred, average="binary")  
f1 = f1_score(y_test_resampled, y_pred, average="binary")  

# Print results
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-score: {f1:.4f}")

# Full classification report
print("\nClassification Report:\n", classification_report(y_test_resampled, y_pred))



Accuracy: 0.8027
Precision: 0.7795
Recall: 0.8443
F1-score: 0.8106

Classification Report:
               precision    recall  f1-score   support

           0       0.83      0.76      0.79     56155
           1       0.78      0.84      0.81     56155

    accuracy                           0.80    112310
   macro avg       0.80      0.80      0.80    112310
weighted avg       0.80      0.80      0.80    112310



In [8]:
from optuna.integration import XGBoostPruningCallback
from sklearn.utils import resample

# Step 1: Create a fixed validation set (50K rows)
X_train_sample, y_train_sample = resample(X_train_resampled, y_train_resampled, n_samples=70000, random_state=42, stratify=y_train_resampled)

xgb_X_train, X_val, xgb_y_train, y_val = train_test_split(
    X_train_sample, y_train_sample, test_size=0.3, random_state=42, stratify=y_train_sample)

# Step 2: Define the Optuna objective function
def objective(trial):

    # Define hyperparameter search space
    params = {
        "objective": "binary:logistic",
        "eval_metric": "logloss",
        "tree_method": "gpu_hist",  
        "verbosity":0,
        "verbose":-1,
        "learning_rate": trial.suggest_loguniform("learning_rate", 0.01, 0.3),
        "max_depth": trial.suggest_int("max_depth", 3, 10),
        "min_child_weight": trial.suggest_int("min_child_weight", 1, 10),
        "gamma": trial.suggest_loguniform("gamma", 1e-3, 1.0),
        "subsample": trial.suggest_uniform("subsample", 0.5, 1.0),
        "colsample_bytree": trial.suggest_uniform("colsample_bytree", 0.5, 1.0),
        "lambda": trial.suggest_loguniform("lambda", 1e-3, 10.0),
        "alpha": trial.suggest_loguniform("alpha", 1e-3, 10.0),
        "n_estimators": 500,  # High number for early stopping
        "eval_metric": "logloss",
        "early_stopping_rounds":20,
        "callbacks":[XGBoostPruningCallback(trial, "validation_0-logloss")],
    }
    
    # pruning_callback = XGBoostPruningCallback(trial, "validation_0-logloss")
    # Train the model
    model = xgb.XGBClassifier(**params)
    model.fit(xgb_X_train, xgb_y_train, eval_set=[(X_val, y_val)], verbose=False )

    # Predict on validation set
    preds = model.predict(X_val)
    accuracy = accuracy_score(y_val, preds)
    return accuracy

# Step 3: Run Optuna
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=30)

# Best parameters
print("Best Accuracy:", study.best_value)
print("Best Parameters:", study.best_params)


[I 2025-03-23 23:49:25,321] A new study created in memory with name: no-name-0aaded34-f144-4e3a-b91a-81f3cb3ddb19
[I 2025-03-23 23:49:37,393] Trial 0 finished with value: 0.8698095238095238 and parameters: {'learning_rate': 0.17164585799609702, 'max_depth': 3, 'min_child_weight': 4, 'gamma': 0.1663809904508759, 'subsample': 0.8529723780595613, 'colsample_bytree': 0.7626704544140603, 'lambda': 1.8490638687045047, 'alpha': 5.082865735735715}. Best is trial 0 with value: 0.8698095238095238.
[I 2025-03-23 23:49:51,483] Trial 1 finished with value: 0.8755714285714286 and parameters: {'learning_rate': 0.04914497686858078, 'max_depth': 9, 'min_child_weight': 3, 'gamma': 0.09506709211237088, 'subsample': 0.7680194135457485, 'colsample_bytree': 0.6559431632042786, 'lambda': 0.1149361484163164, 'alpha': 0.03799279121710413}. Best is trial 1 with value: 0.8755714285714286.
[I 2025-03-23 23:49:59,674] Trial 2 finished with value: 0.8717619047619047 and parameters: {'learning_rate': 0.1052927229395

Best Accuracy: 0.8755714285714286
Best Parameters: {'learning_rate': 0.04914497686858078, 'max_depth': 9, 'min_child_weight': 3, 'gamma': 0.09506709211237088, 'subsample': 0.7680194135457485, 'colsample_bytree': 0.6559431632042786, 'lambda': 0.1149361484163164, 'alpha': 0.03799279121710413}


In [9]:
# Train final model using best params on full training data
xgb_best_params = study.best_params

xgb_best = xgb.XGBClassifier(**xgb_best_params)
xgb=xgb_best.fit(X_train_resampled, y_train_resampled)
joblib.dump(xgb, 'xgb_model.pkl')

y_pred = xgb_best.predict(X_test_resampled)
xgb_preds = xgb_best.predict_proba(X_val)[:, 1]

accuracy = accuracy_score(y_test_resampled, y_pred)
precision = precision_score(y_test_resampled, y_pred)
recall = recall_score(y_test_resampled, y_pred)
f1 = f1_score(y_test_resampled, y_pred)

print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-score: {f1:.4f}")

# Optional: Full classification report
print("\nClassification Report:\n", classification_report(y_test_resampled, y_pred))


Accuracy: 0.8536
Precision: 0.8106
Recall: 0.9227
F1-score: 0.8630

Classification Report:
               precision    recall  f1-score   support

           0       0.91      0.78      0.84     56155
           1       0.81      0.92      0.86     56155

    accuracy                           0.85    112310
   macro avg       0.86      0.85      0.85    112310
weighted avg       0.86      0.85      0.85    112310



In [10]:
from sklearn.utils import resample
from sklearn.metrics import roc_auc_score
from optuna.pruners import HyperbandPruner

# Step 1: Create a fixed validation set (50K rows)
X_train_sample, y_train_sample = resample(X_train_resampled, y_train_resampled, n_samples=70000, random_state=42, stratify=y_train_resampled)

lgb_X_train, lgb_X_val, lgb_y_train, lgb_y_val = train_test_split(
    X_train_sample, y_train_sample, test_size=0.3, random_state=42, stratify=y_train_sample)


def objective(trial):
    param_grid = {
        "objective": "binary",
        "metric": "auc",
        "verbosity": -1,
        "verbose":-1,
        "boosting_type": 'gbdt', 
        "learning_rate": trial.suggest_loguniform("learning_rate", 0.01, 0.3),
        "num_leaves": trial.suggest_int("num_leaves", 20, 400, step=20),
        "max_depth": trial.suggest_int("max_depth", 3, 15),
        "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 10, 200),
        "lambda_l1": trial.suggest_loguniform("lambda_l1", 1e-3, 10.0),
        "lambda_l2": trial.suggest_loguniform("lambda_l2", 1e-3, 10.0),
        "feature_fraction": trial.suggest_uniform("feature_fraction", 0.4, 1.0),
        "bagging_fraction": trial.suggest_uniform("bagging_fraction", 0.4, 1.0),
        "bagging_freq": trial.suggest_int("bagging_freq", 1, 7),
    }
    
    train_data = lgb.Dataset(lgb_X_train, label=lgb_y_train)
    val_data = lgb.Dataset(lgb_X_val, label=lgb_y_val)
    
    model = lgb.train(
        param_grid,
        train_data,
        valid_sets=[train_data, val_data],
        valid_names=["train", "valid_0"],
        callbacks=[lgb.early_stopping(50, verbose=False), lgb.log_evaluation(0)],
    )
    
    preds = model.predict(lgb_X_val)
    return roc_auc_score(lgb_y_val, preds)
    

# Run Optuna optimization
study = optuna.create_study(direction="maximize", pruner=HyperbandPruner)
study.optimize(objective, n_trials=20)
optuna.logging.set_verbosity(optuna.logging.WARNING)

# Best parameters
print("Best ROC-AUC Score:", study.best_value)
print("Best Parameters:", study.best_params)


[I 2025-03-23 23:53:07,895] A new study created in memory with name: no-name-b25bc9e1-19e4-46f2-969d-4a29b4c7c0a2
[I 2025-03-23 23:53:16,059] Trial 0 finished with value: 0.9274200272108842 and parameters: {'learning_rate': 0.05584472170764126, 'num_leaves': 280, 'max_depth': 11, 'min_data_in_leaf': 28, 'lambda_l1': 0.004397542784986295, 'lambda_l2': 0.0013933404443800933, 'feature_fraction': 0.49653241086684463, 'bagging_fraction': 0.4571894076494749, 'bagging_freq': 3}. Best is trial 0 with value: 0.9274200272108842.
[I 2025-03-23 23:53:18,757] Trial 1 finished with value: 0.9101128752834468 and parameters: {'learning_rate': 0.14897308503788276, 'num_leaves': 40, 'max_depth': 3, 'min_data_in_leaf': 166, 'lambda_l1': 0.010760762934719358, 'lambda_l2': 0.0012378398795271862, 'feature_fraction': 0.9944615743321128, 'bagging_fraction': 0.42596074387253374, 'bagging_freq': 1}. Best is trial 0 with value: 0.9274200272108842.
[I 2025-03-23 23:53:23,973] Trial 2 finished with value: 0.898969

Best ROC-AUC Score: 0.9290236916099773
Best Parameters: {'learning_rate': 0.10472599203129926, 'num_leaves': 300, 'max_depth': 13, 'min_data_in_leaf': 34, 'lambda_l1': 1.5851860466735266, 'lambda_l2': 0.01831785872517951, 'feature_fraction': 0.4028746801190124, 'bagging_fraction': 0.5062031265497579, 'bagging_freq': 4}


In [11]:
# Train final LightGBM model with best parameters
lgb_best_params = study.best_params
lgb_best = lgb.LGBMClassifier(**lgb_best_params)
lgb_best.fit(X_train_resampled, y_train_resampled)
joblib.dump(lgb_best, 'lgbm_model.pkl')

# Predictions on test set
y_pred = lgb_best.predict(X_test_resampled)
lgb_preds = lgb_best.predict_proba(lgb_X_val)[:, 1]

# Evaluate results
accuracy = accuracy_score(y_test_resampled, y_pred)
precision = precision_score(y_test_resampled, y_pred)
recall = recall_score(y_test_resampled, y_pred)
f1 = f1_score(y_test_resampled, y_pred)

print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-score: {f1:.4f}")

print("\nClassification Report:")
print(classification_report(y_test_resampled, y_pred))


Accuracy: 0.8801
Precision: 0.8208
Recall: 0.9727
F1-score: 0.8903

Classification Report:
              precision    recall  f1-score   support

           0       0.97      0.79      0.87     56155
           1       0.82      0.97      0.89     56155

    accuracy                           0.88    112310
   macro avg       0.89      0.88      0.88    112310
weighted avg       0.89      0.88      0.88    112310



In [25]:
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

# Split your dataset
X_train, X_val, y_train, y_val = train_test_split(X_train_sample, y_train_sample, 
                                                  test_size=0.2, stratify=y_train_sample, random_state=42)

# Base models 
rf = rf_best
xgb = xgb_best
lgbm = lgb_best

# Meta-learner
meta_learner = LogisticRegression(max_iter=1000, random_state=42)

# Define the stacking classifier
stacking_clf = StackingClassifier(
    estimators=[
        ('rf', rf),
        ('xgb', xgb),
        ('lgbm', lgbm)
    ],
    final_estimator=meta_learner,
    cv=5,
    n_jobs=-1,
    passthrough=True
)

# Fit the model
stacking_clf.fit(X_train, y_train)

# Predict
y_pred = stacking_clf.predict(X_val)

# Evaluate
print(classification_report(y_val, y_pred))


              precision    recall  f1-score   support

           0       0.93      0.81      0.87      7000
           1       0.84      0.94      0.88      7000

    accuracy                           0.88     14000
   macro avg       0.88      0.88      0.88     14000
weighted avg       0.88      0.88      0.88     14000

