In [1]:
# Cell 1: Imports
import pandas as pd
import numpy as np
import joblib
import shap
import optuna
import warnings
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder, FunctionTransformer
from sklearn.metrics import roc_auc_score, classification_report
from sklearn.ensemble import RandomForestClassifier
import lightgbm as lgb
import xgboost as xgb

warnings.filterwarnings('ignore', category=FutureWarning)
print("--- Model Training & Optimization Script Initialized ---")

  from .autonotebook import tqdm as notebook_tqdm


--- Model Training & Optimization Script Initialized ---


In [2]:
# Cell 2: Load a Large, Representative Sample
try:
    file_path = '../data/lending_club_accepted.csv'
    n_rows_to_sample = 300000  # Increased sample size for accuracy
    print(f"Loading a sample of {n_rows_to_sample} rows...")
    df = pd.read_csv(file_path, nrows=n_rows_to_sample, low_memory=False)
    print("Sample loaded successfully!")
except FileNotFoundError:
    print("ERROR: Data file not found. Please place 'lending_club_accepted.csv' in the 'data' folder.")
    df = None

Loading a sample of 300000 rows...
Sample loaded successfully!


In [3]:
# Cell 3: Advanced Preprocessing & Feature Engineering
if df is not None:
    df = df[df['loan_status'].isin(['Fully Paid', 'Charged Off', 'Default'])]
    df['is_default'] = df['loan_status'].apply(lambda x: 1 if x in ['Charged Off', 'Default'] else 0)
    
    df['emp_length'].replace({'< 1 year': '0 years', '10+ years': '10 years'}, inplace=True)
    df['emp_length'] = df['emp_length'].str.replace(r'\D', '', regex=True).astype(float)
    df['emp_length'].fillna(df['emp_length'].median(), inplace=True)
    
    df['earliest_cr_line'] = pd.to_datetime(df['earliest_cr_line'], errors='coerce')
    df['issue_d'] = pd.to_datetime(df['issue_d'], errors='coerce')
    df['credit_history_length'] = (df['issue_d'] - df['earliest_cr_line']).dt.days / 365.25
    df.drop(columns=['earliest_cr_line', 'issue_d'], inplace=True)
    
    # --- Feature Engineering ---
    print("Performing Advanced Feature Engineering...")
    df['loan_to_income_ratio'] = df['loan_amnt'] / (df['annual_inc'] + 1) # Add 1 to avoid division by zero
    df['interest_to_income_ratio'] = (df['installment'] * 12) / (df['annual_inc'] + 1)
    df['revol_util_to_open_acc'] = df['revol_util'] / (df['open_acc'] + 1)
    print("Advanced Feature Engineering complete.")

  df['earliest_cr_line'] = pd.to_datetime(df['earliest_cr_line'], errors='coerce')
  df['issue_d'] = pd.to_datetime(df['issue_d'], errors='coerce')


Performing Advanced Feature Engineering...
Advanced Feature Engineering complete.


In [4]:
# Cell 4: Feature Selection
if df is not None:
    features_to_use = [
        'loan_amnt', 'term', 'int_rate', 'installment', 'grade', 'sub_grade', 'emp_length',
        'home_ownership', 'annual_inc', 'verification_status', 'purpose', 'dti', 'open_acc',
        'pub_rec', 'revol_bal', 'revol_util', 'total_acc', 'initial_list_status',
        'application_type', 'mort_acc', 'pub_rec_bankruptcies', 'credit_history_length',
        'loan_to_income_ratio', 'interest_to_income_ratio', 'revol_util_to_open_acc' # New features
    ]
    target = 'is_default'
    df_model = df[features_to_use + [target]].copy().dropna()
    X = df_model[features_to_use]
    y = df_model[target]
    print("Features selected.")

Features selected.


In [5]:
# Cell 5: Create Preprocessing Pipeline & Split Data
if df is not None:
    numeric_features = X.select_dtypes(include=np.number).columns.tolist()
    categorical_features = X.select_dtypes(include=['object', 'category']).columns.tolist()
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', StandardScaler(), numeric_features),
            ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_features)
        ], remainder='passthrough')
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
    print("Preprocessing pipeline created and data split.")

Preprocessing pipeline created and data split.


In [6]:
# Cell 6: High-Efficiency Model Bake-Off with StratifiedKFold
if df is not None:
    print("\n--- Model Bake-Off: Comparing Performance with Stratified 3-Fold CV ---")
    
    models = {
        'RandomForest': RandomForestClassifier(n_estimators=100, class_weight='balanced', random_state=42, n_jobs=-1),
        'LightGBM': lgb.LGBMClassifier(class_weight='balanced', random_state=42),
        'XGBoost': xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
    }
    
    # Use StratifiedKFold for robust evaluation on imbalanced data
    cv_strategy = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
    
    for name, model in models.items():
        pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('classifier', model)])
        
        # This is inherently parallel if n_jobs is set in the model
        scores = cross_val_score(pipeline, X_train, y_train, cv=cv_strategy, scoring='roc_auc', n_jobs=-1)
        print(f"{name} - Mean CV ROC AUC: {np.mean(scores):.4f} (Std: {np.std(scores):.4f})")
    
    # For this example, we will proceed to optimize XGBoost as it's typically the top performer.
    best_model_name = 'XGBoost'
    print(f"\n Proceeding to optimize champion candidate: {best_model_name}")


--- Model Bake-Off: Comparing Performance with Stratified 3-Fold CV ---
RandomForest - Mean CV ROC AUC: 0.7215 (Std: 0.0012)
LightGBM - Mean CV ROC AUC: 0.7372 (Std: 0.0018)
XGBoost - Mean CV ROC AUC: 0.7287 (Std: 0.0015)

 Proceeding to optimize champion candidate: XGBoost


In [7]:
# Cell 7: Advanced Hyperparameter Tuning of the Champion Model
if df is not None:
    print(f"\n--- Starting Advanced Hyperparameter Optimization for {best_model_name} ---")
    
    def objective(trial):
        params = {
            'objective': 'binary:logistic', 'eval_metric': 'auc', 'use_label_encoder': False, 'random_state': 42,
            'n_estimators': trial.suggest_int('n_estimators', 500, 2000, step=100),
            'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.1, log=True),
            'max_depth': trial.suggest_int('max_depth', 5, 12),
            'subsample': trial.suggest_float('subsample', 0.7, 1.0),
            'colsample_bytree': trial.suggest_float('colsample_bytree', 0.7, 1.0),
            'gamma': trial.suggest_float('gamma', 1e-8, 10.0, log=True),
            'min_child_weight': trial.suggest_int('min_child_weight', 1, 15),
            'scale_pos_weight': trial.suggest_float('scale_pos_weight', 1, 15)
        }
        model = xgb.XGBClassifier(**params)
        pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('classifier', model)])
        pipeline.fit(X_train, y_train)
        return roc_auc_score(y_test, pipeline.predict_proba(X_test)[:, 1])

    study = optuna.create_study(direction='maximize')
    study.optimize(objective, n_trials=40) # A more extensive search
    print("Optimization Finished.")
    print(f"Best trial ROC AUC: {study.best_value:.4f}")
    print("Best hyperparameters found:", study.best_params)
    
    final_params = study.best_params
    final_model = xgb.XGBClassifier(**final_params, random_state=42, use_label_encoder=False, eval_metric='logloss')

[I 2025-07-07 00:33:07,841] A new study created in memory with name: no-name-c476b5ad-64cb-49b0-b473-32a0aec3e4e5



--- Starting Advanced Hyperparameter Optimization for XGBoost ---


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
[I 2025-07-07 00:33:22,288] Trial 0 finished with value: 0.728433310231672 and parameters: {'n_estimators': 500, 'learning_rate': 0.07482006318810987, 'max_depth': 9, 'subsample': 0.8067432385325863, 'colsample_bytree': 0.8525996795861523, 'gamma': 1.2980904736099084e-07, 'min_child_weight': 12, 'scale_pos_weight': 2.865668784817782}. Best is trial 0 with value: 0.728433310231672.
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
[I 2025-07-07 00:33:46,016] Trial 1 finished with value: 0.7287852583309601 and parameters: {'n_estimators': 1200, 'learning_rate': 0.09247810742054613, 'max_depth': 5, 'subsample': 0.8441374931913039, 'colsample_bytree': 0.9496099122822859, 'gamma': 2.0178948337706793e-08, 'min_child_weight': 1, 'scale_pos_weight': 5.6483080042070934}. Best is trial 1 with value: 0.7287852583309601.
Parameters: { "use_label_encoder" } are

Optimization Finished.
Best trial ROC AUC: 0.7421
Best hyperparameters found: {'n_estimators': 1000, 'learning_rate': 0.016311976803135272, 'max_depth': 5, 'subsample': 0.9051643549876769, 'colsample_bytree': 0.8357097839119043, 'gamma': 0.0007502146362241052, 'min_child_weight': 5, 'scale_pos_weight': 1.1331208163138011}


In [8]:
# Cell 8: Retrain and Save the Final Champion Model and Artifacts
if df is not None:
    print(f"\n--- Retraining and Saving Final Model ({best_model_name}) ---")
    
    final_pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('classifier', final_model)])
    final_pipeline.fit(X_train, y_train)

    print("\n--- Final Model Performance on Unseen Test Set ---")
    y_pred_proba = final_pipeline.predict_proba(X_test)[:, 1]
    y_pred = final_pipeline.predict(X_test)
    final_auc = roc_auc_score(y_test, y_pred_proba)
    print(f"Final Test Set ROC AUC: {final_auc:.4f}")
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))

    joblib.dump(final_pipeline, '../models/best_model.joblib')
    print("✅ Final model pipeline saved.")
    
    X_train_processed = final_pipeline.named_steps['preprocessor'].transform(X_train)
    model_for_shap = final_pipeline.named_steps['classifier']
    explainer = shap.TreeExplainer(model_for_shap, X_train_processed, feature_perturbation="interventional")
    
    joblib.dump(explainer, '../models/shap_explainer.joblib')
    processed_feature_names = final_pipeline.named_steps['preprocessor'].get_feature_names_out()
    joblib.dump(processed_feature_names, '../models/processed_feature_names.joblib')
    print("✅ SHAP explainer and feature names for  model saved.")
    print("\n--- SCRIPT COMPLETE ---")


--- Retraining and Saving Final Model (XGBoost) ---


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)



--- Final Model Performance on Unseen Test Set ---
Final Test Set ROC AUC: 0.7421

Classification Report:
              precision    recall  f1-score   support

           0       0.83      0.96      0.89     42439
           1       0.56      0.19      0.28     10695

    accuracy                           0.81     53134
   macro avg       0.69      0.58      0.59     53134
weighted avg       0.77      0.81      0.77     53134

✅ Final model pipeline saved.
✅ SHAP explainer and feature names for  model saved.

--- SCRIPT COMPLETE ---
