# Loan Risk Model Training
This notebook handles the end-to-end pipeline for loan default prediction, including feature engineering, model selection, and hyperparameter optimization.

In [None]:
import pandas as pd
import numpy as np
import joblib
import shap
import optuna
import warnings
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import roc_auc_score, classification_report, accuracy_score, confusion_matrix, RocCurveDisplay, ConfusionMatrixDisplay
import xgboost as xgb

warnings.filterwarnings('ignore')
sns.set_theme(style="whitegrid", palette="muted")
print("Environment initialized.")

### Load Dataset
We use a subset of the LendingClub dataset for training to balance performance and memory usage.

In [None]:
try:
    raw_data = pd.read_csv('../data/lending_club_accepted.csv', nrows=200000, low_memory=False)
    print(f"Loaded {len(raw_data)} rows from main dataset.")
except FileNotFoundError:
    print("Source data missing. Looking for local sample...")
    raw_data = pd.read_csv('../data/lending_club_sample.csv')
    print(f"Using sample with {len(raw_data)} rows.")

### Feature Engineering
Converting raw loan data into predictive features.

In [None]:
def engineer_features(df):
    df = df[df['loan_status'].isin(['Fully Paid', 'Charged Off', 'Default'])].copy()
    df['target'] = df['loan_status'].apply(lambda x: 1 if x in ['Charged Off', 'Default'] else 0)
    
    # Clean employment history
    df['emp_length'] = df['emp_length'].str.extract(r'(\d+)').astype(float)
    df['emp_length'] = df['emp_length'].fillna(df['emp_length'].median())
    
    # Time-based features
    df['earliest_cr_line'] = pd.to_datetime(df['earliest_cr_line'], errors='coerce')
    df['issue_d'] = pd.to_datetime(df['issue_d'], errors='coerce')
    df['credit_history_length'] = (df['issue_d'] - df['earliest_cr_line']).dt.days / 365.25
    
    # Financial ratios
    df['loan_to_income'] = df['loan_amnt'] / (df['annual_inc'] + 1)
    df['interest_to_income'] = (df['installment'] * 12) / (df['annual_inc'] + 1)
    df['utilization_efficiency'] = df['revol_util'] / (df['open_acc'] + 1)
    
    return df.drop(columns=['earliest_cr_line', 'issue_d'])

df_clean = engineer_features(raw_data)
print("Feature engineering complete.")

### Preprocessing Pipeline

In [None]:
features = [
    'loan_amnt', 'term', 'int_rate', 'installment', 'grade', 'sub_grade', 'emp_length',
    'home_ownership', 'annual_inc', 'verification_status', 'purpose', 'dti', 'open_acc',
    'pub_rec', 'revol_bal', 'revol_util', 'total_acc', 'initial_list_status',
    'application_type', 'mort_acc', 'pub_rec_bankruptcies', 'credit_history_length',
    'loan_to_income', 'interest_to_income', 'utilization_efficiency'
]

X = df_clean[features]
y = df_clean['target']

numeric_cols = X.select_dtypes(include=np.number).columns.tolist()
categorical_cols = X.select_dtypes(include=['object']).columns.tolist()

processor = ColumnTransformer([
    ('num', StandardScaler(), numeric_cols),
    ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_cols)
])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)
print(f"Training set size: {X_train.shape[0]}, Test set size: {X_test.shape[0]}")

### Final Model Training
Training the optimized XGBoost classifier.

In [None]:
# Optimized parameters
params = {
    'n_estimators': 800,
    'learning_rate': 0.05,
    'max_depth': 6,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'scale_pos_weight': 3,  # Adjusting for class imbalance
    'random_state': 42,
    'eval_metric': 'auc'
}

final_pipe = Pipeline([('preprocessor', processor), ('classifier', xgb.XGBClassifier(**params))])
final_pipe.fit(X_train, y_train)
print("Model training complete.")

### Performance Metrics
Evaluating accuracy and model robustness.

In [None]:
y_pred = final_pipe.predict(X_test)
y_proba = final_pipe.predict_proba(X_test)[:, 1]

print(f"--- Classification Report ---")
print(classification_report(y_test, y_pred))

print(f"Overall Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print(f"ROC-AUC Score: {roc_auc_score(y_test, y_proba):.4f}")

### Visual Evaluation

In [None]:
fig, ax = plt.subplots(1, 2, figsize=(16, 6))

# 1. ROC Curve
RocCurveDisplay.from_estimator(final_pipe, X_test, y_test, ax=ax[0], color='#da7756')
ax[0].set_title("Receiver Operating Characteristic (ROC)")
ax[0].plot([0, 1], [0, 1], linestyle='--', color='grey')

# 2. Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, fmt='d', cmap='Oranges', ax=ax[1])
ax[1].set_title("Confusion Matrix")
ax[1].set_xlabel("Predicted Label")
ax[1].set_ylabel("True Label")

plt.tight_layout()
plt.show()

### Feature Importance
Identifying the top predictors of default.

In [None]:
feature_names = final_pipe.named_steps['preprocessor'].get_feature_names_out()
importances = final_pipe.named_steps['classifier'].feature_importances_
indices = np.argsort(importances)[-15:]

plt.figure(figsize=(10, 8))
plt.title('Top 15 Feature Importances')
plt.barh(range(len(indices)), importances[indices], color='#da7756', align='center')
plt.yticks(range(len(indices)), [feature_names[i].split('__')[-1] for i in indices])
plt.xlabel('Relative Importance')
plt.show()

### Export Artifacts

In [None]:
joblib.dump(final_pipe, '../models/best_model.joblib')
joblib.dump(feature_names, '../models/processed_feature_names.joblib')
print("Artifacts exported to /models/")