# Credit Risk Assessment - XGBoost Model Training

This notebook demonstrates how to train and evaluate an XGBoost model for credit risk prediction using real lending club data.

**Dataset**: Lending Club loan data  
**Target**: Binary classification (0 = No default, 1 = Default)

## 0. Installation of Packages

In [1]:
import sys
!{sys.executable} -m pip install xgboost scikit-learn pandas numpy matplotlib seaborn scipy tqdm joblib
print("âœ“ Packages installed successfully!")
print("You can now run the rest of the notebook.")

Defaulting to user installation because normal site-packages is not writeable
âœ“ Packages installed successfully!
You can now run the rest of the notebook.


## 1. Setup and Imports

In [2]:
import sys
from pathlib import Path

# Add project root to path
project_root = Path.cwd().parent
sys.path.insert(0, str(project_root))

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

from src.config import RAW_DATA_DIR, DATASET_CONFIG, MODELS_DIR, XGBOOST_PARAMS
from src.data_loader import load_data
from src.preprocessor import DataPreprocessor
from src.feature_engineer import FeatureEngineer
from src.models.xgboost_model import XGBoostModel
from src.evaluation import ModelEvaluator
from src.utils import setup_logging, set_seed

# Setup
setup_logging()
set_seed(42)

print("âœ“ Imports successful!")

âœ“ Imports successful!


## 2. Load Data

Load training and test datasets. You can adjust `nrows` to control sample size.

In [3]:
# Load training data
print("Loading training data...")
train_path = RAW_DATA_DIR / DATASET_CONFIG['train_dataset']
train_df = load_data(train_path, optimize=True, nrows=50000)  # Adjust nrows as needed

print(f"\nLoaded {len(train_df):,} rows from {DATASET_CONFIG['train_dataset']}")
print(f"Features: {train_df.shape[1]}")

# Show target distribution
target_col = DATASET_CONFIG['target_column']
print(f"\nTarget distribution:")
print(train_df[target_col].value_counts())

# Show first few rows
train_df.head()

2025-11-27 23:43:32,672 - credit_risk_fyp.data_loader - INFO - Loading dataset from: c:\Users\Faheem\Desktop\Umair FYP\FYP2025\credit_risk_fyp\data\raw\lending_club_train.csv


Loading training data...

File size: 0.21 GB
Loaded 50,000 rows

Optimizing data types...
Memory usage before optimization: 76.06 MB
Memory usage after optimization: 60.75 MB
Memory decreased by 20.1%

Memory usage: 60.75 MB


2025-11-27 23:43:33,904 - credit_risk_fyp.data_loader - INFO - Successfully loaded 50,000 rows and 103 columns



Top 10 memory-consuming columns:
  desc                          :     3.63 MB
  title                         :     3.17 MB
  purpose                       :     3.05 MB
  emp_title                     :     3.02 MB
  term                          :     2.81 MB
  application_type              :     2.81 MB
  earliest_cr_line              :     2.72 MB
  emp_length                    :     2.65 MB
  home_ownership                :     2.62 MB
  zip_code                      :     2.57 MB

Loaded 50,000 rows from lending_club_train.csv
Features: 103

Target distribution:
default
0    40037
1     9963
Name: count, dtype: int64


Unnamed: 0,id,default,loan_amnt,term,emp_title,emp_length,home_ownership,annual_inc,pymnt_plan,desc,...,sec_app_inq_last_6mths,sec_app_mort_acc,sec_app_open_acc,sec_app_revol_util,sec_app_open_act_il,sec_app_num_rev_accts,sec_app_chargeoff_within_12_mths,sec_app_collections_12_mths_ex_med,sec_app_mths_since_last_major_derog,disbursement_method
0,1,0,10000,36 months,pharmacist,6 years,MORTGAGE,130000.0,n,,...,,,,,,,,,,Cash
1,2,0,20000,36 months,TIGI Linea - Unilever,5 years,MORTGAGE,51769.859375,n,Borrower added on 09/04/12 > To consolidate ...,...,,,,,,,,,,Cash
2,3,1,28000,60 months,Manager of Program Management,2 years,RENT,136000.0,n,,...,,,,,,,,,,Cash
3,4,1,32875,60 months,Oracle Corporation,< 1 year,OWN,106000.0,n,,...,,,,,,,,,,Cash
4,5,1,10000,36 months,Enrichment Coordinator,3 years,RENT,123000.0,n,,...,,,,,,,,,,Cash


In [4]:
# Load test data
print("Loading test data...")
test_path = RAW_DATA_DIR / DATASET_CONFIG['test_dataset']
test_df = load_data(test_path, optimize=True, nrows=10000)  # Adjust nrows as needed

print(f"\nLoaded {len(test_df):,} rows from {DATASET_CONFIG['test_dataset']}")
print(f"Features: {test_df.shape[1]}")

# Show target distribution
print(f"\nTarget distribution:")
print(test_df[target_col].value_counts())

2025-11-27 23:43:33,947 - credit_risk_fyp.data_loader - INFO - Loading dataset from: c:\Users\Faheem\Desktop\Umair FYP\FYP2025\credit_risk_fyp\data\raw\lending_club_test.csv


Loading test data...

File size: 0.06 GB
Loaded 10,000 rows

Optimizing data types...


2025-11-27 23:43:34,248 - credit_risk_fyp.data_loader - INFO - Successfully loaded 10,000 rows and 103 columns


Memory usage before optimization: 15.21 MB
Memory usage after optimization: 12.13 MB
Memory decreased by 20.2%

Memory usage: 12.13 MB

Top 10 memory-consuming columns:
  desc                          :     0.73 MB
  title                         :     0.63 MB
  purpose                       :     0.61 MB
  emp_title                     :     0.60 MB
  term                          :     0.56 MB
  application_type              :     0.56 MB
  earliest_cr_line              :     0.54 MB
  emp_length                    :     0.53 MB
  home_ownership                :     0.52 MB
  zip_code                      :     0.51 MB

Loaded 10,000 rows from lending_club_test.csv
Features: 103

Target distribution:
Series([], Name: count, dtype: int64)


## 3. Prepare Data Splits

In [5]:
id_col = DATASET_CONFIG['id_column']

# Training data: Split into train and validation
X_train_full = train_df.drop(columns=[target_col, id_col])
y_train_full = train_df[target_col]

# Split training data: 85% train, 15% validation
X_train, X_val, y_train, y_val = train_test_split(
    X_train_full, y_train_full, 
    test_size=0.15, 
    random_state=42, 
    stratify=y_train_full
)

# Test data
X_test = test_df.drop(columns=[target_col, id_col])
y_test = test_df[target_col]

print(f"Train set: {len(X_train):,} samples")
print(f"Validation set: {len(X_val):,} samples")
print(f"Test set: {len(X_test):,} samples")
print(f"\nTrain target distribution: {y_train.value_counts().to_dict()}")
print(f"Test target distribution: {y_test.value_counts().to_dict()}")

Train set: 42,500 samples
Validation set: 7,500 samples
Test set: 10,000 samples

Train target distribution: {0: 34031, 1: 8469}
Test target distribution: {}


## 4. Preprocessing

In [6]:
print("Preprocessing data...")
preprocessor = DataPreprocessor()

# Combine X_train and y_train for preprocessing
train_df_combined = X_train.copy()
train_df_combined[target_col] = y_train

# Fit on training data
X_train_processed, _ = preprocessor.fit_transform(train_df_combined)
X_val_processed, _ = preprocessor.transform(X_val)
X_test_processed, _ = preprocessor.transform(X_test)

print(f"\nFeatures after preprocessing: {X_train_processed.shape[1]}")
print(f"Train shape: {X_train_processed.shape}")
print(f"Val shape: {X_val_processed.shape}")
print(f"Test shape: {X_test_processed.shape}")

# Save preprocessor
preprocessor_path = MODELS_DIR / 'preprocessor.pkl'
preprocessor.save(preprocessor_path)
print(f"\nâœ“ Preprocessor saved to {preprocessor_path}")

Preprocessing data...


2025-11-27 23:43:34,398 - credit_risk_fyp.preprocessor - INFO - Fitting preprocessor on training data...
2025-11-27 23:43:34,408 - credit_risk_fyp.preprocessor - INFO - Creating binary target variable...
2025-11-27 23:43:34,433 - credit_risk_fyp.preprocessor - INFO - Removed 0 rows with uncertain/excluded status
2025-11-27 23:43:34,436 - credit_risk_fyp.preprocessor - INFO - Target distribution:
  Non-default (0): 34,031
  Default (1): 8,469
2025-11-27 23:43:34,486 - credit_risk_fyp.preprocessor - INFO - Identified 36 columns with >50.0% missing
2025-11-27 23:43:34,505 - credit_risk_fyp.preprocessor - INFO - Identified 51 numerical and 14 categorical features
2025-11-27 23:43:34,506 - credit_risk_fyp.preprocessor - INFO - Fitting imputers for missing values...
2025-11-27 23:43:34,910 - credit_risk_fyp.preprocessor - INFO - Fitting encoders for 14 categorical features...
2025-11-27 23:43:35,180 - credit_risk_fyp.preprocessor - INFO - Fitting scaler for numerical features...
2025-11-27 2


Features after preprocessing: 65
Train shape: (42500, 65)
Val shape: (7500, 65)
Test shape: (10000, 65)
âœ“ Saved object to c:\Users\Faheem\Desktop\Umair FYP\FYP2025\credit_risk_fyp\models\preprocessor.pkl

âœ“ Preprocessor saved to c:\Users\Faheem\Desktop\Umair FYP\FYP2025\credit_risk_fyp\models\preprocessor.pkl


## 5. Feature Engineering (Optional)

In [7]:
print("Engineering features...")
feature_engineer = FeatureEngineer()

# Fit on training data
X_train_final = feature_engineer.fit_transform(X_train_processed)
X_val_final = feature_engineer.transform(X_val_processed)
X_test_final = feature_engineer.transform(X_test_processed)

print(f"\nFeatures after engineering: {X_train_final.shape[1]}")
print(f"Train shape: {X_train_final.shape}")
print(f"Val shape: {X_val_final.shape}")
print(f"Test shape: {X_test_final.shape}")

# Save feature engineer
fe_path = MODELS_DIR / 'feature_engineer.pkl'
feature_engineer.save(fe_path)
print(f"\nâœ“ Feature engineer saved to {fe_path}")

2025-11-27 23:43:36,280 - credit_risk_fyp.feature_engineer - INFO - Fitting feature engineer...
2025-11-27 23:43:36,284 - credit_risk_fyp.feature_engineer - INFO - Feature engineer fitting complete
2025-11-27 23:43:36,285 - credit_risk_fyp.feature_engineer - INFO - Engineering features...
2025-11-27 23:43:36,307 - credit_risk_fyp.feature_engineer - INFO - Creating ratio features...
2025-11-27 23:43:36,312 - credit_risk_fyp.feature_engineer - INFO - Creating time-based features...
2025-11-27 23:43:36,327 - credit_risk_fyp.feature_engineer - INFO - Creating credit behavior features...
2025-11-27 23:43:36,332 - credit_risk_fyp.feature_engineer - INFO - Creating interaction features...
2025-11-27 23:43:36,334 - credit_risk_fyp.feature_engineer - INFO - Creating aggregation features...
2025-11-27 23:43:36,348 - credit_risk_fyp.feature_engineer - INFO - Creating binned features...
2025-11-27 23:43:36,361 - credit_risk_fyp.feature_engineer - INFO - Created 20 new features (total: 85)
2025-11-

Engineering features...

Features after engineering: 85
Train shape: (42500, 85)
Val shape: (7500, 85)
Test shape: (10000, 85)
âœ“ Saved object to c:\Users\Faheem\Desktop\Umair FYP\FYP2025\credit_risk_fyp\models\feature_engineer.pkl

âœ“ Feature engineer saved to c:\Users\Faheem\Desktop\Umair FYP\FYP2025\credit_risk_fyp\models\feature_engineer.pkl


## 6. Train XGBoost Model

You can modify the parameters below to experiment with different configurations.

In [8]:
# Option 1: Use default parameters from config
params = XGBOOST_PARAMS.copy()

# Option 2: Use custom parameters (uncomment to use)
# params = {
#     'objective': 'binary:logistic',
#     'eval_metric': 'auc',
#     'tree_method': 'gpu_hist',  # Use 'hist' for CPU
#     'max_depth': 8,
#     'learning_rate': 0.05,
#     'n_estimators': 500,
#     'subsample': 0.8,
#     'colsample_bytree': 0.8,
#     'random_state': 42
# }

print("Training XGBoost model...")
print(f"Parameters: {params}")
print("\nThis may take a few minutes...\n")

# Initialize and train
xgb_model = XGBoostModel(params=params)
xgb_model.train(
    X_train_final, y_train,
    X_val_final, y_val,
    verbose=True
)

# Save model
model_path = MODELS_DIR / 'xgboost_model.pkl'
xgb_model.save_model(model_path)
print(f"\nâœ“ Model saved to {model_path}")

2025-11-27 23:43:36,440 - credit_risk_fyp.models.xgboost - INFO - Training XGBoost model...
2025-11-27 23:43:36,441 - credit_risk_fyp.models.xgboost - INFO - Training set size: 42,500 samples, 85 features
2025-11-27 23:43:36,446 - credit_risk_fyp.models.xgboost - INFO - Class distribution - 0: 34,031, 1: 8,469
2025-11-27 23:43:36,448 - credit_risk_fyp.models.xgboost - INFO - Scale pos weight: 4.0183


Training XGBoost model...
Parameters: {'objective': 'binary:logistic', 'eval_metric': 'auc', 'tree_method': 'gpu_hist', 'predictor': 'gpu_predictor', 'max_depth': 8, 'learning_rate': 0.05, 'n_estimators': 1000, 'subsample': 0.8, 'colsample_bytree': 0.8, 'min_child_weight': 3, 'gamma': 0.1, 'reg_alpha': 0.1, 'reg_lambda': 1.0, 'scale_pos_weight': 5, 'random_state': 42, 'n_jobs': -1, 'early_stopping_rounds': 50, 'verbose_eval': 50}

This may take a few minutes...



2025-11-27 23:43:36,504 - credit_risk_fyp.models.xgboost - INFO - Validation set size: 7,500 samples


XGBoostError: Invalid Input: 'gpu_hist', valid values are: {'approx', 'auto', 'exact', 'hist'}

## 7. Make Predictions

In [None]:
print("Making predictions...")

# Predictions on validation set
y_val_proba = xgb_model.predict_proba(X_val_final)
y_val_pred = xgb_model.predict(X_val_final)

# Predictions on test set
y_test_proba = xgb_model.predict_proba(X_test_final)
y_test_pred = xgb_model.predict(X_test_final)

print(f"\nâœ“ Validation predictions: {len(y_val_proba):,}")
print(f"âœ“ Test predictions: {len(y_test_proba):,}")

# Show sample predictions
print("\nSample predictions (first 10):")
results_df = pd.DataFrame({
    'True_Label': y_test[:10].values,
    'Predicted_Probability': y_test_proba[:10],
    'Predicted_Label': y_test_pred[:10]
})
results_df

## 8. Evaluate Model

In [None]:
evaluator = ModelEvaluator()

# Evaluate on validation set
print("="*80)
print("VALIDATION SET RESULTS")
print("="*80)

val_metrics = evaluator.evaluate(
    y_val, y_val_proba,
    threshold=0.5,
    model_name="XGBoost_Validation"
)

# Display validation metrics
print("\nValidation Metrics:")
for metric, value in val_metrics.items():
    if isinstance(value, (int, float)) and metric != 'threshold':
        print(f"  {metric}: {value:.4f}" if isinstance(value, float) else f"  {metric}: {value}")

In [None]:
# Evaluate on test set
print("="*80)
print("TEST SET RESULTS")
print("="*80)

test_metrics = evaluator.evaluate(
    y_test, y_test_proba,
    threshold=0.5,
    model_name="XGBoost_Test"
)

# Display test metrics
print("\nTest Metrics:")
for metric, value in test_metrics.items():
    if isinstance(value, (int, float)) and metric != 'threshold':
        print(f"  {metric}: {value:.4f}" if isinstance(value, float) else f"  {metric}: {value}")

## 9. Visualizations

In [None]:
# Generate all evaluation plots for validation set
print("Generating validation plots...")
val_figures = evaluator.evaluate_all_plots(
    y_val, y_val_proba,
    model_name="XGBoost_Validation",
    threshold=0.5,
    save=True
)
print("âœ“ Validation plots saved")

In [None]:
# Generate all evaluation plots for test set
print("Generating test plots...")
test_figures = evaluator.evaluate_all_plots(
    y_test, y_test_proba,
    model_name="XGBoost_Test",
    threshold=0.5,
    save=True
)
print("âœ“ Test plots saved")

## 10. Feature Importance

In [None]:
# Get feature importance
feature_importance = xgb_model.get_feature_importance(importance_type='gain')

print("Top 20 Most Important Features:")
print("="*80)
feature_importance.head(20)

In [None]:
# Plot feature importance
xgb_model.plot_feature_importance(
    top_n=20, 
    importance_type='gain',
    save_path=evaluator.figures_dir / 'xgboost_feature_importance.png'
)

## 11. Threshold Optimization

In [None]:
# Find optimal thresholds for different metrics
print("Threshold Optimization (on validation set):")
print("="*80)

for metric in ['f1', 'precision', 'recall']:
    optimal_threshold, optimal_score = evaluator.optimize_threshold(
        y_val, y_val_proba, metric=metric
    )
    print(f"{metric.upper():<15} Threshold: {optimal_threshold:.4f}, Score: {optimal_score:.4f}")

## 12. Generate Comparison Report

In [None]:
# Create comparison report
results_dict = {
    'XGBoost_Validation': val_metrics,
    'XGBoost_Test': test_metrics
}

report_df = evaluator.generate_report(
    results_dict,
    output_path=evaluator.reports_dir / 'xgboost_evaluation_report.csv'
)

print("\nEvaluation Report:")
report_df[['roc_auc', 'accuracy', 'precision', 'recall', 'f1_score', 'n_samples', 'n_positive', 'n_negative']]

## 13. Summary

In [None]:
print("="*80)
print("TRAINING COMPLETE!")
print("="*80)

print(f"\nðŸ“Š DATASETS USED:")
print(f"  Training data: {DATASET_CONFIG['train_dataset']} ({len(train_df):,} rows)")
print(f"  Test data: {DATASET_CONFIG['test_dataset']} ({len(test_df):,} rows)")
print(f"  Train samples: {len(X_train):,}")
print(f"  Validation samples: {len(X_val):,}")
print(f"  Test samples: {len(X_test):,}")

print(f"\nâœ“ Model saved to: {model_path}")
print(f"âœ“ Figures saved to: {evaluator.figures_dir}")
print(f"âœ“ Reports saved to: {evaluator.reports_dir}")

print(f"\nðŸ“ˆ PERFORMANCE SUMMARY:")
print(f"  Validation AUC: {val_metrics['roc_auc']:.4f}")
print(f"  Test AUC: {test_metrics['roc_auc']:.4f}")
print(f"  Test Accuracy: {test_metrics['accuracy']:.4f}")
print(f"  Test F1-Score: {test_metrics['f1_score']:.4f}")
print("="*80)