# Credit Risk Model Comparison

This notebook compares all trained models and identifies the best performer based on:
1. **TPR (True Positive Rate / Recall)** - How many actual defaults we catch
2. **FPR (False Positive Rate)** - How many good loans we incorrectly flag as risky
3. **Other metrics** - AUC-ROC, Precision, F1-Score

## Step 1: Setup and Load Results

In [1]:
import sys
from pathlib import Path
import pickle
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

# Setup paths
project_root = Path.cwd().parent.parent
sys.path.insert(0, str(project_root / 'credit_risk_fyp'))

from src.config import RESULTS_DIR

print("Loading model results...\n")

Loading model results...



In [2]:
# Function to safely load model results
def load_model_results(model_name, file_name):
    try:
        with open(RESULTS_DIR / file_name, 'rb') as f:
            data = pickle.load(f)
        print(f"‚úì {model_name} loaded")
        return data
    except FileNotFoundError:
        print(f"‚úó {model_name} not found - please train this model first")
        return None

# Load all models (base + ensemble)
models = {
    'Logistic Regression': load_model_results('Logistic Regression', 'logistic_regression_metrics.pkl'),
    'Random Forest': load_model_results('Random Forest', 'random_forest_metrics.pkl'),
    'XGBoost': load_model_results('XGBoost', 'xgboost_metrics.pkl'),
    'Neural Network': load_model_results('Neural Network', 'neural_network_metrics.pkl'),
    'Stacking Ensemble': load_model_results('Stacking Ensemble', 'stacking_ensemble_metrics.pkl'),
    'Weighted Ensemble': load_model_results('Weighted Ensemble', 'weighted_ensemble_metrics.pkl')
}

# Filter out models that weren't loaded
models = {k: v for k, v in models.items() if v is not None}

print(f"\n{'='*80}")
print(f"Successfully loaded {len(models)} models")
print(f"{'='*80}")

‚úì Logistic Regression loaded
‚úì Random Forest loaded
‚úó XGBoost not found - please train this model first
‚úì Neural Network loaded
‚úì Stacking Ensemble loaded
‚úì Weighted Ensemble loaded

Successfully loaded 5 models


## Step 2: Build Comparison Table

In [3]:
# Baseline results (from original analysis)
baseline = {
    'AUC-ROC': 0.7086,
    'Precision': 0.3425,
    'Recall (TPR)': 0.5223,
    'F1-Score': 0.4138,
    'FPR': 0.2000,
    'Threshold': 0.5000
}

# Build comparison DataFrame
rows = []

# Add baseline
rows.append({
    'Model': 'Baseline',
    **baseline
})

# Add trained models
for model_name, data in models.items():
    tm = data['test_metrics']
    
    # Handle different metric key names (base models vs ensemble models)
    auc = tm.get('roc_auc', tm.get('auc_roc', 0))
    precision = tm.get('precision', 0)
    recall = tm.get('recall', 0)
    f1 = tm.get('f1_score', 0)
    fpr = tm.get('false_positive_rate', tm.get('fpr', 0))
    
    rows.append({
        'Model': model_name,
        'AUC-ROC': auc,
        'Precision': precision,
        'Recall (TPR)': recall,
        'F1-Score': f1,
        'FPR': fpr,
        'Threshold': data.get('optimal_threshold', 0.5)
    })

comparison_df = pd.DataFrame(rows)

print("\n" + "="*80)
print("COMPLETE MODEL COMPARISON (ALL 6 MODELS)")
print("="*80)
print(comparison_df.to_string(index=False))

# Save to CSV
csv_path = RESULTS_DIR / 'final_model_comparison.csv'
comparison_df.to_csv(csv_path, index=False)
print(f"\n‚úì Saved to: {csv_path}")


COMPLETE MODEL COMPARISON (ALL 6 MODELS)
              Model  AUC-ROC  Precision  Recall (TPR)  F1-Score      FPR  Threshold
           Baseline 0.708600   0.342500      0.522300  0.413800 0.200000   0.500000
Logistic Regression 0.636047   0.264237      0.643219  0.374591 0.445828   0.270530
      Random Forest 0.710572   0.344460      0.559179  0.426309 0.264897   0.414141
     Neural Network 0.720545   0.343169      0.606292  0.438271 0.288863   0.426963
  Stacking Ensemble 0.718666   0.319555      0.670664  0.432862 0.355481   0.212588
  Weighted Ensemble 0.721113   0.322776      0.673423  0.436388 0.351709   0.355688

‚úì Saved to: c:\Users\Faheem\Desktop\Umair FYP\FYP2025\credit_risk_fyp\results\final_model_comparison.csv


## Step 3: Identify Best Models by Metric

In [4]:
print("\n" + "="*80)
print("BEST MODELS BY METRIC")
print("="*80)

# TPR: Higher is better (catch more defaults)
best_tpr_idx = comparison_df['Recall (TPR)'].idxmax()
print(f"\nüéØ BEST TPR (Catch Most Defaults):")
print(f"   Model: {comparison_df.loc[best_tpr_idx, 'Model']}")
print(f"   TPR: {comparison_df.loc[best_tpr_idx, 'Recall (TPR)']:.4f} ({comparison_df.loc[best_tpr_idx, 'Recall (TPR)']*100:.2f}%)")
print(f"   ‚Üí Out of 100 defaults, catches {comparison_df.loc[best_tpr_idx, 'Recall (TPR)']*100:.1f}")

# FPR: Lower is better (fewer false alarms)
best_fpr_idx = comparison_df['FPR'].idxmin()
print(f"\nüéØ BEST FPR (Fewest False Alarms):")
print(f"   Model: {comparison_df.loc[best_fpr_idx, 'Model']}")
print(f"   FPR: {comparison_df.loc[best_fpr_idx, 'FPR']:.4f} ({comparison_df.loc[best_fpr_idx, 'FPR']*100:.2f}%)")
print(f"   ‚Üí Out of 100 good loans, incorrectly flags only {comparison_df.loc[best_fpr_idx, 'FPR']*100:.1f}")

# AUC-ROC: Higher is better (overall discrimination)
best_auc_idx = comparison_df['AUC-ROC'].idxmax()
print(f"\nüéØ BEST AUC-ROC (Overall Performance):")
print(f"   Model: {comparison_df.loc[best_auc_idx, 'Model']}")
print(f"   AUC-ROC: {comparison_df.loc[best_auc_idx, 'AUC-ROC']:.4f}")

# F1-Score: Higher is better (balanced precision/recall)
best_f1_idx = comparison_df['F1-Score'].idxmax()
print(f"\nüéØ BEST F1-Score (Balanced Performance):")
print(f"   Model: {comparison_df.loc[best_f1_idx, 'Model']}")
print(f"   F1-Score: {comparison_df.loc[best_f1_idx, 'F1-Score']:.4f}")

# Precision: Higher is better (fewer false positives among predictions)
best_precision_idx = comparison_df['Precision'].idxmax()
print(f"\nüéØ BEST Precision (Most Accurate Predictions):")
print(f"   Model: {comparison_df.loc[best_precision_idx, 'Model']}")
print(f"   Precision: {comparison_df.loc[best_precision_idx, 'Precision']:.4f}")


BEST MODELS BY METRIC

üéØ BEST TPR (Catch Most Defaults):
   Model: Weighted Ensemble
   TPR: 0.6734 (67.34%)
   ‚Üí Out of 100 defaults, catches 67.3

üéØ BEST FPR (Fewest False Alarms):
   Model: Baseline
   FPR: 0.2000 (20.00%)
   ‚Üí Out of 100 good loans, incorrectly flags only 20.0

üéØ BEST AUC-ROC (Overall Performance):
   Model: Weighted Ensemble
   AUC-ROC: 0.7211

üéØ BEST F1-Score (Balanced Performance):
   Model: Neural Network
   F1-Score: 0.4383

üéØ BEST Precision (Most Accurate Predictions):
   Model: Random Forest
   Precision: 0.3445


## Step 4: TPR vs FPR Trade-off Analysis

In [5]:
print("\n" + "="*80)
print("TPR vs FPR TRADE-OFF ANALYSIS")
print("="*80)
print("\nGoal: Maximize TPR (catch defaults) while minimizing FPR (false alarms)\n")

# Calculate a combined score: TPR - FPR (higher is better)
comparison_df['TPR-FPR Score'] = comparison_df['Recall (TPR)'] - comparison_df['FPR']

# Sort by this score
ranked = comparison_df.sort_values('TPR-FPR Score', ascending=False)

print(f"{'Rank':<6} {'Model':<25} {'TPR':<10} {'FPR':<10} {'TPR-FPR Score':<15}")
print("-" * 80)
for idx, (i, row) in enumerate(ranked.iterrows(), 1):
    print(f"{idx:<6} {row['Model']:<25} {row['Recall (TPR)']:<10.4f} {row['FPR']:<10.4f} {row['TPR-FPR Score']:<15.4f}")

best_tradeoff_model = ranked.iloc[0]['Model']
print(f"\nüèÜ BEST OVERALL (TPR-FPR Balance): {best_tradeoff_model}")


TPR vs FPR TRADE-OFF ANALYSIS

Goal: Maximize TPR (catch defaults) while minimizing FPR (false alarms)

Rank   Model                     TPR        FPR        TPR-FPR Score  
--------------------------------------------------------------------------------
1      Baseline                  0.5223     0.2000     0.3223         
2      Weighted Ensemble         0.6734     0.3517     0.3217         
3      Neural Network            0.6063     0.2889     0.3174         
4      Stacking Ensemble         0.6707     0.3555     0.3152         
5      Random Forest             0.5592     0.2649     0.2943         
6      Logistic Regression       0.6432     0.4458     0.1974         

üèÜ BEST OVERALL (TPR-FPR Balance): Baseline


## Step 5: Final Recommendation

In [6]:
print("\n" + "="*80)
print("FINAL RECOMMENDATIONS")
print("="*80)

print("\nüìä SUMMARY:")
print(f"   ‚Ä¢ Models evaluated: {len(comparison_df)}")
print(f"   ‚Ä¢ Best overall (AUC-ROC): {comparison_df.loc[best_auc_idx, 'Model']}")
print(f"   ‚Ä¢ Best TPR-FPR balance: {best_tradeoff_model}")

print("\nüí° USE CASE RECOMMENDATIONS:")
print(f"\n   1Ô∏è‚É£  For CATCHING MOST DEFAULTS (High TPR):")
print(f"       ‚Üí Use: {comparison_df.loc[best_tpr_idx, 'Model']}")
print(f"       ‚Üí TPR: {comparison_df.loc[best_tpr_idx, 'Recall (TPR)']:.2%} | FPR: {comparison_df.loc[best_tpr_idx, 'FPR']:.2%}")

print(f"\n   2Ô∏è‚É£  For MINIMIZING FALSE ALARMS (Low FPR):")
print(f"       ‚Üí Use: {comparison_df.loc[best_fpr_idx, 'Model']}")
print(f"       ‚Üí FPR: {comparison_df.loc[best_fpr_idx, 'FPR']:.2%} | TPR: {comparison_df.loc[best_fpr_idx, 'Recall (TPR)']:.2%}")

print(f"\n   3Ô∏è‚É£  For BALANCED PERFORMANCE (Best F1):")
print(f"       ‚Üí Use: {comparison_df.loc[best_f1_idx, 'Model']}")
print(f"       ‚Üí F1: {comparison_df.loc[best_f1_idx, 'F1-Score']:.4f} | AUC-ROC: {comparison_df.loc[best_f1_idx, 'AUC-ROC']:.4f}")

print(f"\n   4Ô∏è‚É£  For OVERALL BEST (TPR-FPR Balance):")
print(f"       ‚Üí Use: {best_tradeoff_model}")
best_idx = ranked.index[0]
print(f"       ‚Üí TPR-FPR Score: {comparison_df.loc[best_idx, 'TPR-FPR Score']:.4f}")

print("\n" + "="*80)
print("‚úÖ ANALYSIS COMPLETE!")
print("="*80)


FINAL RECOMMENDATIONS

üìä SUMMARY:
   ‚Ä¢ Models evaluated: 6
   ‚Ä¢ Best overall (AUC-ROC): Weighted Ensemble
   ‚Ä¢ Best TPR-FPR balance: Baseline

üí° USE CASE RECOMMENDATIONS:

   1Ô∏è‚É£  For CATCHING MOST DEFAULTS (High TPR):
       ‚Üí Use: Weighted Ensemble
       ‚Üí TPR: 67.34% | FPR: 35.17%

   2Ô∏è‚É£  For MINIMIZING FALSE ALARMS (Low FPR):
       ‚Üí Use: Baseline
       ‚Üí FPR: 20.00% | TPR: 52.23%

   3Ô∏è‚É£  For BALANCED PERFORMANCE (Best F1):
       ‚Üí Use: Neural Network
       ‚Üí F1: 0.4383 | AUC-ROC: 0.7205

   4Ô∏è‚É£  For OVERALL BEST (TPR-FPR Balance):
       ‚Üí Use: Baseline
       ‚Üí TPR-FPR Score: 0.3223

‚úÖ ANALYSIS COMPLETE!
