# Credit Risk Prediction - Model Comparison

This notebook compares different machine learning models for credit risk prediction.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import roc_curve, auc, precision_recall_curve, classification_report
import warnings
warnings.filterwarnings('ignore')

plt.style.use('seaborn-v0_8-whitegrid')
%matplotlib inline

In [None]:
# Load data
df = pd.read_csv('../data/features/credit_features.csv')
print(f'Dataset Shape: {df.shape}')

# Prepare features
exclude_cols = ['customer_id', 'default']
feature_cols = [col for col in df.columns if col not in exclude_cols and df[col].dtype in ['int64', 'float64']]

X = df[feature_cols].fillna(0).values
y = df['default'].values

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
print(f'Training set: {len(X_train)}, Test set: {len(X_test)}')

## 1. Train Models

In [None]:
# Scale features for Logistic Regression
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Define models
models = {
    'Logistic Regression': LogisticRegression(max_iter=1000, class_weight='balanced', random_state=42),
    'Random Forest': RandomForestClassifier(n_estimators=100, max_depth=10, class_weight='balanced', random_state=42),
    'Gradient Boosting': GradientBoostingClassifier(n_estimators=100, max_depth=5, random_state=42)
}

# Train and evaluate
results = {}

for name, model in models.items():
    print(f'Training {name}...')
    
    # Use scaled data for LR, original for tree-based
    if 'Logistic' in name:
        model.fit(X_train_scaled, y_train)
        y_pred_proba = model.predict_proba(X_test_scaled)[:, 1]
    else:
        model.fit(X_train, y_train)
        y_pred_proba = model.predict_proba(X_test)[:, 1]
    
    # Calculate metrics
    fpr, tpr, _ = roc_curve(y_test, y_pred_proba)
    roc_auc = auc(fpr, tpr)
    
    results[name] = {
        'model': model,
        'y_pred_proba': y_pred_proba,
        'fpr': fpr,
        'tpr': tpr,
        'auc': roc_auc
    }
    
    print(f'  AUC: {roc_auc:.4f}')

## 2. ROC Curve Comparison

In [None]:
plt.figure(figsize=(10, 8))

colors = ['blue', 'green', 'red', 'purple']
for (name, result), color in zip(results.items(), colors):
    plt.plot(result['fpr'], result['tpr'], color=color, lw=2,
             label=f"{name} (AUC = {result['auc']:.3f})")

plt.plot([0, 1], [0, 1], 'k--', lw=2, label='Random')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate', fontsize=12)
plt.ylabel('True Positive Rate', fontsize=12)
plt.title('ROC Curve Comparison', fontsize=14)
plt.legend(loc='lower right', fontsize=11)
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

## 3. Model Comparison Summary

In [None]:
# Create comparison table
comparison_df = pd.DataFrame({
    'Model': list(results.keys()),
    'AUC-ROC': [results[m]['auc'] for m in results],
    'Gini': [2*results[m]['auc'] - 1 for m in results]
}).sort_values('AUC-ROC', ascending=False)

print('Model Comparison:')
print('='*50)
comparison_df

## 4. Feature Importance (Best Model)

In [None]:
# Get best model (Random Forest or Gradient Boosting)
best_model_name = comparison_df.iloc[0]['Model']
best_model = results[best_model_name]['model']

print(f'Best Model: {best_model_name}')

if hasattr(best_model, 'feature_importances_'):
    importance = best_model.feature_importances_
    importance_df = pd.DataFrame({
        'feature': feature_cols,
        'importance': importance
    }).sort_values('importance', ascending=False)
    
    plt.figure(figsize=(10, 8))
    top_15 = importance_df.head(15)
    plt.barh(top_15['feature'], top_15['importance'], color='steelblue')
    plt.xlabel('Importance')
    plt.title(f'Top 15 Feature Importance ({best_model_name})', fontsize=14)
    plt.gca().invert_yaxis()
    plt.tight_layout()
    plt.show()

## 5. Conclusion

In [None]:
print('Model Comparison Complete!')
print('='*50)
print(f'Best Performing Model: {best_model_name}')
print(f'AUC-ROC: {comparison_df.iloc[0]["AUC-ROC"]:.4f}')
print(f'Gini Coefficient: {comparison_df.iloc[0]["Gini"]:.4f}')
print('\nRecommendation: Use the best model for production deployment.')