# Credit Risk Modeling with Imbalanced Learning

Advanced credit risk modeling focusing on class imbalance handling.

In [None]:
import pandas as pd
import numpy as np
import sys
sys.path.append('..')

from data_science.imbalanced_learning import ImbalancedLearningHandler
from data_science.visualization import FinancialVisualizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, roc_auc_score

## Load and Prepare Credit Risk Dataset

In [None]:
# Create realistic credit risk dataset with severe imbalance
np.random.seed(42)
n_samples = 5000

# Generate features
data = {
    'age': np.random.randint(18, 70, n_samples),
    'income': np.random.lognormal(10.5, 0.6, n_samples),
    'debt_ratio': np.random.beta(2, 5, n_samples),
    'credit_history_length': np.random.randint(0, 30, n_samples),
    'num_credit_lines': np.random.poisson(3, n_samples),
    'credit_utilization': np.random.beta(2, 3, n_samples)
}

df = pd.DataFrame(data)

# Create realistic default probability (highly imbalanced)
default_prob = (
    0.01 +  # Base rate
    0.15 * (df['debt_ratio'] > 0.4) +  # High debt ratio
    0.10 * (df['credit_utilization'] > 0.8) +  # High utilization
    0.05 * (df['credit_history_length'] < 2) +  # Short history
    0.08 * (df['income'] < 30000)  # Low income
)

df['default'] = np.random.binomial(1, default_prob, n_samples)

print(f"Dataset shape: {df.shape}")
print(f"Default rate: {df['default'].mean():.2%}")
df.head()

## Imbalanced Learning Analysis

In [None]:
# Initialize handlers
imb = ImbalancedLearningHandler()
viz = FinancialVisualizer()

# Analyze imbalance
imbalance_analysis = imb.analyze_imbalance(df['default'])
print("Class Imbalance Analysis:")
for key, value in imbalance_analysis.items():
    print(f"{key}: {value}")

# Visualize class distribution
class_fig = viz.plot_class_distribution(df['default'], "Credit Default Distribution")
class_fig.show()

## Compare Sampling Techniques

In [None]:
# Prepare features
features = ['age', 'income', 'debt_ratio', 'credit_history_length', 'num_credit_lines', 'credit_utilization']
X = df[features]
y = df['default']

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

# Compare sampling methods
comparison = imb.compare_sampling_methods(X_train, y_train, X_test, y_test)

# Display results
results_df = pd.DataFrame({
    'Method': list(comparison.keys()),
    'ROC-AUC': [results['roc_auc'] for results in comparison.values()],
    'F1-Score': [results['classification_report']['1']['f1-score'] for results in comparison.values()],
    'Precision': [results['classification_report']['1']['precision'] for results in comparison.values()],
    'Recall': [results['classification_report']['1']['recall'] for results in comparison.values()],
    'Training_Samples': [results['training_samples'] for results in comparison.values()]
})

print("Sampling Methods Comparison:")
print(results_df.round(3))

## Optimal Threshold Analysis

In [None]:
# Train best performing model
X_smote, y_smote = imb.apply_smote(X_train, y_train)

from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=100, random_state=42, class_weight='balanced')
rf.fit(X_smote, y_smote)

# Get probabilities
y_prob = rf.predict_proba(X_test)[:, 1]

# Find optimal threshold
threshold_analysis = imb.get_optimal_threshold(y_test.values, y_prob)

print("Optimal Threshold Analysis:")
for key, value in threshold_analysis.items():
    print(f"{key}: {value:.3f}")

# Apply optimal threshold
y_pred_optimal = (y_prob >= threshold_analysis['optimal_threshold']).astype(int)

print("\nClassification Report with Optimal Threshold:")
print(classification_report(y_test, y_pred_optimal))

## Model Performance Visualization

In [None]:
# Create performance plots
performance_plots = viz.plot_model_performance(y_test.values, y_pred_optimal, y_prob)

# Display plots
for plot_name, fig in performance_plots.items():
    fig.show()
    print(f"\n{plot_name} displayed")

## Feature Importance for Credit Risk

In [None]:
# Get feature importance
feature_importance = pd.DataFrame({
    'feature': features,
    'importance': rf.feature_importances_
}).sort_values('importance', ascending=False)

print("Feature Importance for Credit Risk:")
print(feature_importance)

# Plot feature importance
importance_fig = viz.plot_feature_importance(
    feature_importance['feature'].tolist(),
    feature_importance['importance'].tolist()
)
importance_fig.show()

## Summary

This notebook demonstrated advanced credit risk modeling with focus on:

1. **Severe Class Imbalance**: Realistic 5% default rate
2. **Multiple Sampling Techniques**: SMOTE, ADASYN, combined methods
3. **Optimal Threshold Selection**: Maximizing F1-score for imbalanced data
4. **Performance Visualization**: ROC curves, confusion matrices
5. **Feature Importance**: Key risk factors identification

Key findings:
- SMOTE typically performs best for credit risk datasets
- Optimal threshold differs from default 0.5
- Debt ratio and credit utilization are top risk factors
- Balanced models improve minority class detection