# Credit Risk Prediction - Feature Analysis

This notebook analyzes the engineered features and their predictive power.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_selection import mutual_info_classif
import warnings
warnings.filterwarnings('ignore')

plt.style.use('seaborn-v0_8-whitegrid')
%matplotlib inline

In [None]:
# Load engineered features
df = pd.read_csv('../data/features/credit_features.csv')
print(f'Dataset Shape: {df.shape}')
print(f'Features: {len(df.columns) - 2}')  # Exclude ID and target
df.head()

## 1. Feature Overview

In [None]:
# Feature categories
risk_indicators = ['high_dti', 'high_utilization', 'low_credit_score', 
                   'has_delinquencies', 'has_bankruptcies', 'new_borrower',
                   'high_interest', 'large_loan']

financial_ratios = ['payment_to_income', 'available_credit_ratio',
                    'credit_per_line', 'balance_per_line', 'debt_to_credit']

interaction_features = ['score_dti_interaction', 'age_employment_ratio',
                        'score_util_interaction', 'income_loan_ratio',
                        'delinquency_severity']

print('Feature Categories:')
print(f'Risk Indicators: {len(risk_indicators)}')
print(f'Financial Ratios: {len(financial_ratios)}')
print(f'Interaction Features: {len(interaction_features)}')

## 2. Feature Importance Analysis

In [None]:
# Calculate mutual information
exclude_cols = ['customer_id', 'default']
feature_cols = [col for col in df.columns if col not in exclude_cols and df[col].dtype in ['int64', 'float64']]

X = df[feature_cols].fillna(0)
y = df['default']

mi_scores = mutual_info_classif(X, y, random_state=42)
mi_df = pd.DataFrame({'feature': feature_cols, 'mi_score': mi_scores})
mi_df = mi_df.sort_values('mi_score', ascending=False)

# Plot top 20 features
plt.figure(figsize=(10, 10))
top_20 = mi_df.head(20)
plt.barh(top_20['feature'], top_20['mi_score'], color='steelblue')
plt.xlabel('Mutual Information Score')
plt.title('Top 20 Features by Mutual Information', fontsize=14)
plt.gca().invert_yaxis()
plt.tight_layout()
plt.show()

## 3. Risk Indicator Analysis

In [None]:
# Analyze risk indicators
available_indicators = [col for col in risk_indicators if col in df.columns]

fig, axes = plt.subplots(2, 4, figsize=(16, 8))
axes = axes.flatten()

for i, indicator in enumerate(available_indicators):
    default_rate = df.groupby(indicator)['default'].mean()
    axes[i].bar(['No', 'Yes'], default_rate.values, color=['green', 'red'])
    axes[i].set_title(f'{indicator.replace("_", " ").title()}')
    axes[i].set_ylabel('Default Rate')
    for j, v in enumerate(default_rate.values):
        axes[i].text(j, v + 0.01, f'{v:.1%}', ha='center')

plt.suptitle('Default Rate by Risk Indicator', fontsize=14, y=1.02)
plt.tight_layout()
plt.show()

## 4. Composite Risk Score Analysis

In [None]:
# Analyze composite risk score
if 'composite_risk_score' in df.columns:
    fig, axes = plt.subplots(1, 2, figsize=(14, 5))
    
    # Distribution by default status
    sns.kdeplot(data=df, x='composite_risk_score', hue='default', ax=axes[0], fill=True)
    axes[0].set_title('Risk Score Distribution by Default Status')
    axes[0].legend(['No Default', 'Default'])
    
    # Default rate by risk score decile
    df['risk_decile'] = pd.qcut(df['composite_risk_score'], 10, labels=range(1, 11))
    decile_default = df.groupby('risk_decile')['default'].mean()
    
    axes[1].bar(decile_default.index.astype(str), decile_default.values, color='coral')
    axes[1].set_title('Default Rate by Risk Score Decile')
    axes[1].set_xlabel('Risk Score Decile (1=Lowest Risk, 10=Highest Risk)')
    axes[1].set_ylabel('Default Rate')
    
    plt.tight_layout()
    plt.show()
    
    df.drop('risk_decile', axis=1, inplace=True)

## 5. Feature Correlation with Target

In [None]:
# Correlation with default
correlations = df[feature_cols + ['default']].corr()['default'].drop('default')
correlations = correlations.sort_values(key=abs, ascending=False)

plt.figure(figsize=(12, 10))
colors = ['red' if x > 0 else 'blue' for x in correlations.values]
plt.barh(correlations.index[:20], correlations.values[:20], color=colors[:20])
plt.xlabel('Correlation with Default')
plt.title('Top 20 Feature Correlations with Default', fontsize=14)
plt.axvline(x=0, color='black', linestyle='-', linewidth=0.5)
plt.tight_layout()
plt.show()

## 6. Summary

In [None]:
print('Feature Analysis Summary')
print('='*50)
print(f'Total Features: {len(feature_cols)}')
print(f'\nTop 5 Features by Mutual Information:')
for i, row in mi_df.head(5).iterrows():
    print(f'  - {row["feature"]}: {row["mi_score"]:.4f}')
print(f'\nTop 5 Features by Correlation:')
for feat, corr in correlations.head(5).items():
    print(f'  - {feat}: {corr:.4f}')