# Credit Risk Prediction - Exploratory Data Analysis

This notebook performs comprehensive exploratory data analysis on the credit risk dataset.

In [None]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import warnings
warnings.filterwarnings('ignore')

# Set style
plt.style.use('seaborn-v0_8-whitegrid')
sns.set_palette('husl')
%matplotlib inline

In [None]:
# Load data
df = pd.read_csv('../data/raw/credit_data.csv')
print(f'Dataset Shape: {df.shape}')
df.head()

## 1. Dataset Overview

In [None]:
# Basic info
print('Dataset Information:')
print('='*50)
print(df.info())
print('\n' + '='*50)
print('\nStatistical Summary:')
df.describe()

In [None]:
# Missing values
missing = df.isnull().sum()
missing_pct = (missing / len(df) * 100).round(2)
missing_df = pd.DataFrame({'Missing Count': missing, 'Missing %': missing_pct})
missing_df[missing_df['Missing Count'] > 0]

## 2. Target Variable Analysis

In [None]:
# Default distribution
fig, axes = plt.subplots(1, 2, figsize=(12, 5))

# Count plot
default_counts = df['default'].value_counts()
axes[0].bar(['No Default', 'Default'], default_counts.values, color=['green', 'red'])
axes[0].set_title('Default Distribution', fontsize=14)
axes[0].set_ylabel('Count')
for i, v in enumerate(default_counts.values):
    axes[0].text(i, v + 100, f'{v:,}', ha='center', fontsize=12)

# Pie chart
axes[1].pie(default_counts.values, labels=['No Default', 'Default'], 
           autopct='%1.1f%%', colors=['green', 'red'], explode=[0, 0.1])
axes[1].set_title('Default Rate', fontsize=14)

plt.tight_layout()
plt.show()

print(f'Default Rate: {df["default"].mean()*100:.2f}%')

## 3. Numerical Features Analysis

In [None]:
# Distribution of key numerical features
numeric_cols = ['age', 'income', 'credit_score', 'loan_amount', 'interest_rate', 'dti_ratio']

fig, axes = plt.subplots(2, 3, figsize=(15, 10))
axes = axes.flatten()

for i, col in enumerate(numeric_cols):
    ax = axes[i]
    
    # Histogram with KDE
    sns.histplot(data=df, x=col, hue='default', kde=True, ax=ax, bins=30)
    ax.set_title(f'{col.replace("_", " ").title()} Distribution', fontsize=12)
    ax.legend(['No Default', 'Default'])

plt.tight_layout()
plt.show()

In [None]:
# Box plots by default status
fig, axes = plt.subplots(2, 3, figsize=(15, 10))
axes = axes.flatten()

for i, col in enumerate(numeric_cols):
    sns.boxplot(data=df, x='default', y=col, ax=axes[i], palette=['green', 'red'])
    axes[i].set_title(f'{col.replace("_", " ").title()} by Default Status')
    axes[i].set_xticklabels(['No Default', 'Default'])

plt.tight_layout()
plt.show()

## 4. Categorical Features Analysis

In [None]:
# Categorical distributions
cat_cols = ['home_ownership', 'loan_purpose', 'education', 'marital_status']

fig, axes = plt.subplots(2, 2, figsize=(14, 10))
axes = axes.flatten()

for i, col in enumerate(cat_cols):
    # Calculate default rate by category
    default_rate = df.groupby(col)['default'].mean().sort_values(ascending=False)
    
    axes[i].bar(default_rate.index, default_rate.values, color='steelblue')
    axes[i].set_title(f'Default Rate by {col.replace("_", " ").title()}', fontsize=12)
    axes[i].set_ylabel('Default Rate')
    axes[i].tick_params(axis='x', rotation=45)
    axes[i].axhline(y=df['default'].mean(), color='red', linestyle='--', label='Overall Rate')
    axes[i].legend()

plt.tight_layout()
plt.show()

## 5. Correlation Analysis

In [None]:
# Correlation matrix
numeric_df = df.select_dtypes(include=[np.number])
corr_matrix = numeric_df.corr()

plt.figure(figsize=(14, 10))
mask = np.triu(np.ones_like(corr_matrix, dtype=bool))
sns.heatmap(corr_matrix, mask=mask, annot=True, cmap='RdBu_r', center=0,
            fmt='.2f', square=True, linewidths=0.5)
plt.title('Correlation Matrix', fontsize=14)
plt.tight_layout()
plt.show()

In [None]:
# Correlation with target
target_corr = corr_matrix['default'].drop('default').sort_values(key=abs, ascending=False)

plt.figure(figsize=(10, 8))
colors = ['red' if x > 0 else 'blue' for x in target_corr.values]
plt.barh(target_corr.index, target_corr.values, color=colors)
plt.xlabel('Correlation with Default')
plt.title('Feature Correlation with Default', fontsize=14)
plt.axvline(x=0, color='black', linestyle='-', linewidth=0.5)
plt.tight_layout()
plt.show()

## 6. Key Risk Factors

In [None]:
# Credit Score vs Default
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Bin credit scores
df['credit_score_bin'] = pd.cut(df['credit_score'], 
                                bins=[300, 580, 670, 740, 800, 850],
                                labels=['Poor', 'Fair', 'Good', 'Very Good', 'Excellent'])

default_by_score = df.groupby('credit_score_bin')['default'].agg(['mean', 'count'])

axes[0].bar(default_by_score.index, default_by_score['mean'], color='coral')
axes[0].set_title('Default Rate by Credit Score Tier', fontsize=12)
axes[0].set_ylabel('Default Rate')
axes[0].set_xlabel('Credit Score Tier')

# DTI Ratio analysis
df['dti_bin'] = pd.cut(df['dti_ratio'], bins=[0, 20, 35, 50, 100], labels=['<20%', '20-35%', '35-50%', '>50%'])
default_by_dti = df.groupby('dti_bin')['default'].mean()

axes[1].bar(default_by_dti.index, default_by_dti.values, color='steelblue')
axes[1].set_title('Default Rate by DTI Ratio', fontsize=12)
axes[1].set_ylabel('Default Rate')
axes[1].set_xlabel('DTI Ratio Range')

plt.tight_layout()
plt.show()

## 7. Key Insights Summary

In [None]:
# Summary statistics by default status
summary = df.groupby('default').agg({
    'credit_score': 'mean',
    'income': 'mean',
    'loan_amount': 'mean',
    'dti_ratio': 'mean',
    'interest_rate': 'mean',
    'delinquencies_2yr': 'mean'
}).round(2)

summary.index = ['No Default', 'Default']
print('Average Values by Default Status:')
print('='*60)
summary

In [None]:
# Clean up temporary columns
df.drop(['credit_score_bin', 'dti_bin'], axis=1, inplace=True, errors='ignore')

print('EDA Complete!')
print('Key Findings:')
print('1. Lower credit scores strongly correlate with higher default rates')
print('2. Higher DTI ratios indicate increased default risk')
print('3. Delinquency history is a strong predictor of future default')
print('4. Higher interest rates are associated with riskier borrowers')