# SafeLend - Exploratory Data Analysis

This notebook contains exploratory data analysis for the SafeLend credit risk prediction project.

## Overview
- Data exploration and visualization
- Missing value analysis
- Feature distribution analysis
- Correlation analysis
- Target variable analysis


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

# Set style
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

# Load data
print("Loading data...")
application_train = pd.read_csv('../Data/raw/application_train.csv')
application_test = pd.read_csv('../Data/raw/application_test.csv')

print(f"Training data shape: {application_train.shape}")
print(f"Test data shape: {application_test.shape}")

# Basic data info
print("\nTraining data info:")
print(application_train.info())
print("\nTest data info:")
print(application_test.info())


In [None]:
# Dataset info analysis
print("=== DATASET OVERVIEW ===")
print(f"Training set: {application_train.shape[0]:,} applicants, {application_train.shape[1]} features")
print(f"Test set: {application_test.shape[0]:,} applicants, {application_test.shape[1]} features")

# Check for missing values
print("\n=== MISSING VALUES ANALYSIS ===")
train_missing = application_train.isnull().sum()
test_missing = application_test.isnull().sum()

missing_df = pd.DataFrame({
    'Train_Missing': train_missing,
    'Test_Missing': test_missing,
    'Train_Percent': (train_missing / len(application_train) * 100).round(2),
    'Test_Percent': (test_missing / len(application_test) * 100).round(2)
})

# Show columns with highest missing values
missing_df = missing_df[missing_df['Train_Missing'] > 0].sort_values('Train_Percent', ascending=False)
print("Top 15 columns with missing values:")
print(missing_df.head(15))

# Data types analysis
print("\n=== DATA TYPES ANALYSIS ===")
print("Training data types:")
print(application_train.dtypes.value_counts())
print("\nTest data types:")
print(application_test.dtypes.value_counts())


In [None]:
# Target variable analysis
print("=== TARGET VARIABLE ANALYSIS ===")

# Target distribution
target_counts = application_train['TARGET'].value_counts()
target_pct = application_train['TARGET'].value_counts(normalize=True) * 100

print("Target distribution:")
for value, count in target_counts.items():
    pct = target_pct[value]
    label = "Default" if value == 1 else "Repay"
    print(f"  {label}: {count:,} ({pct:.2f}%)")

# Visualize target distribution
plt.figure(figsize=(12, 5))

plt.subplot(1, 2, 1)
target_counts.plot(kind='bar', color=['green', 'red'])
plt.title('Target Distribution (Count)')
plt.xlabel('Target')
plt.ylabel('Count')
plt.xticks([0, 1], ['Repay (0)', 'Default (1)'], rotation=0)

plt.subplot(1, 2, 2)
target_pct.plot(kind='bar', color=['green', 'red'])
plt.title('Target Distribution (Percentage)')
plt.xlabel('Target')
plt.ylabel('Percentage')
plt.xticks([0, 1], ['Repay (0)', 'Default (1)'], rotation=0)

plt.tight_layout()
plt.show()

# Class imbalance analysis
print(f"\nClass imbalance ratio: {target_counts[1] / target_counts[0]:.3f}")
print(f"Minority class percentage: {target_pct[1]:.2f}%")


In [None]:
# Missing values analysis
print("=== MISSING VALUES VISUALIZATION ===")

# Create missing values heatmap for top missing columns
top_missing_cols = missing_df.head(20).index

plt.figure(figsize=(15, 8))

# Training set missing values
plt.subplot(2, 1, 1)
train_missing_subset = application_train[top_missing_cols].isnull().sum()
train_missing_subset.plot(kind='bar', color='skyblue')
plt.title('Missing Values in Training Set (Top 20 Columns)')
plt.ylabel('Missing Count')
plt.xticks(rotation=45)

# Test set missing values
plt.subplot(2, 1, 2)
test_missing_subset = application_test[top_missing_cols].isnull().sum()
test_missing_subset.plot(kind='bar', color='lightcoral')
plt.title('Missing Values in Test Set (Top 20 Columns)')
plt.ylabel('Missing Count')
plt.xticks(rotation=45)

plt.tight_layout()
plt.show()

# Missing values patterns
print("\nMissing values patterns:")
print(f"Columns with >50% missing: {(missing_df['Train_Percent'] > 50).sum()}")
print(f"Columns with >25% missing: {(missing_df['Train_Percent'] > 25).sum()}")
print(f"Columns with >10% missing: {(missing_df['Train_Percent'] > 10).sum()}")

# Columns with no missing values
no_missing = (application_train.isnull().sum() == 0).sum()
print(f"Columns with no missing values: {no_missing}")


In [None]:
# Feature distribution analysis
print("=== FEATURE DISTRIBUTION ANALYSIS ===")

# Select key numerical features for analysis
key_features = ['AMT_INCOME_TOTAL', 'AMT_CREDIT', 'AMT_ANNUITY', 'AMT_GOODS_PRICE', 
                'DAYS_BIRTH', 'DAYS_EMPLOYED', 'CNT_FAM_MEMBERS', 'CNT_CHILDREN']

# Filter features that exist in the dataset
existing_features = [f for f in key_features if f in application_train.columns]
print(f"Analyzing {len(existing_features)} key features: {existing_features}")

# Distribution plots
n_features = len(existing_features)
n_cols = 3
n_rows = (n_features + n_cols - 1) // n_cols

plt.figure(figsize=(15, 5 * n_rows))

for i, feature in enumerate(existing_features):
    plt.subplot(n_rows, n_cols, i + 1)
    
    # Plot distribution for each target class
    repay_data = application_train[application_train['TARGET'] == 0][feature].dropna()
    default_data = application_train[application_train['TARGET'] == 1][feature].dropna()
    
    plt.hist(repay_data, bins=50, alpha=0.7, label='Repay', color='green', density=True)
    plt.hist(default_data, bins=50, alpha=0.7, label='Default', color='red', density=True)
    
    plt.title(f'{feature} Distribution')
    plt.xlabel(feature)
    plt.ylabel('Density')
    plt.legend()
    plt.yscale('log')  # Log scale for better visualization

plt.tight_layout()
plt.show()

# Statistical summary
print("\nStatistical Summary of Key Features:")
for feature in existing_features:
    print(f"\n{feature}:")
    print(f"  Mean: {application_train[feature].mean():.2f}")
    print(f"  Median: {application_train[feature].median():.2f}")
    print(f"  Std: {application_train[feature].std():.2f}")
    print(f"  Min: {application_train[feature].min():.2f}")
    print(f"  Max: {application_train[feature].max():.2f}")
    print(f"  Missing: {application_train[feature].isnull().sum():,} ({application_train[feature].isnull().mean()*100:.1f}%)")


In [None]:
# Correlation analysis
print("=== CORRELATION ANALYSIS ===")

# Select numerical features for correlation analysis
numerical_features = application_train.select_dtypes(include=[np.number]).columns.tolist()
numerical_features = [f for f in numerical_features if f not in ['SK_ID_CURR', 'TARGET']]

# Calculate correlation matrix
correlation_matrix = application_train[numerical_features + ['TARGET']].corr()

# Correlation with target
target_corr = correlation_matrix['TARGET'].drop('TARGET').sort_values(key=abs, ascending=False)
print("Top 20 features most correlated with TARGET:")
print(target_corr.head(20))

# Visualize correlation matrix for top features
top_features = target_corr.head(15).index.tolist() + ['TARGET']
corr_subset = correlation_matrix.loc[top_features, top_features]

plt.figure(figsize=(12, 10))
sns.heatmap(corr_subset, annot=True, cmap='coolwarm', center=0, 
            square=True, fmt='.2f', cbar_kws={'shrink': 0.8})
plt.title('Correlation Matrix - Top Features with Target')
plt.tight_layout()
plt.show()

# Feature importance by correlation
plt.figure(figsize=(12, 8))
target_corr.head(20).plot(kind='barh', color=['red' if x > 0 else 'blue' for x in target_corr.head(20)])
plt.title('Top 20 Features by Correlation with Target')
plt.xlabel('Correlation with Target')
plt.ylabel('Features')
plt.tight_layout()
plt.show()


In [None]:
# Categorical features analysis
print("=== CATEGORICAL FEATURES ANALYSIS ===")

# Get categorical features
categorical_features = application_train.select_dtypes(include=['object']).columns.tolist()
print(f"Found {len(categorical_features)} categorical features:")
print(categorical_features)

# Analyze each categorical feature
for feature in categorical_features[:10]:  # Analyze first 10 categorical features
    print(f"\n{feature}:")
    value_counts = application_train[feature].value_counts()
    print(f"  Unique values: {application_train[feature].nunique()}")
    print(f"  Most common: {value_counts.head(3).to_dict()}")
    
    # Default rate by category
    if application_train[feature].nunique() <= 20:  # Only for features with reasonable number of categories
        default_rates = application_train.groupby(feature)['TARGET'].mean().sort_values(ascending=False)
        print(f"  Default rates: {default_rates.head(5).to_dict()}")

# Visualize categorical features with reasonable number of categories
categorical_to_plot = [f for f in categorical_features if application_train[f].nunique() <= 10 and application_train[f].nunique() > 1]
n_cat = len(categorical_to_plot)

if n_cat > 0:
    n_cols = 2
    n_rows = (n_cat + n_cols - 1) // n_cols
    
    plt.figure(figsize=(15, 5 * n_rows))
    
    for i, feature in enumerate(categorical_to_plot):
        plt.subplot(n_rows, n_cols, i + 1)
        
        # Create crosstab
        crosstab = pd.crosstab(application_train[feature], application_train['TARGET'], normalize='index')
        crosstab.plot(kind='bar', stacked=True, ax=plt.gca(), color=['green', 'red'])
        plt.title(f'{feature} vs Target')
        plt.xlabel(feature)
        plt.ylabel('Proportion')
        plt.legend(['Repay', 'Default'])
        plt.xticks(rotation=45)
    
    plt.tight_layout()
    plt.show()


In [None]:
# Summary and insights
print("=== EDA SUMMARY AND INSIGHTS ===")

print("1. DATASET OVERVIEW:")
print(f"   - Training set: {application_train.shape[0]:,} applicants with {application_train.shape[1]} features")
print(f"   - Test set: {application_test.shape[0]:,} applicants with {application_test.shape[1]} features")
print(f"   - Class imbalance: {target_counts[1] / target_counts[0]:.3f} (default/repay ratio)")

print("\n2. TARGET VARIABLE:")
print(f"   - Default rate: {target_pct[1]:.2f}%")
print(f"   - This is a highly imbalanced dataset requiring special handling")

print("\n3. MISSING VALUES:")
print(f"   - Columns with >50% missing: {(missing_df['Train_Percent'] > 50).sum()}")
print(f"   - Columns with >25% missing: {(missing_df['Train_Percent'] > 25).sum()}")
print(f"   - Columns with no missing: {no_missing}")

print("\n4. FEATURE TYPES:")
print(f"   - Numerical features: {len(numerical_features)}")
print(f"   - Categorical features: {len(categorical_features)}")

print("\n5. KEY INSIGHTS:")
print("   - Strong class imbalance requires careful model evaluation")
print("   - Many features have high missing value rates")
print("   - Feature engineering will be crucial for model performance")
print("   - Correlation analysis shows some features are more predictive than others")

print("\n6. RECOMMENDATIONS:")
print("   - Use stratified sampling for model validation")
print("   - Implement robust missing value handling")
print("   - Focus on feature engineering and selection")
print("   - Consider ensemble methods to handle class imbalance")
print("   - Use appropriate evaluation metrics (ROC-AUC, PR-AUC)")

# Save summary to file
summary_data = {
    'dataset_size': {
        'train_samples': application_train.shape[0],
        'test_samples': application_test.shape[0],
        'total_features': application_train.shape[1]
    },
    'target_distribution': {
        'default_rate': float(target_pct[1]),
        'class_imbalance_ratio': float(target_counts[1] / target_counts[0])
    },
    'missing_values': {
        'high_missing_columns': int((missing_df['Train_Percent'] > 50).sum()),
        'total_missing_columns': int((missing_df['Train_Percent'] > 0).sum())
    },
    'feature_types': {
        'numerical': len(numerical_features),
        'categorical': len(categorical_features)
    }
}

import json
with open('../Data/processed/eda_summary.json', 'w') as f:
    json.dump(summary_data, f, indent=2)

print(f"\n✅ EDA Summary saved to: ../Data/processed/eda_summary.json")


In [None]:
# Numerical features analysis


In [None]:
# Categorical features analysis


In [None]:
# Correlation analysis


In [None]:
# Feature importance analysis


In [None]:
# Outlier analysis


In [None]:
# Summary statistics
