
# Home Credit Default Risk: Comprehensive EDA Report
**Generated by Antigravity Agent**

## 0. Goal & Constraints
**Goal**: Generate a reproducible, structured, and visual EDA report for the Home Credit Default Risk dataset (`application_train.csv`).
**Focus**: Data Quality, Target Distribution, Missingness, Numerical/Categorical distributions, Correlations, and Bias signals.
**Constraints**: No missing value imputation, no outlier removal (report only).



In [None]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from sklearn.metrics import roc_auc_score, mutual_info_score
import os
import warnings
import json

# Configuration
warnings.filterwarnings('ignore')
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (12, 6)
pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 50)

# Constants
RANDOM_SEED = 42
TRAIN_PATH = 'application_train.csv'
TEST_PATH = 'application_test.csv'
TARGET_COL = 'TARGET'
ID_COL = 'SK_ID_CURR'

# Output directories
os.makedirs('figures', exist_ok=True)
os.makedirs('tables', exist_ok=True)

print("Environment setup complete.")



In [None]:

def save_figure(fig, filename):
    fig.savefig(f'figures/{filename}', bbox_inches='tight')
    # plt.close(fig) # Keep open for notebook display

def save_table(df, filename):
    df.to_csv(f'tables/{filename}')
    print(f"Saved table: tables/{filename}")

def plot_distribution(df, col, target_col=TARGET_COL, sample_size=50000):
    fig, axes = plt.subplots(1, 2, figsize=(16, 5))
    
    # Sampling for plotting to improve performance
    plot_df = df.sample(min(len(df), sample_size), random_state=RANDOM_SEED) if len(df) > sample_size else df
    
    # Distribution
    sns.histplot(data=plot_df, x=col, kde=True, ax=axes[0], color='skyblue')
    axes[0].set_title(f'Distribution of {col}')
    
    # Relation with Target
    if df[col].dtype == 'object' or len(df[col].unique()) < 20: 
        # Categorical or low cardinality
        sns.barplot(data=plot_df, x=col, y=target_col, ax=axes[1], errorbar=('ci', 95))
        axes[1].tick_params(axis='x', rotation=45)
    else: 
        # Numerical
        sns.boxplot(data=plot_df, x=target_col, y=col, ax=axes[1])
    
    axes[1].set_title(f'{col} vs {target_col}')
    plt.tight_layout()
    return fig

def calc_psi(expected, actual, buckettype='bins', buckets=10, axis=0):
    '''Calculate the PSI (Population Stability Index) for two vectors'''
    def scale_range (input, min, max):
        input += -(np.min(input))
        input /= np.max(input) / (max - min)
        input += min
        return input

    breakpoints = np.arange(0, buckets + 1) / (buckets) * 100

    if buckettype == 'bins':
        breakpoints = scale_range(breakpoints, np.min(expected), np.max(expected))
    elif buckettype == 'quantiles':
        breakpoints = np.stack([np.percentile(expected, b) for b in breakpoints])

    expected_percents = np.histogram(expected, breakpoints)[0] / len(expected)
    actual_percents = np.histogram(actual, breakpoints)[0] / len(actual)

    def sub_psi(e_perc, a_perc):
        if a_perc == 0: a_perc = 0.0001
        if e_perc == 0: e_perc = 0.0001
        value = (e_perc - a_perc) * np.log(e_perc / a_perc)
        return(value)

    psi_value = np.sum(sub_psi(expected_percents[i], actual_percents[i]) for i in range(0, len(expected_percents)))
    return psi_value




## Chapter A: Data Loading & Meta Info
- **Goal**: Load data, check basic properties (shape, memory, duplicates), and generate a schema summary.



In [None]:

# Load Data
if os.path.exists(TRAIN_PATH):
    df_train = pd.read_csv(TRAIN_PATH)
    print(f"Loaded {TRAIN_PATH}: {df_train.shape}")
else:
    raise FileNotFoundError(f"{TRAIN_PATH} not found. Please ensure the file is in the working directory.")

# Basic Meta Info
memory_usage = df_train.memory_usage(deep=True).sum() / 1024**2
print(f"Memory Usage: {memory_usage:.2f} MB")

# Check ID Uniqueness
n_unique_ids = df_train[ID_COL].nunique()
n_rows = len(df_train)
print(f"Unique IDs: {n_unique_ids}")
print(f"Duplicate IDs: {n_rows - n_unique_ids}")

# Schema Table
schema_d = []
for col in df_train.columns:
    n_unique = df_train[col].nunique()
    n_missing = df_train[col].isnull().sum()
    example_vals = df_train[col].dropna().unique()[:3] if n_unique > 0 else []
    
    inferred_type = 'numeric'
    if df_train[col].dtype == 'object':
        inferred_type = 'categorical'
    elif 'DAYS' in col or 'DATE' in col:  # Rough heuristic
        inferred_type = 'date-like'
        
    schema_d.append({
        'col_name': col,
        'dtype': df_train[col].dtype,
        'n_unique': n_unique,
        'pct_unique': n_unique / n_rows,
        'n_missing': n_missing,
        'pct_missing': n_missing / n_rows,
        'example_values': str(list(example_vals)),
        'inferred_type': inferred_type
    })

df_schema = pd.DataFrame(schema_d)
save_table(df_schema, 'schema_table.csv')
display(df_schema.head())




## Chapter B: Target Variable Overview
- **Goal**: Analyze the distribution and imbalance of the target variable.



In [None]:

# Target Distribution
target_counts = df_train[TARGET_COL].value_counts()
target_pct = df_train[TARGET_COL].value_counts(normalize=True)

print("Target Distribution:")
print(target_counts)
print("\nTarget Percentage:")
print(target_pct)

# Plot
plt.figure(figsize=(8, 5))
ax = sns.countplot(x=TARGET_COL, data=df_train)
plt.title('Target Variable Distribution (Imbalance)')
for p in ax.patches:
    ax.annotate(f'{p.get_height()}\n({p.get_height()/n_rows:.1%})', 
                (p.get_x() + p.get_width() / 2., p.get_height()), 
                ha='center', va='bottom')
save_figure(plt.gcf(), 'chapter_b_target_dist.png')
plt.show()

# Baseline Metrics
majority_class_acc = target_pct.max()
prevalence = target_pct[1] if 1 in target_pct else 0
print(f"Majority Class Baseline Accuracy: {majority_class_acc:.4f}")
print(f"Positive Class Prevalence: {prevalence:.4f}")




## Chapter C: Missingness Intelligence
- **Goal**: Analyze missing value patterns and their relationship with the target.



In [None]:

# Top Missing Columns
missing_ranking = df_schema.sort_values('pct_missing', ascending=False)
top_missing = missing_ranking[missing_ranking['pct_missing'] > 0].head(30)
display(top_missing[['col_name', 'pct_missing', 'n_missing']])

# Plot Top Missing
plt.figure(figsize=(10, 8))
sns.barplot(data=top_missing, y='col_name', x='pct_missing')
plt.title('Top 30 Missing Features')
save_figure(plt.gcf(), 'chapter_c_missing_top30.png')
plt.show()

# Row-wise Missing Count
df_train['n_missing_row'] = df_train.isnull().sum(axis=1)
plt.figure(figsize=(10, 5))
sns.histplot(df_train['n_missing_row'], bins=30)
plt.title('Distribution of Missing Values per Row')
save_figure(plt.gcf(), 'chapter_c_row_missingness.png')
plt.show()

# Missingness vs Target (Lift Analysis)
# Identify columns with meaningful missingness (> 5% missing)
high_missing_cols = missing_ranking[missing_ranking['pct_missing'] > 0.05]['col_name'].tolist()

missing_signals = []
for col in high_missing_cols[:20]: # Limit to top 20 to save time
    is_missing = df_train[col].isnull()
    target_rate_missing = df_train[is_missing][TARGET_COL].mean()
    target_rate_present = df_train[~is_missing][TARGET_COL].mean()
    
    missing_signals.append({
        'feature': col,
        'missing_rate': is_missing.mean(),
        'target_rate_missing': target_rate_missing,
        'target_rate_present': target_rate_present,
        'lift': target_rate_missing / (target_rate_present + 1e-6)
    })

df_missing_signals = pd.DataFrame(missing_signals).sort_values('lift', ascending=False)
save_table(df_missing_signals, 'missingness_signals.csv')
display(df_missing_signals.head(10))




## Chapter D: Numerical Features EDA
- **Goal**: Distribution, Statistics, and Target Relationship for numerical features.



In [None]:

# Identify Numerical Columns
numeric_cols = df_schema[df_schema['inferred_type'] == 'numeric']['col_name'].tolist()
# Exclude ID and Target
numeric_cols = [c for c in numeric_cols if c not in [ID_COL, TARGET_COL, 'n_missing_row']]

# D1. Numerical Summary
num_summary = df_train[numeric_cols].describe(percentiles=[0.01, 0.05, 0.25, 0.5, 0.75, 0.95, 0.99]).T
num_summary['missing%'] = df_train[numeric_cols].isnull().mean()
num_summary['skew'] = df_train[numeric_cols].skew()
num_summary['kurtosis'] = df_train[numeric_cols].kurtosis()
save_table(num_summary, 'numeric_summary.csv')
display(num_summary.head())

# D3. Numeric Signal Ranking (AUC/KS)
numeric_signals = []
for col in numeric_cols:
    # Drop NA for calculation
    valid_data = df_train[[col, TARGET_COL]].dropna()
    if len(valid_data) < 100 or valid_data[col].nunique() < 2:
        continue
        
    try:
        # Simple AUC (using feature as score)
        auc = roc_auc_score(valid_data[TARGET_COL], valid_data[col])
        # If AUC < 0.5, flip it (feature might be negatively correlated)
        auc = max(auc, 1-auc)
        
        # KS Statistic
        s0 = valid_data[valid_data[TARGET_COL]==0][col]
        s1 = valid_data[valid_data[TARGET_COL]==1][col]
        ks_stat, _ = stats.ks_2samp(s0, s1)
        
        numeric_signals.append({
            'feature': col,
            'auc': auc,
            'ks_stat': ks_stat
        })
    except Exception as e:
        pass

df_num_signals = pd.DataFrame(numeric_signals).sort_values('auc', ascending=False)
save_table(df_num_signals, 'numeric_signal_ranking.csv')
print("Top Numeric Signals by AUC:")
display(df_num_signals.head(10))

# Plot Top 5 Features
for col in df_num_signals['feature'].head(5):
    fig = plot_distribution(df_train, col)
    save_figure(fig, f'chapter_d_dist_{col}.png')
    plt.show()




## Chapter E: Categorical Features EDA
- **Goal**: Analyze categorical features, cardinality, and target rates.



In [None]:

cat_cols = df_schema[df_schema['inferred_type'] == 'categorical']['col_name'].tolist()

# E1 & E2. Categorical Stats & Ranking
cat_stats = []

for col in cat_cols:
    total_count = len(df_train)
    n_unique = df_train[col].nunique()
    
    # Calculate Target Rate Variance (weighted) or Mutual Information (simple proxy)
    # Here we look at the range of target rates across categories
    
    # Group by category
    grp = df_train.groupby(col)[TARGET_COL].agg(['count', 'mean'])
    grp = grp.sort_values('count', ascending=False)
    
    # Check for rare classes (<1%)
    rare_mask = (grp['count'] / total_count) < 0.01
    n_rare = rare_mask.sum()
    pct_rare = grp[rare_mask]['count'].sum() / total_count
    
    # Signal strength: Range of target rates (max - min) for categories with meaningful size (>100 samples)
    valid_cats = grp[grp['count'] > 100]
    tr_range = 0
    if len(valid_cats) > 1:
        tr_range = valid_cats['mean'].max() - valid_cats['mean'].min()
        
    cat_stats.append({
        'feature': col,
        'n_unique': n_unique,
        'n_rare_cats': n_rare,
        'pct_rare_samples': pct_rare,
        'target_rate_range': tr_range
    })
    
    # Plot Top categories
    if n_unique <= 50: # Limit plots
        plt.figure(figsize=(12, 5))
        # Top 10 categories
        top_cats = grp.head(10).index
        plot_data = df_train[df_train[col].isin(top_cats)]
        
        sns.barplot(data=plot_data, x=col, y=TARGET_COL, order=top_cats, errorbar=('ci', 95))
        plt.title(f'{col} Default Rate (Top Categories)')
        plt.xticks(rotation=45)
        save_figure(plt.gcf(), f'chapter_e_{col}_target_rate.png')
        plt.close()

df_cat_stats = pd.DataFrame(cat_stats).sort_values('target_rate_range', ascending=False)
save_table(df_cat_stats, 'categorical_signal_ranking.csv')
display(df_cat_stats)




## Chapter F: Correlation & Redundancy
- **Goal**: Identify collinearity and strong correlations.



In [None]:

# Numerical Correlation (Top Features)
# Use top 30 numeric features by signal to avoid huge matrix
top_numeric = df_num_signals.head(30)['feature'].tolist()
corr_mat = df_train[top_numeric].corr()

plt.figure(figsize=(12, 10))
sns.heatmap(corr_mat, cmap='coolwarm', center=0, annot=False)
plt.title('Correlation Matrix (Top 30 Numeric Features)')
save_figure(plt.gcf(), 'chapter_f_corr_matrix.png')
plt.show()

# List strong pairs
corr_pairs = (corr_mat.where(np.triu(np.ones(corr_mat.shape), k=1).astype(bool))
              .stack()
              .to_frame('corr')
              .reset_index())
corr_pairs.columns = ['feat_a', 'feat_b', 'corr']
corr_pairs['abs_corr'] = corr_pairs['corr'].abs()
high_corr = corr_pairs[corr_pairs['abs_corr'] > 0.7].sort_values('abs_corr', ascending=False)
save_table(high_corr, 'high_correlation_pairs.csv')
display(high_corr.head(15))




## Chapter G: Outliers & Sentinel Values
- **Goal**: Scan for potential sentinel values (e.g. 365243 in DAYS columns) and extreme outliers.



In [None]:

sentinel_candidates = []

for col in numeric_cols:
    # Check for specific value heavily repeated (Mode) that is at the edge
    mode_val = df_train[col].mode()[0]
    mode_count = (df_train[col] == mode_val).sum()
    mode_pct = mode_count / len(df_train)
    
    # Check if max or min is far from percentiles
    desc = num_summary.loc[col]
    
    is_suspicious = False
    note = ""
    
    # Case: Days Employed 365243
    if col == 'DAYS_EMPLOYED' and 365243 in df_train[col].values:
        is_suspicious = True
        note = "Known Sentinel: 365243"
    
    # Case: Mode distinct from median/mean and high pct
    if mode_pct > 0.05 and abs(mode_val - desc['50%']) > desc['std']:
        is_suspicious = True
        note += f" | High freq mode: {mode_val} ({mode_pct:.1%})"
        
    if is_suspicious:
        sentinel_candidates.append({
            'feature': col,
            'suspicious_value': mode_val, 
            'count': mode_count,
            'pct': mode_pct,
            'notes': note
        })

df_sentinels = pd.DataFrame(sentinel_candidates)
save_table(df_sentinels, 'potential_sentinels.csv')
display(df_sentinels)

# Visualize Sentinel vs Target for DAYS_EMPLOYED if exists
if 'DAYS_EMPLOYED' in df_train.columns:
    df_train['is_365243'] = df_train['DAYS_EMPLOYED'] == 365243
    print(df_train.groupby('is_365243')[TARGET_COL].agg(['count', 'mean']))




## Chapter H: Train vs Test Drift
- **Goal**: Compare distribution between Train and Test sets (if Test set exists).



In [None]:

if os.path.exists(TEST_PATH):
    df_test = pd.read_csv(TEST_PATH)
    print(f"Loaded Test Data: {df_test.shape}")
    
    drift_results = []
    
    # Check Top Numeric Features for Drift using KS
    for col in top_numeric[:20]: # Check top signals
        if col in df_test.columns:
            s_train = df_train[col].dropna()
            s_test = df_test[col].dropna()
            
            ks_stat, p_val = stats.ks_2samp(s_train, s_test)
            drift_results.append({
                'feature': col,
                'metric': 'KS',
                'value': ks_stat,
                'drift_detected': ks_stat > 0.1 # Threshold
            })
            
    df_drift = pd.DataFrame(drift_results).sort_values('value', ascending=False)
    save_table(df_drift, 'drift_analysis.csv')
    display(df_drift.head())
else:
    print("Application Test file not found. Skipping Drift Analysis.")




## Chapter I: Bias / Fairness EDA
- **Goal**: Explore default rates across sensitive attributes (Gender, Age).



In [None]:

# 1. Gender
if 'CODE_GENDER' in df_train.columns:
    print("Gender Analysis:")
    g_grp = df_train.groupby('CODE_GENDER')[TARGET_COL].agg(['count', 'mean'])
    display(g_grp)
    
    plt.figure(figsize=(6, 4))
    sns.barplot(data=df_train, x='CODE_GENDER', y=TARGET_COL, errorbar=('ci', 95))
    plt.title('Default Rate by Gender')
    save_figure(plt.gcf(), 'chapter_i_gender_bias.png')
    plt.show()

# 2. Age (Binning DAYS_BIRTH)
if 'DAYS_BIRTH' in df_train.columns:
    df_train['AGE'] = df_train['DAYS_BIRTH'] / -365
    df_train['AGE_BIN'] = pd.cut(df_train['AGE'], bins=[20, 30, 40, 50, 60, 70, 100])
    
    print("Age Analysis:")
    age_grp = df_train.groupby('AGE_BIN')[TARGET_COL].agg(['count', 'mean'])
    display(age_grp)
    
    plt.figure(figsize=(10, 4))
    sns.barplot(data=df_train, x='AGE_BIN', y=TARGET_COL, errorbar=('ci', 95))
    plt.title('Default Rate by Age Group')
    save_figure(plt.gcf(), 'chapter_i_age_bias.png')
    plt.show()




## Executive Summary & Outputs
The analysis is complete. Key tables have been saved to the `tables/` directory and figures to `figures/`.
summary.json will be generated now.



In [None]:

summary_json = {
    "n_rows": int(len(df_train)),
    "n_cols": int(len(df_train.columns)),
    "target_prevalence": float(prevalence),
    "top_missing_cols": top_missing['col_name'].tolist()[:5],
    "top_numeric_signals": df_num_signals['feature'].tolist()[:5] if 'df_num_signals' in locals() else [],
    "top_categorical_signals": df_cat_stats['feature'].tolist()[:5] if 'df_cat_stats' in locals() else []
}

with open('summary.json', 'w') as f:
    json.dump(summary_json, f, indent=4)

print("Summary JSON saved.")
print("To generate HTML report, try using: jupyter nbconvert --to html eda_report.ipynb")

