
# EMI Predict AI – Full Exploratory Data Analysis (EDA)

This notebook implements **complete EDA** for the EMI Predict AI – Intelligent Financial Risk Assessment project.

It answers the following key questions:
1. Dataset shape and structure  
2. Missing values and duplicates  
3. Summary statistics (numeric & categorical)  
4. Target variable analysis (classification & regression)  
5. Scenario-based analysis  
6. Correlation analysis  
7. Engineered features and financial behaviour  
8. Risk factor analysis  
9. Outliers and business insights support



In [None]:

# ============================================================
# 0. IMPORTS & LOAD DATA
# ============================================================
from pathlib import Path

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Optional prettier plots
try:
    import seaborn as sns
    sns.set(style="whitegrid")
except ImportError:
    sns = None

plt.rcParams["figure.figsize"] = (10, 6)
plt.rcParams["axes.grid"] = True

# Try to import from src.* first, then local as fallback
DATA_PATH = None
TARGET_CLASS = None
TARGET_REG = None
engineer_features = None

try:
    from src.config import DATA_PATH as SRC_DATA_PATH, TARGET_CLASS as SRC_TARGET_CLASS, TARGET_REG as SRC_TARGET_REG
    from src.feature_engineering import engineer_features as src_engineer_features
    DATA_PATH = SRC_DATA_PATH
    TARGET_CLASS = SRC_TARGET_CLASS
    TARGET_REG = SRC_TARGET_REG
    engineer_features = src_engineer_features
    print("Using src.config and src.feature_engineering")
except Exception as e:
    print("src.* import failed:", e)

if DATA_PATH is None:
    try:
        from src.config import DATA_PATH as LOCAL_DATA_PATH, TARGET_CLASS as LOCAL_TARGET_CLASS, TARGET_REG as LOCAL_TARGET_REG
        from src.feature_engineering import engineer_features as local_engineer_features
        DATA_PATH = LOCAL_DATA_PATH
        TARGET_CLASS = LOCAL_TARGET_CLASS
        TARGET_REG = LOCAL_TARGET_REG
        engineer_features = local_engineer_features
        print("Using local config.py and feature_engineering.py")
    except Exception as e:
        print("Local config import failed:", e)

print("DATA_PATH:", DATA_PATH)
print("TARGET_CLASS:", TARGET_CLASS)
print("TARGET_REG:", TARGET_REG)

if DATA_PATH is None or engineer_features is None:
    raise RuntimeError("Could not resolve DATA_PATH or engineer_features. Please check your project structure.")

df = pd.read_csv(DATA_PATH)
print("Data loaded! Shape:", df.shape)

df.head()


## 1. Dataset Overview

In [None]:

# ============================================================
# 1. DATASET OVERVIEW (shape, types, sample)
# ============================================================
print("Shape of dataset (rows, columns):", df.shape)

print("\nColumn data types:")
print(df.dtypes)

print("\nFirst 5 rows:")
df.head()


## 2. Missing Values & Duplicates

In [None]:

# ============================================================
# 2. MISSING VALUES & DUPLICATES
# ============================================================
# Missing values
missing_counts = df.isna().sum().sort_values(ascending=False)
missing_pct = (missing_counts / len(df) * 100).round(2)

missing_df = pd.DataFrame({
    "missing_count": missing_counts,
    "missing_pct": missing_pct
})

print("Missing values per column (top 20):")
missing_df.head(20)


In [None]:

# Plot missing values (if any)
if sns is not None and (missing_df["missing_count"] > 0).any():
    plt.figure(figsize=(10, 5))
    non_zero = missing_df[missing_df["missing_count"] > 0]
    sns.barplot(x=non_zero.index, y="missing_pct", data=non_zero)
    plt.xticks(rotation=90)
    plt.ylabel("% Missing")
    plt.title("Missing Values (%) per Column")
    plt.tight_layout()
    plt.show()
else:
    print("No missing values or seaborn not available.")


In [None]:

# Duplicates
duplicate_count = df.duplicated().sum()
print("\nNumber of duplicate rows:", duplicate_count)


## 3. Summary Statistics – Numeric Features

In [None]:

# ============================================================
# 3. SUMMARY STATISTICS FOR NUMERIC FEATURES
# ============================================================
numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
print("Numeric columns:", numeric_cols)

df[numeric_cols].describe().T


## 4. Categorical Features Overview

In [None]:

# ============================================================
# 4. CATEGORICAL FEATURE OVERVIEW
# ============================================================
cat_cols = df.select_dtypes(include=["object", "category"]).columns.tolist()
print("Categorical columns:", cat_cols)

summary_cats = []
for col in cat_cols:
    summary_cats.append({
        "column": col,
        "n_unique": df[col].nunique(),
        "top_values": df[col].value_counts().head(5).to_dict()
    })

pd.DataFrame(summary_cats)


## 5. Target Variable (Classification) – `emi_eligibility`

In [None]:

# ============================================================
# 5. TARGET (CLASSIFICATION): emi_eligibility
# ============================================================
if TARGET_CLASS in df.columns:
    print("Target (classification):", TARGET_CLASS)
    print("\nValue counts:")
    print(df[TARGET_CLASS].value_counts())
    print("\nPercentage distribution:")
    print((df[TARGET_CLASS].value_counts(normalize=True) * 100).round(2))

    if sns is not None:
        plt.figure()
        sns.countplot(x=TARGET_CLASS, data=df, order=df[TARGET_CLASS].value_counts().index)
        plt.title("Distribution of EMI Eligibility Classes")
        plt.ylabel("Count")
        plt.tight_layout()
        plt.show()
else:
    print("TARGET_CLASS not found in dataframe.")


## 6. Target Variable (Regression) – `max_monthly_emi`

In [None]:

# ============================================================
# 6. TARGET (REGRESSION): max_monthly_emi
# ============================================================
if TARGET_REG in df.columns:
    print("Target (regression):", TARGET_REG)
    display(df[TARGET_REG].describe())

    if sns is not None:
        fig, axes = plt.subplots(1, 2, figsize=(14, 5))
        sns.histplot(df[TARGET_REG], kde=True, ax=axes[0])
        axes[0].set_title("Distribution of Max Monthly EMI")
        axes[0].set_xlabel(TARGET_REG)

        sns.boxplot(x=df[TARGET_REG], ax=axes[1])
        axes[1].set_title("Boxplot of Max Monthly EMI")

        plt.tight_layout()
        plt.show()
    else:
        df[TARGET_REG].hist(bins=50)
        plt.title("Distribution of Max Monthly EMI")
        plt.xlabel(TARGET_REG)
        plt.ylabel("Count")
        plt.show()
else:
    print("TARGET_REG not found in dataframe.")


## 7. EMI Scenario-Based Analysis

In [None]:

# ============================================================
# 7. EMI SCENARIO ANALYSIS
# ============================================================
scenario_col = "emi_scenario"

if scenario_col in df.columns:
    print("Scenario value counts:")
    print(df[scenario_col].value_counts())
    print("\nScenario percentage distribution:")
    print((df[scenario_col].value_counts(normalize=True) * 100).round(2))

    if sns is not None:
        plt.figure()
        sns.countplot(x=scenario_col, data=df, order=df[scenario_col].value_counts().index)
        plt.xticks(rotation=45)
        plt.title("Distribution of EMI Scenarios")
        plt.tight_layout()
        plt.show()

    # Scenario vs Eligibility
    if TARGET_CLASS in df.columns:
        cross_tab = pd.crosstab(df[scenario_col], df[TARGET_CLASS])
        print("\nScenario vs Eligibility (counts):")
        display(cross_tab)

        if sns is not None:
            plt.figure(figsize=(8, 6))
            sns.heatmap(cross_tab, annot=True, fmt="d", cmap="Blues")
            plt.title("EMI Scenario vs Eligibility")
            plt.ylabel("Scenario")
            plt.xlabel("Eligibility Class")
            plt.tight_layout()
            plt.show()
else:
    print("Column 'emi_scenario' not found in dataframe.")


## 8. Correlation Analysis

In [None]:

# ============================================================
# 8. CORRELATION ANALYSIS (NUMERIC)
# ============================================================
corr = df[numeric_cols].corr()

# Top correlations with max_monthly_emi
if TARGET_REG in corr.columns:
    print("Top positive correlations with max_monthly_emi:")
    print(corr[TARGET_REG].sort_values(ascending=False).head(10))

    print("\nMost negative correlations with max_monthly_emi:")
    print(corr[TARGET_REG].sort_values(ascending=True).head(10))

if sns is not None:
    plt.figure(figsize=(12, 10))
    sns.heatmap(corr, cmap="coolwarm", center=0)
    plt.title("Correlation Heatmap (Numeric Features)")
    plt.tight_layout()
    plt.show()
else:
    print("Seaborn not available; skipping correlation heatmap.")


## 9. Engineered Financial Features

In [None]:
# ============================================================
# 9. APPLY ENGINEERED FEATURES
#    (ensure key numeric columns are actually numeric)
# ============================================================

# Columns that should be numeric but might have been read as strings
numeric_like_cols = [
    "monthly_salary",
    "school_fees",
    "college_fees",
    "travel_expenses",
    "groceries_utilities",
    "other_monthly_expenses",
    "monthly_rent",
    "current_emi_amount",
    "credit_score",
    "max_monthly_emi",
    "requested_amount",
    "requested_tenure",
    "bank_balance",
    "emergency_fund",
]

for col in numeric_like_cols:
    if col in df.columns:
        df[col] = pd.to_numeric(df[col], errors="coerce")

# Now apply feature engineering safely
df_feat = engineer_features(df.copy())
print("Original shape:", df.shape)
print("With engineered features:", df_feat.shape)

engineered_cols = [
    "total_education_fees",
    "total_living_expenses",
    "total_expenses",
    "disposable_income",
    "dti_ratio",
    "expense_to_income_ratio",
    "savings_rate",
    "emi_burden_ratio",
    "existing_loans_flag",
    "high_credit_score_flag",
    "medium_credit_score_flag",
    "affordability_score",
]

available_engineered = [c for c in engineered_cols if c in df_feat.columns]
print("Available engineered features:", available_engineered)

df_feat[available_engineered].describe().T


### 9.1 Engineered Feature Distributions

In [None]:

if sns is not None:
    for col in available_engineered:
        if df_feat[col].dtype != "O":
            plt.figure()
            sns.histplot(df_feat[col], kde=True)
            plt.title(f"Distribution of {col}")
            plt.tight_layout()
            plt.show()
else:
    print("Seaborn not available; skipping engineered-feature distributions.")


## 10. Income vs EMI Eligibility

In [None]:

# ============================================================
# 10. INCOME vs ELIGIBILITY
# ============================================================
income_col = "monthly_salary"

if income_col in df_feat.columns and TARGET_CLASS in df_feat.columns:
    print("Income stats by eligibility:")
    display(df_feat.groupby(TARGET_CLASS)[income_col].describe())

    if sns is not None:
        plt.figure(figsize=(8, 5))
        sns.boxplot(x=TARGET_CLASS, y=income_col, data=df_feat)
        plt.title("Monthly Salary by EMI Eligibility")
        plt.tight_layout()
        plt.show()
else:
    print("Either income_col or TARGET_CLASS not found.")


## 11. Credit Score vs EMI Eligibility

In [None]:

# ============================================================
# 11. CREDIT SCORE vs ELIGIBILITY
# ============================================================
credit_col = "credit_score"

if credit_col in df_feat.columns and TARGET_CLASS in df_feat.columns:
    print("Credit score stats by eligibility:")
    display(df_feat.groupby(TARGET_CLASS)[credit_col].describe())

    if sns is not None:
        plt.figure(figsize=(8, 5))
        sns.boxplot(x=TARGET_CLASS, y=credit_col, data=df_feat)
        plt.title("Credit Score by EMI Eligibility")
        plt.tight_layout()
        plt.show()
else:
    print("Either credit_col or TARGET_CLASS not	found.")


## 12. DTI Ratio vs EMI Eligibility

In [None]:

# ============================================================
# 12. DTI RATIO vs ELIGIBILITY
# ============================================================
dti_col = "dti_ratio"

if dti_col in df_feat.columns and TARGET_CLASS in df_feat.columns:
    print("DTI ratio stats by eligibility:")
    display(df_feat.groupby(TARGET_CLASS)[dti_col].describe())

    if sns is not None:
        plt.figure(figsize=(8, 5))
        sns.boxplot(x=TARGET_CLASS, y=dti_col, data=df_feat)
        plt.title("DTI Ratio by EMI Eligibility")
        plt.tight_layout()
        plt.show()
else:
    print("Either dti_col or TARGET_CLASS not found.")


## 13. Expenses, Rent & Eligibility

In [None]:

# ============================================================
# 13. EXPENSES vs ELIGIBILITY
# ============================================================
expense_cols = ["total_expenses", "expense_to_income_ratio", "monthly_rent"]

for col in expense_cols:
    if col in df_feat.columns and TARGET_CLASS in df_feat.columns:
        print(f"\n=== {col} by eligibility ===")
        display(df_feat.groupby(TARGET_CLASS)[col].describe())

        if sns is not None and df_feat[col].dtype != "O":
            plt.figure(figsize=(8, 5))
            sns.boxplot(x=TARGET_CLASS, y=col, data=df_feat)
            plt.title(f"{col} by EMI Eligibility")
            plt.tight_layout()
            plt.show()


## 14. Existing Loans vs Eligibility

In [None]:

# ============================================================
# 14. EXISTING LOANS vs APPROVAL
# ============================================================
if "existing_loans" in df_feat.columns and TARGET_CLASS in df_feat.columns:
    print("Existing loans vs eligibility (counts):")
    ctab = pd.crosstab(df_feat["existing_loans"], df_feat[TARGET_CLASS])
    display(ctab)

    if sns is not None:
        plt.figure(figsize=(6, 4))
        sns.heatmap(ctab, annot=True, fmt="d", cmap="Blues")
        plt.title("Existing Loans vs Eligibility")
        plt.ylabel("Existing Loans")
        plt.xlabel("Eligibility")
        plt.tight_layout()
        plt.show()
else:
    print("existing_loans column or TARGET_CLASS not found.")


## 15. Disposable Income Analysis

In [None]:

# ============================================================
# 15. DISPOSABLE INCOME ANALYSIS
# ============================================================
disp_col = "disposable_income"

if disp_col in df_feat.columns:
    print("Overall disposable income stats:")
    display(df_feat[disp_col].describe())

    if sns is not None:
        plt.figure()
        sns.histplot(df_feat[disp_col], kde=True)
        plt.title("Distribution of Disposable Income")
        plt.tight_layout()
        plt.show()

    if TARGET_CLASS in df_feat.columns and sns is not None:
        plt.figure(figsize=(8, 5))
        sns.boxplot(x=TARGET_CLASS, y=disp_col, data=df_feat)
        plt.title("Disposable Income by EMI Eligibility")
        plt.tight_layout()
        plt.show()
else:
    print("disposable_income column not found in engineered dataframe.")


## 16. Engineered Features vs Eligibility

In [None]:

# ============================================================
# 16. ENGINEERED FEATURES vs ELIGIBILITY
# ============================================================
if TARGET_CLASS in df_feat.columns and sns is not None:
    for col in available_engineered:
        if df_feat[col].dtype != "O":
            print(f"\n=== {col} by eligibility ===")
            display(df_feat.groupby(TARGET_CLASS)[col].describe())

            plt.figure(figsize=(8, 5))
            sns.boxplot(x=TARGET_CLASS, y=col, data=df_feat)
            plt.title(f"{col} by EMI Eligibility")
            plt.tight_layout()
            plt.show()
else:
    print("Either TARGET_CLASS missing or seaborn not available.")


## 17. Scenario-Based Rejection, Outliers & Business Insight Support

In [None]:
# 17.a Scenario-based approval / rejection rates
scenario_col = "emi_scenario"
if scenario_col in df_feat.columns and TARGET_CLASS in df_feat.columns:
    # Count rows per (scenario, class)
    scenario_stats = (
        df_feat.groupby([scenario_col, TARGET_CLASS])
        .size()
        .reset_index(name="count")
    )

    # Compute proportion within each scenario
    scenario_stats["proportion"] = (
        scenario_stats["count"]
        / scenario_stats.groupby(scenario_col)["count"].transform("sum")
    )

    print("Scenario-wise eligibility proportions:")
    display(scenario_stats.head(20))

    if sns is not None:
        plt.figure(figsize=(10, 6))
        sns.barplot(
            data=scenario_stats,
            x=scenario_col,
            y="proportion",
            hue=TARGET_CLASS,
        )
        plt.title("Eligibility Proportion by EMI Scenario")
        plt.ylabel("Proportion")
        plt.tight_layout()
        plt.show()
