In [10]:
# -*- coding: utf-8 -*-
# Train data generator (preserves exact columns; writes training_data_aligned.csv)

import os
import numpy as np
import pandas as pd
from datetime import datetime, timedelta

np.random.seed(111)

OUT_DIR = "credit_risk_output"
os.makedirs(OUT_DIR, exist_ok=True)

# -------------------------------
#  NEW: Load external reference datasets
# -------------------------------
LOAN_DATA_PATH = "Loan_default (1).csv"
TELCO_DATA_PATH = "WA_Fn-UseC_-Telco-Customer-Churn.csv"

# Safe loading with error handling
try:
    loan_df = pd.read_csv(LOAN_DATA_PATH)
    print(f"[INFO] Loaded loan dataset: {loan_df.shape}")
except FileNotFoundError:
    print(f"[WARNING] {LOAN_DATA_PATH} not found. Using fallback distributions.")
    loan_df = None

try:
    telco_df = pd.read_csv(TELCO_DATA_PATH)
    print(f"[INFO] Loaded telco dataset: {telco_df.shape}")
except FileNotFoundError:
    print(f"[WARNING] {TELCO_DATA_PATH} not found. Using fallback distributions.")
    telco_df = None

# Pre-calculate distributions from loan dataset
if loan_df is not None:
    loan_age_stats = loan_df["age"].dropna().describe() if "age" in loan_df.columns else None
    loan_income_stats = loan_df["income"].dropna().describe() if "income" in loan_df.columns else None
    loan_amount_stats = loan_df["loan_amount"].dropna().describe() if "loan_amount" in loan_df.columns else None
else:
    loan_age_stats = loan_income_stats = loan_amount_stats = None

# Pre-calculate distributions from telco dataset
if telco_df is not None:
    telco_tenure_stats = telco_df["tenure"].dropna().describe() if "tenure" in telco_df.columns else None
    telco_charges_stats = telco_df["MonthlyCharges"].dropna().describe() if "MonthlyCharges" in telco_df.columns else None
else:
    telco_tenure_stats = telco_charges_stats = None

# -------------------------------
# Core model features (updated to include loan application amount)
# -------------------------------
MODEL_FEATURES = [
    "age", "monthly_income_inr", "monthly_expenses_inr", "monthly_savings_inr",
    "outstanding_loan_amount_inr", "loan_amount_applied_inr", "years_current_employment", "banking_relationship_years",
    "timeliness_score", "repayment_ability_score", "financial_health_score",
    "payment_reliability_score", "stability_index"
]

TARGET_REG = "probability_of_default"
TARGET_CLS = "risk_category"

# Updated column list with new loan application amount
ALL_COLUMNS = [
    "applicant_id", "application_date", "age", "gender", "education_level",
    "employment_type", "marital_status", "family_size", "number_of_dependents",
    "location_type", "monthly_income_inr", "spouse_income_inr", "monthly_expenses_inr",
    "monthly_savings_inr", "monthly_utility_bills_inr", "property_value_inr",
    "vehicle_value_inr", "total_investments_inr", "outstanding_loan_amount_inr",
    "loan_amount_applied_inr", "years_current_employment", "banking_relationship_years",
    "monthly_business_revenue_inr", "daily_mobile_hours", "monthly_digital_transactions",
    "avg_transaction_amount_inr", "social_media_accounts_count", "mobile_app_usage_intensity_score",
    "digital_payment_adoption_score", "utility_payment_regularity_score",
    "location_stability_score", "mobile_banking_usage_score", "payment_reliability_score",
    "financial_health_score", "stability_index", "timeliness_score",
    "repayment_ability_score", "probability_of_default", "data_completeness_pct",
    "consent_status", "explainability_support_flag", "city", "risk_category"
]

GUJARAT_CITIES = [
    "Ahmedabad", "Surat", "Vadodara", "Rajkot", "Bhavnagar", "Jamnagar", "Junagadh",
    "Gandhinagar", "Nadiad", "Morbi", "Anand", "Mehsana", "Navsari", "Bharuch",
    "Vapi", "Valsad", "Patan", "Godhra", "Porbandar", "Palanpur", "Veraval", "Surendranagar"
]
EDUCATION_LEVELS = ["High School", "Diploma", "Graduate", "Post Graduate", "Professional"]
EMPLOYMENT_TYPES = ["Salaried", "Self Employed", "Business Owner", "Professional"]
LOCATION_TYPES = ["Metro", "Tier1", "Tier2"]

# NEW: City to location type mapping as specified
CITY_TO_LOCATION_TYPE = {
    "Ahmedabad": "Metro",
    "Surat": "Metro",
    "Vadodara": "Metro",
    "Rajkot": "Tier1",
    "Bhavnagar": "Tier1",
    "Jamnagar": "Tier1",
    "Junagadh": "Tier2",
    "Gandhinagar": "Metro",
    "Nadiad": "Tier2",
    "Morbi": "Tier2",
    "Anand": "Tier2",
    "Mehsana": "Tier2",
    "Navsari": "Tier2",
    "Bharuch": "Tier2",
    "Vapi": "Tier2",
    "Valsad": "Tier2",
    "Patan": "Tier2",
    "Godhra": "Tier2",
    "Porbandar": "Tier2",
    "Palanpur": "Tier2",
    "Veraval": "Tier2",
    "Surendranagar": "Tier1"
}

# -------------------------------
# NEW: Sampling functions from datasets
# -------------------------------
def _sample_age():
    """Sample age from loan dataset or fallback to original distribution"""
    if loan_age_stats is not None:
        sampled_age = int(np.random.normal(loan_age_stats["mean"], loan_age_stats["std"]))
        return max(18, min(75, sampled_age))  # Clip to reasonable range
    else:
        # Fallback to original distribution
        return int(np.random.normal(36, 12))

def _sample_income():
    """Sample income from loan dataset or fallback to original distribution"""
    if loan_income_stats is not None:
        sampled_income = int(np.random.normal(loan_income_stats["mean"], loan_income_stats["std"]))
        return max(15000, min(500000, sampled_income))  # Clip to reasonable range
    else:
        # Fallback to original distribution
        return int(np.random.normal(55000, 25000))

def _sample_loan_amount():
    """Sample loan amount from loan dataset or fallback to original distribution"""
    if loan_amount_stats is not None:
        sampled_amount = int(np.random.normal(loan_amount_stats["mean"], loan_amount_stats["std"]))
        return max(15000, min(6000000, sampled_amount))  # Clip to reasonable range
    else:
        # Fallback to original distribution
        return int(np.random.normal(250000, 80000))

def _sample_tenure():
    """Sample tenure/relationship years from telco dataset or fallback"""
    if telco_tenure_stats is not None:
        sampled_tenure = np.random.normal(telco_tenure_stats["mean"], telco_tenure_stats["std"])
        return max(0.5, min(35.0, sampled_tenure))  # Clip to reasonable range
    else:
        # Fallback to original distribution
        return np.random.uniform(0.5, 15.0)

def _sample_digital_transactions():
    """Sample digital transaction count based on telco usage patterns"""
    if telco_charges_stats is not None:
        # Use monthly charges as proxy for digital activity
        base_transactions = int(telco_charges_stats["mean"] / 2)  # Rough conversion
        noise = np.random.normal(0, telco_charges_stats["std"] / 4)
        return max(5, int(base_transactions + noise))
    else:
        # Fallback to original distribution
        base = int(np.random.normal(60, 20))
        return max(5, base)

def _sample_monthly_charges():
    """Sample monthly charges/utility bills from telco dataset"""
    if telco_charges_stats is not None:
        charges = np.random.normal(telco_charges_stats["mean"], telco_charges_stats["std"])
        return max(500, int(charges * 60))  # Convert USD to INR approximately
    else:
        return int(np.random.normal(2500, 800))

# -------------------------------
# NEW: Function to generate realistic loan application amount
# -------------------------------
def _generate_loan_application_amount(income, age, segment, property_value, employment_type):
    """
    Generate realistic loan application amount based on customer profile
    """
    # Base application amount from dataset sampling
    base_amount = _sample_loan_amount()

    # Income-based adjustment (higher income = higher applications)
    income_multiplier = min(3.0, max(0.3, income / 50000))

    # Age-based loan purpose patterns
    if age < 30:
        # Young people: personal loans, vehicle loans, small home loans
        purpose_multiplier = np.random.choice([0.6, 1.2, 2.0], p=[0.5, 0.3, 0.2])
    elif age < 45:
        # Prime age: home loans, business loans, higher amounts
        purpose_multiplier = np.random.choice([1.0, 2.5, 4.0], p=[0.3, 0.4, 0.3])
    else:
        # Mature: consolidation, business expansion, moderate amounts
        purpose_multiplier = np.random.choice([0.8, 1.8, 3.0], p=[0.4, 0.4, 0.2])

    # Property ownership influence
    if property_value > 0:
        # Existing property owners tend to apply for larger amounts
        property_multiplier = np.random.uniform(1.2, 2.0)
    else:
        property_multiplier = np.random.uniform(0.7, 1.3)

    # Employment type influence
    employment_multipliers = {
        "Salaried": np.random.uniform(0.8, 1.4),
        "Professional": np.random.uniform(1.0, 1.8),
        "Business Owner": np.random.uniform(1.2, 2.5),
        "Self Employed": np.random.uniform(0.6, 1.6)
    }
    emp_multiplier = employment_multipliers.get(employment_type, 1.0)

    # Segment-based adjustments
    segment_adjustments = {
        "excellent": np.random.uniform(1.5, 3.0),  # Apply for larger amounts
        "good": np.random.uniform(1.0, 2.0),
        "fair": np.random.uniform(0.7, 1.5),
        "poor": np.random.uniform(0.4, 1.0),
        "bad": np.random.uniform(0.3, 0.8)  # Smaller applications due to desperation
    }
    segment_multiplier = segment_adjustments.get(segment, 1.0)

    # Calculate final application amount
    application_amount = int(base_amount * income_multiplier * purpose_multiplier *
                           property_multiplier * emp_multiplier * segment_multiplier)

    # Realistic bounds (minimum 25k, maximum 8x annual income)
    min_amount = 25000
    max_amount = min(8000000, int(income * 12 * 8))  # Max 8x annual income

    return max(min_amount, min(max_amount, application_amount))

# -------------------------------
# Utility functions (unchanged)
# -------------------------------
def _clip(v, lo, hi):
    return max(lo, min(hi, v))

def _sigmoid(x):
    import numpy as _np
    return 1.0 / (1.0 + _np.exp(-_np.clip(x, -500, 500)))

def _risk_category_from_p(p):
    if p <= 0.18: return "Low Risk"
    elif p <= 0.42: return "Medium Risk"
    elif p <= 0.68: return "High Risk"
    else: return "Very High Risk"

def _generate_applicant_id():
    prefix = np.random.choice(["A", "B", "C", "D", "E", "F"])
    number = np.random.randint(1000, 9999)
    return f"{prefix}{number:04d}"

def _generate_application_date():
    start_date = datetime(2024, 6, 1)
    end_date = datetime(2025, 7, 31)
    days_diff = (end_date - start_date).days
    random_days = np.random.randint(0, max(1, days_diff))
    return (start_date + timedelta(days=int(random_days))).strftime("%Y-%m-%d")

def calculate_data_driven_scores(row):
    """
    Calculate scores based on actual financial data including loan application amount.
    """
    import numpy as np

    # Extract key financial metrics
    income = max(1.0, float(row["monthly_income_inr"]))
    expenses = float(row["monthly_expenses_inr"])
    savings = float(row["monthly_savings_inr"])
    loan_amount = float(row["outstanding_loan_amount_inr"])
    application_amount = float(row["loan_amount_applied_inr"])  # NEW
    age = int(row["age"])
    emp_years = float(row["years_current_employment"])
    bank_years = float(row["banking_relationship_years"])
    property_value = float(row["property_value_inr"])
    investments = float(row["total_investments_inr"])

    # Calculate ratios including new application-based metrics
    dti_ratio = loan_amount / (12.0 * income) if income > 0 else 0
    application_to_income_ratio = application_amount / (12.0 * income) if income > 0 else 0
    loan_utilization_ratio = loan_amount / max(1.0, application_amount)  # How much of applied amount is outstanding
    expense_ratio = expenses / income if income > 0 else 1
    savings_ratio = savings / income if income > 0 else 0

    def _clip_local(value, min_val, max_val):
        return max(min_val, min(max_val, value))

    # 1. TIMELINESS SCORE (5-95) - Enhanced with application amount consideration
    timeliness_base = (
        min(emp_years * 8, 40) +
        min(bank_years * 6, 30) +
        min((age - 18) * 0.8, 20) +
        5
    )
    # Enhanced penalties including application behavior
    timeliness_penalty = (dti_ratio * 15 +
                         max(0, expense_ratio - 0.7) * 20 +
                         max(0, application_to_income_ratio - 1.5) * 10)  # Penalty for over-application
    timeliness_score = _clip_local(int(timeliness_base - timeliness_penalty + np.random.randint(-8, 9)), 5, 95)

    # 2. REPAYMENT ABILITY SCORE (5-90) - Enhanced with application amount factors
    repayment_base = (
        min(np.log(income/25000) * 15, 30) +
        max(0, savings_ratio * 40) +
        min(emp_years * 2, 20)
    )
    # Enhanced penalties considering application vs outstanding ratio
    repayment_penalty = (dti_ratio * 25 +
                        max(0, expense_ratio - 0.8) * 15 +
                        max(0, application_to_income_ratio - 2.0) * 12 +  # Over-application penalty
                        max(0, loan_utilization_ratio - 0.9) * 8)  # High utilization penalty
    repayment_score = _clip_local(int(repayment_base - repayment_penalty + np.random.randint(-6, 7)), 5, 90)

    # 3. FINANCIAL HEALTH SCORE (10-95) - Enhanced with application amount
    asset_ratio = (property_value + investments) / max(income * 12, 1)
    financial_base = (
        min(np.log(income/20000) * 12, 25) +
        min(asset_ratio * 20, 30) +
        max(0, savings_ratio * 25) +
        min(bank_years * 1.5, 15)
    )
    # Consider application amount in financial health assessment
    financial_penalty = (dti_ratio * 20 +
                        max(0, expense_ratio - 0.75) * 18 +
                        max(0, application_to_income_ratio - 1.8) * 10)  # Over-application indicates poor planning
    financial_score = _clip_local(int(financial_base - financial_penalty + np.random.randint(-10, 11)), 10, 95)

    # 4. PAYMENT RELIABILITY SCORE (10-95) - Enhanced with loan behavior
    reliability_base = (
        min(emp_years * 4, 35) +
        max(0, (1 - expense_ratio) * 30) +
        min(income/5000, 20) +
        10
    )
    # Reliability affected by loan application vs utilization pattern
    reliability_penalty = (dti_ratio * 30 +
                          max(0, expense_ratio - 0.85) * 25 +
                          abs(loan_utilization_ratio - 0.7) * 8)  # Optimal utilization around 70%
    reliability_score = _clip_local(int(reliability_base - reliability_penalty + np.random.randint(-7, 8)), 10, 95)

    # 5. STABILITY INDEX (5-90) - Include application amount pattern
    stability_base = (
        min(emp_years * 3, 25) +
        min(bank_years * 2, 15) +
        min((age - 20) * 0.6, 20) +
        min(asset_ratio * 15, 20) +
        10
    )
    # Stability considers reasonable application amounts
    stability_penalty = (dti_ratio * 18 +
                        max(0, expense_ratio - 0.8) * 12 +
                        max(0, application_to_income_ratio - 2.5) * 8)  # Very high applications indicate instability
    stability_score = _clip_local(int(stability_base - stability_penalty + np.random.randint(-12, 13)), 5, 90)

    # 6. UTILITY PAYMENT REGULARITY SCORE (25-95) - Enhanced
    utility_base = (
        85 -
        dti_ratio * 25 -
        max(0, expense_ratio - 0.6) * 20 +
        min(savings_ratio * 15, 10)
    )
    utility_score = _clip_local(int(utility_base + np.random.randint(-8, 9)), 25, 95)

    # 7. LOCATION STABILITY SCORE (30-120) - Enhanced
    location_base = (
        bank_years * 8 +
        emp_years * 5 +
        (1 if property_value > 0 else 0) * 15 +
        min((age - 18) * 1.2, 25) +
        30
    )
    location_score = _clip_local(int(location_base + np.random.randint(-10, 11)), 30, 120)

    return {
        "timeliness_score": timeliness_score,
        "repayment_ability_score": repayment_score,
        "financial_health_score": financial_score,
        "payment_reliability_score": reliability_score,
        "stability_index": stability_score,
        "utility_payment_regularity_score": utility_score,
        "location_stability_score": location_score
    }

def _pd_from_features(row):
    """
    Enhanced probability of default calculation including loan application amount
    """
    income = max(1.0, float(row["monthly_income_inr"]))
    expenses = float(row["monthly_expenses_inr"])
    loan_amount = float(row["outstanding_loan_amount_inr"])
    application_amount = float(row["loan_amount_applied_inr"])  # NEW

    dti = loan_amount / (12.0 * income)
    app_to_income = application_amount / (12.0 * income)  # NEW RATIO
    exp_ratio = expenses / income

    s_avg = np.mean([
        _clip(row["timeliness_score"], 0, 100),
        _clip(row["repayment_ability_score"], 0, 100),
        _clip(row["financial_health_score"], 0, 100),
        _clip(row["payment_reliability_score"], 0, 100),
        _clip(row["stability_index"], 0, 100),
    ]) / 100.0

    tenure = max(0.0, float(row["years_current_employment"]))
    bank_rel = max(0.0, float(row["banking_relationship_years"]))
    age = int(row["age"])

    # Enhanced probability calculation with application amount factor
    x = (
        1.6 * dti +
        1.1 * exp_ratio +
        0.4 * app_to_income +  # NEW: Application amount impact on default probability
        -2.2 * s_avg +
        -0.18 * np.log1p(tenure) +
        -0.22 * np.log1p(bank_rel) +
        0.06 * (age < 25) +
        0.04 * (age > 60) +
        0.07 * _sigmoid((income - 100000.0) / 50000.0) * (dti > 0.7) +
        0.05 * (app_to_income > 3.0) - 0.3  # NEW: Penalty for excessive applications
    )

    edu = row.get("education_level", "Graduate")
    x += {"High School": 0.05, "Diploma": 0.02, "Graduate": -0.01, "Post Graduate": -0.03, "Professional": -0.05}.get(edu, 0.0)
    emp = row.get("employment_type", "Salaried")
    x += {"Salaried": -0.02, "Professional": -0.03, "Self Employed": 0.03, "Business Owner": 0.01}.get(emp, 0.0)

    # NEW: Light locality effect for improved model accuracy
    loc = row.get("location_type", "Tier2")
    x += {"Metro": -0.03, "Tier1": 0.00, "Tier2": 0.02}.get(loc, 0.0)

    base_pd = _sigmoid(x)
    pd = _clip(float(base_pd + np.random.normal(0, 0.08)), 0.01, 0.92)
    return pd

def _generate_profile(segment, age_mu=36, inc_mu=55000, inc_sigma=30000):
    if segment == "excellent":
        # Use sampled base values and adjust for segment
        age = max(28, min(65, _sample_age() + 5))
        base_income = _sample_income()
        income = int(max(60000, base_income * 1.5))
        exp_ratio = np.clip(np.random.normal(0.42, 0.09), 0.25, 0.62)
        dti = np.clip(np.random.normal(0.18, 0.07), 0.02, 0.38)
        education = np.random.choice(["Graduate", "Post Graduate", "Professional"], p=[0.3, 0.5, 0.2])
        employment = np.random.choice(["Salaried", "Professional", "Business Owner"], p=[0.4, 0.4, 0.2])
    elif segment == "good":
        age = max(25, min(58, _sample_age() + 2))
        base_income = _sample_income()
        income = int(max(40000, base_income * 1.15))
        exp_ratio = np.clip(np.random.normal(0.57, 0.11), 0.38, 0.78)
        dti = np.clip(np.random.normal(0.38, 0.13), 0.08, 0.72)
        education = np.random.choice(["Diploma", "Graduate", "Post Graduate"], p=[0.2, 0.6, 0.2])
        employment = np.random.choice(["Salaried", "Professional", "Self Employed"], p=[0.5, 0.3, 0.2])
    elif segment == "fair":
        age = max(22, min(55, _sample_age() - 1))
        base_income = _sample_income()
        income = int(max(25000, base_income * 0.88))
        exp_ratio = np.clip(np.random.normal(0.72, 0.12), 0.52, 0.88)
        dti = np.clip(np.random.normal(0.65, 0.22), 0.28, 1.25)
        education = np.random.choice(["High School", "Diploma", "Graduate"], p=[0.3, 0.5, 0.2])
        employment = np.random.choice(["Salaried", "Self Employed"], p=[0.6, 0.4])
    elif segment == "poor":
        age = max(20, min(52, _sample_age() - 3))
        base_income = _sample_income()
        income = int(max(18000, base_income * 0.65))
        exp_ratio = np.clip(np.random.normal(0.83, 0.09), 0.68, 0.97)
        dti = np.clip(np.random.normal(1.15, 0.38), 0.65, 2.2)
        education = np.random.choice(["High School", "Diploma"], p=[0.7, 0.3])
        employment = np.random.choice(["Salaried", "Self Employed"], p=[0.4, 0.6])
    else:  # bad
        age = max(18, min(48, _sample_age() - 5))
        base_income = _sample_income()
        income = int(max(15000, base_income * 0.52))
        exp_ratio = np.clip(np.random.normal(0.92, 0.07), 0.78, 1.00)
        dti = np.clip(np.random.normal(1.85, 0.55), 1.1, 3.8)
        education = np.random.choice(["High School", "Diploma"], p=[0.8, 0.2])
        employment = "Self Employed"

    gender = np.random.choice(["Male", "Female"], p=[0.55, 0.45])
    marital_status = np.random.choice(["Single", "Married"], p=[0.25, 0.75]) if age >= 28 else np.random.choice(["Single", "Married"], p=[0.7, 0.3])
    if marital_status == "Married":
        family_size = np.random.randint(2, 6)
        dependents = max(0, family_size - 2)
        spouse_income = int(np.random.uniform(0.2, 0.8) * income) if np.random.rand() < 0.6 else 0
    else:
        family_size, dependents, spouse_income = 1, 0, 0

    # CHANGED: Use deterministic city-to-location mapping
    city = np.random.choice(GUJARAT_CITIES)
    location_type = CITY_TO_LOCATION_TYPE.get(city, "Tier2")

    expenses = int(_clip(income * exp_ratio + np.random.randint(-3000, 3000), 2000, max(2500, income - 500)))
    savings = max(0, income - expenses - np.random.randint(0, 4000))

    # Asset generation (needed for loan application calculation)
    if income > 80000 and age > 30:
        property_value = int(income * np.random.uniform(25, 55) * (age / 40))
        vehicle_value = int(income * np.random.uniform(2, 8))
        investments = int(income * np.random.uniform(8, 25) * ((age - 25) / 20))
    elif income > 50000:
        property_value = int(income * np.random.uniform(15, 35) * (age / 40)) if np.random.rand() < 0.4 else 0
        vehicle_value = int(income * np.random.uniform(1, 5)) if np.random.rand() < 0.6 else 0
        investments = int(income * np.random.uniform(3, 12)) if np.random.rand() < 0.5 else 0
    else:
        property_value = 0
        vehicle_value = int(income * np.random.uniform(0.5, 3)) if np.random.rand() < 0.3 else 0
        investments = int(income * np.random.uniform(0.5, 5)) if np.random.rand() < 0.2 else 0

    # Generate loan application amount FIRST (needed for outstanding loan calculation)
    loan_application_amount = _generate_loan_application_amount(
        income, age, segment, property_value, employment
    )

    # Outstanding loan amount is typically less than or equal to application amount
    utilization_rate = np.clip(np.random.beta(2, 2), 0.3, 1.0)  # Most people use 30-100% of approved amount
    loan_amt = int(loan_application_amount * utilization_rate)

    # Ensure DTI constraints are still met
    max_affordable_loan = int(dti * 12 * income)
    loan_amt = min(loan_amt, max_affordable_loan)
    loan_amt = max(15000, loan_amt)  # Minimum loan amount

    # Use sampled utility bills
    utility_bills = _sample_monthly_charges()
    utility_bills = int(utility_bills * family_size * np.random.uniform(0.8, 1.2))

    # Use sampled tenure values
    emp_years = round(_clip(_sample_tenure() * np.random.uniform(0.6, 1.0), 0.5, 35), 1)
    bank_years = round(_clip(_sample_tenure() - np.random.uniform(-2.0, 3.0), 0.5, max(0.5, age - 18)), 1)

    if employment in ["Business Owner", "Self Employed"]:
        business_revenue = int(income * np.random.uniform(1.2, 2.8))
    else:
        business_revenue = 0

    if age < 35 and education in ["Graduate", "Post Graduate", "Professional"]:
        mobile_hours = round(np.random.uniform(6, 12), 1)
        digital_transactions = _sample_digital_transactions() + np.random.randint(20, 40)
        social_media = np.random.randint(4, 8)
        app_usage_score = np.random.randint(65, 95)
        digital_payment_score = np.random.randint(70, 95)
        mobile_banking_score = np.random.randint(60, 90)
    elif age < 50:
        mobile_hours = round(np.random.uniform(3, 8), 1)
        digital_transactions = _sample_digital_transactions()
        social_media = np.random.randint(2, 6)
        app_usage_score = np.random.randint(40, 75)
        digital_payment_score = np.random.randint(45, 80)
        mobile_banking_score = np.random.randint(35, 70)
    else:
        mobile_hours = round(np.random.uniform(1, 5), 1)
        digital_transactions = max(10, _sample_digital_transactions() - 20)
        social_media = np.random.randint(1, 4)
        app_usage_score = np.random.randint(20, 55)
        digital_payment_score = np.random.randint(25, 60)
        mobile_banking_score = np.random.randint(20, 50)

    avg_transaction = int((income + expenses) / max(1, digital_transactions) * np.random.uniform(0.5, 2.0))

    # Create the initial row with basic financial data
    row = {
        "applicant_id": _generate_applicant_id(),
        "application_date": _generate_application_date(),
        "age": age,
        "gender": gender,
        "education_level": education,
        "employment_type": employment,
        "marital_status": marital_status,
        "family_size": family_size,
        "number_of_dependents": dependents,
        "location_type": location_type,
        "monthly_income_inr": income,
        "spouse_income_inr": spouse_income,
        "monthly_expenses_inr": expenses,
        "monthly_savings_inr": savings,
        "monthly_utility_bills_inr": utility_bills,
        "property_value_inr": property_value,
        "vehicle_value_inr": vehicle_value,
        "total_investments_inr": investments,
        "outstanding_loan_amount_inr": loan_amt,
        "loan_amount_applied_inr": loan_application_amount,  # NEW COLUMN
        "years_current_employment": emp_years,
        "banking_relationship_years": bank_years,
        "monthly_business_revenue_inr": business_revenue,
        "daily_mobile_hours": mobile_hours,
        "monthly_digital_transactions": digital_transactions,
        "avg_transaction_amount_inr": avg_transaction,
        "social_media_accounts_count": social_media,
        "mobile_app_usage_intensity_score": app_usage_score,
        "digital_payment_adoption_score": digital_payment_score,
        "mobile_banking_usage_score": mobile_banking_score,
        "data_completeness_pct": np.random.randint(85, 100),
        "consent_status": "Full Consent",
        "explainability_support_flag": 1,
        "city": city,
    }

    # Calculate data-driven scores based on actual financial data
    scores = calculate_data_driven_scores(row)

    # Update the row with calculated scores
    row.update(scores)

    # Calculate probability of default using the new scores
    pd_val = _pd_from_features(row)
    row["probability_of_default"] = pd_val
    row["risk_category"] = _risk_category_from_p(pd_val)

    return row

def generate_enhanced_training_data(n_rows=12000, seed=111):
    np.random.seed(seed)
    seg_mix = {"excellent": 0.28, "good": 0.32, "fair": 0.24, "poor": 0.11, "bad": 0.05}
    counts = {k: int(v * n_rows) for k, v in seg_mix.items()}
    diff = n_rows - sum(counts.values())
    if diff != 0:
        counts["good"] += diff

    rows = []
    for seg, cnt in counts.items():
        for _ in range(cnt):
            rows.append(_generate_profile(seg))

    df = pd.DataFrame(rows)
    df = df[ALL_COLUMNS]
    out_csv = os.path.join(OUT_DIR, "training_data_aligned.csv")
    df.to_csv(out_csv, index=False)
    print(f"[TRAIN] ✅ Enhanced training set: {out_csv} shape={df.shape}")
    print(f"[TRAIN] Risk distribution: {df['risk_category'].value_counts().to_dict()}")
    print(f"[TRAIN] PD range: {df['probability_of_default'].min():.3f} - {df['probability_of_default'].max():.3f}")
    print(f"[TRAIN] Application amount range: ₹{df['loan_amount_applied_inr'].min():,.0f} - ₹{df['loan_amount_applied_inr'].max():,.0f}")

    # Print dataset integration summary
    print(f"\n[INFO] Dataset Integration Summary:")
    print(f"  - Loan dataset: {'✅ Loaded' if loan_df is not None else '❌ Not found'}")
    print(f"  - Telco dataset: {'✅ Loaded' if telco_df is not None else '❌ Not found'}")
    if loan_df is not None:
        print(f"  - Age sampling: μ={loan_age_stats['mean']:.1f}, σ={loan_age_stats['std']:.1f}" if loan_age_stats is not None else "")
        print(f"  - Income sampling: μ={loan_income_stats['mean']:.0f}, σ={loan_income_stats['std']:.0f}" if loan_income_stats is not None else "")
        print(f"  - Loan amount sampling: μ={loan_amount_stats['mean']:.0f}, σ={loan_amount_stats['std']:.0f}" if loan_amount_stats is not None else "")
    if telco_df is not None:
        print(f"  - Tenure sampling: μ={telco_tenure_stats['mean']:.1f}, σ={telco_tenure_stats['std']:.1f}" if telco_tenure_stats is not None else "")
        print(f"  - Charges sampling: μ={telco_charges_stats['mean']:.1f}, σ={telco_charges_stats['std']:.1f}" if telco_charges_stats is not None else "")

    return df

if __name__ == "__main__":
    generate_enhanced_training_data(n_rows=12000, seed=111)


[INFO] Loaded loan dataset: (255347, 18)
[INFO] Loaded telco dataset: (7043, 21)
[TRAIN] ✅ Enhanced training set: credit_risk_output/training_data_aligned.csv shape=(12000, 43)
[TRAIN] Risk distribution: {'Medium Risk': 5297, 'High Risk': 3209, 'Low Risk': 2103, 'Very High Risk': 1391}
[TRAIN] PD range: 0.010 - 0.920
[TRAIN] Application amount range: ₹25,000 - ₹8,000,000

[INFO] Dataset Integration Summary:
  - Loan dataset: ✅ Loaded
  - Telco dataset: ✅ Loaded



  - Tenure sampling: μ=32.4, σ=24.6
  - Charges sampling: μ=64.8, σ=30.1


In [11]:
# -*- coding: utf-8 -*-
# Test data generator (preserves exact columns; writes test_data_aligned.csv)
# ALIGNED with enhanced training data generator

import os
import numpy as np
import pandas as pd
from datetime import datetime, timedelta

np.random.seed(222)

OUT_DIR = "credit_risk_output"
os.makedirs(OUT_DIR, exist_ok=True)

# -------------------------------
#  NEW: Load external reference datasets (same as training)
# -------------------------------
LOAN_DATA_PATH = "Loan_default (1).csv"
TELCO_DATA_PATH = "WA_Fn-UseC_-Telco-Customer-Churn.csv"

# Safe loading with error handling
try:
    loan_df = pd.read_csv(LOAN_DATA_PATH)
    print(f"[INFO] Loaded loan dataset for test: {loan_df.shape}")
except FileNotFoundError:
    print(f"[WARNING] {LOAN_DATA_PATH} not found. Using fallback distributions.")
    loan_df = None

try:
    telco_df = pd.read_csv(TELCO_DATA_PATH)
    print(f"[INFO] Loaded telco dataset for test: {telco_df.shape}")
except FileNotFoundError:
    print(f"[WARNING] {TELCO_DATA_PATH} not found. Using fallback distributions.")
    telco_df = None

# Pre-calculate distributions from external datasets (same as training)
if loan_df is not None:
    loan_age_stats = loan_df["age"].dropna().describe() if "age" in loan_df.columns else None
    loan_income_stats = loan_df["income"].dropna().describe() if "income" in loan_df.columns else None
    loan_amount_stats = loan_df["loan_amount"].dropna().describe() if "loan_amount" in loan_df.columns else None
else:
    loan_age_stats = loan_income_stats = loan_amount_stats = None

if telco_df is not None:
    telco_tenure_stats = telco_df["tenure"].dropna().describe() if "tenure" in telco_df.columns else None
    telco_charges_stats = telco_df["MonthlyCharges"].dropna().describe() if "MonthlyCharges" in telco_df.columns else None
else:
    telco_tenure_stats = telco_charges_stats = None

# Test columns (NO targets, NO scores - these should be predicted) - UPDATED with loan application amount
TEST_COLUMNS = [
    "applicant_id", "application_date", "age", "gender", "education_level",
    "employment_type", "marital_status", "family_size", "number_of_dependents",
    "location_type", "monthly_income_inr", "spouse_income_inr", "monthly_expenses_inr",
    "monthly_savings_inr", "monthly_utility_bills_inr", "property_value_inr",
    "vehicle_value_inr", "total_investments_inr", "outstanding_loan_amount_inr",
    "loan_amount_applied_inr", "years_current_employment", "banking_relationship_years",
    "monthly_business_revenue_inr", "daily_mobile_hours", "monthly_digital_transactions",
    "avg_transaction_amount_inr", "social_media_accounts_count", "mobile_app_usage_intensity_score",
    "digital_payment_adoption_score", "data_completeness_pct", "consent_status",
    "explainability_support_flag", "city"
]

GUJARAT_CITIES = [
    "Ahmedabad", "Surat", "Vadodara", "Rajkot", "Bhavnagar", "Jamnagar", "Junagadh",
    "Gandhinagar", "Nadiad", "Morbi", "Anand", "Mehsana", "Navsari", "Bharuch",
    "Vapi", "Valsad", "Patan", "Godhra", "Porbandar", "Palanpur", "Veraval", "Surendranagar"
]
EDUCATION_LEVELS = ["High School", "Diploma", "Graduate", "Post Graduate", "Professional"]
EMPLOYMENT_TYPES = ["Salaried", "Self Employed", "Business Owner", "Professional"]
LOCATION_TYPES = ["Metro", "Tier1", "Tier2"]

# NEW: City to location type mapping as specified (same as training)
CITY_TO_LOCATION_TYPE = {
    "Ahmedabad": "Metro",
    "Surat": "Metro",
    "Vadodara": "Metro",
    "Rajkot": "Tier1",
    "Bhavnagar": "Tier1",
    "Jamnagar": "Tier1",
    "Junagadh": "Tier2",
    "Gandhinagar": "Metro",
    "Nadiad": "Tier2",
    "Morbi": "Tier2",
    "Anand": "Tier2",
    "Mehsana": "Tier2",
    "Navsari": "Tier2",
    "Bharuch": "Tier2",
    "Vapi": "Tier2",
    "Valsad": "Tier2",
    "Patan": "Tier2",
    "Godhra": "Tier2",
    "Porbandar": "Tier2",
    "Palanpur": "Tier2",
    "Veraval": "Tier2",
    "Surendranagar": "Tier1"
}

# -------------------------------
# NEW: Sampling functions (same as training)
# -------------------------------
def _sample_age():
    """Sample age from loan dataset or fallback to original distribution"""
    if loan_age_stats is not None:
        sampled_age = int(np.random.normal(loan_age_stats["mean"], loan_age_stats["std"]))
        return max(18, min(75, sampled_age))
    else:
        return int(np.random.normal(36, 12))

def _sample_income():
    """Sample income from loan dataset or fallback to original distribution"""
    if loan_income_stats is not None:
        sampled_income = int(np.random.normal(loan_income_stats["mean"], loan_income_stats["std"]))
        return max(15000, min(500000, sampled_income))
    else:
        return int(np.random.normal(55000, 25000))

def _sample_loan_amount():
    """Sample loan amount from loan dataset or fallback to original distribution"""
    if loan_amount_stats is not None:
        sampled_amount = int(np.random.normal(loan_amount_stats["mean"], loan_amount_stats["std"]))
        return max(15000, min(6000000, sampled_amount))
    else:
        return int(np.random.normal(250000, 80000))

def _sample_tenure():
    """Sample tenure/relationship years from telco dataset or fallback"""
    if telco_tenure_stats is not None:
        sampled_tenure = np.random.normal(telco_tenure_stats["mean"], telco_tenure_stats["std"])
        return max(0.5, min(35.0, sampled_tenure))
    else:
        return np.random.uniform(0.5, 15.0)

def _sample_digital_transactions():
    """Sample digital transaction count based on telco usage patterns"""
    if telco_charges_stats is not None:
        base_transactions = int(telco_charges_stats["mean"] / 2)
        noise = np.random.normal(0, telco_charges_stats["std"] / 4)
        return max(5, int(base_transactions + noise))
    else:
        base = int(np.random.normal(60, 20))
        return max(5, base)

def _sample_monthly_charges():
    """Sample monthly charges/utility bills from telco dataset"""
    if telco_charges_stats is not None:
        charges = np.random.normal(telco_charges_stats["mean"], telco_charges_stats["std"])
        return max(500, int(charges * 60))  # Convert USD to INR approximately
    else:
        return int(np.random.normal(2500, 800))

# -------------------------------
# NEW: Function to generate realistic loan application amount (same as training)
# -------------------------------
def _generate_loan_application_amount(income, age, profile_type, property_value, employment_type):
    """
    Generate realistic loan application amount based on customer profile
    """
    # Base application amount from dataset sampling
    base_amount = _sample_loan_amount()

    # Income-based adjustment (higher income = higher applications)
    income_multiplier = min(3.0, max(0.3, income / 50000))

    # Age-based loan purpose patterns
    if age < 30:
        purpose_multiplier = np.random.choice([0.6, 1.2, 2.0], p=[0.5, 0.3, 0.2])
    elif age < 45:
        purpose_multiplier = np.random.choice([1.0, 2.5, 4.0], p=[0.3, 0.4, 0.3])
    else:
        purpose_multiplier = np.random.choice([0.8, 1.8, 3.0], p=[0.4, 0.4, 0.2])

    # Property ownership influence
    if property_value > 0:
        property_multiplier = np.random.uniform(1.2, 2.0)
    else:
        property_multiplier = np.random.uniform(0.7, 1.3)

    # Employment type influence
    employment_multipliers = {
        "Salaried": np.random.uniform(0.8, 1.4),
        "Professional": np.random.uniform(1.0, 1.8),
        "Business Owner": np.random.uniform(1.2, 2.5),
        "Self Employed": np.random.uniform(0.6, 1.6)
    }
    emp_multiplier = employment_multipliers.get(employment_type, 1.0)

    # Profile-based adjustments
    profile_adjustments = {
        "high_earner_low_risk": np.random.uniform(2.0, 3.5),
        "stable_middle_class": np.random.uniform(1.2, 2.2),
        "young_professional": np.random.uniform(0.8, 1.8),
        "average_earner": np.random.uniform(0.6, 1.4),
        "financial_stress": np.random.uniform(0.3, 0.9),
        "outlier_case": np.random.uniform(1.5, 3.0)
    }
    profile_multiplier = profile_adjustments.get(profile_type, 1.0)

    # Calculate final application amount
    application_amount = int(base_amount * income_multiplier * purpose_multiplier *
                           property_multiplier * emp_multiplier * profile_multiplier)

    # Realistic bounds
    min_amount = 20000
    max_amount = min(2000000, int(income * 12 * 8))

    return max(min_amount, min(max_amount, application_amount))

# -------------------------------
# Utility functions
# -------------------------------
def _clip(v, lo, hi):
    return max(lo, min(hi, v))

def _generate_applicant_id():
    prefix = np.random.choice(["A", "B", "C","D", "E", "F"])
    number = np.random.randint(1000, 9999)
    return f"{prefix}{number:04d}"

def _generate_application_date():
    start_date = datetime(2024, 6, 1)
    end_date = datetime(2025, 7, 31)
    days_diff = (end_date - start_date).days
    random_days = np.random.randint(0, max(1, days_diff))
    return (start_date + timedelta(days=int(random_days))).strftime("%Y-%m-%d")

def _generate_test_profile(profile_type):
    """
    Generate test profiles using same logic as training but with external dataset sampling
    NO SCORES OR TARGETS - those should be predicted by the model
    """
    if profile_type == "high_earner_low_risk":
        age = max(32, min(55, _sample_age() + 5))
        base_income = _sample_income()
        income = int(max(120000, base_income * 1.8))
        exp_ratio = float(np.clip(np.random.normal(0.35, 0.08), 0.20, 0.55))
        education = np.random.choice(["Graduate", "Post Graduate", "Professional"], p=[0.2, 0.5, 0.3])
        employment = np.random.choice(["Salaried", "Professional", "Business Owner"], p=[0.3, 0.4, 0.3])

    elif profile_type == "stable_middle_class":
        age = max(28, min(50, _sample_age() + 2))
        base_income = _sample_income()
        income = int(max(60000, base_income * 1.3))
        exp_ratio = float(np.clip(np.random.normal(0.48, 0.09), 0.32, 0.68))
        education = np.random.choice(["Diploma", "Graduate", "Post Graduate"], p=[0.3, 0.5, 0.2])
        employment = np.random.choice(["Salaried", "Professional"], p=[0.7, 0.3])

    elif profile_type == "young_professional":
        age = max(22, min(32, _sample_age() - 5))
        base_income = _sample_income()
        income = int(max(45000, base_income * 1.1))
        exp_ratio = float(np.clip(np.random.normal(0.58, 0.12), 0.38, 0.75))
        education = np.random.choice(["Graduate", "Post Graduate"], p=[0.7, 0.3])
        employment = np.random.choice(["Salaried", "Professional"], p=[0.8, 0.2])

    elif profile_type == "average_earner":
        age = max(25, min(48, _sample_age()))
        base_income = _sample_income()
        income = int(max(35000, base_income * 0.9))
        exp_ratio = float(np.clip(np.random.normal(0.68, 0.12), 0.48, 0.85))
        education = np.random.choice(["High School", "Diploma", "Graduate"], p=[0.2, 0.5, 0.3])
        employment = np.random.choice(["Salaried", "Self Employed"], p=[0.6, 0.4])

    elif profile_type == "financial_stress":
        age = max(22, min(45, _sample_age() - 2))
        base_income = _sample_income()
        income = int(max(22000, base_income * 0.7))
        exp_ratio = float(np.clip(np.random.normal(0.85, 0.08), 0.75, 0.95))
        education = np.random.choice(["High School", "Diploma", "Graduate"], p=[0.4, 0.4, 0.2])
        employment = np.random.choice(["Salaried", "Self Employed"], p=[0.5, 0.5])

    else:  # outlier_case
        age = max(20, min(65, _sample_age() + np.random.randint(-10, 10)))
        base_income = _sample_income()
        income = int(max(18000, base_income * np.random.uniform(0.5, 2.0)))
        exp_ratio = float(np.clip(np.random.normal(0.65, 0.25), 0.25, 0.95))
        education = np.random.choice(EDUCATION_LEVELS)
        employment = np.random.choice(EMPLOYMENT_TYPES)

    # Demographics
    gender = np.random.choice(["Male", "Female"], p=[0.55, 0.45])
    marital_status = np.random.choice(["Single", "Married"], p=[0.3, 0.7]) if age >= 28 else np.random.choice(["Single", "Married"], p=[0.75, 0.25])

    if marital_status == "Married":
        family_size = np.random.randint(2, 5)
        dependents = max(0, family_size - 2)
        spouse_income = int(np.random.uniform(0.15, 0.75) * income) if np.random.rand() < 0.65 else 0
    else:
        family_size, dependents, spouse_income = 1, 0, 0

    # CHANGED: Use deterministic city-to-location mapping
    city = np.random.choice(GUJARAT_CITIES)
    location_type = CITY_TO_LOCATION_TYPE.get(city, "Tier2")

    # Financial calculations
    expenses = int(_clip(income * exp_ratio + np.random.randint(-2500, 2500), 1500, max(2000, income - 300)))
    savings = max(0, income - expenses - np.random.randint(0, 3500))

    # Assets
    if income > 90000 and age > 32:
        property_value = int(income * np.random.uniform(20, 45) * (age / 40))
        vehicle_value = int(income * np.random.uniform(1.5, 6))
        investments = int(income * np.random.uniform(6, 20) * ((age - 22) / 18))
    elif income > 45000:
        property_value = int(income * np.random.uniform(10, 25) * (age / 40)) if np.random.rand() < 0.35 else 0
        vehicle_value = int(income * np.random.uniform(0.8, 4)) if np.random.rand() < 0.55 else 0
        investments = int(income * np.random.uniform(2, 10)) if np.random.rand() < 0.45 else 0
    else:
        property_value = 0
        vehicle_value = int(income * np.random.uniform(0.3, 2.5)) if np.random.rand() < 0.25 else 0
        investments = int(income * np.random.uniform(0.2, 4)) if np.random.rand() < 0.15 else 0

    # Generate loan application amount FIRST
    loan_application_amount = _generate_loan_application_amount(
        income, age, profile_type, property_value, employment
    )

    # Outstanding loan calculation
    utilization_rate = np.clip(np.random.beta(2.5, 2), 0.25, 1.0)
    outstanding_loan = int(loan_application_amount * utilization_rate)

    # Keep reasonable bounds
    max_reasonable_loan = int(income * 12 * 3.5)  # Max 3.5x annual income
    outstanding_loan = min(outstanding_loan, max_reasonable_loan)
    outstanding_loan = max(10000, outstanding_loan)

    # Utility bills
    utility_bills = _sample_monthly_charges()
    utility_bills = int(utility_bills * family_size * np.random.uniform(0.7, 1.3))

    # Employment and banking tenure
    emp_years = round(_clip(_sample_tenure() * np.random.uniform(0.5, 1.0), 0.5, min(age - 18, 30)), 1)
    bank_years = round(_clip(_sample_tenure() - np.random.uniform(-1.5, 2.5), 0.5, max(0.5, age - 16)), 1)

    # Business revenue
    if employment in ["Business Owner", "Self Employed"]:
        business_revenue = int(income * np.random.uniform(1.1, 2.5))
    else:
        business_revenue = 0

    # Digital behavior based on age and education
    if age < 35 and education in ["Graduate", "Post Graduate", "Professional"]:
        mobile_hours = round(np.random.uniform(5.5, 11), 1)
        digital_transactions = _sample_digital_transactions() + np.random.randint(15, 35)
        social_media = np.random.randint(3, 7)
        app_usage_score = np.random.randint(60, 90)
        digital_payment_score = np.random.randint(65, 90)
    elif age < 50:
        mobile_hours = round(np.random.uniform(2.5, 7.5), 1)
        digital_transactions = _sample_digital_transactions()
        social_media = np.random.randint(2, 5)
        app_usage_score = np.random.randint(35, 70)
        digital_payment_score = np.random.randint(40, 75)
    else:
        mobile_hours = round(np.random.uniform(1, 4.5), 1)
        digital_transactions = max(8, _sample_digital_transactions() - 15)
        social_media = np.random.randint(1, 3)
        app_usage_score = np.random.randint(15, 50)
        digital_payment_score = np.random.randint(20, 55)

    avg_transaction = int((income + expenses) / max(1, digital_transactions) * np.random.uniform(0.4, 1.8))

    # Construct test row (NO SCORES OR TARGETS)
    row = {
        "applicant_id": _generate_applicant_id(),
        "application_date": _generate_application_date(),
        "age": age,
        "gender": gender,
        "education_level": education,
        "employment_type": employment,
        "marital_status": marital_status,
        "family_size": family_size,
        "number_of_dependents": dependents,
        "location_type": location_type,
        "monthly_income_inr": income,
        "spouse_income_inr": spouse_income,
        "monthly_expenses_inr": expenses,
        "monthly_savings_inr": savings,
        "monthly_utility_bills_inr": utility_bills,
        "property_value_inr": property_value,
        "vehicle_value_inr": vehicle_value,
        "total_investments_inr": investments,
        "outstanding_loan_amount_inr": outstanding_loan,
        "loan_amount_applied_inr": loan_application_amount,
        "years_current_employment": emp_years,
        "banking_relationship_years": bank_years,
        "monthly_business_revenue_inr": business_revenue,
        "daily_mobile_hours": mobile_hours,
        "monthly_digital_transactions": digital_transactions,
        "avg_transaction_amount_inr": avg_transaction,
        "social_media_accounts_count": social_media,
        "mobile_app_usage_intensity_score": app_usage_score,
        "digital_payment_adoption_score": digital_payment_score,
        "data_completeness_pct": np.random.randint(80, 100),
        "consent_status": "Full Consent",
        "explainability_support_flag": 1,
        "city": city,
    }

    return row

def generate_test_data(n_rows=3000, seed=222):
    """Generate test data with diverse profiles"""
    np.random.seed(seed)

    # Test profile distribution
    profiles = {
        "high_earner_low_risk": 0.20,
        "stable_middle_class": 0.25,
        "young_professional": 0.20,
        "average_earner": 0.20,
        "financial_stress": 0.10,
        "outlier_case": 0.05
    }

    counts = {k: int(v * n_rows) for k, v in profiles.items()}
    diff = n_rows - sum(counts.values())
    if diff != 0:
        counts["average_earner"] += diff

    rows = []
    for profile_type, count in counts.items():
        for _ in range(count):
            rows.append(_generate_test_profile(profile_type))

    df = pd.DataFrame(rows)
    df = df[TEST_COLUMNS]

    out_csv = os.path.join(OUT_DIR, "test_data_aligned.csv")
    df.to_csv(out_csv, index=False)

    print(f"[TEST] ✅ Enhanced test set: {out_csv} shape={df.shape}")
    print(f"[TEST] Profile distribution: {dict(zip(profiles.keys(), [counts[k] for k in profiles.keys()]))}")
    print(f"[TEST] Age range: {df['age'].min()} - {df['age'].max()}")
    print(f"[TEST] Income range: ₹{df['monthly_income_inr'].min():,.0f} - ₹{df['monthly_income_inr'].max():,.0f}")
    print(f"[TEST] Application amount range: ₹{df['loan_amount_applied_inr'].min():,.0f} - ₹{df['loan_amount_applied_inr'].max():,.0f}")

    # Print dataset integration summary
    print(f"\n[INFO] Test Dataset Integration Summary:")
    print(f"  - Loan dataset: {'✅ Loaded' if loan_df is not None else '❌ Not found'}")
    print(f"  - Telco dataset: {'✅ Loaded' if telco_df is not None else '❌ Not found'}")

    return df

if __name__ == "__main__":
    generate_test_data(n_rows=50, seed=222)


[INFO] Loaded loan dataset for test: (255347, 18)
[INFO] Loaded telco dataset for test: (7043, 21)
[TEST] ✅ Enhanced test set: credit_risk_output/test_data_aligned.csv shape=(50, 33)
[TEST] Profile distribution: {'high_earner_low_risk': 10, 'stable_middle_class': 12, 'young_professional': 10, 'average_earner': 11, 'financial_stress': 5, 'outlier_case': 2}
[TEST] Age range: 22 - 55
[TEST] Income range: ₹22,000 - ₹188,665
[TEST] Application amount range: ₹56,287 - ₹2,000,000

[INFO] Test Dataset Integration Summary:
  - Loan dataset: ✅ Loaded
  - Telco dataset: ✅ Loaded


In [12]:
# -*- coding: utf-8 -*-
# XGBoost-only model pipeline: trains, evaluates, saves one PKL, writes CSV+JSON, and generates PNGs
# ALIGNED with enhanced training and test data generators - WITH SCORE CALCULATION AND LOAN APPLICATION AMOUNT

import os
import json
import joblib
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, mean_absolute_error, roc_auc_score, confusion_matrix, roc_curve, classification_report

from xgboost import XGBClassifier, XGBRegressor

OUT_DIR = "credit_risk_output"
os.makedirs(OUT_DIR, exist_ok=True)

# Core model features (UPDATED to include loan application amount)
MODEL_FEATURES = [
    "age", "monthly_income_inr", "monthly_expenses_inr", "monthly_savings_inr",
    "outstanding_loan_amount_inr", "loan_amount_applied_inr", "years_current_employment",
    "banking_relationship_years", "timeliness_score", "repayment_ability_score",
    "financial_health_score", "payment_reliability_score", "stability_index"
]

# Targets (aligned with data generators)
TARGET_REG = "probability_of_default"
TARGET_CLS = "risk_category"
RISK_LABELS = ["Low Risk", "Medium Risk", "High Risk", "Very High Risk"]

def convert_np_types(obj):
    """Convert numpy types to native Python types for JSON serialization"""
    if isinstance(obj, dict):
        return {k: convert_np_types(v) for k, v in obj.items()}
    elif isinstance(obj, list):
        return [convert_np_types(i) for i in obj]
    elif isinstance(obj, np.integer):
        return int(obj)
    elif isinstance(obj, np.floating):
        return float(obj)
    elif isinstance(obj, np.ndarray):
        return obj.tolist()
    else:
        return obj

def _risk_to_num(labels):
    """Convert risk category labels to numeric (aligned with generators)"""
    mapping = {r: i for i, r in enumerate(RISK_LABELS)}
    return np.array([mapping[x] for x in labels], dtype=int)

def _risk_category_from_p(p):
    """Convert probability to risk category (aligned with generators)"""
    if p <= 0.18: return "Low Risk"
    elif p <= 0.42: return "Medium Risk"
    elif p <= 0.68: return "High Risk"
    else: return "Very High Risk"

def calculate_data_driven_scores(row):
    """
    Calculate scores based on actual financial data including loan application amount.
    (UPDATED FROM TRAINING GENERATOR TO ENSURE CONSISTENCY)
    """
    # Extract key financial metrics
    income = max(1.0, float(row["monthly_income_inr"]))
    expenses = float(row["monthly_expenses_inr"])
    savings = float(row["monthly_savings_inr"])
    loan_amount = float(row["outstanding_loan_amount_inr"])
    application_amount = float(row.get("loan_amount_applied_inr", loan_amount))  # NEW
    age = int(row["age"])
    emp_years = float(row["years_current_employment"])
    bank_years = float(row["banking_relationship_years"])
    property_value = float(row.get("property_value_inr", 0))
    investments = float(row.get("total_investments_inr", 0))

    # Calculate ratios including new application-based metrics
    dti_ratio = loan_amount / (12.0 * income) if income > 0 else 0
    application_to_income_ratio = application_amount / (12.0 * income) if income > 0 else 0
    loan_utilization_ratio = loan_amount / max(1.0, application_amount)
    expense_ratio = expenses / income if income > 0 else 1
    savings_ratio = savings / income if income > 0 else 0

    def _clip_local(value, min_val, max_val):
        return max(min_val, min(max_val, value))

    # 1. TIMELINESS SCORE (5-95) - Enhanced with application amount consideration
    timeliness_base = (
        min(emp_years * 8, 40) +
        min(bank_years * 6, 30) +
        min((age - 18) * 0.8, 20) +
        5
    )
    # Enhanced penalties including application behavior
    timeliness_penalty = (dti_ratio * 15 +
                         max(0, expense_ratio - 0.7) * 20 +
                         max(0, application_to_income_ratio - 1.5) * 10)
    timeliness_score = _clip_local(int(timeliness_base - timeliness_penalty + np.random.randint(-8, 9)), 5, 95)

    # 2. REPAYMENT ABILITY SCORE (5-90) - Enhanced with application amount factors
    repayment_base = (
        min(np.log(income/25000) * 15, 30) +
        max(0, savings_ratio * 40) +
        min(emp_years * 2, 20)
    )
    repayment_penalty = (dti_ratio * 25 +
                        max(0, expense_ratio - 0.8) * 15 +
                        max(0, application_to_income_ratio - 2.0) * 12 +
                        max(0, loan_utilization_ratio - 0.9) * 8)
    repayment_score = _clip_local(int(repayment_base - repayment_penalty + np.random.randint(-6, 7)), 5, 90)

    # 3. FINANCIAL HEALTH SCORE (10-95) - Enhanced with application amount
    asset_ratio = (property_value + investments) / max(income * 12, 1)
    financial_base = (
        min(np.log(income/20000) * 12, 25) +
        min(asset_ratio * 20, 30) +
        max(0, savings_ratio * 25) +
        min(bank_years * 1.5, 15)
    )
    financial_penalty = (dti_ratio * 20 +
                        max(0, expense_ratio - 0.75) * 18 +
                        max(0, application_to_income_ratio - 1.8) * 10)
    financial_score = _clip_local(int(financial_base - financial_penalty + np.random.randint(-10, 11)), 10, 95)

    # 4. PAYMENT RELIABILITY SCORE (10-95) - Enhanced with loan behavior
    reliability_base = (
        min(emp_years * 4, 35) +
        max(0, (1 - expense_ratio) * 30) +
        min(income/5000, 20) +
        10
    )
    reliability_penalty = (dti_ratio * 30 +
                          max(0, expense_ratio - 0.85) * 25 +
                          abs(loan_utilization_ratio - 0.7) * 8)
    reliability_score = _clip_local(int(reliability_base - reliability_penalty + np.random.randint(-7, 8)), 10, 95)

    # 5. STABILITY INDEX (5-90) - Include application amount pattern
    stability_base = (
        min(emp_years * 3, 25) +
        min(bank_years * 2, 15) +
        min((age - 20) * 0.6, 20) +
        min(asset_ratio * 15, 20) +
        10
    )
    stability_penalty = (dti_ratio * 18 +
                        max(0, expense_ratio - 0.8) * 12 +
                        max(0, application_to_income_ratio - 2.5) * 8)
    stability_score = _clip_local(int(stability_base - stability_penalty + np.random.randint(-12, 13)), 5, 90)

    return {
        "timeliness_score": timeliness_score,
        "repayment_ability_score": repayment_score,
        "financial_health_score": financial_score,
        "payment_reliability_score": reliability_score,
        "stability_index": stability_score
    }

def _ensure_scores_present(data, data_type="data"):
    """
    Ensure all required score columns are present in the dataset.
    Calculate them if missing using the same logic as training generator.
    """
    score_columns = [
        "timeliness_score", "repayment_ability_score", "financial_health_score",
        "payment_reliability_score", "stability_index"
    ]

    missing_scores = [col for col in score_columns if col not in data.columns]

    if missing_scores:
        print(f"[SCORES] Missing score columns in {data_type}: {missing_scores}")
        print(f"[SCORES] Calculating scores from financial data...")

        # Calculate scores for each row
        calculated_scores = []
        for idx, row in data.iterrows():
            scores = calculate_data_driven_scores(row)
            calculated_scores.append(scores)

        # Convert to DataFrame and add missing columns
        scores_df = pd.DataFrame(calculated_scores)
        for col in missing_scores:
            if col in scores_df.columns:
                data[col] = scores_df[col]

        print(f"[SCORES] ✅ Calculated and added {len(missing_scores)} score columns")
    else:
        print(f"[SCORES] ✅ All score columns present in {data_type}")

    return data

def _validate_loan_data(data, data_type="data"):
    """
    NEW: Validate loan application amount data
    """
    print(f"[VALIDATION] Validating loan data in {data_type}...")

    # Check for required columns
    required_cols = ["loan_amount_applied_inr", "outstanding_loan_amount_inr", "monthly_income_inr"]
    missing_cols = [col for col in required_cols if col not in data.columns]
    if missing_cols:
        raise ValueError(f"Missing required columns in {data_type}: {missing_cols}")

    # Check for negative values
    if (data["loan_amount_applied_inr"] < 0).any():
        raise ValueError(f"Negative loan application amounts found in {data_type}")

    # Logical validation: outstanding should generally not exceed applied (with tolerance for interest)
    over_applied = data[data["outstanding_loan_amount_inr"] > data["loan_amount_applied_inr"] * 1.15]
    if len(over_applied) > 0:
        print(f"[WARNING] {len(over_applied)} records in {data_type} have outstanding > 115% of applied amount")

    # Check for extremely high application amounts relative to income
    high_ratio = data[data["loan_amount_applied_inr"] > data["monthly_income_inr"] * 12 * 10]
    if len(high_ratio) > 0:
        print(f"[WARNING] {len(high_ratio)} records in {data_type} have application > 10x annual income")

    print(f"[VALIDATION] ✅ Loan data validation completed for {data_type}")

    # Print summary statistics
    print(f"[SUMMARY] Application amounts in {data_type}:")
    print(f"  - Min: ₹{data['loan_amount_applied_inr'].min():,.0f}")
    print(f"  - Max: ₹{data['loan_amount_applied_inr'].max():,.0f}")
    print(f"  - Mean: ₹{data['loan_amount_applied_inr'].mean():,.0f}")

    return data

def _add_derived_features(data):
    """
    NEW: Add derived features from loan application amount
    """
    print("[FEATURES] Adding derived features from loan application data...")

    # Application to income ratio
    data["application_to_income_ratio"] = data["loan_amount_applied_inr"] / (data["monthly_income_inr"] * 12)

    # Loan utilization ratio (how much of applied amount is outstanding)
    data["loan_utilization_ratio"] = data["outstanding_loan_amount_inr"] / data["loan_amount_applied_inr"]

    # Application amount category
    data["application_amount_category"] = pd.cut(
        data["loan_amount_applied_inr"],
        bins=[0, 100000, 500000, 1000000, float('inf')],
        labels=['Small', 'Medium', 'Large', 'Very Large']
    )

    print("[FEATURES] ✅ Added derived features:")
    print("  - application_to_income_ratio")
    print("  - loan_utilization_ratio")
    print("  - application_amount_category")

    return data

def _load_data():
    """Load training and test data with proper validation and score calculation"""
    train = pd.read_csv(os.path.join(OUT_DIR, "training_data_aligned.csv"))
    test = pd.read_csv(os.path.join(OUT_DIR, "test_data_aligned.csv"))

    print(f"[DATA] Loaded training  {train.shape}")
    print(f"[DATA] Loaded test  {test.shape}")

    # NEW: Validate loan application data
    train = _validate_loan_data(train, "training")
    test = _validate_loan_data(test, "test")

    # Ensure scores are present in both datasets
    train = _ensure_scores_present(train, "training")
    test = _ensure_scores_present(test, "test")

    # NEW: Add derived features (optional - can be commented out if not needed)
    # train = _add_derived_features(train)
    # test = _add_derived_features(test)

    # Validate training data has all required columns
    missing_train = [c for c in MODEL_FEATURES + [TARGET_REG, TARGET_CLS] if c not in train.columns]
    assert not missing_train, f"Training missing columns: {missing_train}"

    # Validate test data has model features (targets may or may not be present)
    missing_test = [c for c in MODEL_FEATURES if c not in test.columns]
    assert not missing_test, f"Test missing columns: {missing_test}"

    print(f"[DATA] ✅ Training data shape: {train.shape}")
    print(f"[DATA] ✅ Test data shape: {test.shape}")
    print(f"[DATA] Test has targets: {TARGET_REG in test.columns and TARGET_CLS in test.columns}")

    return train, test

def _fit_xgboost_models(X_train, y_reg_train, y_cls_train_num, eval_fraction=0.25, seed=42):
    """Enhanced XGBoost model fitting with improved hyperparameters (aligned with data complexity)"""
    # Stratified split to maintain class balance
    stratify_opt = y_cls_train_num if len(np.unique(y_cls_train_num)) > 1 else None
    X_tr, X_val, yreg_tr, yreg_val, ycls_tr, ycls_val = train_test_split(
        X_train, y_reg_train, y_cls_train_num, test_size=eval_fraction,
        random_state=seed, stratify=stratify_opt
    )

    # Enhanced XGBoost Regressor (tuned for financial data with loan application features)
    xgb_reg = XGBRegressor(
        n_estimators=900,            # More trees for better learning with additional features
        learning_rate=0.05,          # Slightly lower learning rate for stability
        max_depth=8,                 # Deeper trees for complex financial relationships
        subsample=0.85,              # Row sampling for regularization
        colsample_bytree=0.8,        # Feature sampling
        colsample_bylevel=0.8,       # Additional feature sampling per level
        reg_alpha=0.1,               # L1 regularization
        reg_lambda=2.0,              # Higher L2 regularization for more features
        min_child_weight=4,          # Prevent overfitting on small groups
        gamma=0.1,                   # Minimum split loss
        random_state=seed,
        n_jobs=-1
    )

    # Enhanced XGBoost Classifier (tuned for risk categories with loan features)
    xgb_cls = XGBClassifier(
        n_estimators=1000,           # More estimators for classification with additional features
        learning_rate=0.05,
        max_depth=8,
        subsample=0.85,
        colsample_bytree=0.8,
        colsample_bylevel=0.8,
        reg_alpha=0.1,
        reg_lambda=2.0,
        min_child_weight=4,
        gamma=0.1,
        random_state=seed,
        eval_metric="mlogloss",
        use_label_encoder=False,
        n_jobs=-1
    )

    print("[MODEL] Training XGBoost models...")
    print(f"[MODEL] Training features: {X_train.shape[1]} (including loan_amount_applied_inr)")

    # Fit models on training split
    xgb_reg.fit(X_tr, yreg_tr)
    xgb_cls.fit(X_tr, ycls_tr)

    # Validate on hold-out set
    y_val_pred_reg = xgb_reg.predict(X_val)
    y_val_pred_cls_num = xgb_cls.predict(X_val)
    y_val_pred_cls_str = np.array([RISK_LABELS[int(v)] for v in y_val_pred_cls_num])

    # Calculate validation metrics
    y_val_cls_str = np.array([RISK_LABELS[int(v)] for v in ycls_val])
    acc = accuracy_score(y_val_cls_str, y_val_pred_cls_str)
    mae = mean_absolute_error(yreg_val, y_val_pred_reg)

    # Binary classification AUC (High/Very High vs others)
    y_bin = (ycls_val >= 2).astype(int)
    auc_bin = roc_auc_score(y_bin, y_val_pred_reg) if len(np.unique(y_bin)) > 1 else float("nan")

    metrics = {"accuracy": acc, "mae": mae, "auc_bin": auc_bin}
    print(f"[VALIDATION] Accuracy: {acc:.4f}, MAE: {mae:.4f}, AUC: {auc_bin:.4f}")

    # Refit on full training data for final models
    print("[MODEL] Refitting on full training data...")
    xgb_reg.fit(X_train, y_reg_train)
    xgb_cls.fit(X_train, y_cls_train_num)

    return xgb_reg, xgb_cls, (X_val, y_val_cls_str, y_val_pred_reg, y_val_pred_cls_str, yreg_val, metrics)

def _generate_test_predictions(xgb_reg, xgb_cls, test_data):
    """Generate predictions on test data with risk score calculation (aligned with generators)"""
    X_test = test_data[MODEL_FEATURES].copy()

    print("[PREDICTION] Generating test predictions...")
    print(f"[PREDICTION] Using features: {MODEL_FEATURES}")

    # Predict probability of default and risk category
    test_pred_reg = xgb_reg.predict(X_test)
    test_pred_cls_num = xgb_cls.predict(X_test)
    test_pred_cls_str = np.array([RISK_LABELS[int(v)] for v in test_pred_cls_num])

    # Create results dataframe with predictions
    test_results = test_data.copy()
    test_results[TARGET_REG] = test_pred_reg
    test_results[TARGET_CLS] = test_pred_cls_str

    # Calculate risk score (aligned with generator format)
    test_results["risk_score"] = (test_results[TARGET_REG] * 100).round(1)

    print(f"[PREDICTION] ✅ Test predictions completed. Shape: {test_results.shape}")
    print(f"[PREDICTION] Risk distribution: {pd.Series(test_pred_cls_str).value_counts().to_dict()}")
    print(f"[PREDICTION] PD range: {test_pred_reg.min():.3f} - {test_pred_reg.max():.3f}")
    print(f"[PREDICTION] Application amounts in test predictions: ₹{test_results['loan_amount_applied_inr'].min():,.0f} - ₹{test_results['loan_amount_applied_inr'].max():,.0f}")

    return test_results, test_pred_reg, test_pred_cls_str

def _save_pngs(xgb_reg, X_val, y_val_cls_str, y_val_pred_reg, y_val_pred_cls_str):
    """Generate all visualization plots"""
    print("[PLOTS] Generating visualizations...")

    # Confusion Matrix
    labels = RISK_LABELS
    cm = confusion_matrix(y_val_cls_str, y_val_pred_cls_str, labels=labels)
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=labels, yticklabels=labels)
    plt.title('Confusion Matrix - XGBoost (with Loan Application Features)')
    plt.ylabel('Actual')
    plt.xlabel('Predicted')
    plt.tight_layout()
    plt.savefig(os.path.join(OUT_DIR, "confusion_matrix_best.png"), dpi=300, bbox_inches="tight")
    plt.close()

    # ROC Curve (High/Very High vs others) using regressor scores
    y_true_num = _risk_to_num(y_val_cls_str)
    y_bin = (y_true_num >= 2).astype(int)
    if len(np.unique(y_bin)) > 1:
        fpr, tpr, _ = roc_curve(y_bin, y_val_pred_reg)
        auc_val = roc_auc_score(y_bin, y_val_pred_reg)
        plt.figure(figsize=(8, 6))
        plt.plot(fpr, tpr, lw=2, label=f"XGB Enhanced (AUC={auc_val:.3f})")
        plt.plot([0, 1], [0, 1], lw=2, linestyle="--", color="gray", label="Random")
        plt.xlabel("False Positive Rate")
        plt.ylabel("True Positive Rate")
        plt.title("ROC Curve (High/Very High vs others)")
        plt.legend(loc="lower right")
        plt.grid(True, linestyle="--", alpha=0.6)
        plt.tight_layout()
        plt.savefig(os.path.join(OUT_DIR, "roc_curve_best.png"), dpi=300, bbox_inches="tight")
        plt.close()

    # Feature Importance (regressor) - UPDATED to show loan application feature
    if hasattr(xgb_reg, "feature_importances_"):
        importances = np.array(xgb_reg.feature_importances_)
        idx = np.argsort(importances)[::-1]
        top_n = min(len(MODEL_FEATURES), len(importances))

        plt.figure(figsize=(12, 8))  # Larger figure for more features
        plt.barh(range(top_n), importances[idx[:top_n]][::-1])
        feature_names = [MODEL_FEATURES[i] for i in idx[:top_n]][::-1]

        # Highlight loan application feature if it's in top features
        colors = ['red' if 'loan_amount_applied' in name else 'steelblue' for name in feature_names]
        plt.barh(range(top_n), importances[idx[:top_n]][::-1], color=colors)

        plt.yticks(range(top_n), feature_names)
        plt.xlabel("Relative Importance")
        plt.title("Feature Importance - XGBoost Regressor (Enhanced with Loan Application)")
        plt.tight_layout()
        plt.savefig(os.path.join(OUT_DIR, "feature_importance_best.png"), dpi=300, bbox_inches="tight")
        plt.close()

        # Print feature importance for loan application amount
        loan_app_idx = MODEL_FEATURES.index("loan_amount_applied_inr")
        loan_app_importance = importances[loan_app_idx]
        print(f"[FEATURE IMPORTANCE] loan_amount_applied_inr importance: {loan_app_importance:.4f} (rank: {np.where(idx == loan_app_idx)[0][0] + 1}/{len(MODEL_FEATURES)})")

    # Risk Distribution Visualization
    risk_dist = pd.Series(y_val_cls_str).value_counts()
    pred_dist = pd.Series(y_val_pred_cls_str).value_counts()
    plt.figure(figsize=(12, 6))
    x_pos = np.arange(len(RISK_LABELS))
    width = 0.35
    actual_counts = [risk_dist.get(label, 0) for label in RISK_LABELS]
    pred_counts = [pred_dist.get(label, 0) for label in RISK_LABELS]
    plt.bar(x_pos - width/2, actual_counts, width, label='Actual', alpha=0.8)
    plt.bar(x_pos + width/2, pred_counts, width, label='Predicted', alpha=0.8)
    plt.xlabel('Risk Categories')
    plt.ylabel('Count')
    plt.title('Risk Distribution: Actual vs Predicted (Enhanced Model)')
    plt.xticks(x_pos, RISK_LABELS, rotation=45)
    plt.legend()
    plt.tight_layout()
    plt.savefig(os.path.join(OUT_DIR, "risk_distribution.png"), dpi=300, bbox_inches="tight")
    plt.close()

def _update_metric_plots(acc, mae, y_reg_val, y_reg_pred):
    """Generate performance comparison plots"""
    # Accuracy Comparison with actual bars
    plt.figure(figsize=(8, 6))
    metrics_names = ["Accuracy", "1 - MAE"]
    plt.bar([0, 1], [acc, max(0.0, 1.0 - mae)], width=0.5, color=["#4c72b0", "#55a868"])
    for i, v in enumerate([acc, max(0.0, 1.0 - mae)]):
        plt.text(i, v + 0.01, f"{v:.3f}", ha="center", va="bottom")
    plt.xticks([0, 1], metrics_names)
    plt.title("Model Performance Comparison (XGBoost Enhanced)")
    plt.xlabel("Metric")
    plt.ylabel("Score")
    plt.ylim(0, 1.05)
    plt.tight_layout()
    plt.savefig(os.path.join(OUT_DIR, "accuracy_comparison.png"), dpi=300, bbox_inches="tight")
    plt.close()

    # Prediction Errors with real residuals
    plt.figure(figsize=(10, 6))
    errors = y_reg_val - y_reg_pred
    plt.scatter(y_reg_pred, errors, alpha=0.6)
    plt.axhline(y=0, color='r', linestyle='--')
    plt.xlabel('Predicted Probability of Default')
    plt.ylabel('Prediction Error')
    plt.title('Prediction Errors - XGBoost Regressor (Enhanced)')
    plt.grid(True, linestyle="--", alpha=0.6)
    plt.tight_layout()
    plt.savefig(os.path.join(OUT_DIR, "prediction_errors.png"), dpi=300, bbox_inches="tight")
    plt.close()

def main():
    """Main pipeline execution (aligned with data generator workflow)"""
    print("="*70)
    print("🚀 XGBoost Credit Risk Pipeline - Enhanced with Loan Application Amount")
    print("="*70)

    print("\n[STEP 1] Loading and validating data...")
    train, test = _load_data()

    # Prepare training data
    X_train = train[MODEL_FEATURES].copy()
    y_reg_train = train[TARGET_REG].astype(float).copy()
    y_cls_train_str = train[TARGET_CLS].astype(str).copy()
    y_cls_train_num = _risk_to_num(y_cls_train_str)

    print(f"[TRAIN] Training features shape: {X_train.shape}")
    print(f"[TRAIN] Model features include loan_amount_applied_inr: {'loan_amount_applied_inr' in MODEL_FEATURES}")
    print(f"[TRAIN] Risk distribution: {pd.Series(y_cls_train_str).value_counts().to_dict()}")

    print("\n[STEP 2] Training enhanced XGBoost models...")
    xgb_reg, xgb_cls, validation = _fit_xgboost_models(X_train, y_reg_train, y_cls_train_num)
    X_val, y_val_cls_str, y_val_pred_reg, y_val_pred_cls_str, y_reg_val, val_metrics = validation

    print("\n[STEP 3] Generating enhanced visualizations...")
    _save_pngs(xgb_reg, X_val, y_val_cls_str, y_val_pred_reg, y_val_pred_cls_str)
    _update_metric_plots(val_metrics["accuracy"], val_metrics["mae"], y_reg_val, y_val_pred_reg)

    print("\n[STEP 4] Predicting on test data...")
    test_results, test_pred_reg, test_pred_cls_str = _generate_test_predictions(xgb_reg, xgb_cls, test)

    print("\n[STEP 5] Saving results...")

    # Save test predictions CSV (aligned with generator format)
    test_results.to_csv(os.path.join(OUT_DIR, "test_predictions.csv"), index=False)
    print(f"[SAVE] ✅ Test predictions saved: test_predictions.csv")

    # Save complete model pipeline (single PKL file) - UPDATED
    full_pipeline = {
        "xgb_regressor": xgb_reg,
        "xgb_classifier": xgb_cls,
        "risk_labels": RISK_LABELS,
        "model_features": MODEL_FEATURES,  # Now includes loan_amount_applied_inr
        "metadata": {
            "train_shape": list(X_train.shape),
            "test_shape": list(test[MODEL_FEATURES].shape),
            "risk_mapping": {label: i for i, label in enumerate(RISK_LABELS)},
            "pd_thresholds": {"low": 0.18, "medium": 0.42, "high": 0.68},
            "enhanced_features": ["loan_amount_applied_inr"],  # NEW
            "model_version": "enhanced_with_loan_application"  # NEW
        }
    }
    joblib.dump(full_pipeline, os.path.join(OUT_DIR, "xgb_credit_risk_pipeline.pkl"))
    print(f"[SAVE] ✅ Enhanced model pipeline saved: xgb_credit_risk_pipeline.pkl")

    # Generate comprehensive JSON output (aligned with generator analysis) - UPDATED
    cm = confusion_matrix(y_val_cls_str, y_val_pred_cls_str, labels=RISK_LABELS).tolist()
    class_report = classification_report(y_val_cls_str, y_val_pred_cls_str, output_dict=True)

    # Feature importances from regressor
    feat_imps = []
    if hasattr(full_pipeline["xgb_regressor"], "feature_importances_"):
        for f, w in zip(MODEL_FEATURES, full_pipeline["xgb_regressor"].feature_importances_):
            feat_imps.append({
                "feature": f,
                "importance": float(w),
                "is_loan_application_feature": "loan_amount_applied" in f  # NEW
            })

    output_json = {
        "data": test_results.to_dict(orient="records"),
        "analysis": {
            "metrics": {
                "accuracy": float(val_metrics["accuracy"]),
                "mae": float(val_metrics["mae"]),
                "auc_bin": float(val_metrics["auc_bin"]) if np.isfinite(val_metrics["auc_bin"]) else None
            },
            "confusion_matrix": {
                "labels": RISK_LABELS,
                "matrix": cm
            },
            "classification_report": class_report,
            "class_distribution_validation": dict(pd.Series(y_val_cls_str).value_counts()),
            "class_distribution_test_predicted": dict(pd.Series(test_pred_cls_str).value_counts()),
            "feature_importance_regressor": feat_imps,
            "shapes": {
                "train": list(X_train.shape),
                "validation": list(X_val.shape),
                "test": list(test[MODEL_FEATURES].shape)
            },
            "model_info": {
                "algorithm": "XGBoost Enhanced",
                "version": "enhanced_with_loan_application",  # NEW
                "features_count": len(MODEL_FEATURES),  # NEW
                "has_loan_application_feature": True,  # NEW
                "regressor_params": xgb_reg.get_params(),
                "classifier_params": xgb_cls.get_params()
            },
            "loan_application_analysis": {  # NEW SECTION
                "application_amount_range": {
                    "min": float(test_results["loan_amount_applied_inr"].min()),
                    "max": float(test_results["loan_amount_applied_inr"].max()),
                    "mean": float(test_results["loan_amount_applied_inr"].mean())
                },
                "loan_utilization_stats": {
                    "mean": float((test_results["outstanding_loan_amount_inr"] / test_results["loan_amount_applied_inr"]).mean()),
                    "median": float((test_results["outstanding_loan_amount_inr"] / test_results["loan_amount_applied_inr"]).median())
                }
            }
        }
    }

    # Convert numpy types before JSON serialization
    json_safe_output = convert_np_types(output_json)
    with open(os.path.join(OUT_DIR, "model_output.json"), "w") as f:
        json.dump(json_safe_output, f, indent=2)
    print(f"[SAVE] ✅ Enhanced JSON analysis saved: model_output.json")

    print("\n" + "="*70)
    print("✅ Enhanced XGBoost Credit Risk Pipeline Completed!")
    print("="*70)
    print(f"📊 PERFORMANCE METRICS:")
    print(f"   ├── Regressor MAE: {val_metrics['mae']:.4f}")
    print(f"   ├── Classifier Accuracy: {val_metrics['accuracy']:.4f}")
    print(f"   └── Binary AUC (High Risk): {val_metrics['auc_bin']:.4f}")
    print(f"\n🆕 ENHANCEMENT FEATURES:")
    print(f"   ├── Total Features: {len(MODEL_FEATURES)} (including loan_amount_applied_inr)")
    print(f"   ├── Loan Application Amount: ✅ Included")
    print(f"   ├── Enhanced Score Calculations: ✅ Updated")
    print(f"   └── Validation & Derived Features: ✅ Added")
    print(f"\n📁 OUTPUT FILES:")
    print(f"   ├── training_data_aligned.csv")
    print(f"   ├── test_data_aligned.csv")
    print(f"   ├── test_predictions.csv (with loan application data)")
    print(f"   ├── xgb_credit_risk_pipeline.pkl (enhanced model)")
    print(f"   ├── model_output.json (comprehensive analysis with loan features)")
    print(f"   └── 6 PNG visualizations (enhanced with loan application insights)")
    print("\n🎯 Model now fully utilizes loan application amount for improved predictions!")

if __name__ == "__main__":
    main()


🚀 XGBoost Credit Risk Pipeline - Enhanced with Loan Application Amount

[STEP 1] Loading and validating data...
[DATA] Loaded training  (12000, 43)
[DATA] Loaded test  (50, 33)
[VALIDATION] Validating loan data in training...
[VALIDATION] ✅ Loan data validation completed for training
[SUMMARY] Application amounts in training:
  - Min: ₹25,000
  - Max: ₹8,000,000
  - Mean: ₹1,595,464
[VALIDATION] Validating loan data in test...
[VALIDATION] ✅ Loan data validation completed for test
[SUMMARY] Application amounts in test:
  - Min: ₹56,287
  - Max: ₹2,000,000
  - Mean: ₹924,646
[SCORES] ✅ All score columns present in training
[SCORES] Missing score columns in test: ['timeliness_score', 'repayment_ability_score', 'financial_health_score', 'payment_reliability_score', 'stability_index']
[SCORES] Calculating scores from financial data...
[SCORES] ✅ Calculated and added 5 score columns
[DATA] ✅ Training data shape: (12000, 43)
[DATA] ✅ Test data shape: (50, 38)
[DATA] Test has targets: False
[