In [4]:
# -*- coding: utf-8 -*-
# Train data generator with loan_type and interest_rate features

import os
import numpy as np
import pandas as pd
from datetime import datetime, timedelta

np.random.seed(111)

OUT_DIR = "credit_risk_output"
os.makedirs(OUT_DIR, exist_ok=True)

# External datasets loading
LOAN_DATA_PATH = "Loan_default (1).csv"
TELCO_DATA_PATH = "WA_Fn-UseC_-Telco-Customer-Churn.csv"

try:
    loan_df = pd.read_csv(LOAN_DATA_PATH)
    print(f"[INFO] Loaded loan dataset: {loan_df.shape}")
except FileNotFoundError:
    print(f"[WARNING] {LOAN_DATA_PATH} not found. Using fallback distributions.")
    loan_df = None

try:
    telco_df = pd.read_csv(TELCO_DATA_PATH)
    print(f"[INFO] Loaded telco dataset: {telco_df.shape}")
except FileNotFoundError:
    print(f"[WARNING] {TELCO_DATA_PATH} not found. Using fallback distributions.")
    telco_df = None

# Pre-calculate distributions
if loan_df is not None:
    loan_age_stats = loan_df["age"].dropna().describe() if "age" in loan_df.columns else None
    loan_income_stats = loan_df["income"].dropna().describe() if "income" in loan_df.columns else None
    loan_amount_stats = loan_df["loan_amount"].dropna().describe() if "loan_amount" in loan_df.columns else None
else:
    loan_age_stats = loan_income_stats = loan_amount_stats = None

if telco_df is not None:
    telco_tenure_stats = telco_df["tenure"].dropna().describe() if "tenure" in telco_df.columns else None
    telco_charges_stats = telco_df["MonthlyCharges"].dropna().describe() if "MonthlyCharges" in telco_df.columns else None
else:
    telco_tenure_stats = telco_charges_stats = None

# NEW: Loan types and their characteristics
LOAN_TYPES = [
    "personal loan", "home loan", "auto loan", "education loan",
    "business loan", "credit card", "gold loan"
]

# NEW: Interest rate ranges for each loan type (realistic market rates)
LOAN_INTEREST_RATES = {
    "personal loan": {"min": 11.99, "max": 24.0, "secured": False, "risk_factor": 1.4},
    "home loan": {"min": 7.35, "max": 12.95, "secured": True, "risk_factor": 0.6},
    "auto loan": {"min": 7.70, "max": 15.30, "secured": True, "risk_factor": 0.8},
    "education loan": {"min": 9.45, "max": 15.0, "secured": False, "risk_factor": 0.9},
    "business loan": {"min": 10.85, "max": 17.95, "secured": False, "risk_factor": 1.2},
    "credit card": {"min": 18.0, "max": 42.0, "secured": False, "risk_factor": 1.8},
    "gold loan": {"min": 8.30, "max": 16.0, "secured": True, "risk_factor": 0.7}
}

# Enhanced model features (35+ features for better accuracy)
MODEL_FEATURES = [
    "age", "monthly_income_inr", "monthly_expenses_inr", "monthly_savings_inr",
    "outstanding_loan_amount_inr", "loan_amount_applied_inr", "years_current_employment",
    "banking_relationship_years", "timeliness_score", "repayment_ability_score",
    "financial_health_score", "payment_reliability_score", "stability_index",
    "spouse_income_inr", "monthly_utility_bills_inr", "property_value_inr",
    "vehicle_value_inr", "total_investments_inr", "monthly_business_revenue_inr",
    "daily_mobile_hours", "monthly_digital_transactions", "avg_transaction_amount_inr",
    "social_media_accounts_count", "mobile_app_usage_intensity_score",
    "digital_payment_adoption_score", "utility_payment_regularity_score",
    "location_stability_score", "mobile_banking_usage_score",
    # NEW: Loan type features
    "interest_rate", "loan_type_personal_loan", "loan_type_home_loan",
    "loan_type_auto_loan", "loan_type_education_loan", "loan_type_business_loan",
    "loan_type_credit_card", "loan_type_gold_loan"
]

TARGET_REG = "probability_of_default"
TARGET_CLS = "risk_category"

# Updated column list with new loan features
ALL_COLUMNS = [
    "applicant_id", "application_date", "age", "gender", "education_level",
    "employment_type", "marital_status", "family_size", "number_of_dependents",
    "location_type", "monthly_income_inr", "spouse_income_inr", "monthly_expenses_inr",
    "monthly_savings_inr", "monthly_utility_bills_inr", "property_value_inr",
    "vehicle_value_inr", "total_investments_inr", "outstanding_loan_amount_inr",
    "loan_amount_applied_inr", "years_current_employment", "banking_relationship_years",
    "monthly_business_revenue_inr", "daily_mobile_hours", "monthly_digital_transactions",
    "avg_transaction_amount_inr", "social_media_accounts_count", "mobile_app_usage_intensity_score",
    "digital_payment_adoption_score", "utility_payment_regularity_score",
    "location_stability_score", "mobile_banking_usage_score", "payment_reliability_score",
    "financial_health_score", "stability_index", "timeliness_score",
    "repayment_ability_score",
    # NEW: Loan type columns
    "loan_type", "interest_rate",
    "probability_of_default", "consent_status", "city", "risk_category"
]

GUJARAT_CITIES = [
    "Ahmedabad", "Surat", "Vadodara", "Rajkot", "Bhavnagar", "Jamnagar", "Junagadh",
    "Gandhinagar", "Nadiad", "Morbi", "Anand", "Mehsana", "Navsari", "Bharuch",
    "Vapi", "Valsad", "Patan", "Godhra", "Porbandar", "Palanpur", "Veraval", "Surendranagar"
]
EDUCATION_LEVELS = ["High School", "Diploma", "Graduate", "Post Graduate", "Professional"]
EMPLOYMENT_TYPES = ["Salaried", "Self Employed", "Business Owner", "Professional"]
LOCATION_TYPES = ["Metro", "Tier1", "Tier2"]

CITY_TO_LOCATION_TYPE = {
    "Ahmedabad": "Metro", "Surat": "Metro", "Vadodara": "Metro",
    "Rajkot": "Tier1", "Bhavnagar": "Tier1", "Jamnagar": "Tier1",
    "Gandhinagar": "Metro", "Surendranagar": "Tier1",
    "Junagadh": "Tier2", "Nadiad": "Tier2", "Morbi": "Tier2", "Anand": "Tier2",
    "Mehsana": "Tier2", "Navsari": "Tier2", "Bharuch": "Tier2", "Vapi": "Tier2",
    "Valsad": "Tier2", "Patan": "Tier2", "Godhra": "Tier2", "Porbandar": "Tier2",
    "Palanpur": "Tier2", "Veraval": "Tier2"
}

# Sampling functions (same as before)
def _sample_age():
    if loan_age_stats is not None:
        sampled_age = int(np.random.normal(loan_age_stats["mean"], loan_age_stats["std"]))
        return max(18, min(75, sampled_age))
    else:
        return int(np.random.normal(36, 12))

def _sample_income():
    if loan_income_stats is not None:
        sampled_income = int(np.random.normal(loan_income_stats["mean"], loan_income_stats["std"]))
        return max(15000, min(500000, sampled_income))
    else:
        return int(np.random.normal(55000, 25000))

def _sample_loan_amount():
    if loan_amount_stats is not None:
        sampled_amount = int(np.random.normal(loan_amount_stats["mean"], loan_amount_stats["std"]))
        return max(15000, min(6000000, sampled_amount))
    else:
        return int(np.random.normal(250000, 80000))

def _sample_tenure():
    if telco_tenure_stats is not None:
        sampled_tenure = np.random.normal(telco_tenure_stats["mean"], telco_tenure_stats["std"])
        return max(0.5, min(35.0, sampled_tenure))
    else:
        return np.random.uniform(0.5, 15.0)

def _sample_digital_transactions():
    if telco_charges_stats is not None:
        base_transactions = int(telco_charges_stats["mean"] / 2)
        noise = np.random.normal(0, telco_charges_stats["std"] / 4)
        return max(5, int(base_transactions + noise))
    else:
        base = int(np.random.normal(60, 20))
        return max(5, base)

def _sample_monthly_charges():
    if telco_charges_stats is not None:
        charges = np.random.normal(telco_charges_stats["mean"], telco_charges_stats["std"])
        return max(500, int(charges * 60))
    else:
        return int(np.random.normal(2500, 800))

# NEW: Function to assign loan type based on customer profile
def _assign_loan_type(income, age, employment_type, property_value, application_amount):
    """Assign loan type based on customer profile and application amount"""

    # Age-based loan preferences
    if age < 30:
        # Young people prefer personal loans, education loans, auto loans
        loan_prefs = ["personal loan", "education loan", "auto loan", "credit card"]
        weights = [0.35, 0.25, 0.25, 0.15]
    elif age < 45:
        # Prime age group - more diverse loan needs including home loans
        loan_prefs = ["home loan", "personal loan", "auto loan", "business loan", "education loan"]
        weights = [0.30, 0.25, 0.20, 0.15, 0.10]
    else:
        # Mature customers - home loans, business loans, gold loans
        loan_prefs = ["home loan", "business loan", "personal loan", "gold loan", "auto loan"]
        weights = [0.35, 0.25, 0.20, 0.15, 0.05]

    # Employment type adjustments
    if employment_type == "Business Owner":
        # Business owners prefer business loans and gold loans
        if "business loan" in loan_prefs:
            idx = loan_prefs.index("business loan")
            weights[idx] *= 1.5
        if "gold loan" in loan_prefs:
            idx = loan_prefs.index("gold loan")
            weights[idx] *= 1.3

    elif employment_type == "Self Employed":
        # Self employed prefer personal loans and gold loans
        if "personal loan" in loan_prefs:
            idx = loan_prefs.index("personal loan")
            weights[idx] *= 1.3
        if "gold loan" in loan_prefs:
            idx = loan_prefs.index("gold loan")
            weights[idx] *= 1.2

    # Property ownership influence
    if property_value > 0:
        # Property owners more likely to take secured loans
        if "home loan" in loan_prefs:
            idx = loan_prefs.index("home loan")
            weights[idx] *= 1.4
        if "gold loan" in loan_prefs:
            idx = loan_prefs.index("gold loan")
            weights[idx] *= 1.2

    # Application amount influence
    if application_amount > 1000000:  # High amount applications
        # Large amounts favor home loans and business loans
        for loan_type in ["home loan", "business loan"]:
            if loan_type in loan_prefs:
                idx = loan_prefs.index(loan_type)
                weights[idx] *= 1.5
    elif application_amount < 100000:  # Small amount applications
        # Small amounts favor personal loans and credit cards
        for loan_type in ["personal loan", "credit card"]:
            if loan_type in loan_prefs:
                idx = loan_prefs.index(loan_type)
                weights[idx] *= 1.3

    # Normalize weights
    total_weight = sum(weights)
    normalized_weights = [w/total_weight for w in weights]

    # Select loan type
    loan_type = np.random.choice(loan_prefs, p=normalized_weights)
    return loan_type

# NEW: Function to calculate interest rate based on loan type and risk
def _calculate_interest_rate(loan_type, segment, income, age, employment_type, dti_ratio):
    """Calculate interest rate based on loan type and customer risk profile"""

    loan_config = LOAN_INTEREST_RATES[loan_type]
    base_min = loan_config["min"]
    base_max = loan_config["max"]
    risk_factor = loan_config["risk_factor"]

    # Segment-based rate adjustment
    segment_adjustments = {
        "excellent": -0.15,  # Best rates
        "good": -0.05,
        "fair": 0.05,
        "poor": 0.15,
        "bad": 0.25  # Worst rates
    }
    segment_adj = segment_adjustments.get(segment, 0.0)

    # Income-based adjustment (higher income = lower rates)
    if income > 100000:
        income_adj = -0.10
    elif income > 50000:
        income_adj = -0.05
    elif income < 25000:
        income_adj = 0.15
    else:
        income_adj = 0.0

    # Age-based adjustment
    if age < 25:
        age_adj = 0.10  # Young = higher risk
    elif age > 60:
        age_adj = 0.05  # Older = slightly higher risk
    else:
        age_adj = 0.0

    # Employment type adjustment
    emp_adjustments = {
        "Salaried": -0.05,
        "Professional": -0.08,
        "Self Employed": 0.10,
        "Business Owner": 0.05
    }
    emp_adj = emp_adjustments.get(employment_type, 0.0)

    # DTI ratio adjustment
    if dti_ratio > 0.6:
        dti_adj = 0.20
    elif dti_ratio > 0.4:
        dti_adj = 0.10
    else:
        dti_adj = 0.0

    # Calculate final interest rate
    total_adjustment = (segment_adj + income_adj + age_adj + emp_adj + dti_adj) * risk_factor

    # Start from middle of range and adjust
    base_rate = (base_min + base_max) / 2
    final_rate = base_rate + total_adjustment

    # Add some random variation
    final_rate += np.random.normal(0, 0.5)

    # Ensure within loan type bounds
    final_rate = max(base_min, min(base_max, final_rate))

    return round(final_rate, 2)

# Utility functions
def _clip(v, lo, hi):
    return max(lo, min(hi, v))

def _sigmoid(x):
    return 1.0 / (1.0 + np.exp(-np.clip(x, -500, 500)))

def _risk_category_from_p(p):
    if p <= 0.18: return "Low Risk"
    elif p <= 0.42: return "Medium Risk"
    elif p <= 0.68: return "High Risk"
    else: return "Very High Risk"

def _generate_applicant_id():
    prefix = np.random.choice(["A", "B", "C", "D", "E", "F"])
    number = np.random.randint(1000, 9999)
    return f"{prefix}{number:04d}"

def _generate_application_date():
    start_date = datetime(2024, 6, 1)
    end_date = datetime(2025, 7, 31)
    days_diff = (end_date - start_date).days
    random_days = np.random.randint(0, max(1, days_diff))
    return (start_date + timedelta(days=int(random_days))).strftime("%Y-%m-%d")

def _generate_loan_application_amount(income, age, segment, property_value, employment_type, loan_type):
    """Enhanced loan application amount based on loan type"""

    base_amount = _sample_loan_amount()
    income_multiplier = min(3.0, max(0.3, income / 50000))

    # Loan type specific multipliers
    loan_type_multipliers = {
        "personal loan": np.random.uniform(0.5, 1.5),
        "home loan": np.random.uniform(3.0, 8.0),
        "auto loan": np.random.uniform(1.0, 3.0),
        "education loan": np.random.uniform(0.8, 2.5),
        "business loan": np.random.uniform(1.5, 5.0),
        "credit card": np.random.uniform(0.2, 0.8),
        "gold loan": np.random.uniform(0.3, 1.2)
    }

    loan_type_mult = loan_type_multipliers.get(loan_type, 1.0)

    # Age-based patterns
    if age < 30:
        purpose_multiplier = np.random.choice([0.6, 1.2, 2.0], p=[0.5, 0.3, 0.2])
    elif age < 45:
        purpose_multiplier = np.random.choice([1.0, 2.5, 4.0], p=[0.3, 0.4, 0.3])
    else:
        purpose_multiplier = np.random.choice([0.8, 1.8, 3.0], p=[0.4, 0.4, 0.2])

    # Property influence
    property_multiplier = np.random.uniform(1.2, 2.0) if property_value > 0 else np.random.uniform(0.7, 1.3)

    # Employment influence
    employment_multipliers = {
        "Salaried": np.random.uniform(0.8, 1.4),
        "Professional": np.random.uniform(1.0, 1.8),
        "Business Owner": np.random.uniform(1.2, 2.5),
        "Self Employed": np.random.uniform(0.6, 1.6)
    }
    emp_multiplier = employment_multipliers.get(employment_type, 1.0)

    # Segment adjustments
    segment_adjustments = {
        "excellent": np.random.uniform(1.5, 3.0),
        "good": np.random.uniform(1.0, 2.0),
        "fair": np.random.uniform(0.7, 1.5),
        "poor": np.random.uniform(0.4, 1.0),
        "bad": np.random.uniform(0.3, 0.8)
    }
    segment_multiplier = segment_adjustments.get(segment, 1.0)

    # Calculate final amount
    application_amount = int(base_amount * income_multiplier * purpose_multiplier *
                           property_multiplier * emp_multiplier * segment_multiplier * loan_type_mult)

    # Loan type specific bounds
    if loan_type == "home loan":
        min_amount, max_amount = 500000, min(8000000, int(income * 12 * 8))
    elif loan_type == "auto loan":
        min_amount, max_amount = 200000, min(1500000, int(income * 12 * 3))
    elif loan_type == "business loan":
        min_amount, max_amount = 100000, min(5000000, int(income * 12 * 6))
    elif loan_type == "education loan":
        min_amount, max_amount = 50000, min(2000000, int(income * 12 * 4))
    elif loan_type == "credit card":
        min_amount, max_amount = 10000, min(500000, int(income * 12))
    elif loan_type == "gold loan":
        min_amount, max_amount = 25000, min(1000000, int(income * 12 * 2))
    else:  # personal loan
        min_amount, max_amount = 25000, min(2000000, int(income * 12 * 3))

    return max(min_amount, min(max_amount, application_amount))

def calculate_data_driven_scores(row):
    """Enhanced scoring with loan type considerations"""
    income = max(1.0, float(row["monthly_income_inr"]))
    expenses = float(row["monthly_expenses_inr"])
    savings = float(row["monthly_savings_inr"])
    loan_amount = float(row["outstanding_loan_amount_inr"])
    application_amount = float(row["loan_amount_applied_inr"])
    age = int(row["age"])
    emp_years = float(row["years_current_employment"])
    bank_years = float(row["banking_relationship_years"])
    property_value = float(row["property_value_inr"])
    investments = float(row["total_investments_inr"])

    # NEW: Loan type factor
    loan_type = row.get("loan_type", "personal loan")
    loan_risk_factor = LOAN_INTEREST_RATES[loan_type]["risk_factor"]

    # Enhanced ratios
    dti_ratio = loan_amount / (12.0 * income) if income > 0 else 0
    application_to_income_ratio = application_amount / (12.0 * income) if income > 0 else 0
    loan_utilization_ratio = loan_amount / max(1.0, application_amount)
    expense_ratio = expenses / income if income > 0 else 1
    savings_ratio = savings / income if income > 0 else 0
    asset_ratio = (property_value + investments) / max(income * 12, 1)

    def _clip_local(value, min_val, max_val):
        return max(min_val, min(max_val, value))

    # Enhanced calculations with loan type considerations
    # 1. TIMELINESS SCORE with loan type adjustment
    timeliness_base = (
        min(emp_years * 10, 45) +
        min(bank_years * 8, 35) +
        min((age - 18) * 1.2, 25) +
        min(asset_ratio * 15, 20) +
        5
    )
    timeliness_penalty = (
        dti_ratio * 18 * loan_risk_factor +  # NEW: loan type penalty
        max(0, expense_ratio - 0.65) * 25 +
        max(0, application_to_income_ratio - 1.2) * 12 +
        max(0, 1 - savings_ratio) * 10
    )
    timeliness_score = _clip_local(int(timeliness_base - timeliness_penalty + np.random.randint(-6, 7)), 5, 95)

    # 2. REPAYMENT ABILITY with loan type factor
    repayment_base = (
        min(np.log(max(income, 1000)/20000) * 20, 35) +
        max(0, savings_ratio * 45) +
        min(emp_years * 3, 25) +
        min(asset_ratio * 10, 15) +
        (5 if LOAN_INTEREST_RATES[loan_type]["secured"] else 0) +  # NEW: secured loan bonus
        5
    )
    repayment_penalty = (
        dti_ratio * 30 * loan_risk_factor +  # NEW: loan type penalty
        max(0, expense_ratio - 0.75) * 20 +
        max(0, application_to_income_ratio - 1.8) * 15 +
        max(0, loan_utilization_ratio - 0.85) * 12
    )
    repayment_score = _clip_local(int(repayment_base - repayment_penalty + np.random.randint(-5, 6)), 5, 90)

    # 3. FINANCIAL HEALTH with enhanced loan type considerations
    financial_base = (
        min(np.log(max(income, 1000)/15000) * 15, 30) +
        min(asset_ratio * 25, 35) +
        max(0, savings_ratio * 30) +
        min(bank_years * 2, 20) +
        (8 if LOAN_INTEREST_RATES[loan_type]["secured"] else 0) +  # NEW: secured loan bonus
        10
    )
    financial_penalty = (
        dti_ratio * 25 * loan_risk_factor +  # NEW: loan type penalty
        max(0, expense_ratio - 0.70) * 22 +
        max(0, application_to_income_ratio - 1.5) * 12 +
        (5 if age < 22 or age > 65 else 0)
    )
    financial_score = _clip_local(int(financial_base - financial_penalty + np.random.randint(-8, 9)), 10, 95)

    # 4. PAYMENT RELIABILITY with loan type considerations
    reliability_base = (
        min(emp_years * 5, 40) +
        max(0, (1.2 - expense_ratio) * 35) +
        min(np.log(max(income, 1000)/3000), 25) +
        min(bank_years * 2, 15) +
        (5 if LOAN_INTEREST_RATES[loan_type]["secured"] else 0) +  # NEW: secured loan bonus
        10
    )
    reliability_penalty = (
        dti_ratio * 35 * loan_risk_factor +  # NEW: loan type penalty
        max(0, expense_ratio - 0.80) * 30 +
        abs(loan_utilization_ratio - 0.65) * 10 +
        max(0, application_to_income_ratio - 2.0) * 8
    )
    reliability_score = _clip_local(int(reliability_base - reliability_penalty + np.random.randint(-6, 7)), 10, 95)

    # 5. STABILITY INDEX with loan type factor
    stability_base = (
        min(emp_years * 4, 30) +
        min(bank_years * 3, 20) +
        min((age - 18) * 0.8, 25) +
        min(asset_ratio * 20, 25) +
        (10 if property_value > 0 else 0) +
        (5 if LOAN_INTEREST_RATES[loan_type]["secured"] else 0) +  # NEW: secured loan bonus
        5
    )
    stability_penalty = (
        dti_ratio * 20 * loan_risk_factor +  # NEW: loan type penalty
        max(0, expense_ratio - 0.75) * 15 +
        max(0, application_to_income_ratio - 2.2) * 10 +
        (8 if emp_years < 1 else 0)
    )
    stability_score = _clip_local(int(stability_base - stability_penalty + np.random.randint(-10, 11)), 5, 90)

    # Remaining scores (utility, location, mobile banking) - same logic
    utility_base = (
        90 -
        dti_ratio * 28 * loan_risk_factor -  # NEW: loan type factor
        max(0, expense_ratio - 0.55) * 25 +
        min(savings_ratio * 18, 12) +
        min(bank_years * 1.5, 8)
    )
    utility_score = _clip_local(int(utility_base + np.random.randint(-7, 8)), 25, 95)

    location_base = (
        bank_years * 10 +
        emp_years * 6 +
        (20 if property_value > 0 else 0) +
        min((age - 18) * 1.5, 30) +
        min(asset_ratio * 12, 15) +
        30
    )
    location_score = _clip_local(int(location_base + np.random.randint(-8, 9)), 30, 120)

    mobile_banking_base = (
        max(20, 95 - (age - 25) * 1.2) +
        min(emp_years * 2, 15) +
        min(np.log(max(income, 1000)/20000) * 10, 15)
    )
    mobile_banking_score = _clip_local(int(mobile_banking_base + np.random.randint(-10, 11)), 20, 95)

    return {
        "timeliness_score": timeliness_score,
        "repayment_ability_score": repayment_score,
        "financial_health_score": financial_score,
        "payment_reliability_score": reliability_score,
        "stability_index": stability_score,
        "utility_payment_regularity_score": utility_score,
        "location_stability_score": location_score,
        "mobile_banking_usage_score": mobile_banking_score
    }

def _pd_from_features(row):
    """Enhanced probability calculation with loan type and interest rate factors"""
    income = max(1.0, float(row["monthly_income_inr"]))
    expenses = float(row["monthly_expenses_inr"])
    loan_amount = float(row["outstanding_loan_amount_inr"])
    application_amount = float(row["loan_amount_applied_inr"])
    interest_rate = float(row["interest_rate"])
    loan_type = row["loan_type"]

    # Enhanced ratios
    dti = loan_amount / (12.0 * income)
    app_to_income = application_amount / (12.0 * income)
    exp_ratio = expenses / income
    savings_ratio = float(row["monthly_savings_inr"]) / income if income > 0 else 0

    # NEW: Loan type risk factor
    loan_risk_factor = LOAN_INTEREST_RATES[loan_type]["risk_factor"]
    secured_bonus = -0.15 if LOAN_INTEREST_RATES[loan_type]["secured"] else 0

    # NEW: Interest rate factor (higher rates indicate higher risk)
    # Normalize interest rate relative to loan type range
    rate_min = LOAN_INTEREST_RATES[loan_type]["min"]
    rate_max = LOAN_INTEREST_RATES[loan_type]["max"]
    normalized_rate = (interest_rate - rate_min) / (rate_max - rate_min) if rate_max > rate_min else 0.5
    interest_penalty = normalized_rate * 0.3  # Higher rate = higher default probability

    # Enhanced score average
    scores = [
        _clip(row["timeliness_score"], 0, 100),
        _clip(row["repayment_ability_score"], 0, 100),
        _clip(row["financial_health_score"], 0, 100),
        _clip(row["payment_reliability_score"], 0, 100),
        _clip(row["stability_index"], 0, 100),
        _clip(row["utility_payment_regularity_score"], 0, 100),
        _clip(row["location_stability_score"], 0, 120) / 120 * 100,
        _clip(row["mobile_banking_usage_score"], 0, 100)
    ]
    s_avg = np.mean(scores) / 100.0

    tenure = max(0.0, float(row["years_current_employment"]))
    bank_rel = max(0.0, float(row["banking_relationship_years"]))
    age = int(row["age"])
    property_value = float(row["property_value_inr"])
    digital_score = float(row["digital_payment_adoption_score"]) / 100.0

    # Enhanced probability calculation with loan type factors
    x = (
        1.8 * dti * loan_risk_factor +  # NEW: loan type risk factor
        1.2 * exp_ratio +
        0.5 * app_to_income +
        interest_penalty +  # NEW: interest rate penalty
        -2.5 * s_avg +
        -0.25 * np.log1p(tenure) +
        -0.30 * np.log1p(bank_rel) +
        -0.15 * savings_ratio +
        secured_bonus +  # NEW: secured loan bonus
        0.08 * (age < 25) +
        0.06 * (age > 60) +
        -0.05 * (property_value > 0) +
        -0.03 * digital_score +
        0.10 * _sigmoid((income - 80000.0) / 40000.0) * (dti > 0.8) +
        0.07 * (app_to_income > 2.5) - 0.35
    )

    # Categorical adjustments
    edu = row.get("education_level", "Graduate")
    x += {"High School": 0.08, "Diploma": 0.03, "Graduate": 0.0, "Post Graduate": -0.04, "Professional": -0.07}.get(edu, 0.0)

    emp = row.get("employment_type", "Salaried")
    x += {"Salaried": -0.03, "Professional": -0.05, "Self Employed": 0.04, "Business Owner": 0.02}.get(emp, 0.0)

    loc = row.get("location_type", "Tier2")
    x += {"Metro": -0.04, "Tier1": 0.01, "Tier2": 0.03}.get(loc, 0.0)

    base_pd = _sigmoid(x)
    pd = _clip(float(base_pd + np.random.normal(0, 0.06)), 0.01, 0.92)
    return pd

def _generate_profile(segment, age_mu=36, inc_mu=55000, inc_sigma=30000):
    """Enhanced profile generation with loan type and interest rate"""
    # Generate basic profile first
    if segment == "excellent":
        age = max(28, min(65, _sample_age() + 5))
        base_income = _sample_income()
        income = int(max(60000, base_income * 1.5))
        exp_ratio = np.clip(np.random.normal(0.42, 0.09), 0.25, 0.62)
        dti = np.clip(np.random.normal(0.18, 0.07), 0.02, 0.38)
        education = np.random.choice(["Graduate", "Post Graduate", "Professional"], p=[0.3, 0.5, 0.2])
        employment = np.random.choice(["Salaried", "Professional", "Business Owner"], p=[0.4, 0.4, 0.2])
    elif segment == "good":
        age = max(25, min(58, _sample_age() + 2))
        base_income = _sample_income()
        income = int(max(40000, base_income * 1.15))
        exp_ratio = np.clip(np.random.normal(0.57, 0.11), 0.38, 0.78)
        dti = np.clip(np.random.normal(0.38, 0.13), 0.08, 0.72)
        education = np.random.choice(["Diploma", "Graduate", "Post Graduate"], p=[0.2, 0.6, 0.2])
        employment = np.random.choice(["Salaried", "Professional", "Self Employed"], p=[0.5, 0.3, 0.2])
    elif segment == "fair":
        age = max(22, min(55, _sample_age() - 1))
        base_income = _sample_income()
        income = int(max(25000, base_income * 0.88))
        exp_ratio = np.clip(np.random.normal(0.72, 0.12), 0.52, 0.88)
        dti = np.clip(np.random.normal(0.65, 0.22), 0.28, 1.25)
        education = np.random.choice(["High School", "Diploma", "Graduate"], p=[0.3, 0.5, 0.2])
        employment = np.random.choice(["Salaried", "Self Employed"], p=[0.6, 0.4])
    elif segment == "poor":
        age = max(20, min(52, _sample_age() - 3))
        base_income = _sample_income()
        income = int(max(18000, base_income * 0.65))
        exp_ratio = np.clip(np.random.normal(0.83, 0.09), 0.68, 0.97)
        dti = np.clip(np.random.normal(1.15, 0.38), 0.65, 2.2)
        education = np.random.choice(["High School", "Diploma"], p=[0.7, 0.3])
        employment = np.random.choice(["Salaried", "Self Employed"], p=[0.4, 0.6])
    else:  # bad
        age = max(18, min(48, _sample_age() - 5))
        base_income = _sample_income()
        income = int(max(15000, base_income * 0.52))
        exp_ratio = np.clip(np.random.normal(0.92, 0.07), 0.78, 1.00)
        dti = np.clip(np.random.normal(1.85, 0.55), 1.1, 3.8)
        education = np.random.choice(["High School", "Diploma"], p=[0.8, 0.2])
        employment = "Self Employed"

    # Generate other profile attributes
    gender = np.random.choice(["Male", "Female"], p=[0.55, 0.45])
    marital_status = np.random.choice(["Single", "Married"], p=[0.25, 0.75]) if age >= 28 else np.random.choice(["Single", "Married"], p=[0.7, 0.3])

    if marital_status == "Married":
        family_size = np.random.randint(2, 6)
        dependents = max(0, family_size - 2)
        spouse_income = int(np.random.uniform(0.2, 0.8) * income) if np.random.rand() < 0.6 else 0
    else:
        family_size, dependents, spouse_income = 1, 0, 0

    city = np.random.choice(GUJARAT_CITIES)
    location_type = CITY_TO_LOCATION_TYPE.get(city, "Tier2")

    expenses = int(_clip(income * exp_ratio + np.random.randint(-3000, 3000), 2000, max(2500, income - 500)))
    savings = max(0, income - expenses - np.random.randint(0, 4000))

    # Asset generation
    if income > 80000 and age > 30:
        property_value = int(income * np.random.uniform(25, 55) * (age / 40))
        vehicle_value = int(income * np.random.uniform(2, 8))
        investments = int(income * np.random.uniform(8, 25) * ((age - 25) / 20))
    elif income > 50000:
        property_value = int(income * np.random.uniform(15, 35) * (age / 40)) if np.random.rand() < 0.4 else 0
        vehicle_value = int(income * np.random.uniform(1, 5)) if np.random.rand() < 0.6 else 0
        investments = int(income * np.random.uniform(3, 12)) if np.random.rand() < 0.5 else 0
    else:
        property_value = 0
        vehicle_value = int(income * np.random.uniform(0.5, 3)) if np.random.rand() < 0.3 else 0
        investments = int(income * np.random.uniform(0.5, 5)) if np.random.rand() < 0.2 else 0

    # NEW: Assign loan type FIRST (needed for application amount calculation)
    preliminary_application_amount = _sample_loan_amount()  # Just for loan type assignment
    loan_type = _assign_loan_type(income, age, employment, property_value, preliminary_application_amount)

    # NOW generate proper loan application amount based on loan type
    loan_application_amount = _generate_loan_application_amount(
        income, age, segment, property_value, employment, loan_type
    )

    # Outstanding loan calculation
    utilization_rate = np.clip(np.random.beta(2, 2), 0.3, 1.0)
    loan_amt = int(loan_application_amount * utilization_rate)
    max_affordable_loan = int(dti * 12 * income)
    loan_amt = min(loan_amt, max_affordable_loan)
    loan_amt = max(15000, loan_amt)

    # Calculate preliminary DTI for interest rate calculation
    actual_dti = loan_amt / (12.0 * income)

    # NEW: Calculate interest rate based on loan type and risk profile
    interest_rate = _calculate_interest_rate(loan_type, segment, income, age, employment, actual_dti)

    # Other financial details
    utility_bills = _sample_monthly_charges()
    utility_bills = int(utility_bills * family_size * np.random.uniform(0.8, 1.2))

    emp_years = round(_clip(_sample_tenure() * np.random.uniform(0.6, 1.0), 0.5, 35), 1)
    bank_years = round(_clip(_sample_tenure() - np.random.uniform(-2.0, 3.0), 0.5, max(0.5, age - 18)), 1)

    if employment in ["Business Owner", "Self Employed"]:
        business_revenue = int(income * np.random.uniform(1.2, 2.8))
    else:
        business_revenue = 0

    # Digital behavior
    if age < 35 and education in ["Graduate", "Post Graduate", "Professional"]:
        mobile_hours = round(np.random.uniform(6, 12), 1)
        digital_transactions = _sample_digital_transactions() + np.random.randint(20, 40)
        social_media = np.random.randint(4, 8)
        app_usage_score = np.random.randint(65, 95)
        digital_payment_score = np.random.randint(70, 95)
        mobile_banking_score = np.random.randint(60, 90)
    elif age < 50:
        mobile_hours = round(np.random.uniform(3, 8), 1)
        digital_transactions = _sample_digital_transactions()
        social_media = np.random.randint(2, 6)
        app_usage_score = np.random.randint(40, 75)
        digital_payment_score = np.random.randint(45, 80)
        mobile_banking_score = np.random.randint(35, 70)
    else:
        mobile_hours = round(np.random.uniform(1, 5), 1)
        digital_transactions = max(10, _sample_digital_transactions() - 20)
        social_media = np.random.randint(1, 4)
        app_usage_score = np.random.randint(20, 55)
        digital_payment_score = np.random.randint(25, 60)
        mobile_banking_score = np.random.randint(20, 50)

    avg_transaction = int((income + expenses) / max(1, digital_transactions) * np.random.uniform(0.5, 2.0))

    # Create the initial row with basic financial data
    row = {
        "applicant_id": _generate_applicant_id(),
        "application_date": _generate_application_date(),
        "age": age,
        "gender": gender,
        "education_level": education,
        "employment_type": employment,
        "marital_status": marital_status,
        "family_size": family_size,
        "number_of_dependents": dependents,
        "location_type": location_type,
        "monthly_income_inr": income,
        "spouse_income_inr": spouse_income,
        "monthly_expenses_inr": expenses,
        "monthly_savings_inr": savings,
        "monthly_utility_bills_inr": utility_bills,
        "property_value_inr": property_value,
        "vehicle_value_inr": vehicle_value,
        "total_investments_inr": investments,
        "outstanding_loan_amount_inr": loan_amt,
        "loan_amount_applied_inr": loan_application_amount,
        "years_current_employment": emp_years,
        "banking_relationship_years": bank_years,
        "monthly_business_revenue_inr": business_revenue,
        "daily_mobile_hours": mobile_hours,
        "monthly_digital_transactions": digital_transactions,
        "avg_transaction_amount_inr": avg_transaction,
        "social_media_accounts_count": social_media,
        "mobile_app_usage_intensity_score": app_usage_score,
        "digital_payment_adoption_score": digital_payment_score,
        "mobile_banking_usage_score": mobile_banking_score,
        "consent_status": "Full Consent",
        "city": city,
        # NEW: Loan type features
        "loan_type": loan_type,
        "interest_rate": interest_rate
    }

    # Calculate scores using the enhanced function
    scores = calculate_data_driven_scores(row)
    row.update(scores)

    # Calculate probability of default and risk category
    pd_val = _pd_from_features(row)
    row["probability_of_default"] = pd_val
    row["risk_category"] = _risk_category_from_p(pd_val)

    return row

def generate_enhanced_training_data(n_rows=12000, seed=111):
    """Generate enhanced training data with loan type and interest rate features"""
    np.random.seed(seed)
    seg_mix = {"excellent": 0.28, "good": 0.32, "fair": 0.24, "poor": 0.11, "bad": 0.05}
    counts = {k: int(v * n_rows) for k, v in seg_mix.items()}
    diff = n_rows - sum(counts.values())
    if diff != 0:
        counts["good"] += diff

    rows = []
    for seg, cnt in counts.items():
        for _ in range(cnt):
            rows.append(_generate_profile(seg))

    df = pd.DataFrame(rows)

    # NEW: Add one-hot encoded loan type features for model training
    for loan_type in LOAN_TYPES:
        col_name = f"loan_type_{loan_type.replace(' ', '_')}"
        df[col_name] = (df['loan_type'] == loan_type).astype(int)

    df = df[ALL_COLUMNS]
    out_csv = os.path.join(OUT_DIR, "training_data_aligned.csv")
    df.to_csv(out_csv, index=False)

    print(f"[TRAIN] ✅ Enhanced training set with loan types: {out_csv} shape={df.shape}")
    print(f"[TRAIN] Features used in model: {len(MODEL_FEATURES)} (35+ features including loan type)")
    print(f"[TRAIN] Risk distribution: {df['risk_category'].value_counts().to_dict()}")
    print(f"[TRAIN] Loan type distribution: {df['loan_type'].value_counts().to_dict()}")
    print(f"[TRAIN] Interest rate range: {df['interest_rate'].min():.2f}% - {df['interest_rate'].max():.2f}%")
    print(f"[TRAIN] PD range: {df['probability_of_default'].min():.3f} - {df['probability_of_default'].max():.3f}")

    # Print loan type vs risk correlation
    print(f"\n[ANALYSIS] Loan Type vs Risk Correlation:")
    risk_by_loan_type = df.groupby('loan_type')['probability_of_default'].agg(['mean', 'count']).round(3)
    for loan_type, stats in risk_by_loan_type.iterrows():
        secured_status = "Secured" if LOAN_INTEREST_RATES[loan_type]["secured"] else "Unsecured"
        print(f"  - {loan_type}: {stats['mean']:.3f} avg PD, {stats['count']} samples ({secured_status})")

    return df

if __name__ == "__main__":
    generate_enhanced_training_data(n_rows=15000, seed=111)


[INFO] Loaded loan dataset: (255347, 18)
[INFO] Loaded telco dataset: (7043, 21)
[TRAIN] ✅ Enhanced training set with loan types: credit_risk_output/training_data_aligned.csv shape=(15000, 43)
[TRAIN] Features used in model: 36 (35+ features including loan type)
[TRAIN] Risk distribution: {'Low Risk': 5909, 'Medium Risk': 4179, 'Very High Risk': 2821, 'High Risk': 2091}
[TRAIN] Loan type distribution: {np.str_('personal loan'): 4125, np.str_('home loan'): 3666, np.str_('auto loan'): 2395, np.str_('business loan'): 1926, np.str_('education loan'): 1613, np.str_('gold loan'): 646, np.str_('credit card'): 629}
[TRAIN] Interest rate range: 8.44% - 32.10%
[TRAIN] PD range: 0.010 - 0.920

[ANALYSIS] Loan Type vs Risk Correlation:
  - auto loan: 0.197 avg PD, 2395.0 samples (Secured)
  - business loan: 0.489 avg PD, 1926.0 samples (Unsecured)
  - credit card: 0.164 avg PD, 629.0 samples (Unsecured)
  - education loan: 0.237 avg PD, 1613.0 samples (Unsecured)
  - gold loan: 0.083 avg PD, 646.0

In [5]:
# -*- coding: utf-8 -*-
# Test data generator with loan_type and interest_rate features

import os
import numpy as np
import pandas as pd
from datetime import datetime, timedelta

np.random.seed(222)

OUT_DIR = "credit_risk_output"
os.makedirs(OUT_DIR, exist_ok=True)

# External datasets loading (same as train)
LOAN_DATA_PATH = "Loan_default (1).csv"
TELCO_DATA_PATH = "WA_Fn-UseC_-Telco-Customer-Churn.csv"

try:
    loan_df = pd.read_csv(LOAN_DATA_PATH)
    print(f"[INFO] Loaded loan dataset for test: {loan_df.shape}")
except FileNotFoundError:
    print(f"[WARNING] {LOAN_DATA_PATH} not found. Using fallback distributions.")
    loan_df = None

try:
    telco_df = pd.read_csv(TELCO_DATA_PATH)
    print(f"[INFO] Loaded telco dataset for test: {telco_df.shape}")
except FileNotFoundError:
    print(f"[WARNING] {TELCO_DATA_PATH} not found. Using fallback distributions.")
    telco_df = None

# Pre-calculate distributions
if loan_df is not None:
    loan_age_stats = loan_df["age"].dropna().describe() if "age" in loan_df.columns else None
    loan_income_stats = loan_df["income"].dropna().describe() if "income" in loan_df.columns else None
    loan_amount_stats = loan_df["loan_amount"].dropna().describe() if "loan_amount" in loan_df.columns else None
else:
    loan_age_stats = loan_income_stats = loan_amount_stats = None

if telco_df is not None:
    telco_tenure_stats = telco_df["tenure"].dropna().describe() if "tenure" in telco_df.columns else None
    telco_charges_stats = telco_df["MonthlyCharges"].dropna().describe() if "MonthlyCharges" in telco_df.columns else None
else:
    telco_tenure_stats = telco_charges_stats = None

# NEW: Loan types and characteristics (same as train)
LOAN_TYPES = [
    "personal loan", "home loan", "auto loan", "education loan",
    "business loan", "credit card", "gold loan"
]

LOAN_INTEREST_RATES = {
    "personal loan": {"min": 11.99, "max": 24.0, "secured": False, "risk_factor": 1.4},
    "home loan": {"min": 7.35, "max": 12.95, "secured": True, "risk_factor": 0.6},
    "auto loan": {"min": 7.70, "max": 15.30, "secured": True, "risk_factor": 0.8},
    "education loan": {"min": 9.45, "max": 15.0, "secured": False, "risk_factor": 0.9},
    "business loan": {"min": 10.85, "max": 17.95, "secured": False, "risk_factor": 1.2},
    "credit card": {"min": 18.0, "max": 42.0, "secured": False, "risk_factor": 1.8},
    "gold loan": {"min": 8.30, "max": 16.0, "secured": True, "risk_factor": 0.7}
}

# Test columns (NO targets, NO scores - updated with loan type features)
TEST_COLUMNS = [
    "applicant_id", "application_date", "age", "gender", "education_level",
    "employment_type", "marital_status", "family_size", "number_of_dependents",
    "location_type", "monthly_income_inr", "spouse_income_inr", "monthly_expenses_inr",
    "monthly_savings_inr", "monthly_utility_bills_inr", "property_value_inr",
    "vehicle_value_inr", "total_investments_inr", "outstanding_loan_amount_inr",
    "loan_amount_applied_inr", "years_current_employment", "banking_relationship_years",
    "monthly_business_revenue_inr", "daily_mobile_hours", "monthly_digital_transactions",
    "avg_transaction_amount_inr", "social_media_accounts_count", "mobile_app_usage_intensity_score",
    "digital_payment_adoption_score", "consent_status", "city",
    # NEW: Loan type features
    "loan_type", "interest_rate"
]

GUJARAT_CITIES = [
    "Ahmedabad", "Surat", "Vadodara", "Rajkot", "Bhavnagar", "Jamnagar", "Junagadh",
    "Gandhinagar", "Nadiad", "Morbi", "Anand", "Mehsana", "Navsari", "Bharuch",
    "Vapi", "Valsad", "Patan", "Godhra", "Porbandar", "Palanpur", "Veraval", "Surendranagar"
]
EDUCATION_LEVELS = ["High School", "Diploma", "Graduate", "Post Graduate", "Professional"]
EMPLOYMENT_TYPES = ["Salaried", "Self Employed", "Business Owner", "Professional"]
LOCATION_TYPES = ["Metro", "Tier1", "Tier2"]

CITY_TO_LOCATION_TYPE = {
    "Ahmedabad": "Metro", "Surat": "Metro", "Vadodara": "Metro",
    "Rajkot": "Tier1", "Bhavnagar": "Tier1", "Jamnagar": "Tier1",
    "Gandhinagar": "Metro", "Surendranagar": "Tier1",
    "Junagadh": "Tier2", "Nadiad": "Tier2", "Morbi": "Tier2", "Anand": "Tier2",
    "Mehsana": "Tier2", "Navsari": "Tier2", "Bharuch": "Tier2", "Vapi": "Tier2",
    "Valsad": "Tier2", "Patan": "Tier2", "Godhra": "Tier2", "Porbandar": "Tier2",
    "Palanpur": "Tier2", "Veraval": "Tier2"
}

# Sampling functions (same as train)
def _sample_age():
    if loan_age_stats is not None:
        sampled_age = int(np.random.normal(loan_age_stats["mean"], loan_age_stats["std"]))
        return max(18, min(75, sampled_age))
    else:
        return int(np.random.normal(36, 12))

def _sample_income():
    if loan_income_stats is not None:
        sampled_income = int(np.random.normal(loan_income_stats["mean"], loan_income_stats["std"]))
        return max(15000, min(500000, sampled_income))
    else:
        return int(np.random.normal(55000, 25000))

def _sample_loan_amount():
    if loan_amount_stats is not None:
        sampled_amount = int(np.random.normal(loan_amount_stats["mean"], loan_amount_stats["std"]))
        return max(15000, min(6000000, sampled_amount))
    else:
        return int(np.random.normal(250000, 80000))

def _sample_tenure():
    if telco_tenure_stats is not None:
        sampled_tenure = np.random.normal(telco_tenure_stats["mean"], telco_tenure_stats["std"])
        return max(0.5, min(35.0, sampled_tenure))
    else:
        return np.random.uniform(0.5, 15.0)

def _sample_digital_transactions():
    if telco_charges_stats is not None:
        base_transactions = int(telco_charges_stats["mean"] / 2)
        noise = np.random.normal(0, telco_charges_stats["std"] / 4)
        return max(5, int(base_transactions + noise))
    else:
        base = int(np.random.normal(60, 20))
        return max(5, base)

def _sample_monthly_charges():
    if telco_charges_stats is not None:
        charges = np.random.normal(telco_charges_stats["mean"], telco_charges_stats["std"])
        return max(500, int(charges * 60))
    else:
        return int(np.random.normal(2500, 800))

# NEW: Loan type assignment and interest rate calculation (same logic as train)
def _assign_loan_type(income, age, employment_type, property_value, application_amount):
    if age < 30:
        loan_prefs = ["personal loan", "education loan", "auto loan", "credit card"]
        weights = [0.35, 0.25, 0.25, 0.15]
    elif age < 45:
        loan_prefs = ["home loan", "personal loan", "auto loan", "business loan", "education loan"]
        weights = [0.30, 0.25, 0.20, 0.15, 0.10]
    else:
        loan_prefs = ["home loan", "business loan", "personal loan", "gold loan", "auto loan"]
        weights = [0.35, 0.25, 0.20, 0.15, 0.05]

    # Apply same adjustments as train
    if employment_type == "Business Owner":
        if "business loan" in loan_prefs:
            idx = loan_prefs.index("business loan")
            weights[idx] *= 1.5
        if "gold loan" in loan_prefs:
            idx = loan_prefs.index("gold loan")
            weights[idx] *= 1.3
    elif employment_type == "Self Employed":
        if "personal loan" in loan_prefs:
            idx = loan_prefs.index("personal loan")
            weights[idx] *= 1.3
        if "gold loan" in loan_prefs:
            idx = loan_prefs.index("gold loan")
            weights[idx] *= 1.2

    if property_value > 0:
        if "home loan" in loan_prefs:
            idx = loan_prefs.index("home loan")
            weights[idx] *= 1.4
        if "gold loan" in loan_prefs:
            idx = loan_prefs.index("gold loan")
            weights[idx] *= 1.2

    if application_amount > 1000000:
        for loan_type in ["home loan", "business loan"]:
            if loan_type in loan_prefs:
                idx = loan_prefs.index(loan_type)
                weights[idx] *= 1.5
    elif application_amount < 100000:
        for loan_type in ["personal loan", "credit card"]:
            if loan_type in loan_prefs:
                idx = loan_prefs.index(loan_type)
                weights[idx] *= 1.3

    total_weight = sum(weights)
    normalized_weights = [w/total_weight for w in weights]

    loan_type = np.random.choice(loan_prefs, p=normalized_weights)
    return loan_type

def _calculate_interest_rate_for_test(loan_type, profile_type, income, age, employment_type, dti_ratio):
    """Calculate interest rate for test profiles"""
    loan_config = LOAN_INTEREST_RATES[loan_type]
    base_min = loan_config["min"]
    base_max = loan_config["max"]
    risk_factor = loan_config["risk_factor"]

    # Profile-based adjustments (mapped from segment)
    profile_adjustments = {
        "high_earner_low_risk": -0.20,
        "stable_middle_class": -0.08,
        "young_professional": 0.02,
        "average_earner": 0.08,
        "financial_stress": 0.18,
        "outlier_case": 0.12
    }
    profile_adj = profile_adjustments.get(profile_type, 0.0)

    # Same income, age, employment, DTI adjustments as train
    if income > 100000:
        income_adj = -0.10
    elif income > 50000:
        income_adj = -0.05
    elif income < 25000:
        income_adj = 0.15
    else:
        income_adj = 0.0

    if age < 25:
        age_adj = 0.10
    elif age > 60:
        age_adj = 0.05
    else:
        age_adj = 0.0

    emp_adjustments = {
        "Salaried": -0.05,
        "Professional": -0.08,
        "Self Employed": 0.10,
        "Business Owner": 0.05
    }
    emp_adj = emp_adjustments.get(employment_type, 0.0)

    if dti_ratio > 0.6:
        dti_adj = 0.20
    elif dti_ratio > 0.4:
        dti_adj = 0.10
    else:
        dti_adj = 0.0

    total_adjustment = (profile_adj + income_adj + age_adj + emp_adj + dti_adj) * risk_factor

    base_rate = (base_min + base_max) / 2
    final_rate = base_rate + total_adjustment
    final_rate += np.random.normal(0, 0.5)
    final_rate = max(base_min, min(base_max, final_rate))

    return round(final_rate, 2)

def _generate_loan_application_amount(income, age, profile_type, property_value, employment_type, loan_type):
    """Enhanced loan application amount for test (same logic as train)"""
    base_amount = _sample_loan_amount()
    income_multiplier = min(3.0, max(0.3, income / 50000))

    loan_type_multipliers = {
        "personal loan": np.random.uniform(0.5, 1.5),
        "home loan": np.random.uniform(3.0, 8.0),
        "auto loan": np.random.uniform(1.0, 3.0),
        "education loan": np.random.uniform(0.8, 2.5),
        "business loan": np.random.uniform(1.5, 5.0),
        "credit card": np.random.uniform(0.2, 0.8),
        "gold loan": np.random.uniform(0.3, 1.2)
    }
    loan_type_mult = loan_type_multipliers.get(loan_type, 1.0)

    if age < 30:
        purpose_multiplier = np.random.choice([0.6, 1.2, 2.0], p=[0.5, 0.3, 0.2])
    elif age < 45:
        purpose_multiplier = np.random.choice([1.0, 2.5, 4.0], p=[0.3, 0.4, 0.3])
    else:
        purpose_multiplier = np.random.choice([0.8, 1.8, 3.0], p=[0.4, 0.4, 0.2])

    property_multiplier = np.random.uniform(1.2, 2.0) if property_value > 0 else np.random.uniform(0.7, 1.3)

    employment_multipliers = {
        "Salaried": np.random.uniform(0.8, 1.4),
        "Professional": np.random.uniform(1.0, 1.8),
        "Business Owner": np.random.uniform(1.2, 2.5),
        "Self Employed": np.random.uniform(0.6, 1.6)
    }
    emp_multiplier = employment_multipliers.get(employment_type, 1.0)

    profile_adjustments = {
        "high_earner_low_risk": np.random.uniform(2.0, 3.5),
        "stable_middle_class": np.random.uniform(1.2, 2.2),
        "young_professional": np.random.uniform(0.8, 1.8),
        "average_earner": np.random.uniform(0.6, 1.4),
        "financial_stress": np.random.uniform(0.3, 0.9),
        "outlier_case": np.random.uniform(1.5, 3.0)
    }
    profile_multiplier = profile_adjustments.get(profile_type, 1.0)

    application_amount = int(base_amount * income_multiplier * purpose_multiplier *
                           property_multiplier * emp_multiplier * profile_multiplier * loan_type_mult)

    # Loan type specific bounds
    if loan_type == "home loan":
        min_amount, max_amount = 500000, min(8000000, int(income * 12 * 8))
    elif loan_type == "auto loan":
        min_amount, max_amount = 200000, min(1500000, int(income * 12 * 3))
    elif loan_type == "business loan":
        min_amount, max_amount = 100000, min(5000000, int(income * 12 * 6))
    elif loan_type == "education loan":
        min_amount, max_amount = 50000, min(2000000, int(income * 12 * 4))
    elif loan_type == "credit card":
        min_amount, max_amount = 10000, min(500000, int(income * 12))
    elif loan_type == "gold loan":
        min_amount, max_amount = 25000, min(1000000, int(income * 12 * 2))
    else:  # personal loan
        min_amount, max_amount = 25000, min(2000000, int(income * 12 * 3))

    return max(min_amount, min(max_amount, application_amount))

# Utility functions
def _clip(v, lo, hi):
    return max(lo, min(hi, v))

def _generate_applicant_id():
    prefix = np.random.choice(["A", "B", "C","D", "E", "F"])
    number = np.random.randint(1000, 9999)
    return f"{prefix}{number:04d}"

def _generate_application_date():
    start_date = datetime(2024, 6, 1)
    end_date = datetime(2025, 7, 31)
    days_diff = (end_date - start_date).days
    random_days = np.random.randint(0, max(1, days_diff))
    return (start_date + timedelta(days=int(random_days))).strftime("%Y-%m-%d")

def _generate_test_profile(profile_type):
    """Enhanced test profile generation with loan type and interest rate"""
    # Generate basic profile
    if profile_type == "high_earner_low_risk":
        age = max(32, min(55, _sample_age() + 5))
        base_income = _sample_income()
        income = int(max(120000, base_income * 1.8))
        exp_ratio = float(np.clip(np.random.normal(0.35, 0.08), 0.20, 0.55))
        education = np.random.choice(["Graduate", "Post Graduate", "Professional"], p=[0.2, 0.5, 0.3])
        employment = np.random.choice(["Salaried", "Professional", "Business Owner"], p=[0.3, 0.4, 0.3])
    elif profile_type == "stable_middle_class":
        age = max(28, min(50, _sample_age() + 2))
        base_income = _sample_income()
        income = int(max(60000, base_income * 1.3))
        exp_ratio = float(np.clip(np.random.normal(0.48, 0.09), 0.32, 0.68))
        education = np.random.choice(["Diploma", "Graduate", "Post Graduate"], p=[0.3, 0.5, 0.2])
        employment = np.random.choice(["Salaried", "Professional"], p=[0.7, 0.3])
    elif profile_type == "young_professional":
        age = max(22, min(32, _sample_age() - 5))
        base_income = _sample_income()
        income = int(max(45000, base_income * 1.1))
        exp_ratio = float(np.clip(np.random.normal(0.58, 0.12), 0.38, 0.75))
        education = np.random.choice(["Graduate", "Post Graduate"], p=[0.7, 0.3])
        employment = np.random.choice(["Salaried", "Professional"], p=[0.8, 0.2])
    elif profile_type == "average_earner":
        age = max(25, min(48, _sample_age()))
        base_income = _sample_income()
        income = int(max(35000, base_income * 0.9))
        exp_ratio = float(np.clip(np.random.normal(0.68, 0.12), 0.48, 0.85))
        education = np.random.choice(["High School", "Diploma", "Graduate"], p=[0.2, 0.5, 0.3])
        employment = np.random.choice(["Salaried", "Self Employed"], p=[0.6, 0.4])
    elif profile_type == "financial_stress":
        age = max(22, min(45, _sample_age() - 2))
        base_income = _sample_income()
        income = int(max(22000, base_income * 0.7))
        exp_ratio = float(np.clip(np.random.normal(0.85, 0.08), 0.75, 0.95))
        education = np.random.choice(["High School", "Diploma", "Graduate"], p=[0.4, 0.4, 0.2])
        employment = np.random.choice(["Salaried", "Self Employed"], p=[0.5, 0.5])
    else:  # outlier_case
        age = max(20, min(65, _sample_age() + np.random.randint(-10, 10)))
        base_income = _sample_income()
        income = int(max(18000, base_income * np.random.uniform(0.5, 2.0)))
        exp_ratio = float(np.clip(np.random.normal(0.65, 0.25), 0.25, 0.95))
        education = np.random.choice(EDUCATION_LEVELS)
        employment = np.random.choice(EMPLOYMENT_TYPES)

    # Demographics
    gender = np.random.choice(["Male", "Female"], p=[0.55, 0.45])
    marital_status = np.random.choice(["Single", "Married"], p=[0.3, 0.7]) if age >= 28 else np.random.choice(["Single", "Married"], p=[0.75, 0.25])

    if marital_status == "Married":
        family_size = np.random.randint(2, 5)
        dependents = max(0, family_size - 2)
        spouse_income = int(np.random.uniform(0.15, 0.75) * income) if np.random.rand() < 0.65 else 0
    else:
        family_size, dependents, spouse_income = 1, 0, 0

    city = np.random.choice(GUJARAT_CITIES)
    location_type = CITY_TO_LOCATION_TYPE.get(city, "Tier2")

    # Financial calculations
    expenses = int(_clip(income * exp_ratio + np.random.randint(-2500, 2500), 1500, max(2000, income - 300)))
    savings = max(0, income - expenses - np.random.randint(0, 3500))

    # Assets
    if income > 90000 and age > 32:
        property_value = int(income * np.random.uniform(20, 45) * (age / 40))
        vehicle_value = int(income * np.random.uniform(1.5, 6))
        investments = int(income * np.random.uniform(6, 20) * ((age - 22) / 18))
    elif income > 45000:
        property_value = int(income * np.random.uniform(10, 25) * (age / 40)) if np.random.rand() < 0.35 else 0
        vehicle_value = int(income * np.random.uniform(0.8, 4)) if np.random.rand() < 0.55 else 0
        investments = int(income * np.random.uniform(2, 10)) if np.random.rand() < 0.45 else 0
    else:
        property_value = 0
        vehicle_value = int(income * np.random.uniform(0.3, 2.5)) if np.random.rand() < 0.25 else 0
        investments = int(income * np.random.uniform(0.2, 4)) if np.random.rand() < 0.15 else 0

    # NEW: Assign loan type FIRST
    preliminary_application_amount = _sample_loan_amount()
    loan_type = _assign_loan_type(income, age, employment, property_value, preliminary_application_amount)

    # Generate loan application amount based on loan type
    loan_application_amount = _generate_loan_application_amount(
        income, age, profile_type, property_value, employment, loan_type
    )

    # Outstanding loan calculation
    utilization_rate = np.clip(np.random.beta(2.5, 2), 0.25, 1.0)
    outstanding_loan = int(loan_application_amount * utilization_rate)
    max_reasonable_loan = int(income * 12 * 3.5)
    outstanding_loan = min(outstanding_loan, max_reasonable_loan)
    outstanding_loan = max(10000, outstanding_loan)

    # Calculate DTI for interest rate
    actual_dti = outstanding_loan / (12.0 * income)

    # NEW: Calculate interest rate
    interest_rate = _calculate_interest_rate_for_test(loan_type, profile_type, income, age, employment, actual_dti)

    # Utility bills and other details
    utility_bills = _sample_monthly_charges()
    utility_bills = int(utility_bills * family_size * np.random.uniform(0.7, 1.3))

    emp_years = round(_clip(_sample_tenure() * np.random.uniform(0.5, 1.0), 0.5, min(age - 18, 30)), 1)
    bank_years = round(_clip(_sample_tenure() - np.random.uniform(-1.5, 2.5), 0.5, max(0.5, age - 16)), 1)

    if employment in ["Business Owner", "Self Employed"]:
        business_revenue = int(income * np.random.uniform(1.1, 2.5))
    else:
        business_revenue = 0

    # Digital behavior
    if age < 35 and education in ["Graduate", "Post Graduate", "Professional"]:
        mobile_hours = round(np.random.uniform(5.5, 11), 1)
        digital_transactions = _sample_digital_transactions() + np.random.randint(15, 35)
        social_media = np.random.randint(3, 7)
        app_usage_score = np.random.randint(60, 90)
        digital_payment_score = np.random.randint(65, 90)
    elif age < 50:
        mobile_hours = round(np.random.uniform(2.5, 7.5), 1)
        digital_transactions = _sample_digital_transactions()
        social_media = np.random.randint(2, 5)
        app_usage_score = np.random.randint(35, 70)
        digital_payment_score = np.random.randint(40, 75)
    else:
        mobile_hours = round(np.random.uniform(1, 4.5), 1)
        digital_transactions = max(8, _sample_digital_transactions() - 15)
        social_media = np.random.randint(1, 3)
        app_usage_score = np.random.randint(15, 50)
        digital_payment_score = np.random.randint(20, 55)

    avg_transaction = int((income + expenses) / max(1, digital_transactions) * np.random.uniform(0.4, 1.8))

    # Construct test row (NO SCORES OR TARGETS)
    row = {
        "applicant_id": _generate_applicant_id(),
        "application_date": _generate_application_date(),
        "age": age,
        "gender": gender,
        "education_level": education,
        "employment_type": employment,
        "marital_status": marital_status,
        "family_size": family_size,
        "number_of_dependents": dependents,
        "location_type": location_type,
        "monthly_income_inr": income,
        "spouse_income_inr": spouse_income,
        "monthly_expenses_inr": expenses,
        "monthly_savings_inr": savings,
        "monthly_utility_bills_inr": utility_bills,
        "property_value_inr": property_value,
        "vehicle_value_inr": vehicle_value,
        "total_investments_inr": investments,
        "outstanding_loan_amount_inr": outstanding_loan,
        "loan_amount_applied_inr": loan_application_amount,
        "years_current_employment": emp_years,
        "banking_relationship_years": bank_years,
        "monthly_business_revenue_inr": business_revenue,
        "daily_mobile_hours": mobile_hours,
        "monthly_digital_transactions": digital_transactions,
        "avg_transaction_amount_inr": avg_transaction,
        "social_media_accounts_count": social_media,
        "mobile_app_usage_intensity_score": app_usage_score,
        "digital_payment_adoption_score": digital_payment_score,
        "consent_status": "Full Consent",
        "city": city,
        # NEW: Loan type features
        "loan_type": loan_type,
        "interest_rate": interest_rate,
    }

    return row

def generate_test_data(n_rows=3000, seed=222):
    """Generate test data with loan type and interest rate features"""
    np.random.seed(seed)

    profiles = {
        "high_earner_low_risk": 0.20,
        "stable_middle_class": 0.25,
        "young_professional": 0.20,
        "average_earner": 0.20,
        "financial_stress": 0.10,
        "outlier_case": 0.05
    }

    counts = {k: int(v * n_rows) for k, v in profiles.items()}
    diff = n_rows - sum(counts.values())
    if diff != 0:
        counts["average_earner"] += diff

    rows = []
    for profile_type, count in counts.items():
        for _ in range(count):
            rows.append(_generate_test_profile(profile_type))

    df = pd.DataFrame(rows)
    df = df[TEST_COLUMNS]

    out_csv = os.path.join(OUT_DIR, "test_data_aligned.csv")
    df.to_csv(out_csv, index=False)

    print(f"[TEST] ✅ Enhanced test set with loan types: {out_csv} shape={df.shape}")
    print(f"[TEST] Profile distribution: {dict(zip(profiles.keys(), [counts[k] for k in profiles.keys()]))}")
    print(f"[TEST] Loan type distribution: {df['loan_type'].value_counts().to_dict()}")
    print(f"[TEST] Interest rate range: {df['interest_rate'].min():.2f}% - {df['interest_rate'].max():.2f}%")
    print(f"[TEST] Age range: {df['age'].min()} - {df['age'].max()}")
    print(f"[TEST] Income range: ₹{df['monthly_income_inr'].min():,.0f} - ₹{df['monthly_income_inr'].max():,.0f}")

    return df

if __name__ == "__main__":
    generate_test_data(n_rows=40, seed=222)


[INFO] Loaded loan dataset for test: (255347, 18)
[INFO] Loaded telco dataset for test: (7043, 21)
[TEST] ✅ Enhanced test set with loan types: credit_risk_output/test_data_aligned.csv shape=(40, 33)
[TEST] Profile distribution: {'high_earner_low_risk': 8, 'stable_middle_class': 10, 'young_professional': 8, 'average_earner': 8, 'financial_stress': 4, 'outlier_case': 2}
[TEST] Loan type distribution: {np.str_('home loan'): 16, np.str_('personal loan'): 8, np.str_('auto loan'): 5, np.str_('business loan'): 4, np.str_('education loan'): 4, np.str_('credit card'): 2, np.str_('gold loan'): 1}
[TEST] Interest rate range: 9.21% - 30.22%
[TEST] Age range: 22 - 55
[TEST] Income range: ₹23,591 - ₹164,007


In [6]:
# -*- coding: utf-8 -*-
# Three-model pipeline: XGBoost, Random Forest, Decision Tree - selects best performer

import os
import json
import joblib
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, mean_absolute_error, roc_auc_score, confusion_matrix, roc_curve, classification_report
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.preprocessing import StandardScaler

from xgboost import XGBClassifier, XGBRegressor

OUT_DIR = "credit_risk_output"
os.makedirs(OUT_DIR, exist_ok=True)

# Enhanced model features (35+ features including loan type)
MODEL_FEATURES = [
    "age", "monthly_income_inr", "monthly_expenses_inr", "monthly_savings_inr",
    "outstanding_loan_amount_inr", "loan_amount_applied_inr", "years_current_employment",
    "banking_relationship_years", "timeliness_score", "repayment_ability_score",
    "financial_health_score", "payment_reliability_score", "stability_index",
    "spouse_income_inr", "monthly_utility_bills_inr", "property_value_inr",
    "vehicle_value_inr", "total_investments_inr", "monthly_business_revenue_inr",
    "daily_mobile_hours", "monthly_digital_transactions", "avg_transaction_amount_inr",
    "social_media_accounts_count", "mobile_app_usage_intensity_score",
    "digital_payment_adoption_score", "utility_payment_regularity_score",
    "location_stability_score", "mobile_banking_usage_score",
    # NEW: Loan type features and interest rate
    "interest_rate", "loan_type_personal_loan", "loan_type_home_loan",
    "loan_type_auto_loan", "loan_type_education_loan", "loan_type_business_loan",
    "loan_type_credit_card", "loan_type_gold_loan"
]

LOAN_TYPES = [
    "personal loan", "home loan", "auto loan", "education loan",
    "business loan", "credit card", "gold loan"
]

TARGET_REG = "probability_of_default"
TARGET_CLS = "risk_category"
RISK_LABELS = ["Low Risk", "Medium Risk", "High Risk", "Very High Risk"]

def convert_np_types(obj):
    """Convert numpy types to native Python types for JSON serialization"""
    if isinstance(obj, dict):
        return {k: convert_np_types(v) for k, v in obj.items()}
    elif isinstance(obj, list):
        return [convert_np_types(i) for i in obj]
    elif isinstance(obj, np.integer):
        return int(obj)
    elif isinstance(obj, np.floating):
        return float(obj)
    elif isinstance(obj, np.ndarray):
        return obj.tolist()
    else:
        return obj

def _risk_to_num(labels):
    """Convert risk category labels to numeric"""
    mapping = {r: i for i, r in enumerate(RISK_LABELS)}
    return np.array([mapping[x] for x in labels], dtype=int)

def _risk_category_from_p(p):
    """Convert probability to risk category"""
    if p <= 0.18: return "Low Risk"
    elif p <= 0.42: return "Medium Risk"
    elif p <= 0.68: return "High Risk"
    else: return "Very High Risk"

def calculate_data_driven_scores(row):
    """Calculate scores for missing data (same enhanced logic as train)"""
    income = max(1.0, float(row["monthly_income_inr"]))
    expenses = float(row["monthly_expenses_inr"])
    savings = float(row["monthly_savings_inr"])
    loan_amount = float(row["outstanding_loan_amount_inr"])
    application_amount = float(row.get("loan_amount_applied_inr", loan_amount))
    age = int(row["age"])
    emp_years = float(row["years_current_employment"])
    bank_years = float(row["banking_relationship_years"])
    property_value = float(row.get("property_value_inr", 0))
    investments = float(row.get("total_investments_inr", 0))

    # Enhanced ratios
    dti_ratio = loan_amount / (12.0 * income) if income > 0 else 0
    application_to_income_ratio = application_amount / (12.0 * income) if income > 0 else 0
    loan_utilization_ratio = loan_amount / max(1.0, application_amount)
    expense_ratio = expenses / income if income > 0 else 1
    savings_ratio = savings / income if income > 0 else 0
    asset_ratio = (property_value + investments) / max(income * 12, 1)

    def _clip_local(value, min_val, max_val):
        return max(min_val, min(max_val, value))

    # Enhanced scoring calculations (same as train)
    timeliness_base = (
        min(emp_years * 10, 45) +
        min(bank_years * 8, 35) +
        min((age - 18) * 1.2, 25) +
        min(asset_ratio * 15, 20) +
        5
    )
    timeliness_penalty = (
        dti_ratio * 18 +
        max(0, expense_ratio - 0.65) * 25 +
        max(0, application_to_income_ratio - 1.2) * 12 +
        max(0, 1 - savings_ratio) * 10
    )
    timeliness_score = _clip_local(int(timeliness_base - timeliness_penalty + np.random.randint(-6, 7)), 5, 95)

    repayment_base = (
        min(np.log(max(income, 1000)/20000) * 20, 35) +
        max(0, savings_ratio * 45) +
        min(emp_years * 3, 25) +
        min(asset_ratio * 10, 15) +
        5
    )
    repayment_penalty = (
        dti_ratio * 30 +
        max(0, expense_ratio - 0.75) * 20 +
        max(0, application_to_income_ratio - 1.8) * 15 +
        max(0, loan_utilization_ratio - 0.85) * 12
    )
    repayment_score = _clip_local(int(repayment_base - repayment_penalty + np.random.randint(-5, 6)), 5, 90)

    financial_base = (
        min(np.log(max(income, 1000)/15000) * 15, 30) +
        min(asset_ratio * 25, 35) +
        max(0, savings_ratio * 30) +
        min(bank_years * 2, 20) +
        10
    )
    financial_penalty = (
        dti_ratio * 25 +
        max(0, expense_ratio - 0.70) * 22 +
        max(0, application_to_income_ratio - 1.5) * 12 +
        (5 if age < 22 or age > 65 else 0)
    )
    financial_score = _clip_local(int(financial_base - financial_penalty + np.random.randint(-8, 9)), 10, 95)

    reliability_base = (
        min(emp_years * 5, 40) +
        max(0, (1.2 - expense_ratio) * 35) +
        min(np.log(max(income, 1000)/3000), 25) +
        min(bank_years * 2, 15) +
        10
    )
    reliability_penalty = (
        dti_ratio * 35 +
        max(0, expense_ratio - 0.80) * 30 +
        abs(loan_utilization_ratio - 0.65) * 10 +
        max(0, application_to_income_ratio - 2.0) * 8
    )
    reliability_score = _clip_local(int(reliability_base - reliability_penalty + np.random.randint(-6, 7)), 10, 95)

    stability_base = (
        min(emp_years * 4, 30) +
        min(bank_years * 3, 20) +
        min((age - 18) * 0.8, 25) +
        min(asset_ratio * 20, 25) +
        (10 if property_value > 0 else 0) +
        5
    )
    stability_penalty = (
        dti_ratio * 20 +
        max(0, expense_ratio - 0.75) * 15 +
        max(0, application_to_income_ratio - 2.2) * 10 +
        (8 if emp_years < 1 else 0)
    )
    stability_score = _clip_local(int(stability_base - stability_penalty + np.random.randint(-10, 11)), 5, 90)

    utility_base = (
        90 -
        dti_ratio * 28 -
        max(0, expense_ratio - 0.55) * 25 +
        min(savings_ratio * 18, 12) +
        min(bank_years * 1.5, 8)
    )
    utility_score = _clip_local(int(utility_base + np.random.randint(-7, 8)), 25, 95)

    location_base = (
        bank_years * 10 +
        emp_years * 6 +
        (20 if property_value > 0 else 0) +
        min((age - 18) * 1.5, 30) +
        min(asset_ratio * 12, 15) +
        30
    )
    location_score = _clip_local(int(location_base + np.random.randint(-8, 9)), 30, 120)

    mobile_banking_base = (
        max(20, 95 - (age - 25) * 1.2) +
        min(emp_years * 2, 15) +
        min(np.log(max(income, 1000)/20000) * 10, 15)
    )
    mobile_banking_score = _clip_local(int(mobile_banking_base + np.random.randint(-10, 11)), 20, 95)

    return {
        "timeliness_score": timeliness_score,
        "repayment_ability_score": repayment_score,
        "financial_health_score": financial_score,
        "payment_reliability_score": reliability_score,
        "stability_index": stability_score,
        "utility_payment_regularity_score": utility_score,
        "location_stability_score": location_score,
        "mobile_banking_usage_score": mobile_banking_score
    }

def _ensure_scores_present(data, data_type="data"):
    """Ensure all required score columns are present"""
    score_columns = [
        "timeliness_score", "repayment_ability_score", "financial_health_score",
        "payment_reliability_score", "stability_index", "utility_payment_regularity_score",
        "location_stability_score", "mobile_banking_usage_score"
    ]

    missing_scores = [col for col in score_columns if col not in data.columns]

    if missing_scores:
        print(f"[SCORES] Missing score columns in {data_type}: {missing_scores}")
        print(f"[SCORES] Calculating scores from financial data...")

        calculated_scores = []
        for idx, row in data.iterrows():
            scores = calculate_data_driven_scores(row)
            calculated_scores.append(scores)

        scores_df = pd.DataFrame(calculated_scores)
        for col in missing_scores:
            if col in scores_df.columns:
                data[col] = scores_df[col]

        print(f"[SCORES] ✅ Calculated and added {len(missing_scores)} score columns")
    else:
        print(f"[SCORES] ✅ All score columns present in {data_type}")

    return data

def _add_loan_type_features(data):
    """Add one-hot encoded loan type features"""
    print("[FEATURES] Adding one-hot encoded loan type features...")

    # Create one-hot encoded features for loan types
    for loan_type in LOAN_TYPES:
        col_name = f"loan_type_{loan_type.replace(' ', '_')}"
        data[col_name] = (data['loan_type'] == loan_type).astype(int)
        print(f"  - {col_name}: {data[col_name].sum()} records")

    return data

def _add_derived_features(data):
    """Add derived features from loan application data"""
    print("[FEATURES] Adding derived features from loan and financial data...")

    # Application to income ratio
    data["application_to_income_ratio"] = data["loan_amount_applied_inr"] / (data["monthly_income_inr"] * 12)

    # Loan utilization ratio
    data["loan_utilization_ratio"] = data["outstanding_loan_amount_inr"] / data["loan_amount_applied_inr"]

    # Income to expense ratio
    data["income_to_expense_ratio"] = data["monthly_income_inr"] / data["monthly_expenses_inr"]

    # Total assets
    data["total_assets"] = data["property_value_inr"] + data["vehicle_value_inr"] + data["total_investments_inr"]

    # Debt service coverage ratio
    data["debt_service_coverage"] = data["monthly_savings_inr"] / (data["outstanding_loan_amount_inr"] / 12 + 1)

    print("[FEATURES] ✅ Added derived features:")
    print("  - application_to_income_ratio")
    print("  - loan_utilization_ratio")
    print("  - income_to_expense_ratio")
    print("  - total_assets")
    print("  - debt_service_coverage")

    return data

def _load_data():
    """Load and prepare training and test data with enhanced features"""
    train = pd.read_csv(os.path.join(OUT_DIR, "training_data_aligned.csv"))
    test = pd.read_csv(os.path.join(OUT_DIR, "test_data_aligned.csv"))

    print(f"[DATA] Loaded training: {train.shape}")
    print(f"[DATA] Loaded test: {test.shape}")

    # Add one-hot encoded loan type features if not present
    if 'loan_type_personal_loan' not in train.columns:
        train = _add_loan_type_features(train)
    if 'loan_type_personal_loan' not in test.columns:
        test = _add_loan_type_features(test)

    # Ensure scores are present
    train = _ensure_scores_present(train, "training")
    test = _ensure_scores_present(test, "test")

    # Add derived features
    train = _add_derived_features(train)
    test = _add_derived_features(test)

    # Update MODEL_FEATURES to include derived features
    global MODEL_FEATURES
    derived_features = ["application_to_income_ratio", "loan_utilization_ratio", "income_to_expense_ratio",
                       "total_assets", "debt_service_coverage"]
    MODEL_FEATURES = MODEL_FEATURES + derived_features

    print(f"[FEATURES] ✅ Total model features: {len(MODEL_FEATURES)} (including loan type and derived features)")

    # Validate required columns
    missing_train = [c for c in MODEL_FEATURES + [TARGET_REG, TARGET_CLS] if c not in train.columns]
    assert not missing_train, f"Training missing columns: {missing_train}"

    missing_test = [c for c in MODEL_FEATURES if c not in test.columns]
    assert not missing_test, f"Test missing columns: {missing_test}"

    print(f"[DATA] ✅ Training data shape: {train.shape}")
    print(f"[DATA] ✅ Test data shape: {test.shape}")
    print(f"[DATA] Loan type distribution in training:")
    if 'loan_type' in train.columns:
        print(train['loan_type'].value_counts().to_dict())

    return train, test

def _fit_three_models(X_train, y_reg_train, y_cls_train_num, eval_fraction=0.25, seed=42):
    """Train and evaluate three models: XGBoost, Random Forest, Decision Tree - IMPROVED ACCURACY"""
    print("[MODELS] Training three models: XGBoost, Random Forest, Decision Tree...")

    # Add feature scaling for better performance
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)

    # Stratified split
    stratify_opt = y_cls_train_num if len(np.unique(y_cls_train_num)) > 1 else None
    X_tr, X_val, yreg_tr, yreg_val, ycls_tr, ycls_val = train_test_split(
        X_train_scaled, y_reg_train, y_cls_train_num, test_size=eval_fraction,
        random_state=seed, stratify=stratify_opt
    )

    # Prepare models with IMPROVED hyperparameters for better accuracy
    models = {}

    # 1. XGBoost Models (IMPROVED with better hyperparameters)
    print("[MODELS] Training XGBoost models...")
    xgb_reg = XGBRegressor(
        n_estimators=1500,  # Increased from 1200
        learning_rate=0.03,  # Reduced for better convergence
        max_depth=10,  # Increased from 9
        subsample=0.85,  # Increased
        colsample_bytree=0.8,  # Increased
        colsample_bylevel=0.8,  # Increased
        colsample_bynode=0.85,  # Increased
        reg_alpha=0.1,  # Reduced regularization
        reg_lambda=2.0,  # Reduced regularization
        min_child_weight=3,  # Reduced
        gamma=0.1,  # Reduced
        random_state=seed,
        n_jobs=-1,
        tree_method='hist'
    )

    xgb_cls = XGBClassifier(
        n_estimators=1500,  # Increased
        learning_rate=0.03,  # Reduced for better convergence
        max_depth=10,  # Increased
        subsample=0.85,  # Increased
        colsample_bytree=0.8,  # Increased
        colsample_bylevel=0.8,  # Increased
        colsample_bynode=0.85,  # Increased
        reg_alpha=0.1,  # Reduced regularization
        reg_lambda=2.0,  # Reduced regularization
        min_child_weight=3,  # Reduced
        gamma=0.1,  # Reduced
        random_state=seed,
        eval_metric="mlogloss",
        use_label_encoder=False,
        n_jobs=-1,
        tree_method='hist'
    )

    xgb_reg.fit(X_tr, yreg_tr)
    xgb_cls.fit(X_tr, ycls_tr)
    models['XGBoost'] = {'regressor': xgb_reg, 'classifier': xgb_cls, 'scaler': scaler}

    # 2. Random Forest Models (IMPROVED parameters)
    print("[MODELS] Training Random Forest models...")
    rf_reg = RandomForestRegressor(
        n_estimators=1000,  # Increased from 800
        max_depth=18,  # Increased from 15
        min_samples_split=6,  # Reduced from 8
        min_samples_leaf=2,  # Reduced from 3
        max_features='sqrt',
        bootstrap=True,
        random_state=seed,
        n_jobs=-1
    )

    rf_cls = RandomForestClassifier(
        n_estimators=1000,  # Increased
        max_depth=18,  # Increased
        min_samples_split=6,  # Reduced
        min_samples_leaf=2,  # Reduced
        max_features='sqrt',
        bootstrap=True,
        random_state=seed,
        n_jobs=-1
    )

    rf_reg.fit(X_tr, yreg_tr)
    rf_cls.fit(X_tr, ycls_tr)
    models['RandomForest'] = {'regressor': rf_reg, 'classifier': rf_cls, 'scaler': scaler}

    # 3. Decision Tree Models (IMPROVED parameters)
    print("[MODELS] Training Decision Tree models...")
    dt_reg = DecisionTreeRegressor(
        max_depth=15,  # Increased from 12
        min_samples_split=6,  # Reduced from 10
        min_samples_leaf=3,  # Reduced from 5
        max_features='sqrt',
        random_state=seed
    )

    dt_cls = DecisionTreeClassifier(
        max_depth=15,  # Increased
        min_samples_split=6,  # Reduced
        min_samples_leaf=3,  # Reduced
        max_features='sqrt',
        random_state=seed
    )

    dt_reg.fit(X_tr, yreg_tr)
    dt_cls.fit(X_tr, ycls_tr)
    models['DecisionTree'] = {'regressor': dt_reg, 'classifier': dt_cls, 'scaler': scaler}

    # Evaluate all models with cross-validation for more robust accuracy assessment
    print("[EVALUATION] Evaluating all three models with cross-validation...")
    model_performance = {}
    accuracies_cv = {}  # Store cross-validation accuracies for plotting

    for model_name, model_dict in models.items():
        reg_model = model_dict['regressor']
        cls_model = model_dict['classifier']

        # Cross-validation for more robust accuracy assessment
        cv_scores = cross_val_score(cls_model, X_train_scaled, y_cls_train_num, cv=5, scoring='accuracy')
        cv_accuracy = cv_scores.mean()
        accuracies_cv[model_name] = cv_scores

        # Predict on validation set
        y_val_pred_reg = reg_model.predict(X_val)
        y_val_pred_cls_num = cls_model.predict(X_val)
        y_val_pred_cls_str = np.array([RISK_LABELS[int(v)] for v in y_val_pred_cls_num])
        y_val_cls_str = np.array([RISK_LABELS[int(v)] for v in ycls_val])

        # Calculate metrics
        acc = accuracy_score(y_val_cls_str, y_val_pred_cls_str)
        mae = mean_absolute_error(yreg_val, y_val_pred_reg)

        # Binary AUC (High/Very High vs others)
        y_bin = (ycls_val >= 2).astype(int)
        auc_bin = roc_auc_score(y_bin, y_val_pred_reg) if len(np.unique(y_bin)) > 1 else float("nan")

        # IMPROVED combined score with more weight on accuracy
        combined_score = 0.5 * acc + 0.25 * (1 - mae) + 0.25 * (auc_bin if not np.isnan(auc_bin) else 0)

        model_performance[model_name] = {
            'accuracy': acc,
            'cv_accuracy': cv_accuracy,  # Add cross-validation accuracy
            'mae': mae,
            'auc_bin': auc_bin,
            'combined_score': combined_score,
            'predictions': {
                'reg': y_val_pred_reg,
                'cls_str': y_val_pred_cls_str,
                'cls_num': y_val_pred_cls_num
            }
        }

        print(f"[{model_name}] Validation Accuracy: {acc:.4f}, CV Accuracy: {cv_accuracy:.4f}, MAE: {mae:.4f}, AUC: {auc_bin:.4f}, Combined: {combined_score:.4f}")

    # Select best model
    best_model_name = max(model_performance.keys(), key=lambda k: model_performance[k]['combined_score'])
    best_model = models[best_model_name]
    best_performance = model_performance[best_model_name]

    print(f"\n[WINNER] 🏆 Best Model: {best_model_name}")
    print(f"         Combined Score: {best_performance['combined_score']:.4f}")
    print(f"         Validation Accuracy: {best_performance['accuracy']:.4f}")
    print(f"         CV Accuracy: {best_performance['cv_accuracy']:.4f}")
    print(f"         MAE: {best_performance['mae']:.4f}")
    print(f"         AUC: {best_performance['auc_bin']:.4f}")

    # Refit best model on full training data
    print(f"[REFIT] Refitting {best_model_name} on full training data...")
    best_model['regressor'].fit(X_train_scaled, y_reg_train)
    best_model['classifier'].fit(X_train_scaled, y_cls_train_num)

    return best_model, best_model_name, model_performance, (X_val, yreg_val, ycls_val, best_performance), accuracies_cv

def _generate_test_predictions(best_model, test_data):
    """Generate predictions on test data using the best model"""
    X_test = test_data[MODEL_FEATURES].copy()

    # Apply the same scaling used during training
    X_test_scaled = best_model['scaler'].transform(X_test)

    print("[PREDICTION] Generating test predictions with best model...")
    print(f"[PREDICTION] Using {len(MODEL_FEATURES)} features (including loan type)")

    # Predict
    test_pred_reg = best_model['regressor'].predict(X_test_scaled)
    test_pred_cls_num = best_model['classifier'].predict(X_test_scaled)
    test_pred_cls_str = np.array([RISK_LABELS[int(v)] for v in test_pred_cls_num])

    # Create results dataframe
    test_results = test_data.copy()
    test_results[TARGET_REG] = test_pred_reg
    test_results[TARGET_CLS] = test_pred_cls_str
    test_results["risk_score"] = (test_results[TARGET_REG] * 100).round(1)

    print(f"[PREDICTION] ✅ Test predictions completed. Shape: {test_results.shape}")
    print(f"[PREDICTION] Risk distribution: {pd.Series(test_pred_cls_str).value_counts().to_dict()}")
    print(f"[PREDICTION] Interest rate vs Risk correlation:")
    if 'loan_type' in test_results.columns and 'interest_rate' in test_results.columns:
        risk_rate_corr = test_results.groupby('risk_category')['interest_rate'].mean().round(2)
        for risk, avg_rate in risk_rate_corr.items():
            print(f"  - {risk}: {avg_rate}% avg interest rate")

    return test_results, test_pred_reg, test_pred_cls_str

def main():
    """Main pipeline with three-model comparison and loan type features"""
    print("="*80)
    print("🚀 Enhanced Three-Model Credit Risk Pipeline with Loan Types")
    print("   Models: XGBoost vs Random Forest vs Decision Tree")
    print("   Features: 35+ including Loan Type & Interest Rate")
    print("   IMPROVED: Better hyperparameters + Feature scaling")
    print("="*80)

    print("\n[STEP 1] Loading and preparing enhanced data...")
    train, test = _load_data()

    # Prepare training data
    X_train = train[MODEL_FEATURES].copy()
    y_reg_train = train[TARGET_REG].astype(float).copy()
    y_cls_train_str = train[TARGET_CLS].astype(str).copy()
    y_cls_train_num = _risk_to_num(y_cls_train_str)

    print(f"[TRAIN] Training features shape: {X_train.shape}")
    print(f"[TRAIN] Enhanced feature count: {len(MODEL_FEATURES)} (including loan type)")
    print(f"[TRAIN] Risk distribution: {pd.Series(y_cls_train_str).value_counts().to_dict()}")

    print("\n[STEP 2] Training and comparing three models with IMPROVED accuracy...")
    best_model, best_model_name, model_performance, validation, accuracies_cv = _fit_three_models(X_train, y_reg_train, y_cls_train_num)
    X_val, y_reg_val, y_cls_val, best_performance = validation

    # Convert validation predictions for analysis
    y_val_cls_str = np.array([RISK_LABELS[int(v)] for v in y_cls_val])
    y_val_pred_reg = best_performance['predictions']['reg']
    y_val_pred_cls_str = best_performance['predictions']['cls_str']

    print("\n[STEP 3] Predicting on test data...")
    test_results, test_pred_reg, test_pred_cls_str = _generate_test_predictions(best_model, test)

    print("\n[STEP 4] Saving enhanced results...")

    # Save test predictions CSV
    test_results.to_csv(os.path.join(OUT_DIR, "test_predictions.csv"), index=False)
    print(f"[SAVE] ✅ Test predictions saved: test_predictions.csv")

    # Save best model pipeline (single PKL file as requested)
    full_pipeline = {
        "best_model_name": best_model_name,
        "regressor": best_model['regressor'],
        "classifier": best_model['classifier'],
        "scaler": best_model['scaler'],  # Include scaler
        "all_model_performance": model_performance,
        "risk_labels": RISK_LABELS,
        "model_features": MODEL_FEATURES,
        "loan_types": LOAN_TYPES,
        "metadata": {
            "train_shape": list(X_train.shape),
            "test_shape": list(test[MODEL_FEATURES].shape),
            "risk_mapping": {label: i for i, label in enumerate(RISK_LABELS)},
            "pd_thresholds": {"low": 0.18, "medium": 0.42, "high": 0.68},
            "loan_type_features": [f"loan_type_{lt.replace(' ', '_')}" for lt in LOAN_TYPES],
            "enhanced_features": ["interest_rate"] + [f"loan_type_{lt.replace(' ', '_')}" for lt in LOAN_TYPES],
            "model_version": "three_model_comparison_with_loan_types_improved",
            "feature_count": len(MODEL_FEATURES),
            "winning_model": best_model_name,
            "improvements": ["feature_scaling", "better_hyperparameters", "cross_validation"]
        }
    }
    joblib.dump(full_pipeline, os.path.join(OUT_DIR, "best_credit_risk_model.pkl"))
    print(f"[SAVE] ✅ Best model pipeline saved: best_credit_risk_model.pkl")

    # Generate comprehensive JSON output
    cm = confusion_matrix(y_val_cls_str, y_val_pred_cls_str, labels=RISK_LABELS).tolist()
    class_report = classification_report(y_val_cls_str, y_val_pred_cls_str, output_dict=True)

    # Enhanced feature importances
    feat_imps = []
    if hasattr(best_model['regressor'], "feature_importances_"):
        for f, w in zip(MODEL_FEATURES, best_model['regressor'].feature_importances_):
            feat_imps.append({
                "feature": f,
                "importance": float(w),
                "is_loan_type_feature": "loan_type_" in f,
                "is_interest_rate": f == "interest_rate",
                "is_derived_feature": f in ["application_to_income_ratio", "loan_utilization_ratio",
                                          "income_to_expense_ratio", "total_assets", "debt_service_coverage"],
                "is_score_feature": "score" in f or "stability" in f or "reliability" in f
            })

        # Print key feature importance
        importances = np.array(best_model['regressor'].feature_importances_)
        idx = np.argsort(importances)[::-1]
        print(f"[FEATURE IMPORTANCE] Top 10 features in {best_model_name}:")
        for i in range(min(10, len(MODEL_FEATURES))):
            feat_idx = idx[i]
            feature_name = MODEL_FEATURES[feat_idx]
            importance = importances[feat_idx]
            print(f"  {i+1}. {feature_name}: {importance:.4f}")

    output_json = {
        "data": test_results.to_dict(orient="records"),
        "analysis": {
            "best_model": best_model_name,
            "model_comparison": {
                model_name: {
                    "validation_accuracy": float(perf["accuracy"]),
                    "cv_accuracy": float(perf["cv_accuracy"]),
                    "mae": float(perf["mae"]),
                    "auc_bin": float(perf["auc_bin"]) if not np.isnan(perf["auc_bin"]) else None,
                    "combined_score": float(perf["combined_score"])
                } for model_name, perf in model_performance.items()
            },
            "best_model_metrics": {
                "validation_accuracy": float(best_performance["accuracy"]),
                "cv_accuracy": float(best_performance["cv_accuracy"]),
                "mae": float(best_performance["mae"]),
                "auc_bin": float(best_performance["auc_bin"]) if not np.isnan(best_performance["auc_bin"]) else None,
                "combined_score": float(best_performance["combined_score"])
            },
            "confusion_matrix": {
                "labels": RISK_LABELS,
                "matrix": cm
            },
            "classification_report": class_report,
            "class_distribution_validation": dict(pd.Series(y_val_cls_str).value_counts()),
            "class_distribution_test_predicted": dict(pd.Series(test_pred_cls_str).value_counts()),
            "feature_importance": feat_imps,
            "loan_type_analysis": {
                "loan_types_available": LOAN_TYPES,
                "loan_type_distribution_test": dict(test_results['loan_type'].value_counts()) if 'loan_type' in test_results else {},
                "interest_rate_by_risk": dict(test_results.groupby('risk_category')['interest_rate'].mean()) if 'interest_rate' in test_results else {},
                "secured_vs_unsecured": {
                    "secured_loans": ["home loan", "auto loan", "gold loan"],
                    "unsecured_loans": ["personal loan", "business loan", "education loan", "credit card"]
                }
            },
            "model_info": {
                "winning_algorithm": best_model_name,
                "total_models_compared": len(model_performance),
                "features_count": len(MODEL_FEATURES),
                "enhanced_features_added": ["interest_rate"] + [f"loan_type_{lt.replace(' ', '_')}" for lt in LOAN_TYPES],
                "model_version": "three_model_comparison_with_loan_types_improved",
                "improvements_made": ["feature_scaling", "optimized_hyperparameters", "cross_validation"]
            }
        }
    }

    # Convert and save JSON
    json_safe_output = convert_np_types(output_json)
    with open(os.path.join(OUT_DIR, "model_output.json"), "w") as f:
        json.dump(json_safe_output, f, indent=2)
    print(f"[SAVE] ✅ Enhanced JSON analysis saved: model_output.json")

    print("\n" + "="*80)
    print("✅ Enhanced Three-Model Credit Risk Pipeline Completed with IMPROVED ACCURACY!")
    print("="*80)
    print(f"🏆 WINNING MODEL: {best_model_name}")
    print(f"   ├── Validation Accuracy: {best_performance['accuracy']:.4f}")
    print(f"   ├── Cross-Validation Accuracy: {best_performance['cv_accuracy']:.4f}")
    print(f"   ├── MAE: {best_performance['mae']:.4f}")
    print(f"   ├── AUC: {best_performance['auc_bin']:.4f}")
    print(f"   └── Combined Score: {best_performance['combined_score']:.4f}")

    print(f"\n📊 MODEL COMPARISON RESULTS:")
    for model_name, perf in model_performance.items():
        symbol = "🏆" if model_name == best_model_name else "  "
        print(f"   {symbol} {model_name}: {perf['combined_score']:.4f} (Val Acc:{perf['accuracy']:.3f}, CV Acc:{perf['cv_accuracy']:.3f})")

    print(f"\n🚀 ACCURACY IMPROVEMENTS:")
    print(f"   ├── Feature Scaling: StandardScaler applied")
    print(f"   ├── Hyperparameter Tuning: Optimized for better performance")
    print(f"   ├── Cross-Validation: 5-fold CV for robust accuracy assessment")
    print(f"   └── Model Comparison: Weighted scoring with accuracy priority")

    print(f"\n📁 OUTPUT FILES:")
    print(f"   ├── training_data_aligned.csv (with loan types)")
    print(f"   ├── test_data_aligned.csv (with loan types)")
    print(f"   ├── test_predictions.csv (enhanced predictions)")
    print(f"   ├── best_credit_risk_model.pkl (winning model + scaler)")
    print(f"   └── model_output.json (comprehensive analysis)")

    print(f"\n✨ Three models trained with improved accuracy, best selected!")

if __name__ == "__main__":
    main()

🚀 Enhanced Three-Model Credit Risk Pipeline with Loan Types
   Models: XGBoost vs Random Forest vs Decision Tree
   Features: 35+ including Loan Type & Interest Rate
   IMPROVED: Better hyperparameters + Feature scaling

[STEP 1] Loading and preparing enhanced data...
[DATA] Loaded training: (15000, 43)
[DATA] Loaded test: (40, 33)
[FEATURES] Adding one-hot encoded loan type features...
  - loan_type_personal_loan: 4125 records
  - loan_type_home_loan: 3666 records
  - loan_type_auto_loan: 2395 records
  - loan_type_education_loan: 1613 records
  - loan_type_business_loan: 1926 records
  - loan_type_credit_card: 629 records
  - loan_type_gold_loan: 646 records
[FEATURES] Adding one-hot encoded loan type features...
  - loan_type_personal_loan: 8 records
  - loan_type_home_loan: 16 records
  - loan_type_auto_loan: 5 records
  - loan_type_education_loan: 4 records
  - loan_type_business_loan: 4 records
  - loan_type_credit_card: 2 records
  - loan_type_gold_loan: 1 records
[SCORES] ✅ All