In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
"""
=================================================================================
ENHANCED LOAN PAYBACK PREDICTION - TOP LEADERBOARD STRATEGY
Kaggle Playground Series S5E11
Target: 0.93+ ROC AUC
=================================================================================
"""

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from tabulate import tabulate
import warnings
warnings.filterwarnings('ignore')

# Model imports
import xgboost as xgb
from xgboost import XGBClassifier
import lightgbm as lgb
from catboost import CatBoostClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score, roc_curve, auc
from sklearn.preprocessing import LabelEncoder
import gc

# Styling
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")
pd.set_option('display.precision', 5)

print("="*85)
print(" üöÄ ENHANCED LOAN PAYBACK PREDICTION - TOP LEADERBOARD STRATEGY".center(85))
print("="*85)

# =============================================================================
# 1. DATA LOADING
# =============================================================================
print("\n" + "="*85)
print(" üìÇ PHASE 1: DATA LOADING".center(85))
print("="*85)

train = pd.read_csv('/kaggle/input/playground-series-s5e11/train.csv')
test = pd.read_csv('/kaggle/input/playground-series-s5e11/test.csv')

print(f"\n‚úì Train: {train.shape[0]:,} rows √ó {train.shape[1]} columns")
print(f"‚úì Test:  {test.shape[0]:,} rows √ó {test.shape[1]} columns")

# =============================================================================
# 2. ADVANCED FEATURE ENGINEERING
# =============================================================================
print("\n" + "="*85)
print(" üîß PHASE 2: ADVANCED FEATURE ENGINEERING".center(85))
print("="*85)

def parse_grade_subgrade(df):
    """Parse grade_subgrade into letter and number components"""
    df = df.copy()
    df['grade_letter'] = df['grade_subgrade'].str[0]
    df['grade_number'] = df['grade_subgrade'].str[1:].astype(int)
    
    # Create grade ranking (A1=1, A2=2, ..., G5=35)
    grade_rank = {'A': 0, 'B': 5, 'C': 10, 'D': 15, 'E': 20, 'F': 25, 'G': 30}
    df['grade_rank'] = df['grade_letter'].map(grade_rank) + df['grade_number']
    
    return df

def create_elite_features(df, is_train=True):
    """Create advanced engineered features for top performance"""
    df = df.copy()
    
    # Parse grade subgrade
    df = parse_grade_subgrade(df)
    
    # Financial ratio features
    df['income_to_loan_ratio'] = df['annual_income'] / (df['loan_amount'] + 1)
    df['loan_to_income_pct'] = (df['loan_amount'] / df['annual_income']) * 100
    df['available_income'] = df['annual_income'] * (1 - df['debt_to_income_ratio'])
    df['loan_burden'] = df['loan_amount'] / df['available_income']
    
    # Credit risk metrics
    df['credit_risk_v1'] = (
        df['credit_score'] * 0.4 - 
        df['debt_to_income_ratio'] * 1000 * 0.3 - 
        df['interest_rate'] * 10 * 0.2 -
        df['grade_rank'] * 5 * 0.1
    )
    
    df['credit_score_norm'] = (df['credit_score'] - 395) / (849 - 395)
    df['credit_interest_diff'] = df['credit_score_norm'] - (df['interest_rate'] / 20)
    
    # Payment capacity features
    df['monthly_income'] = df['annual_income'] / 12
    
    monthly_rate = df['interest_rate'] / 100 / 12
    n_payments = 60
    df['estimated_monthly_payment'] = (
        df['loan_amount'] * monthly_rate * (1 + monthly_rate)**n_payments
    ) / ((1 + monthly_rate)**n_payments - 1)
    
    df['payment_to_income_ratio'] = df['estimated_monthly_payment'] / df['monthly_income']
    df['disposable_after_loan'] = df['available_income'] / 12 - df['estimated_monthly_payment']
    df['payment_stress'] = df['estimated_monthly_payment'] / (df['available_income'] / 12)
    
    # Interaction features
    df['credit_x_income'] = df['credit_score'] * np.log1p(df['annual_income'])
    df['debt_x_interest'] = df['debt_to_income_ratio'] * df['interest_rate']
    df['grade_x_credit'] = df['grade_rank'] * (850 - df['credit_score'])
    df['loan_x_interest'] = np.log1p(df['loan_amount']) * df['interest_rate']
    
    # Polynomial features
    df['credit_score_sq'] = df['credit_score'] ** 2
    df['debt_ratio_sq'] = df['debt_to_income_ratio'] ** 2
    df['interest_rate_sq'] = df['interest_rate'] ** 2
    
    # Categorical frequency encoding
    categorical_cols = ['gender', 'marital_status', 'education_level', 
                       'employment_status', 'loan_purpose', 'grade_letter']
    
    for col in categorical_cols:
        freq = df[col].value_counts(normalize=True)
        df[f'{col}_freq'] = df[col].map(freq)
    
    # Risk flags
    df['high_risk_flag'] = (
        (df['credit_score'] < 600) | 
        (df['debt_to_income_ratio'] > 0.4) |
        (df['grade_rank'] > 20)
    ).astype(int)
    
    df['excellent_credit'] = (df['credit_score'] >= 750).astype(int)
    df['low_debt_burden'] = (df['debt_to_income_ratio'] <= 0.2).astype(int)
    df['employed_flag'] = (df['employment_status'] == 'Employed').astype(int)
    df['high_income'] = (df['annual_income'] >= df['annual_income'].median()).astype(int)
    df['small_loan'] = (df['loan_amount'] <= df['loan_amount'].quantile(0.3)).astype(int)
    
    df['composite_risk'] = (
        df['high_risk_flag'] * 3 -
        df['excellent_credit'] * 2 -
        df['low_debt_burden'] * 2 -
        df['employed_flag'] * 1
    )
    
    # Binning features
    numeric_for_binning = ['annual_income', 'debt_to_income_ratio', 'credit_score', 
                           'loan_amount', 'interest_rate']
    
    for col in numeric_for_binning:
        for q in [5, 10]:
            try:
                df[f'{col}_bin{q}'] = pd.qcut(df[col], q=q, labels=False, duplicates='drop')
            except:
                df[f'{col}_bin{q}'] = 0
    
    return df

print("\n‚öôÔ∏è  Creating elite feature set...")
train_fe = create_elite_features(train, is_train=True)
test_fe = create_elite_features(test, is_train=False)
print(f"‚úì Created {len([c for c in train_fe.columns if c not in train.columns])} new features")

# =============================================================================
# 3. TARGET ENCODING FOR CATEGORICAL VARIABLES
# =============================================================================
print("\n" + "="*85)
print(" üéØ PHASE 3: ADVANCED CATEGORICAL ENCODING".center(85))
print("="*85)

categorical_cols = ['gender', 'marital_status', 'education_level', 
                   'employment_status', 'loan_purpose', 'grade_subgrade', 'grade_letter']

y = train_fe['loan_paid_back']
X_train = train_fe.drop(['id', 'loan_paid_back'], axis=1)
X_test = test_fe.drop(['id'], axis=1)
test_ids = test_fe['id']

print("\n‚öôÔ∏è  Applying target encoding with cross-validation...")
n_folds = 5
skf = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=42)

for col in categorical_cols:
    if col in X_train.columns:
        X_train[f'{col}_target_enc'] = 0.0
        X_test[f'{col}_target_enc'] = 0.0
        
        for fold, (train_idx, val_idx) in enumerate(skf.split(X_train, y)):
            target_mean = y.iloc[train_idx].groupby(X_train[col].iloc[train_idx]).mean()
            X_train.loc[val_idx, f'{col}_target_enc'] = X_train.loc[val_idx, col].map(target_mean)
        
        global_mean = y.mean()
        X_train[f'{col}_target_enc'].fillna(global_mean, inplace=True)
        
        target_mean_full = y.groupby(X_train[col]).mean()
        X_test[f'{col}_target_enc'] = X_test[col].map(target_mean_full).fillna(global_mean)
        
        print(f"‚úì Target encoded: {col}")

# Label encoding
label_encoders = {}
for col in categorical_cols:
    if col in X_train.columns:
        le = LabelEncoder()
        X_train[col] = le.fit_transform(X_train[col].astype(str))
        X_test[col] = le.transform(X_test[col].astype(str))
        label_encoders[col] = le

print(f"\n‚úì Final feature count: {X_train.shape[1]}")

# =============================================================================
# 4. ENSEMBLE MODEL TRAINING
# =============================================================================
print("\n" + "="*85)
print(" ü§ñ PHASE 4: MULTI-MODEL ENSEMBLE TRAINING".center(85))
print("="*85)

scale_pos_weight = (y == 0).sum() / (y == 1).sum()
print(f"\n‚öñÔ∏è  Class imbalance ratio: {scale_pos_weight:.4f}")

xgb_params = {
    'objective': 'binary:logistic',
    'eval_metric': 'auc',
    'tree_method': 'hist',
    'device': 'cuda',
    'random_state': 42,
    'learning_rate': 0.01,
    'max_depth': 7,
    'min_child_weight': 80,
    'subsample': 0.85,
    'colsample_bytree': 0.85,
    'colsample_bylevel': 0.85,
    'colsample_bynode': 0.85,
    'gamma': 0.1,
    'reg_alpha': 2.0,
    'reg_lambda': 5.0,
    'scale_pos_weight': scale_pos_weight,
    'n_estimators': 3000,
    'early_stopping_rounds': 100
}

lgb_params = {
    'objective': 'binary',
    'metric': 'auc',
    'boosting_type': 'gbdt',
    'num_leaves': 31,
    'learning_rate': 0.01,
    'feature_fraction': 0.8,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'max_depth': 7,
    'min_child_weight': 70,
    'reg_alpha': 2.0,
    'reg_lambda': 5.0,
    'scale_pos_weight': scale_pos_weight,
    'random_state': 42,
    'verbose': -1,
    'n_estimators': 3000
}

cat_params = {
    'iterations': 3000,
    'learning_rate': 0.01,
    'depth': 7,
    'l2_leaf_reg': 5,
    'loss_function': 'Logloss',
    'eval_metric': 'AUC',
    'random_seed': 42,
    'verbose': False,
    'early_stopping_rounds': 100,
    'scale_pos_weight': scale_pos_weight
}

print("\nüîÑ Training ensemble with 5-Fold Stratified CV...")
print("‚îÄ" * 85)

n_folds = 5
skf = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=42)

oof_xgb = np.zeros(len(X_train))
oof_lgb = np.zeros(len(X_train))
oof_cat = np.zeros(len(X_train))

test_xgb = np.zeros(len(X_test))
test_lgb = np.zeros(len(X_test))
test_cat = np.zeros(len(X_test))

feature_importance = []
cv_scores = {'xgb': [], 'lgb': [], 'cat': [], 'ensemble': []}

# Store ROC curves for visualization
roc_data = {'xgb': [], 'lgb': [], 'cat': [], 'ensemble': []}

for fold, (train_idx, val_idx) in enumerate(skf.split(X_train, y), 1):
    print(f"\n{'Fold ' + str(fold):^85}")
    print("‚îÄ" * 85)
    
    X_tr, X_val = X_train.iloc[train_idx], X_train.iloc[val_idx]
    y_tr, y_val = y.iloc[train_idx], y.iloc[val_idx]
    
    # XGBoost
    print("Training XGBoost...", end=" ")
    xgb_model = XGBClassifier(**xgb_params)
    xgb_model.fit(X_tr, y_tr, eval_set=[(X_val, y_val)], verbose=False)
    oof_xgb[val_idx] = xgb_model.predict_proba(X_val)[:, 1]
    test_xgb += xgb_model.predict_proba(X_test)[:, 1] / n_folds
    xgb_score = roc_auc_score(y_val, oof_xgb[val_idx])
    cv_scores['xgb'].append(xgb_score)
    print(f"AUC: {xgb_score:.6f}")
    
    fpr, tpr, _ = roc_curve(y_val, oof_xgb[val_idx])
    roc_data['xgb'].append((fpr, tpr, xgb_score))
    
    # LightGBM
    print("Training LightGBM...", end=" ")
    lgb_model = lgb.LGBMClassifier(**lgb_params)
    lgb_model.fit(X_tr, y_tr, eval_set=[(X_val, y_val)],
                  callbacks=[lgb.early_stopping(100), lgb.log_evaluation(0)])
    oof_lgb[val_idx] = lgb_model.predict_proba(X_val)[:, 1]
    test_lgb += lgb_model.predict_proba(X_test)[:, 1] / n_folds
    lgb_score = roc_auc_score(y_val, oof_lgb[val_idx])
    cv_scores['lgb'].append(lgb_score)
    print(f"AUC: {lgb_score:.6f}")
    
    fpr, tpr, _ = roc_curve(y_val, oof_lgb[val_idx])
    roc_data['lgb'].append((fpr, tpr, lgb_score))
    
    # CatBoost
    print("Training CatBoost...", end=" ")
    cat_model = CatBoostClassifier(**cat_params)
    cat_model.fit(X_tr, y_tr, eval_set=(X_val, y_val), verbose=False)
    oof_cat[val_idx] = cat_model.predict_proba(X_val)[:, 1]
    test_cat += cat_model.predict_proba(X_test)[:, 1] / n_folds
    cat_score = roc_auc_score(y_val, oof_cat[val_idx])
    cv_scores['cat'].append(cat_score)
    print(f"AUC: {cat_score:.6f}")
    
    fpr, tpr, _ = roc_curve(y_val, oof_cat[val_idx])
    roc_data['cat'].append((fpr, tpr, cat_score))
    
    # Ensemble
    oof_ensemble = (oof_xgb[val_idx] * 0.4 + oof_lgb[val_idx] * 0.35 + oof_cat[val_idx] * 0.25)
    ensemble_score = roc_auc_score(y_val, oof_ensemble)
    cv_scores['ensemble'].append(ensemble_score)
    print(f"Ensemble AUC: {ensemble_score:.6f}")
    
    fpr, tpr, _ = roc_curve(y_val, oof_ensemble)
    roc_data['ensemble'].append((fpr, tpr, ensemble_score))
    
    # Feature importance
    fold_importance = pd.DataFrame({
        'feature': X_train.columns,
        'importance': xgb_model.feature_importances_,
        'fold': fold
    })
    feature_importance.append(fold_importance)
    
    gc.collect()

# Overall scores
oof_ensemble_final = (oof_xgb * 0.4 + oof_lgb * 0.35 + oof_cat * 0.25)
overall_score = roc_auc_score(y, oof_ensemble_final)

# =============================================================================
# 5. RESULTS SUMMARY
# =============================================================================
print("\n" + "="*85)
print(" üìä CROSS-VALIDATION RESULTS".center(85))
print("="*85)

results_table = [
    ['XGBoost Mean', f"{np.mean(cv_scores['xgb']):.6f}", f"¬±{np.std(cv_scores['xgb']):.6f}"],
    ['LightGBM Mean', f"{np.mean(cv_scores['lgb']):.6f}", f"¬±{np.std(cv_scores['lgb']):.6f}"],
    ['CatBoost Mean', f"{np.mean(cv_scores['cat']):.6f}", f"¬±{np.std(cv_scores['cat']):.6f}"],
    ['‚îÄ'*20, '‚îÄ'*15, '‚îÄ'*15],
    ['Ensemble Mean', f"{np.mean(cv_scores['ensemble']):.6f}", f"¬±{np.std(cv_scores['ensemble']):.6f}"],
    ['Overall OOF', f"{overall_score:.6f}", '']
]

print("\n" + tabulate(results_table, headers=['Model', 'ROC AUC', 'Std'], tablefmt='fancy_grid'))

# =============================================================================
# 6. OPTIMIZED ENSEMBLE WEIGHTS
# =============================================================================
print("\n" + "="*85)
print(" ‚öñÔ∏è  PHASE 5: OPTIMIZING ENSEMBLE WEIGHTS".center(85))
print("="*85)

best_score = 0
best_weights = (0.4, 0.35, 0.25)

print("\nSearching for optimal weights...")
for w1 in np.arange(0.3, 0.5, 0.05):
    for w2 in np.arange(0.25, 0.45, 0.05):
        w3 = 1 - w1 - w2
        if w3 < 0.2 or w3 > 0.4:
            continue
        
        oof_weighted = oof_xgb * w1 + oof_lgb * w2 + oof_cat * w3
        score = roc_auc_score(y, oof_weighted)
        
        if score > best_score:
            best_score = score
            best_weights = (w1, w2, w3)

print(f"\n‚úì Best weights: XGB={best_weights[0]:.2f}, LGB={best_weights[1]:.2f}, CAT={best_weights[2]:.2f}")
print(f"‚úì Best OOF Score: {best_score:.6f}")

test_ensemble = test_xgb * best_weights[0] + test_lgb * best_weights[1] + test_cat * best_weights[2]

# =============================================================================
# 7. STUNNING VISUALIZATIONS
# =============================================================================
print("\n" + "="*85)
print(" üìä PHASE 6: GENERATING STUNNING VISUALIZATIONS".center(85))
print("="*85)

# Aggregate feature importance
fi_df = pd.concat(feature_importance)
fi_agg = fi_df.groupby('feature')['importance'].mean().sort_values(ascending=False).head(20)

# Create comprehensive visualization
fig = plt.figure(figsize=(24, 16))
fig.suptitle('üöÄ Elite Loan Payback Prediction - Comprehensive Analysis Dashboard', 
             fontsize=20, fontweight='bold', y=0.995)

# Color schemes
colors_models = ['#FF6B6B', '#4ECDC4', '#45B7D1', '#FFA07A']
colors_gradient = plt.cm.viridis(np.linspace(0, 1, 5))

# =================== ROW 1 ===================
# 1. Feature Importance (Top 20)
ax1 = plt.subplot(3, 4, 1)
bars = ax1.barh(range(len(fi_agg)), fi_agg.values, color=plt.cm.plasma(np.linspace(0.3, 0.9, len(fi_agg))))
ax1.set_yticks(range(len(fi_agg)))
ax1.set_yticklabels(fi_agg.index, fontsize=9)
ax1.set_xlabel('Importance Score', fontsize=11, fontweight='bold')
ax1.set_title('üèÜ Top 20 Feature Importance', fontsize=13, fontweight='bold', pad=10)
ax1.invert_yaxis()
ax1.grid(axis='x', alpha=0.3, linestyle='--')
for i, (feat, imp) in enumerate(fi_agg.items()):
    ax1.text(imp, i, f' {imp:.4f}', va='center', fontsize=8)

# 2. Model Performance Comparison
ax2 = plt.subplot(3, 4, 2)
models = ['XGBoost', 'LightGBM', 'CatBoost', 'Ensemble']
means = [np.mean(cv_scores['xgb']), np.mean(cv_scores['lgb']), 
         np.mean(cv_scores['cat']), np.mean(cv_scores['ensemble'])]
stds = [np.std(cv_scores['xgb']), np.std(cv_scores['lgb']), 
        np.std(cv_scores['cat']), np.std(cv_scores['ensemble'])]
x_pos = np.arange(len(models))
bars = ax2.bar(x_pos, means, yerr=stds, capsize=5, color=colors_models, alpha=0.8, edgecolor='black', linewidth=1.5)
ax2.set_xticks(x_pos)
ax2.set_xticklabels(models, fontsize=10, fontweight='bold')
ax2.set_ylabel('ROC AUC Score', fontsize=11, fontweight='bold')
ax2.set_title('üìä Model Performance Comparison', fontsize=13, fontweight='bold', pad=10)
ax2.set_ylim([0.910, 0.920])
ax2.grid(axis='y', alpha=0.3, linestyle='--')
for i, (m, s) in enumerate(zip(means, stds)):
    ax2.text(i, m + s + 0.0005, f'{m:.5f}', ha='center', fontsize=9, fontweight='bold')

# 3. ROC Curves - All Models
ax3 = plt.subplot(3, 4, 3)
model_names = ['XGBoost', 'LightGBM', 'CatBoost', 'Ensemble']
for idx, (model_key, model_name, color) in enumerate(zip(['xgb', 'lgb', 'cat', 'ensemble'], 
                                                          model_names, colors_models)):
    mean_fpr = np.linspace(0, 1, 100)
    tprs = []
    for fpr, tpr, score in roc_data[model_key]:
        tprs.append(np.interp(mean_fpr, fpr, tpr))
    mean_tpr = np.mean(tprs, axis=0)
    mean_auc = np.mean(cv_scores[model_key])
    ax3.plot(mean_fpr, mean_tpr, color=color, linewidth=2.5, 
             label=f'{model_name} (AUC={mean_auc:.4f})', alpha=0.9)

ax3.plot([0, 1], [0, 1], 'k--', linewidth=1.5, label='Random Classifier', alpha=0.5)
ax3.set_xlabel('False Positive Rate', fontsize=11, fontweight='bold')
ax3.set_ylabel('True Positive Rate', fontsize=11, fontweight='bold')
ax3.set_title('üìà ROC Curves - Model Comparison', fontsize=13, fontweight='bold', pad=10)
ax3.legend(loc='lower right', fontsize=9, framealpha=0.9)
ax3.grid(alpha=0.3, linestyle='--')

# 4. CV Fold Scores
ax4 = plt.subplot(3, 4, 4)
fold_nums = np.arange(1, 6)
width = 0.2
ax4.bar(fold_nums - 1.5*width, cv_scores['xgb'], width, label='XGBoost', color=colors_models[0], alpha=0.8)
ax4.bar(fold_nums - 0.5*width, cv_scores['lgb'], width, label='LightGBM', color=colors_models[1], alpha=0.8)
ax4.bar(fold_nums + 0.5*width, cv_scores['cat'], width, label='CatBoost', color=colors_models[2], alpha=0.8)
ax4.bar(fold_nums + 1.5*width, cv_scores['ensemble'], width, label='Ensemble', color=colors_models[3], alpha=0.8)
ax4.axhline(y=overall_score, color='red', linestyle='--', linewidth=2, label=f'Overall: {overall_score:.5f}')
ax4.set_xlabel('Fold Number', fontsize=11, fontweight='bold')
ax4.set_ylabel('ROC AUC Score', fontsize=11, fontweight='bold')
ax4.set_title('üîÑ Cross-Validation Fold Scores', fontsize=13, fontweight='bold', pad=10)
ax4.set_xticks(fold_nums)
ax4.legend(fontsize=8, loc='lower right', framealpha=0.9)
ax4.grid(axis='y', alpha=0.3, linestyle='--')

# =================== ROW 2 ===================
# 5. Target Distribution
ax5 = plt.subplot(3, 4, 5)
target_counts = train['loan_paid_back'].value_counts()
colors_pie = ['#FF6B6B', '#51CF66']
wedges, texts, autotexts = ax5.pie(target_counts, labels=['Not Paid', 'Paid Back'], 
                                     autopct='%1.1f%%', colors=colors_pie, startangle=90,
                                     textprops={'fontsize': 11, 'fontweight': 'bold'},
                                     explode=(0.05, 0.05), shadow=True)
for autotext in autotexts:
    autotext.set_color('white')
    autotext.set_fontsize(12)
ax5.set_title('üéØ Target Distribution', fontsize=13, fontweight='bold', pad=10)

# 6. Credit Score Distribution by Target
ax6 = plt.subplot(3, 4, 6)
paid = train[train['loan_paid_back']==1]['credit_score']
not_paid = train[train['loan_paid_back']==0]['credit_score']
ax6.hist(paid, bins=50, alpha=0.7, label='Paid Back', color='#51CF66', edgecolor='black', linewidth=0.5)
ax6.hist(not_paid, bins=50, alpha=0.7, label='Not Paid', color='#FF6B6B', edgecolor='black', linewidth=0.5)
ax6.axvline(paid.mean(), color='green', linestyle='--', linewidth=2, label=f'Paid Mean: {paid.mean():.0f}')
ax6.axvline(not_paid.mean(), color='red', linestyle='--', linewidth=2, label=f'Not Paid Mean: {not_paid.mean():.0f}')
ax6.set_xlabel('Credit Score', fontsize=11, fontweight='bold')
ax6.set_ylabel('Frequency', fontsize=11, fontweight='bold')
ax6.set_title('üí≥ Credit Score Distribution', fontsize=13, fontweight='bold', pad=10)
ax6.legend(fontsize=9, framealpha=0.9)
ax6.grid(alpha=0.3, linestyle='--')

# 7. Debt-to-Income Ratio Distribution
ax7 = plt.subplot(3, 4, 7)
paid_dti = train[train['loan_paid_back']==1]['debt_to_income_ratio']
not_paid_dti = train[train['loan_paid_back']==0]['debt_to_income_ratio']
ax7.hist(paid_dti, bins=50, alpha=0.7, label='Paid Back', color='#51CF66', edgecolor='black', linewidth=0.5)
ax7.hist(not_paid_dti, bins=50, alpha=0.7, label='Not Paid', color='#FF6B6B', edgecolor='black', linewidth=0.5)
ax7.axvline(paid_dti.mean(), color='green', linestyle='--', linewidth=2, label=f'Paid Mean: {paid_dti.mean():.2f}')
ax7.axvline(not_paid_dti.mean(), color='red', linestyle='--', linewidth=2, label=f'Not Paid Mean: {not_paid_dti.mean():.2f}')
ax7.set_xlabel('Debt-to-Income Ratio', fontsize=11, fontweight='bold')
ax7.set_ylabel('Frequency', fontsize=11, fontweight='bold')
ax7.set_title('üí∞ Debt-to-Income Distribution', fontsize=13, fontweight='bold', pad=10)
ax7.legend(fontsize=9, framealpha=0.9)
ax7.grid(alpha=0.3, linestyle='--')

# 8. Interest Rate vs Credit Score
ax8 = plt.subplot(3, 4, 8)
paid_sample = train[train['loan_paid_back']==1].sample(n=min(5000, len(train[train['loan_paid_back']==1])), random_state=42)
not_paid_sample = train[train['loan_paid_back']==0].sample(n=min(5000, len(train[train['loan_paid_back']==0])), random_state=42)
ax8.scatter(paid_sample['credit_score'], paid_sample['interest_rate'], 
           alpha=0.4, s=10, color='#51CF66', label='Paid Back')
ax8.scatter(not_paid_sample['credit_score'], not_paid_sample['interest_rate'], 
           alpha=0.4, s=10, color='#FF6B6B', label='Not Paid')
ax8.set_xlabel('Credit Score', fontsize=11, fontweight='bold')
ax8.set_ylabel('Interest Rate (%)', fontsize=11, fontweight='bold')
ax8.set_title('üìâ Interest Rate vs Credit Score', fontsize=13, fontweight='bold', pad=10)
ax8.legend(fontsize=9, framealpha=0.9)
ax8.grid(alpha=0.3, linestyle='--')

# =================== ROW 3 ===================
# 9. Prediction Distribution
ax9 = plt.subplot(3, 4, 9)
ax9.hist(test_ensemble, bins=50, color='#4ECDC4', alpha=0.8, edgecolor='black', linewidth=1)
ax9.axvline(test_ensemble.mean(), color='red', linestyle='--', linewidth=2, 
           label=f'Mean: {test_ensemble.mean():.4f}')
ax9.axvline(np.median(test_ensemble), color='orange', linestyle='--', linewidth=2, 
           label=f'Median: {np.median(test_ensemble):.4f}')
ax9.set_xlabel('Predicted Probability', fontsize=11, fontweight='bold')
ax9.set_ylabel('Frequency', fontsize=11, fontweight='bold')
ax9.set_title('üé≤ Test Prediction Distribution', fontsize=13, fontweight='bold', pad=10)
ax9.legend(fontsize=9, framealpha=0.9)
ax9.grid(axis='y', alpha=0.3, linestyle='--')

# 10. Employment Status Impact
ax10 = plt.subplot(3, 4, 10)
emp_payback = train.groupby('employment_status')['loan_paid_back'].agg(['mean', 'count'])
emp_payback = emp_payback.sort_values('mean', ascending=True)
bars = ax10.barh(emp_payback.index, emp_payback['mean'], 
                color=plt.cm.RdYlGn(emp_payback['mean']), alpha=0.8, edgecolor='black', linewidth=1)
ax10.set_xlabel('Payback Rate', fontsize=11, fontweight='bold')
ax10.set_title('üëî Employment Status Impact', fontsize=13, fontweight='bold', pad=10)
ax10.grid(axis='x', alpha=0.3, linestyle='--')
for i, (idx, row) in enumerate(emp_payback.iterrows()):
    ax10.text(row['mean'], i, f" {row['mean']:.2%} (n={row['count']:,})", 
             va='center', fontsize=9, fontweight='bold')

# 11. Loan Purpose Impact
ax11 = plt.subplot(3, 4, 11)
purpose_payback = train.groupby('loan_purpose')['loan_paid_back'].agg(['mean', 'count'])
purpose_payback = purpose_payback.sort_values('mean', ascending=False)
bars = ax11.bar(range(len(purpose_payback)), purpose_payback['mean'], 
               color=plt.cm.viridis(np.linspace(0.2, 0.9, len(purpose_payback))), 
               alpha=0.8, edgecolor='black', linewidth=1)
ax11.set_xticks(range(len(purpose_payback)))
ax11.set_xticklabels(purpose_payback.index, rotation=45, ha='right', fontsize=9)
ax11.set_ylabel('Payback Rate', fontsize=11, fontweight='bold')
ax11.set_title('üéØ Loan Purpose Impact', fontsize=13, fontweight='bold', pad=10)
ax11.grid(axis='y', alpha=0.3, linestyle='--')
for i, (idx, row) in enumerate(purpose_payback.iterrows()):
    ax11.text(i, row['mean'] + 0.01, f"{row['mean']:.2%}", ha='center', fontsize=8, fontweight='bold')

# 12. Model Weights Visualization
ax12 = plt.subplot(3, 4, 12)
weights_data = {
    'XGBoost': best_weights[0],
    'LightGBM': best_weights[1],
    'CatBoost': best_weights[2]
}
wedges, texts, autotexts = ax12.pie(weights_data.values(), labels=weights_data.keys(), 
                                      autopct='%1.1f%%', colors=colors_models[:3], startangle=90,
                                      textprops={'fontsize': 11, 'fontweight': 'bold'},
                                      explode=(0.05, 0.05, 0.05), shadow=True)
for autotext in autotexts:
    autotext.set_color('white')
    autotext.set_fontsize(12)
ax12.set_title('‚öñÔ∏è Optimized Ensemble Weights', fontsize=13, fontweight='bold', pad=10)

plt.tight_layout(rect=[0, 0.03, 1, 0.97])
plt.savefig('elite_loan_prediction_dashboard.png', dpi=300, bbox_inches='tight', facecolor='white')
print("\n‚úì Main dashboard saved: 'elite_loan_prediction_dashboard.png'")
plt.show()

# Additional Visualization: Correlation Heatmap
print("\n‚öôÔ∏è  Generating correlation heatmap...")
fig2, ax = plt.subplots(figsize=(16, 14))
# Select key features for correlation
key_features = ['annual_income', 'debt_to_income_ratio', 'credit_score', 'loan_amount', 
                'interest_rate', 'income_to_loan_ratio', 'credit_risk_v1', 
                'payment_to_income_ratio', 'grade_rank', 'loan_paid_back']
corr_data = train_fe[key_features].corr()
mask = np.triu(np.ones_like(corr_data, dtype=bool))
sns.heatmap(corr_data, mask=mask, annot=True, fmt='.2f', cmap='coolwarm', 
            center=0, square=True, linewidths=1, cbar_kws={"shrink": 0.8},
            ax=ax, vmin=-1, vmax=1)
ax.set_title('üî• Feature Correlation Heatmap - Key Variables', 
            fontsize=16, fontweight='bold', pad=20)
plt.tight_layout()
plt.savefig('correlation_heatmap.png', dpi=300, bbox_inches='tight', facecolor='white')
print("‚úì Correlation heatmap saved: 'correlation_heatmap.png'")
plt.show()

# Additional Visualization: Grade Distribution
print("\n‚öôÔ∏è  Generating grade distribution analysis...")
fig3, axes = plt.subplots(2, 2, figsize=(16, 12))
fig3.suptitle('üìä Credit Grade Analysis', fontsize=18, fontweight='bold', y=0.995)

# Grade letter distribution
ax_g1 = axes[0, 0]
grade_dist = train['grade_subgrade'].str[0].value_counts().sort_index()
bars = ax_g1.bar(grade_dist.index, grade_dist.values, 
                color=plt.cm.RdYlGn_r(np.linspace(0.2, 0.8, len(grade_dist))),
                alpha=0.8, edgecolor='black', linewidth=1.5)
ax_g1.set_xlabel('Grade Letter', fontsize=12, fontweight='bold')
ax_g1.set_ylabel('Count', fontsize=12, fontweight='bold')
ax_g1.set_title('Grade Letter Distribution', fontsize=13, fontweight='bold')
ax_g1.grid(axis='y', alpha=0.3, linestyle='--')
for bar in bars:
    height = bar.get_height()
    ax_g1.text(bar.get_x() + bar.get_width()/2., height,
              f'{int(height):,}', ha='center', va='bottom', fontsize=10, fontweight='bold')

# Grade vs Payback Rate
ax_g2 = axes[0, 1]
grade_payback = train_fe.groupby('grade_letter')['loan_paid_back'].agg(['mean', 'count'])
grade_payback = grade_payback.sort_index()
ax_g2_twin = ax_g2.twinx()
bars = ax_g2.bar(grade_payback.index, grade_payback['mean'], 
               color=plt.cm.RdYlGn(grade_payback['mean']), alpha=0.8, 
               edgecolor='black', linewidth=1.5, label='Payback Rate')
line = ax_g2_twin.plot(grade_payback.index, grade_payback['count'], 
                      color='red', marker='o', linewidth=2, markersize=8, label='Count')
ax_g2.set_xlabel('Grade Letter', fontsize=12, fontweight='bold')
ax_g2.set_ylabel('Payback Rate', fontsize=12, fontweight='bold', color='black')
ax_g2_twin.set_ylabel('Count', fontsize=12, fontweight='bold', color='red')
ax_g2.set_title('Grade vs Payback Rate', fontsize=13, fontweight='bold')
ax_g2.set_ylim([0.5, 1.0])
ax_g2.grid(axis='y', alpha=0.3, linestyle='--')
ax_g2.legend(loc='upper left', fontsize=9)
ax_g2_twin.legend(loc='upper right', fontsize=9)

# Interest Rate by Grade
ax_g3 = axes[1, 0]
grade_interest = train_fe.groupby('grade_letter')['interest_rate'].mean().sort_index()
bars = ax_g3.bar(grade_interest.index, grade_interest.values,
               color=plt.cm.plasma(np.linspace(0.2, 0.9, len(grade_interest))),
               alpha=0.8, edgecolor='black', linewidth=1.5)
ax_g3.set_xlabel('Grade Letter', fontsize=12, fontweight='bold')
ax_g3.set_ylabel('Average Interest Rate (%)', fontsize=12, fontweight='bold')
ax_g3.set_title('Interest Rate by Grade', fontsize=13, fontweight='bold')
ax_g3.grid(axis='y', alpha=0.3, linestyle='--')
for bar, val in zip(bars, grade_interest.values):
    ax_g3.text(bar.get_x() + bar.get_width()/2., val,
              f'{val:.2f}%', ha='center', va='bottom', fontsize=10, fontweight='bold')

# Credit Score by Grade
ax_g4 = axes[1, 1]
grade_credit = train_fe.groupby('grade_letter')['credit_score'].mean().sort_index()
bars = ax_g4.bar(grade_credit.index, grade_credit.values,
               color=plt.cm.viridis(np.linspace(0.2, 0.9, len(grade_credit))),
               alpha=0.8, edgecolor='black', linewidth=1.5)
ax_g4.set_xlabel('Grade Letter', fontsize=12, fontweight='bold')
ax_g4.set_ylabel('Average Credit Score', fontsize=12, fontweight='bold')
ax_g4.set_title('Credit Score by Grade', fontsize=13, fontweight='bold')
ax_g4.grid(axis='y', alpha=0.3, linestyle='--')
for bar, val in zip(bars, grade_credit.values):
    ax_g4.text(bar.get_x() + bar.get_width()/2., val,
              f'{val:.0f}', ha='center', va='bottom', fontsize=10, fontweight='bold')

plt.tight_layout(rect=[0, 0.03, 1, 0.97])
plt.savefig('grade_analysis.png', dpi=300, bbox_inches='tight', facecolor='white')
print("‚úì Grade analysis saved: 'grade_analysis.png'")
plt.show()

# =============================================================================
# 8. FEATURE IMPORTANCE
# =============================================================================
print("\n" + "="*85)
print(" ‚≠ê PHASE 7: TOP FEATURES".center(85))
print("="*85)

print("\nüèÜ Top 20 Most Important Features:")
fi_table = [[feat, f'{imp:.6f}'] for feat, imp in fi_agg.items()]
print(tabulate(fi_table, headers=['Feature', 'Importance'], tablefmt='fancy_grid'))

# =============================================================================
# 9. SUBMISSION
# =============================================================================
print("\n" + "="*85)
print(" üíæ PHASE 8: CREATING SUBMISSION".center(85))
print("="*85)

submission = pd.DataFrame({
    'id': test_ids,
    'loan_paid_back': test_ensemble
})

submission.to_csv('submission_elite.csv', index=False)

print(f"\n‚úì Submission created: submission_elite.csv")
print(f"‚úì Shape: {submission.shape}")
print(f"\nüìä Prediction Statistics:")
pred_stats = [
    ['Mean', f"{test_ensemble.mean():.6f}"],
    ['Median', f"{np.median(test_ensemble):.6f}"],
    ['Min', f"{test_ensemble.min():.6f}"],
    ['Max', f"{test_ensemble.max():.6f}"],
    ['Std', f"{test_ensemble.std():.6f}"]
]
print(tabulate(pred_stats, headers=['Statistic', 'Value'], tablefmt='grid'))

print("\n" + tabulate(submission.head(10), headers='keys', tablefmt='grid', showindex=False))

# =============================================================================
# FINAL SUMMARY
# =============================================================================
print("\n" + "="*85)
print(" üéâ ELITE PIPELINE COMPLETE".center(85))
print("="*85)

summary = [
    ['Features Engineered', f"{X_train.shape[1]}"],
    ['Models in Ensemble', '3 (XGBoost + LightGBM + CatBoost)'],
    ['CV Strategy', '5-Fold Stratified'],
    ['Best OOF Score', f"{best_score:.6f}"],
    ['Expected LB', f"~{best_score - 0.001:.4f} to {best_score + 0.001:.4f}"],
    ['Improvement vs Baseline', f"+{(best_score - 0.9152)*100:.2f}%"]
]

print("\n" + tabulate(summary, headers=['Metric', 'Value'], tablefmt='fancy_grid'))
print("\n" + "="*85)
print(" üöÄ Ready for Kaggle submission!".center(85))
print("="*85)