# Predicting Loan Payback 101
> ### Playground Series S5E11


## Table of Contents 
1. [Introduction & Setup](#introduction)
2. [Data Loading & Overview](#data-loading)
3. [Exploratory Data Analysis (EDA)](#eda)
4. [Feature Engineering](#feature-engineering)
5. [Complete Feature Engineering](#feature-engineering)
6. [Model Training](#model-training)
   - 6.1 LightGBM
   - 6.2 XGBoost
   - 6.3 CatBoost
7. [Model Evaluation](#evaluation)
8. [Ensemble Methods](#ensemble)
9. [Submission Generation](#submission)
10. [Conclusion](#conclusion)


### Competition Goal

Predict the **probability** that a borrower will pay back their loan based on:
- Financial metrics (income, debt, credit score)
- Loan characteristics (amount, interest rate, purpose)
- Personal information (gender, marital status, education, employment)

**Evaluation:** Area Under the ROC Curve (AUC-ROC)

<a id='introduction'></a>
# 1️ Introduction & Setup

In [None]:
# Core libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from scipy import stats
from scipy.stats import skew, kurtosis
from scipy.stats import rankdata

# Preprocessing
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score, roc_curve, confusion_matrix

# Models
import lightgbm as lgb
import xgboost as xgb
from catboost import CatBoostClassifier

# Settings
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

# Seed for reproducibility
SEED = 42
np.random.seed(SEED)

In [None]:
# Configuration
class Config:
    """Configuration class for hyperparameters and settings"""
    N_SPLITS = 5
    SEED = 42
    TARGET = 'loan_paid_back'
    VERBOSE = True
    
    # Model weights for ensemble (will be optimized later)
    WEIGHTS = {
        'lgb': 0.33,
        'xgb': 0.33,
        'cat': 0.34
    }

config = Config()
print("Configuration loaded successfully!")
print(f"   - Number of folds: {config.N_SPLITS}")
print(f"   - Random seed: {config.SEED}")
print(f"   - Target variable: {config.TARGET}")

<a id='data-loading'></a>
# 2️ Data Loading & Overview

In [None]:
# Load datasets
train = pd.read_csv('/kaggle/input/playground-series-s5e11/train.csv')
test = pd.read_csv('/kaggle/input/playground-series-s5e11/test.csv')
sample_submission = pd.read_csv('/kaggle/input/playground-series-s5e11/sample_submission.csv')


print(f" Train shape: {train.shape}")
print(f" Test shape: {test.shape}")
print(f" Sample submission shape: {sample_submission.shape}")
print(f"\n Total rows: {train.shape[0]:,}")
print(f" Total features: {train.shape[1] - 2} (excluding id and target)")
print(f" Test samples to predict: {test.shape[0]:,}")

In [None]:
# First look at the data
print("TRAIN DATA PREVIEW")
display(train.head(10))

print("TEST DATA PREVIEW")
display(test.head(10))

In [None]:
# Data types and memory usage
print("\n" + "="*80)
print("DATA INFO - TRAIN")
print("="*80)
train.info()

print("\n" + "="*80)
print("DATA INFO - TEST")
print("="*80)
test.info()

In [None]:
# Check for missing values

missing_train = train.isnull().sum()
missing_test = test.isnull().sum()

missing_df = pd.DataFrame({
    'Feature': train.columns,
    'Train Missing': missing_train.values,
    'Train Missing %': (missing_train.values / len(train) * 100).round(2),
    'Test Missing': [missing_test.get(col, 0) for col in train.columns],
    'Test Missing %': [(missing_test.get(col, 0) / len(test) * 100) for col in train.columns]
})

missing_summary = missing_df[(missing_df['Train Missing'] > 0) | (missing_df['Test Missing'] > 0)]

if len(missing_summary) > 0:
    display(missing_summary.style.background_gradient(cmap='Reds'))
else:
    print(" No missing values found in train or test data!")
    print(" Data quality is excellent - ready for modeling!")

In [None]:
# Check for duplicates
train_duplicates = train.duplicated().sum()
test_duplicates = test.duplicated().sum()

print(f"Train duplicates: {train_duplicates}")
print(f"Test duplicates: {test_duplicates}")

if train_duplicates == 0 and test_duplicates == 0:
    print(" No duplicates found!")

In [None]:
# Target distribution
print("TARGET DISTRIBUTION ANALYSIS")
print("="*80)

target_counts = train[config.TARGET].value_counts()
target_pct = train[config.TARGET].value_counts(normalize=True) * 100

target_summary = pd.DataFrame({
    'Value': target_counts.index,
    'Count': target_counts.values,
    'Percentage': target_pct.values
})

display(target_summary.style.background_gradient(cmap='Blues'))

print(f"\nTarget Statistics:")
print(f"   - Mean: {train[config.TARGET].mean():.4f}")
print(f"   - Median: {train[config.TARGET].median():.4f}")
print(f"   - Std: {train[config.TARGET].std():.4f}")

# Check for class imbalance
imbalance_ratio = target_counts.min() / target_counts.max()
print(f"\n  Class Balance Ratio: {imbalance_ratio:.3f}")
if imbalance_ratio < 0.5:
    print("  Dataset is imbalanced - consider using stratified sampling")
else:
    print(" Dataset is relatively balanced")

In [None]:
# Visualize target distribution
fig, axes = plt.subplots(1, 3, figsize=(18, 5))

# Count plot
colors = ['#FF6B6B', '#4ECDC4']
train[config.TARGET].value_counts().plot(kind='bar', ax=axes[0], color=colors, edgecolor='black', linewidth=1.5)
axes[0].set_title('Target Distribution (Count)', fontsize=14, fontweight='bold')
axes[0].set_xlabel('Loan Paid Back', fontsize=12)
axes[0].set_ylabel('Count', fontsize=12)
axes[0].tick_params(rotation=0)
axes[0].grid(alpha=0.3, axis='y')

# Add value labels on bars
for container in axes[0].containers:
    axes[0].bar_label(container, fmt='%d', fontsize=10, fontweight='bold')

# Pie chart
explode = (0.05, 0.05)
axes[1].pie(train[config.TARGET].value_counts(), 
            labels=['Not Paid Back (0)', 'Paid Back (1)'], 
            autopct='%1.2f%%', 
            colors=colors, 
            explode=explode, 
            shadow=True, 
            startangle=90,
            textprops={'fontsize': 11, 'fontweight': 'bold'})
axes[1].set_title('Target Distribution (Percentage)', fontsize=14, fontweight='bold')

# Distribution plot
axes[2].hist(train[config.TARGET], bins=20, color='steelblue', edgecolor='black', alpha=0.7)
axes[2].axvline(train[config.TARGET].mean(), color='red', linestyle='--', linewidth=2, label=f"Mean: {train[config.TARGET].mean():.3f}")
axes[2].set_title('Target Value Distribution', fontsize=14, fontweight='bold')
axes[2].set_xlabel('Loan Paid Back', fontsize=12)
axes[2].set_ylabel('Frequency', fontsize=12)
axes[2].legend(fontsize=10)
axes[2].grid(alpha=0.3)

plt.tight_layout()
plt.show()

<a id='eda'></a>
# 3️ Exploratory Data Analysis (EDA)

## 3.1 Feature Type Identification

In [None]:
# Separate numerical and categorical columns
numerical_cols = train.select_dtypes(include=[np.number]).columns.tolist()
numerical_cols.remove('id')
if config.TARGET in numerical_cols:
    numerical_cols.remove(config.TARGET)

categorical_cols = train.select_dtypes(include=['object']).columns.tolist()

print("FEATURE TYPE SUMMARY")
print("="*80)
print(f"\n Numerical features ({len(numerical_cols)}):")
for i, col in enumerate(numerical_cols, 1):
    print(f"   {i}. {col}")

print(f"\n Categorical features ({len(categorical_cols)}):")
for i, col in enumerate(categorical_cols, 1):
    print(f"   {i}. {col}")

print(f"\n Total predictive features: {len(numerical_cols) + len(categorical_cols)}")

## 3.2 Numerical Features Analysis

In [None]:
# Statistical summary of numerical features
print("NUMERICAL FEATURES - STATISTICAL SUMMARY")
print("="*80)

numerical_stats = train[numerical_cols].describe().T
numerical_stats['missing'] = train[numerical_cols].isnull().sum().values
numerical_stats['skewness'] = train[numerical_cols].skew().values
numerical_stats['kurtosis'] = train[numerical_cols].kurtosis().values

display(numerical_stats.style.background_gradient(cmap='coolwarm', subset=['mean', 'std', 'skewness', 'kurtosis']))

In [None]:
# Distribution of numerical features
fig, axes = plt.subplots(3, 2, figsize=(16, 14))
axes = axes.flatten()

for idx, col in enumerate(numerical_cols):
    # Plot distribution with KDE
    axes[idx].hist(train[col], bins=50, alpha=0.6, color='steelblue', edgecolor='black', density=True, label='Histogram')
    
    # Add KDE
    train[col].plot(kind='kde', ax=axes[idx], color='red', linewidth=2, label='KDE')
    
    axes[idx].set_title(f'{col} Distribution', fontsize=13, fontweight='bold', pad=10)
    axes[idx].set_xlabel(col, fontsize=11)
    axes[idx].set_ylabel('Density', fontsize=11)
    axes[idx].grid(alpha=0.3, linestyle='--')
    
    # Add statistics box
    mean_val = train[col].mean()
    median_val = train[col].median()
    std_val = train[col].std()
    
    axes[idx].axvline(mean_val, color='green', linestyle='--', linewidth=2, alpha=0.7, label=f'Mean: {mean_val:.2f}')
    axes[idx].axvline(median_val, color='orange', linestyle='--', linewidth=2, alpha=0.7, label=f'Median: {median_val:.2f}')
    
    axes[idx].legend(fontsize=9, loc='upper right')

plt.suptitle('Numerical Features Distribution Analysis', fontsize=16, fontweight='bold', y=1.00)
plt.tight_layout()
plt.show()

In [None]:
# Correlation analysis
print("CORRELATION ANALYSIS")
print("="*80)

plt.figure(figsize=(14, 11))
correlation_matrix = train[numerical_cols + [config.TARGET]].corr()

# Create mask for upper triangle
mask = np.triu(np.ones_like(correlation_matrix, dtype=bool))

# Create heatmap
sns.heatmap(correlation_matrix, 
            mask=mask, 
            annot=True, 
            fmt='.3f', 
            cmap='coolwarm', 
            center=0, 
            square=True, 
            linewidths=1,
            cbar_kws={"shrink": 0.8, "label": "Correlation Coefficient"},
            annot_kws={"size": 9})

plt.title('Correlation Matrix - Numerical Features & Target', fontsize=16, fontweight='bold', pad=20)
plt.tight_layout()
plt.show()

# Feature importance based on correlation with target
target_corr = correlation_matrix[config.TARGET].drop(config.TARGET).sort_values(ascending=False)

print("\n Correlation with Target (sorted by absolute value):")
target_corr_abs = target_corr.abs().sort_values(ascending=False)
for feature, corr_val in target_corr_abs.items():
    actual_corr = target_corr[feature]
    print(f"   {feature:30s}: {actual_corr:7.4f} (abs: {corr_val:.4f})")

In [None]:
# Visualize correlation with target
plt.figure(figsize=(12, 6))
target_corr_df = target_corr.sort_values()
colors_list = ['#FF6B6B' if x < 0 else '#4ECDC4' for x in target_corr_df.values]

plt.barh(range(len(target_corr_df)), target_corr_df.values, color=colors_list, edgecolor='black', linewidth=1.2)
plt.yticks(range(len(target_corr_df)), target_corr_df.index, fontsize=11)
plt.xlabel('Correlation Coefficient', fontsize=12, fontweight='bold')
plt.title('Feature Correlation with Target Variable', fontsize=14, fontweight='bold', pad=15)
plt.axvline(x=0, color='black', linestyle='-', linewidth=1)
plt.grid(alpha=0.3, axis='x')

# Add value labels
for i, v in enumerate(target_corr_df.values):
    plt.text(v, i, f' {v:.4f}', va='center', fontsize=9, fontweight='bold')

plt.tight_layout()
plt.show()

In [None]:
# Box plots by target - showing distribution differences
fig, axes = plt.subplots(3, 2, figsize=(16, 14))
axes = axes.flatten()

for idx, col in enumerate(numerical_cols):
    # Create box plot
    data_to_plot = [train[train[config.TARGET] == 0][col].dropna(), 
                    train[train[config.TARGET] == 1][col].dropna()]
    
    bp = axes[idx].boxplot(data_to_plot, 
                           labels=['Not Paid (0)', 'Paid (1)'],
                           patch_artist=True,
                           showmeans=True,
                           meanline=True)
    
    # Color the boxes
    colors = ['#FF6B6B', '#4ECDC4']
    for patch, color in zip(bp['boxes'], colors):
        patch.set_facecolor(color)
        patch.set_alpha(0.6)
    
    axes[idx].set_title(f'{col} by Target', fontsize=13, fontweight='bold')
    axes[idx].set_ylabel(col, fontsize=11)
    axes[idx].grid(alpha=0.3, axis='y')
    
    # Add mean values as text
    mean_0 = train[train[config.TARGET] == 0][col].mean()
    mean_1 = train[train[config.TARGET] == 1][col].mean()
    axes[idx].text(1, mean_0, f'μ={mean_0:.1f}', ha='center', va='bottom', fontsize=9, fontweight='bold')
    axes[idx].text(2, mean_1, f'μ={mean_1:.1f}', ha='center', va='bottom', fontsize=9, fontweight='bold')

plt.suptitle('Numerical Features by Target - Box Plot Analysis', fontsize=16, fontweight='bold', y=1.00)
plt.tight_layout()
plt.show()

## 3.3 Categorical Features Analysis

In [None]:
# Categorical features summary
print("CATEGORICAL FEATURES - DETAILED ANALYSIS")
print("="*80)

for col in categorical_cols:
    print(f"Feature: {col.upper()}")
    print(f"{'='*80}")
    print(f"Unique values: {train[col].nunique()}")
    print(f"Most common: {train[col].mode()[0]}")
    print(f"\nValue Counts:")
    
    value_counts_df = pd.DataFrame({
        'Value': train[col].value_counts().index,
        'Count': train[col].value_counts().values,
        'Percentage': (train[col].value_counts(normalize=True) * 100).values
    })
    display(value_counts_df.head(10).style.background_gradient(cmap='Blues', subset=['Count', 'Percentage']))

In [None]:
# Visualize categorical features distribution
fig, axes = plt.subplots(3, 2, figsize=(18, 14))
axes = axes.flatten()

for idx, col in enumerate(categorical_cols):
    value_counts = train[col].value_counts()
    
    # Create bar plot
    bars = axes[idx].bar(range(len(value_counts)), value_counts.values, 
                         color='steelblue', edgecolor='black', linewidth=1.2, alpha=0.8)
    
    axes[idx].set_title(f'{col} Distribution', fontsize=13, fontweight='bold', pad=10)
    axes[idx].set_xlabel(col, fontsize=11)
    axes[idx].set_ylabel('Count', fontsize=11)
    axes[idx].set_xticks(range(len(value_counts)))
    axes[idx].set_xticklabels(value_counts.index, rotation=45, ha='right')
    axes[idx].grid(alpha=0.3, axis='y', linestyle='--')
    
    # Add value labels on bars
    for bar in bars:
        height = bar.get_height()
        axes[idx].text(bar.get_x() + bar.get_width()/2., height,
                      f'{int(height):,}',
                      ha='center', va='bottom', fontsize=9, fontweight='bold')

plt.suptitle('Categorical Features Distribution', fontsize=16, fontweight='bold', y=1.00)
plt.tight_layout()
plt.show()

In [None]:
# Target rate by categorical features
fig, axes = plt.subplots(3, 2, figsize=(18, 14))
axes = axes.flatten()

overall_mean = train[config.TARGET].mean()

for idx, col in enumerate(categorical_cols):
    target_rate = train.groupby(col)[config.TARGET].agg(['mean', 'count']).sort_values('mean', ascending=False)
    
    # Create bar plot
    bars = axes[idx].bar(range(len(target_rate)), target_rate['mean'].values, 
                         color='coral', edgecolor='black', linewidth=1.2, alpha=0.8)
    
    axes[idx].set_title(f'Payback Rate by {col}', fontsize=13, fontweight='bold', pad=10)
    axes[idx].set_xlabel(col, fontsize=11)
    axes[idx].set_ylabel('Payback Rate', fontsize=11)
    axes[idx].set_xticks(range(len(target_rate)))
    axes[idx].set_xticklabels(target_rate.index, rotation=45, ha='right')
    axes[idx].axhline(y=overall_mean, color='red', linestyle='--', 
                      linewidth=2.5, alpha=0.7, label=f'Overall: {overall_mean:.3f}')
    axes[idx].legend(fontsize=10, loc='best')
    axes[idx].grid(alpha=0.3, axis='y', linestyle='--')
    axes[idx].set_ylim([0, max(target_rate['mean'].max() * 1.1, overall_mean * 1.2)])
    
    # Add value labels with sample counts
    for i, (bar, count) in enumerate(zip(bars, target_rate['count'].values)):
        height = bar.get_height()
        axes[idx].text(bar.get_x() + bar.get_width()/2., height,
                      f'{height:.3f}\n(n={count:,})',
                      ha='center', va='bottom', fontsize=8, fontweight='bold')

plt.suptitle('Target Rate Analysis by Categorical Features', fontsize=16, fontweight='bold', y=1.00)
plt.tight_layout()
plt.show()

## 3.4 Advanced Statistical Analysis

In [None]:
# Skewness and Kurtosis detailed analysis
print("SKEWNESS AND KURTOSIS ANALYSIS")
print("="*80)

skew_kurt_df = pd.DataFrame({
    'Feature': numerical_cols,
    'Skewness': [skew(train[col]) for col in numerical_cols],
    'Kurtosis': [kurtosis(train[col]) for col in numerical_cols],
    'Min': [train[col].min() for col in numerical_cols],
    'Max': [train[col].max() for col in numerical_cols],
    'Range': [train[col].max() - train[col].min() for col in numerical_cols]
})

# Add interpretation
skew_kurt_df['Skew_Interpretation'] = skew_kurt_df['Skewness'].apply(
    lambda x: 'Right-skewed' if x > 1 else ('Left-skewed' if x < -1 else 'Symmetric')
)

skew_kurt_df = skew_kurt_df.sort_values('Skewness', key=abs, ascending=False)

display(skew_kurt_df.style.background_gradient(cmap='coolwarm', subset=['Skewness', 'Kurtosis']))

print("\n Interpretation Guide:")
print("    Skewness:")
print("      - Close to 0: Symmetric distribution (Normal-like)")
print("      - > 1: Highly right-skewed (long tail on right)")
print("      - < -1: Highly left-skewed (long tail on left)")
print("\n   Kurtosis:")
print("      - Close to 0: Normal-like tails")
print("      - > 0: Heavy tails (more outliers)")
print("      - < 0: Light tails (fewer outliers)")

In [None]:
# Outlier detection using IQR method
print("OUTLIER DETECTION (IQR METHOD)")
print("="*80)

outlier_summary = []

for col in numerical_cols:
    Q1 = train[col].quantile(0.25)
    Q3 = train[col].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    
    outliers = train[(train[col] < lower_bound) | (train[col] > upper_bound)][col]
    outlier_count = len(outliers)
    outlier_pct = (outlier_count / len(train)) * 100
    
    outlier_summary.append({
        'Feature': col,
        'Q1': Q1,
        'Q3': Q3,
        'IQR': IQR,
        'Lower_Bound': lower_bound,
        'Upper_Bound': upper_bound,
        'Outlier_Count': outlier_count,
        'Outlier_Percentage': outlier_pct
    })

outlier_df = pd.DataFrame(outlier_summary).sort_values('Outlier_Percentage', ascending=False)
display(outlier_df.style.background_gradient(cmap='Reds', subset=['Outlier_Count', 'Outlier_Percentage']))

print(f"\n Outlier Summary:")
print(f"   - Total features with outliers: {(outlier_df['Outlier_Count'] > 0).sum()}")
print(f"   - Average outlier percentage: {outlier_df['Outlier_Percentage'].mean():.2f}%")

<a id='feature-engineering'></a>
# 4️ Feature Engineering

## 4.1 Advanced Feature Engineering Function

In [None]:
def advanced_feature_engineering(df, is_train=True):
    """
    Comprehensive feature engineering for loan prediction
    
    This function creates multiple types of features:
    - Financial ratios and metrics
    - Risk scores and composite metrics
    - Interaction features
    - Binned/categorical versions of numerical features
    - Statistical aggregations
    - Domain-specific features
    
    Parameters:
    -----------
    df : pd.DataFrame
        Input dataframe (train or test)
    is_train : bool
        Whether this is training data
    
    Returns:
    --------
    df : pd.DataFrame
        Dataframe with engineered features
    """
    
    df = df.copy()
    
    print("FEATURE ENGINEERING PIPELINE")
    print("="*80)
    print(f"Starting features: {df.shape[1]}")
    
    # ========================================
    # 1. FINANCIAL RATIO FEATURES
    # ========================================
    print("\n[1/11]  Creating financial ratio features...")
    
    # Core financial ratios
    df['loan_to_income_ratio'] = df['loan_amount'] / (df['annual_income'] + 1)
    df['monthly_income'] = df['annual_income'] / 12
    df['monthly_payment_estimate'] = (df['loan_amount'] * df['interest_rate']) / 1200
    df['payment_to_income_ratio'] = df['monthly_payment_estimate'] / (df['monthly_income'] + 1)
    
    # Debt calculations
    df['current_debt_amount'] = df['debt_to_income_ratio'] * df['annual_income']
    df['total_debt_with_loan'] = df['current_debt_amount'] + df['loan_amount']
    df['new_debt_to_income'] = df['total_debt_with_loan'] / (df['annual_income'] + 1)
    df['debt_increase_ratio'] = df['new_debt_to_income'] / (df['debt_to_income_ratio'] + 0.01)
    df['debt_increase_amount'] = df['loan_amount']
    
    # Disposable income
    df['disposable_income'] = df['annual_income'] - df['current_debt_amount']
    df['disposable_income_ratio'] = df['disposable_income'] / (df['annual_income'] + 1)
    df['loan_to_disposable_income'] = df['loan_amount'] / (df['disposable_income'] + 1)
    df['monthly_disposable_income'] = df['disposable_income'] / 12
    
    # Payment burden
    df['payment_to_disposable_ratio'] = df['monthly_payment_estimate'] / (df['monthly_disposable_income'] + 1)
    df['annual_payment_burden'] = df['monthly_payment_estimate'] * 12
    df['payment_burden_ratio'] = df['annual_payment_burden'] / (df['annual_income'] + 1)
    
    print(f"✓ Created 16 financial ratio features")
    
    # ========================================
    # 2. CREDIT SCORE FEATURES
    # ========================================
    print("\n[2/11]  Creating credit score features...")
    
    # Normalize and transform credit score
    df['credit_score_normalized'] = df['credit_score'] / 850
    df['credit_risk_score'] = 1 - df['credit_score_normalized']
    df['credit_score_squared'] = df['credit_score'] ** 2
    df['credit_score_log'] = np.log1p(df['credit_score'])
    
    # Credit categories
    df['credit_category'] = pd.cut(df['credit_score'], 
                                     bins=[0, 580, 670, 740, 800, 850],
                                     labels=['poor', 'fair', 'good', 'very_good', 'excellent'])
    
    # Credit score bins
    df['credit_bin'] = pd.cut(df['credit_score'], bins=10, labels=False)
    
    # Interactions with other features
    df['credit_income_interaction'] = df['credit_score'] * df['annual_income']
    df['credit_times_dti'] = df['credit_score'] * df['debt_to_income_ratio']
    df['credit_loan_interaction'] = df['credit_score'] * df['loan_amount']
    
    print(f"✓ Created 9 credit score features")
    
    # ========================================
    # 3. INTEREST RATE FEATURES
    # ========================================
    print("\n[3/11]  Creating interest rate features...")
    
    # Interest rate flags and categories
    df['high_interest_flag'] = (df['interest_rate'] > df['interest_rate'].median()).astype(int)
    df['very_high_interest'] = (df['interest_rate'] > df['interest_rate'].quantile(0.75)).astype(int)
    df['low_interest_flag'] = (df['interest_rate'] < df['interest_rate'].quantile(0.25)).astype(int)
    
    # Interest cost calculations
    df['total_interest_cost'] = df['loan_amount'] * df['interest_rate'] / 100
    df['interest_burden'] = df['total_interest_cost'] / (df['annual_income'] + 1)
    df['monthly_interest_cost'] = df['total_interest_cost'] / 12
    
    # Interest rate vs credit score (should be inversely related)
    df['interest_credit_mismatch'] = df['interest_rate'] * (1 - df['credit_score_normalized'])
    df['interest_credit_ratio'] = df['interest_rate'] / (df['credit_score'] / 100)
    
    # Interest rate transformations
    df['interest_rate_squared'] = df['interest_rate'] ** 2
    df['interest_rate_log'] = np.log1p(df['interest_rate'])
    
    print(f"✓ Created 10 interest rate features")
    
    # ========================================
    # 4. COMPOSITE RISK SCORES
    # ========================================
    print("\n[4/11]   Creating composite risk scores...")
    
    # Multi-factor risk scores (weighted combinations)
    df['risk_score_v1'] = (
        df['debt_to_income_ratio'] * 0.25 +
        df['loan_to_income_ratio'] * 0.25 +
        df['credit_risk_score'] * 0.30 +
        (df['interest_rate'] / 100) * 0.20
    )
    
    df['risk_score_v2'] = (
        df['payment_to_income_ratio'] * 0.40 +
        df['new_debt_to_income'] * 0.35 +
        df['interest_burden'] * 0.25
    )
    
    df['risk_score_v3'] = (
        df['debt_to_income_ratio'] * 0.30 +
        df['payment_burden_ratio'] * 0.30 +
        df['credit_risk_score'] * 0.40
    )
    
    # Affordability score (higher is better)
    df['affordability_score'] = (
        df['credit_score_normalized'] * 0.40 +
        (1 - df['debt_to_income_ratio']) * 0.30 +
        df['disposable_income_ratio'] * 0.30
    )
    
    # Financial health score
    df['financial_health_score'] = (
        df['affordability_score'] * 0.60 -
        df['risk_score_v1'] * 0.40
    )
    
    print(f"✓ Created 5 composite risk scores")
    
    # Continue in next cell...
    return df

<a id='feature-engineering'></a>
# 5 Complete Feature Engineering

In [None]:
def complete_feature_engineering(df):
    """
    Comprehensive feature engineering pipeline for loan prediction
    """
    df = df.copy()
    
    print(" FEATURE ENGINEERING PIPELINE")
    print("="*80)
    print(f"Starting features: {df.shape[1]}")
    
    # 1. FINANCIAL RATIOS
    print("\n[1/11]  Financial ratio features...")
    df['loan_to_income_ratio'] = df['loan_amount'] / (df['annual_income'] + 1)
    df['monthly_income'] = df['annual_income'] / 12
    df['monthly_payment_estimate'] = (df['loan_amount'] * df['interest_rate']) / 1200
    df['payment_to_income_ratio'] = df['monthly_payment_estimate'] / (df['monthly_income'] + 1)
    df['current_debt_amount'] = df['debt_to_income_ratio'] * df['annual_income']
    df['total_debt_with_loan'] = df['current_debt_amount'] + df['loan_amount']
    df['new_debt_to_income'] = df['total_debt_with_loan'] / (df['annual_income'] + 1)
    df['debt_increase_ratio'] = df['new_debt_to_income'] / (df['debt_to_income_ratio'] + 0.01)
    df['disposable_income'] = df['annual_income'] - df['current_debt_amount']
    df['disposable_income_ratio'] = df['disposable_income'] / (df['annual_income'] + 1)
    df['loan_to_disposable_income'] = df['loan_amount'] / (df['disposable_income'] + 1)
    df['monthly_disposable_income'] = df['disposable_income'] / 12
    df['payment_to_disposable_ratio'] = df['monthly_payment_estimate'] / (df['monthly_disposable_income'] + 1)
    df['annual_payment_burden'] = df['monthly_payment_estimate'] * 12
    df['payment_burden_ratio'] = df['annual_payment_burden'] / (df['annual_income'] + 1)
    print(f"✓ Created 15 features")
    
    # 2. CREDIT SCORE FEATURES
    print("[2/11]  Credit score features...")
    df['credit_score_normalized'] = df['credit_score'] / 850
    df['credit_risk_score'] = 1 - df['credit_score_normalized']
    df['credit_score_squared'] = df['credit_score'] ** 2
    df['credit_score_log'] = np.log1p(df['credit_score'])
    df['credit_category'] = pd.cut(df['credit_score'], bins=[0, 580, 670, 740, 800, 850],
                                     labels=['poor', 'fair', 'good', 'very_good', 'excellent'])
    df['credit_income_interaction'] = df['credit_score'] * df['annual_income']
    df['credit_times_dti'] = df['credit_score'] * df['debt_to_income_ratio']
    df['credit_loan_interaction'] = df['credit_score'] * df['loan_amount']
    print(f"✓ Created 8 features")
    
    # 3. INTEREST RATE FEATURES
    print("[3/11]  Interest rate features...")
    df['high_interest_flag'] = (df['interest_rate'] > df['interest_rate'].median()).astype(int)
    df['very_high_interest'] = (df['interest_rate'] > df['interest_rate'].quantile(0.75)).astype(int)
    df['low_interest_flag'] = (df['interest_rate'] < df['interest_rate'].quantile(0.25)).astype(int)
    df['total_interest_cost'] = df['loan_amount'] * df['interest_rate'] / 100
    df['interest_burden'] = df['total_interest_cost'] / (df['annual_income'] + 1)
    df['interest_credit_mismatch'] = df['interest_rate'] * (1 - df['credit_score_normalized'])
    df['interest_credit_ratio'] = df['interest_rate'] / (df['credit_score'] / 100)
    df['interest_rate_squared'] = df['interest_rate'] ** 2
    print(f"✓ Created 8 features")
    
    # 4. RISK SCORES
    print("[4/11]   Composite risk scores...")
    df['risk_score_v1'] = (df['debt_to_income_ratio'] * 0.25 + df['loan_to_income_ratio'] * 0.25 +
                           df['credit_risk_score'] * 0.30 + (df['interest_rate'] / 100) * 0.20)
    df['risk_score_v2'] = (df['payment_to_income_ratio'] * 0.40 + df['new_debt_to_income'] * 0.35 +
                           df['interest_burden'] * 0.25)
    df['affordability_score'] = (df['credit_score_normalized'] * 0.40 + 
                                 (1 - df['debt_to_income_ratio']) * 0.30 +
                                 df['disposable_income_ratio'] * 0.30)
    df['financial_health_score'] = df['affordability_score'] * 0.60 - df['risk_score_v1'] * 0.40
    print(f"   ✓ Created 4 features")
    
    # 5. LOAN AMOUNT FEATURES
    print("[5/11]  Loan amount features...")
    df['loan_size'] = pd.cut(df['loan_amount'], bins=[0, 10000, 20000, 30000, np.inf],
                              labels=['small', 'medium', 'large', 'very_large'])
    df['loan_amount_squared'] = df['loan_amount'] ** 2
    df['loan_amount_log'] = np.log1p(df['loan_amount'])
    df['annual_income_log'] = np.log1p(df['annual_income'])
    df['loan_amount_sqrt'] = np.sqrt(df['loan_amount'])
    print(f"✓ Created 5 features")
    
    # 6. BINNING FEATURES
    print("[6/11]  Binned features...")
    df['income_decile'] = pd.qcut(df['annual_income'], q=10, labels=False, duplicates='drop')
    df['credit_decile'] = pd.qcut(df['credit_score'], q=10, labels=False, duplicates='drop')
    df['loan_decile'] = pd.qcut(df['loan_amount'], q=10, labels=False, duplicates='drop')
    df['dti_decile'] = pd.qcut(df['debt_to_income_ratio'], q=10, labels=False, duplicates='drop')
    df['interest_decile'] = pd.qcut(df['interest_rate'], q=10, labels=False, duplicates='drop')
    print(f"✓ Created 5 features")
    
    # 7. INTERACTION FEATURES
    print("[7/11]  Interaction features...")
    df['income_x_credit'] = df['annual_income'] * df['credit_score']
    df['dti_x_interest'] = df['debt_to_income_ratio'] * df['interest_rate']
    df['loan_x_interest'] = df['loan_amount'] * df['interest_rate']
    df['income_x_dti'] = df['annual_income'] * df['debt_to_income_ratio']
    df['income_credit_loan'] = df['annual_income'] * df['credit_score'] * df['loan_amount']
    df['dti_interest_credit'] = df['debt_to_income_ratio'] * df['interest_rate'] * df['credit_score']
    print(f"✓ Created 6 features")
    
    # 8. GRADE FEATURES
    print("[8/11]  Grade/subgrade features...")
    df['grade'] = df['grade_subgrade'].str[0]
    df['subgrade_num'] = df['grade_subgrade'].str[1:].astype(int)
    grade_map = {'A': 1, 'B': 2, 'C': 3, 'D': 4, 'E': 5, 'F': 6, 'G': 7}
    df['grade_numeric'] = df['grade'].map(grade_map)
    df['full_grade_score'] = df['grade_numeric'] * 10 + df['subgrade_num']
    df['grade_credit_ratio'] = df['full_grade_score'] / (df['credit_score'] / 100)
    print(f"✓ Created 5 features")
    
    # 9. STATISTICAL AGGREGATIONS
    print("[9/11]  Statistical aggregations...")
    df['mean_financial_metrics'] = df[['debt_to_income_ratio', 'loan_to_income_ratio', 
                                        'payment_to_income_ratio']].mean(axis=1)
    df['max_financial_burden'] = df[['debt_to_income_ratio', 'loan_to_income_ratio', 
                                      'payment_to_income_ratio']].max(axis=1)
    df['min_financial_burden'] = df[['debt_to_income_ratio', 'loan_to_income_ratio', 
                                      'payment_to_income_ratio']].min(axis=1)
    df['std_financial_metrics'] = df[['debt_to_income_ratio', 'loan_to_income_ratio', 
                                       'payment_to_income_ratio']].std(axis=1)
    print(f"✓ Created 4 features")
    
    # 10. CATEGORICAL COMBINATIONS
    print("[10/11]  Categorical combinations...")
    df['gender_marital'] = df['gender'] + '_' + df['marital_status']
    df['education_employment'] = df['education_level'] + '_' + df['employment_status']
    df['gender_education'] = df['gender'] + '_' + df['education_level']
    df['marital_employment'] = df['marital_status'] + '_' + df['employment_status']
    df['purpose_grade'] = df['loan_purpose'] + '_' + df['grade']
    df['employment_purpose'] = df['employment_status'] + '_' + df['loan_purpose']
    print(f"✓ Created 6 features")
    
    # 11. ANOMALY FLAGS
    print("[11/11]  Anomaly detection flags...")
    df['extreme_dti'] = (df['debt_to_income_ratio'] > df['debt_to_income_ratio'].quantile(0.90)).astype(int)
    df['low_income'] = (df['annual_income'] < df['annual_income'].quantile(0.25)).astype(int)
    df['large_loan'] = (df['loan_amount'] > df['loan_amount'].quantile(0.75)).astype(int)
    df['risky_combo_1'] = ((df['debt_to_income_ratio'] > 0.4) & (df['credit_score'] < 650)).astype(int)
    df['risky_combo_2'] = ((df['loan_to_income_ratio'] > 0.5) & (df['interest_rate'] > 15)).astype(int)
    df['safe_combo'] = ((df['credit_score'] > 750) & (df['debt_to_income_ratio'] < 0.3)).astype(int)
    df['high_risk_all'] = (df['extreme_dti'] & df['risky_combo_1']).astype(int)
    print(f"✓ Created 7 features")
    
    print("\n" + "="*80)
    print(f" Feature Engineering Complete!")
    print(f"   Final features: {df.shape[1]}")
    print(f"   New features: {df.shape[1] - 13}")
    print("="*80)
    
    return df

# Apply feature engineering
train_fe = complete_feature_engineering(train)
test_fe = complete_feature_engineering(test)

In [None]:
# Encode categorical features
print("ENCODING CATEGORICAL FEATURES")
print("="*80)

categorical_features = train_fe.select_dtypes(include=['object', 'category']).columns.tolist()

le_dict = {}
for col in categorical_features:
    le = LabelEncoder()
    train_fe[col] = le.fit_transform(train_fe[col].astype(str))
    test_fe[col] = le.transform(test_fe[col].astype(str))
    le_dict[col] = le
    print(f"✓ {col}: {len(le.classes_)} classes")

print(f"\n Encoded {len(categorical_features)} features")

In [None]:
# Prepare datasets
feature_cols = [col for col in train_fe.columns if col not in ['id', config.TARGET]]

X = train_fe[feature_cols]
y = train_fe[config.TARGET]
X_test = test_fe[feature_cols]
test_ids = test_fe['id']

print("FINAL DATA READY")
print("="*80)
print(f"X: {X.shape}")
print(f"y: {y.shape}")
print(f"X_test: {X_test.shape}")
print(f"Features: {len(feature_cols)}")

<a id='model-training'></a>
# 6 Model Training

## 6.1 LightGBM

In [None]:
def train_lightgbm(X, y, X_test, n_splits=5):
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=SEED)
    oof_preds = np.zeros(len(X))
    test_preds = np.zeros(len(X_test))
    feature_importance = pd.DataFrame()
    
    params = {
        'objective': 'binary',
        'metric': 'auc',
        'boosting_type': 'gbdt',
        'num_leaves': 31,
        'learning_rate': 0.05,
        'feature_fraction': 0.8,
        'bagging_fraction': 0.8,
        'bagging_freq': 5,
        'max_depth': -1,
        'min_child_samples': 20,
        'reg_alpha': 0.1,
        'reg_lambda': 0.1,
        'random_state': SEED,
        'verbose': -1,
        'n_jobs': -1
    }
    
    fold_scores = []
    
    for fold, (train_idx, val_idx) in enumerate(skf.split(X, y)):
        print(f"\n{'='*80}")
        print(f"Fold {fold + 1}/{n_splits}")
        print(f"{'='*80}")
        
        X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
        
        train_data = lgb.Dataset(X_train, label=y_train)
        val_data = lgb.Dataset(X_val, label=y_val, reference=train_data)
        
        model = lgb.train(
            params, train_data, num_boost_round=2000,
            valid_sets=[train_data, val_data],
            callbacks=[lgb.early_stopping(100), lgb.log_evaluation(200)]
        )
        
        oof_preds[val_idx] = model.predict(X_val, num_iteration=model.best_iteration)
        test_preds += model.predict(X_test, num_iteration=model.best_iteration) / n_splits
        
        score = roc_auc_score(y_val, oof_preds[val_idx])
        fold_scores.append(score)
        print(f"Fold {fold + 1} AUC: {score:.6f}")
        
        fold_importance = pd.DataFrame({
            'feature': X.columns,
            'importance': model.feature_importance(importance_type='gain'),
            'fold': fold + 1
        })
        feature_importance = pd.concat([feature_importance, fold_importance])
    
    overall_score = roc_auc_score(y, oof_preds)
    print(f"\n{'='*80}")
    print(f"LightGBM OOF AUC: {overall_score:.6f}")
    print(f"Mean: {np.mean(fold_scores):.6f} (+/- {np.std(fold_scores):.6f})")
    print(f"{'='*80}")
    
    return oof_preds, test_preds, feature_importance, overall_score

print("\n Training LightGBM...")
lgb_oof, lgb_test, lgb_importance, lgb_score = train_lightgbm(X, y, X_test, n_splits=config.N_SPLITS)

## 6.2 XGBoost

In [None]:
def train_xgboost(X, y, X_test, n_splits=5):
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=SEED)
    oof_preds = np.zeros(len(X))
    test_preds = np.zeros(len(X_test))
    
    params = {
        'objective': 'binary:logistic',
        'eval_metric': 'auc',
        'max_depth': 6,
        'learning_rate': 0.05,
        'subsample': 0.8,
        'colsample_bytree': 0.8,
        'min_child_weight': 1,
        'reg_alpha': 0.1,
        'reg_lambda': 0.1,
        'random_state': SEED,
        'tree_method': 'hist',
        'n_jobs': -1
    }
    
    fold_scores = []
    
    for fold, (train_idx, val_idx) in enumerate(skf.split(X, y)):
        print(f"\nFold {fold + 1}/{n_splits}")
        
        X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
        
        model = xgb.XGBClassifier(**params, n_estimators=2000)
        model.fit(
            X_train, y_train,
            eval_set=[(X_val, y_val)],
            early_stopping_rounds=100,
            verbose=200
        )
        
        oof_preds[val_idx] = model.predict_proba(X_val)[:, 1]
        test_preds += model.predict_proba(X_test)[:, 1] / n_splits
        
        score = roc_auc_score(y_val, oof_preds[val_idx])
        fold_scores.append(score)
        print(f"Fold {fold + 1} AUC: {score:.6f}")
    
    overall_score = roc_auc_score(y, oof_preds)
    print(f"\nXGBoost OOF AUC: {overall_score:.6f}")
    print(f"Mean: {np.mean(fold_scores):.6f} (+/- {np.std(fold_scores):.6f})")
    
    return oof_preds, test_preds, overall_score

print("\n Training XGBoost...")
xgb_oof, xgb_test, xgb_score = train_xgboost(X, y, X_test, n_splits=config.N_SPLITS)

## 6.3 CatBoost

In [None]:
def train_catboost(X, y, X_test, n_splits=5):
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=SEED)
    oof_preds = np.zeros(len(X))
    test_preds = np.zeros(len(X_test))
    
    params = {
        'iterations': 2000,
        'learning_rate': 0.05,
        'depth': 6,
        'l2_leaf_reg': 3,
        'random_seed': SEED,
        'loss_function': 'Logloss',
        'eval_metric': 'AUC',
        'early_stopping_rounds': 100,
        'verbose': 200,
        'task_type': 'CPU'
    }
    
    fold_scores = []
    
    for fold, (train_idx, val_idx) in enumerate(skf.split(X, y)):
        print(f"\nFold {fold + 1}/{n_splits}")
        
        X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
        
        model = CatBoostClassifier(**params)
        model.fit(X_train, y_train, eval_set=(X_val, y_val), use_best_model=True)
        
        oof_preds[val_idx] = model.predict_proba(X_val)[:, 1]
        test_preds += model.predict_proba(X_test)[:, 1] / n_splits
        
        score = roc_auc_score(y_val, oof_preds[val_idx])
        fold_scores.append(score)
        print(f"Fold {fold + 1} AUC: {score:.6f}")
    
    overall_score = roc_auc_score(y, oof_preds)
    print(f"\nCatBoost OOF AUC: {overall_score:.6f}")
    print(f"Mean: {np.mean(fold_scores):.6f} (+/- {np.std(fold_scores):.6f})")
    
    return oof_preds, test_preds, overall_score

print("\n Training CatBoost...")
cat_oof, cat_test, cat_score = train_catboost(X, y, X_test, n_splits=config.N_SPLITS)

<a id='evaluation'></a>
# 7 Model Evaluation

In [None]:
# Compare models
comparison = pd.DataFrame({
    'Model': ['LightGBM', 'XGBoost', 'CatBoost'],
    'OOF AUC': [lgb_score, xgb_score, cat_score]
}).sort_values('OOF AUC', ascending=False)

print("\n" + "="*80)
print("MODEL COMPARISON")
print("="*80)
display(comparison.style.background_gradient(cmap='Greens'))

# Visualize
fig, axes = plt.subplots(1, 2, figsize=(16, 6))

# Bar chart
axes[0].bar(comparison['Model'], comparison['OOF AUC'], 
            color=['#FFD700', '#C0C0C0', '#CD7F32'], edgecolor='black', linewidth=2)
axes[0].set_title('Model Performance', fontsize=14, fontweight='bold')
axes[0].set_ylabel('OOF AUC', fontsize=12)
axes[0].grid(alpha=0.3, axis='y')
for i, (model, score) in enumerate(zip(comparison['Model'], comparison['OOF AUC'])):
    axes[0].text(i, score, f'{score:.6f}', ha='center', va='bottom', fontweight='bold')

# ROC curves
for name, oof in [('LightGBM', lgb_oof), ('XGBoost', xgb_oof), ('CatBoost', cat_oof)]:
    fpr, tpr, _ = roc_curve(y, oof)
    axes[1].plot(fpr, tpr, linewidth=2, label=f'{name} (AUC={roc_auc_score(y, oof):.4f})')

axes[1].plot([0, 1], [0, 1], 'k--', linewidth=2, label='Random')
axes[1].set_xlabel('FPR', fontsize=12)
axes[1].set_ylabel('TPR', fontsize=12)
axes[1].set_title('ROC Curves', fontsize=14, fontweight='bold')
axes[1].legend(loc='lower right')
axes[1].grid(alpha=0.3)

plt.tight_layout()
plt.show()

In [None]:
# Feature importance from LightGBM
importance_df = lgb_importance.groupby('feature')['importance'].mean().sort_values(ascending=False).reset_index()

plt.figure(figsize=(12, 10))
top_n = 30
sns.barplot(data=importance_df.head(top_n), y='feature', x='importance', palette='viridis')
plt.title(f'Top {top_n} Important Features', fontsize=16, fontweight='bold')
plt.xlabel('Importance', fontsize=12)
plt.ylabel('Feature', fontsize=12)
plt.tight_layout()
plt.show()

print("\nTop 15 Features:")
display(importance_df.head(15))

<a id='ensemble'></a>
# 8 Ensemble Methods

In [None]:
print("CREATING ENSEMBLE")
print("="*80)

# Simple average
simple_oof = (lgb_oof + xgb_oof + cat_oof) / 3
simple_test = (lgb_test + xgb_test + cat_test) / 3
simple_score = roc_auc_score(y, simple_oof)

# Weighted average
total = lgb_score + xgb_score + cat_score
w_lgb = lgb_score / total
w_xgb = xgb_score / total
w_cat = cat_score / total

weighted_oof = lgb_oof * w_lgb + xgb_oof * w_xgb + cat_oof * w_cat
weighted_test = lgb_test * w_lgb + xgb_test * w_xgb + cat_test * w_cat
weighted_score = roc_auc_score(y, weighted_oof)

# Rank average
rank_oof = (rankdata(lgb_oof) + rankdata(xgb_oof) + rankdata(cat_oof)) / (3 * len(y))
rank_test = (rankdata(lgb_test) + rankdata(xgb_test) + rankdata(cat_test)) / (3 * len(lgb_test))
rank_score = roc_auc_score(y, rank_oof)

ensemble_results = pd.DataFrame({
    'Ensemble': ['Simple Average', 'Weighted Average', 'Rank Average'],
    'OOF AUC': [simple_score, weighted_score, rank_score]
}).sort_values('OOF AUC', ascending=False)

print("\nEnsemble Results:")
display(ensemble_results.style.background_gradient(cmap='Greens'))

print(f"\nWeights: LGB={w_lgb:.3f}, XGB={w_xgb:.3f}, CAT={w_cat:.3f}")

# Choose best
best_idx = ensemble_results['OOF AUC'].idxmax()
best_name = ensemble_results.loc[best_idx, 'Ensemble']
best_score = ensemble_results.loc[best_idx, 'OOF AUC']

if best_name == 'Simple Average':
    final_preds = simple_test
elif best_name == 'Weighted Average':
    final_preds = weighted_test
else:
    final_preds = rank_test

print(f"\n Best: {best_name} (AUC: {best_score:.6f})")

<a id='submission'></a>
# 9 Submission Generation

In [None]:
# # Create submission
# submission = pd.DataFrame({
#     'id': test_ids,
#     config.TARGET: final_preds
# })

# submission.to_csv('submission.csv', index=False)

# print("SUBMISSION CREATED")
# print("="*80)
# print(f"File: submission.csv")
# print(f"Shape: {submission.shape}")
# print(f"\nPreview:")
# display(submission.head(10))

# print(f"\nStatistics:")
# print(submission[config.TARGET].describe())

# # Visualize
# fig, axes = plt.subplots(1, 2, figsize=(16, 6))

# axes[0].hist(submission[config.TARGET], bins=50, color='steelblue', edgecolor='black', alpha=0.7)
# axes[0].axvline(submission[config.TARGET].mean(), color='red', linestyle='--', linewidth=2,
#                 label=f'Mean: {submission[config.TARGET].mean():.4f}')
# axes[0].set_title('Prediction Distribution', fontsize=14, fontweight='bold')
# axes[0].set_xlabel('Probability', fontsize=12)
# axes[0].set_ylabel('Frequency', fontsize=12)
# axes[0].legend()
# axes[0].grid(alpha=0.3)

# axes[1].boxplot(submission[config.TARGET], vert=True, patch_artist=True,
#                 boxprops=dict(facecolor='lightblue', alpha=0.7))
# axes[1].set_title('Box Plot', fontsize=14, fontweight='bold')
# axes[1].set_ylabel('Probability', fontsize=12)
# axes[1].grid(alpha=0.3, axis='y')

# plt.tight_layout()
# plt.show()

# print("\n READY FOR SUBMISSION!")

<a id='conclusion'></a>
# 10 Conclusion & Final Summary

In [None]:
# print(" FINAL SUMMARY")

# print("\n MODEL SCORES:")
# print(f"LightGBM:          {lgb_score:.6f}")
# print(f"XGBoost:           {xgb_score:.6f}")
# print(f"CatBoost:          {cat_score:.6f}")
# print(f"Simple Ensemble:   {simple_score:.6f}")
# print(f"Weighted Ensemble: {weighted_score:.6f}")
# print(f"Rank Ensemble:     {rank_score:.6f}")

# print(f"\n BEST MODEL: {best_name}")
# print(f" BEST AUC: {best_score:.6f}")

# print("\n KEY INSIGHTS:")
# print(f"\n1. Feature Engineering:")
# print(f"• Created {len(feature_cols)} features from 13 original")
# print(f"• Financial ratios were most important")
# print(f"• Risk scores captured complex patterns")

# print(f"\n2. Model Performance:")
# best_single = max(lgb_score, xgb_score, cat_score)
# improvement = ((best_score - best_single) / best_single) * 100
# print(f"• Best single: {best_single:.6f}")
# print(f"• Ensemble gain: +{improvement:.4f}%")
# print(f"• 5-fold CV for robustness")

# print(f"\n3. Top 5 Features:")
# for i, feat in enumerate(importance_df.head(5)['feature'], 1):
#     print(f"   {i}. {feat}")

# print("\n4. Improvements for Future:")
# print("• Hyperparameter optimization")
# print("• Add original dataset")
# print("• Stacking ensemble")
# print("• Neural network models")
# print("• Feature selection")

# # print("\n" + "="*80)
# # print(" COMPLETE - READY FOR KAGGLE SUBMISSION!")
# # print("="*80)
# # print(f"\n File: submission.csv")
# # print(f" Rows: {len(submission):,}")
# # print(f" Expected LB: ~{best_score:.4f}")
# # print(f"\n Good luck, {test.shape[0]:,} predictions ready!")

### Connect with Me  

Feel free to follow me on these platforms:  

[![GitHub](https://img.shields.io/badge/GitHub-181717?style=for-the-badge&logo=github&logoColor=white)](https://github.com/AdilShamim8)  
[![LinkedIn](https://img.shields.io/badge/LinkedIn-0077B5?style=for-the-badge&logo=linkedin&logoColor=white)](https://www.linkedin.com/in/adilshamim8)  
[![Twitter](https://img.shields.io/badge/Twitter-1DA1F2?style=for-the-badge&logo=twitter&logoColor=white)](https://x.com/adil_shamim8)  

Reference: @yeoyunsianggeremie https://www.kaggle.com/code/yeoyunsianggeremie/ps5e11-agentic-ai-solution-single-xgb/notebook

In [None]:
import os
import sys
import time
import json
import logging
from pathlib import Path
import shutil

import numpy as np
import pandas as pd

from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import QuantileTransformer, PowerTransformer, KBinsDiscretizer

import xgboost as xgb
import optuna  # tuning used only in FULL mode

# -------------------------
# Setup logging early
# -------------------------
BASE_DIR = Path("/kaggle/input/playground-series-s5e11")
OUTPUT_DIR = Path(".")
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
LOG_FILE = OUTPUT_DIR / "code_8_1_v4.txt"
SUBMISSION_PATH = OUTPUT_DIR / "submission_4.csv"

logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s | %(levelname)s | %(message)s",
    handlers=[
        logging.FileHandler(LOG_FILE, mode="w", encoding="utf-8"),
        logging.StreamHandler(sys.stdout),
    ],
)
print("Log file initialized at %s", LOG_FILE)

HF_TOKEN = os.environ.get("HF_TOKEN", "")
print("HF_TOKEN present: %s", "yes" if HF_TOKEN else "no")

# -------------------------
# Device selection for XGBoost (purpose: choose CUDA if available)
# -------------------------
def detect_cuda_available() -> bool:
    exe = shutil.which("nvidia-smi")
    if exe is None:
        return False
    out = os.popen(f"{exe} -L").read().strip()
    return len(out) > 0

CUDA_AVAILABLE = detect_cuda_available()
XGB_DEVICE = "cuda:0" if CUDA_AVAILABLE else "cpu"
if CUDA_AVAILABLE:
    print("CUDA detected. Using device='%s' with tree_method='hist'.", XGB_DEVICE)
else:
    print("CUDA not detected. Using CPU (device='cpu') with tree_method='hist'.")

# -------------------------
# Competition schema
# -------------------------
TRAIN_PATH = BASE_DIR / "train.csv"
TEST_PATH = BASE_DIR / "test.csv"
SAMPLE_SUB_PATH = BASE_DIR / "sample_submission.csv"

TARGET_COL = "loan_paid_back"   # binary classification; metric: ROC AUC
ID_COL = "id"
FOLD_COL = "fold"
META_COLS = {TARGET_COL, ID_COL, FOLD_COL}

# Optional numeric columns for special transforms (if present)
C_INCOME = "annual_income"
C_DTI = "debt_to_income_ratio"

# -------------------------
# Load data (purpose: read CSVs; inputs: train/test paths)
# -------------------------
train = pd.read_csv(TRAIN_PATH)
test = pd.read_csv(TEST_PATH)
sample_sub = pd.read_csv(SAMPLE_SUB_PATH)
print("Loaded data. Train shape: %s | Test shape: %s", train.shape, test.shape)
assert TARGET_COL in train.columns, f"Missing target column '{TARGET_COL}' in train.csv"
assert ID_COL in train.columns and ID_COL in test.columns, "Missing id column in train/test."

# -------------------------
# Typing helpers and encoders (purpose: reusable feature builders; inputs: DataFrames/Series)
# -------------------------
def get_cat_num_cols(df: pd.DataFrame, target_col: str, id_col: str, exclude: set):
    cols = [c for c in df.columns if c not in exclude]
    cat_cols = [c for c in cols if df[c].dtype == "object" or str(df[c].dtype).startswith("category")]
    num_cols = [c for c in cols if c not in cat_cols]
    return cat_cols, num_cols

def pick_top_cats(cat_cols, df, k=6, exclude: set = None):
    exclude = exclude or set()
    cands = []
    for c in cat_cols:
        if c in exclude:
            continue
        n_unique = df[c].nunique(dropna=True)
        if 2 <= n_unique <= 200:
            cands.append((c, n_unique))
    cands.sort(key=lambda t: (-t[1], t[0]))
    sel = [c for c, _ in cands[:k]]
    if len(sel) < min(k, len(cat_cols)):
        rest = [c for c in cat_cols if c not in sel and c not in exclude]
        sel += rest[: (k - len(sel))]
    return sel[:k]

def pick_top_nums(num_cols, df, k=5, exclude: set = None):
    exclude = exclude or set()
    stats = []
    for c in num_cols:
        if c in exclude:
            continue
        series = df[c]
        if series.dtype.kind not in "biufc":
            continue
        nunq = series.nunique(dropna=True)
        if nunq <= 2:
            continue
        var = series.var(skipna=True)
        stats.append((c, 0.0 if pd.isna(var) else float(var)))
    stats.sort(key=lambda t: -t[1])
    return [c for c, _ in stats[:k]]

def add_missing_indicators(df: pd.DataFrame, exclude_cols):
    for c in df.columns:
        if c in exclude_cols:
            continue
        ind_name = f"{c}__isna"
        if ind_name not in df.columns:
            df[ind_name] = df[c].isna().astype(np.int8)
    return df

def frequency_encode(train_pool: pd.DataFrame, series: pd.Series):
    counts = train_pool[series.name].value_counts(dropna=False)
    return counts.to_dict()

def compute_te_map(x: pd.Series, y: pd.Series, m: float = 10.0):
    df = pd.DataFrame({"x": x, "y": y})
    gr = df.groupby("x")["y"].agg(["mean", "count"])
    global_mean = float(y.mean())
    smooth = (gr["mean"] * gr["count"] + global_mean * m) / (gr["count"] + m)
    return smooth.to_dict(), global_mean

def oof_target_encode(train_pool_df, y, col, folds, m=10.0):
    oof = pd.Series(index=train_pool_df.index, dtype="float32")
    for f, (tr_idx, va_idx) in folds.items():
        tr_df = train_pool_df.loc[tr_idx]
        tr_y = y.loc[tr_idx]
        mp, gmean = compute_te_map(tr_df[col], tr_y, m)
        oof.loc[va_idx] = train_pool_df.loc[va_idx, col].map(mp).fillna(gmean).astype("float32")
    full_map, full_gmean = compute_te_map(train_pool_df[col], y, m)
    return oof, full_map, full_gmean

def compute_woe_map(x: pd.Series, y: pd.Series, eps: float = 0.5):
    df = pd.DataFrame({"x": x, "y": y})
    pos = df.groupby("x")["y"].sum(min_count=1)
    cnt = df.groupby("x")["y"].count()
    neg = cnt - pos
    total_pos = float(pos.sum())
    total_neg = float(neg.sum())
    dist_pos = (pos + eps) / (total_pos + eps * len(pos))
    dist_neg = (neg + eps) / (total_neg + eps * len(neg))
    woe = np.log((dist_pos) / (dist_neg))
    mapping = woe.to_dict()
    iv = ((dist_pos - dist_neg) * woe).sum()
    return mapping, float(iv)

def oof_woe_encode(train_pool_df, y, col, folds, eps=0.5):
    oof = pd.Series(index=train_pool_df.index, dtype="float32")
    for f, (tr_idx, va_idx) in folds.items():
        tr_df = train_pool_df.loc[tr_idx]
        tr_y = y.loc[tr_idx]
        mp, _iv = compute_woe_map(tr_df[col], tr_y, eps)
        # Clip WOE values to stabilize
        mp = {k: float(np.clip(v, -3.0, 3.0)) for k, v in mp.items()}
        oof.loc[va_idx] = train_pool_df.loc[va_idx, col].map(mp).fillna(0.0).astype("float32")
    full_map, iv_full = compute_woe_map(train_pool_df[col], y, eps)
    full_map = {k: float(np.clip(v, -3.0, 3.0)) for k, v in full_map.items()}
    return oof, full_map, iv_full

def fit_kbins(train_pool_series, n_bins=10):
    med = float(np.nanmedian(train_pool_series.values))
    tr_vals = train_pool_series.fillna(med).values.reshape(-1, 1)
    enc = KBinsDiscretizer(n_bins=n_bins, encode="ordinal", strategy="quantile")
    enc.fit(tr_vals)
    return enc, med

def transform_kbins(enc, med, series):
    vals = series.fillna(med).values.reshape(-1, 1)
    b = enc.transform(vals).astype("float32").reshape(-1)
    b = np.where(np.isfinite(b), b, -1.0)
    return pd.Series(b, index=series.index, dtype="float32")

def fit_rank_gaussian(train_pool_series, random_state=2025):
    med = float(np.nanmedian(train_pool_series.values))
    tr_vals = train_pool_series.fillna(med).values.reshape(-1, 1)
    qt = QuantileTransformer(n_quantiles=min(1000, len(tr_vals)), output_distribution="normal", random_state=random_state)
    qt.fit(tr_vals)
    return qt, med

def transform_rank_gaussian(qt, med, series):
    vals = series.fillna(med).values.reshape(-1, 1)
    out = qt.transform(vals).astype("float32").reshape(-1)
    return pd.Series(out, index=series.index, dtype="float32")

def fit_yeojohnson(train_pool_series):
    med = float(np.nanmedian(train_pool_series.values))
    tr_vals = train_pool_series.fillna(med).values.reshape(-1, 1)
    pt = PowerTransformer(method="yeo-johnson", standardize=True)
    pt.fit(tr_vals)
    return pt, med

def transform_yeojohnson(pt, med, series):
    vals = series.fillna(med).values.reshape(-1, 1)
    out = pt.transform(vals).astype("float32").reshape(-1)
    return pd.Series(out, index=series.index, dtype="float32")

def group_mean_deviation(train_pool_df, val_df, test_df, cat_cols, num_cols):
    # Fit group means on train_pool and map to val/test; guard meta columns.
    for c in cat_cols:
        if c in META_COLS or c not in train_pool_df.columns:
            continue
        for n in num_cols:
            if n in META_COLS or n not in train_pool_df.columns:
                continue
            gname = f"{n}__gm_{c}"
            devname = f"{n}__dev_{c}"
            grp = train_pool_df.groupby(c, observed=True)[n].mean()
            global_mean = float(train_pool_df[n].mean())
            train_pool_df[gname] = train_pool_df[c].map(grp).fillna(global_mean).astype("float32")
            val_df[gname] = val_df[c].map(grp).fillna(global_mean).astype("float32")
            test_df[gname] = test_df[c].map(grp).fillna(global_mean).astype("float32")
            train_pool_df[devname] = (train_pool_df[n] - train_pool_df[gname]).astype("float32")
            val_df[devname] = (val_df[n] - val_df[gname]).astype("float32")
            test_df[devname] = (test_df[n] - test_df[gname]).astype("float32")
    return train_pool_df, val_df, test_df

def group_percentile_feature(train_pool_df, val_df, test_df, group_col, value_col, feature_name, q=100):
    if group_col not in train_pool_df.columns or value_col not in train_pool_df.columns:
        print("Percentile feature skipped (missing): %s within %s", value_col, group_col)
        return train_pool_df, val_df, test_df
    edges_dict = {}
    for g, sub in train_pool_df[[group_col, value_col]].dropna().groupby(group_col, observed=True):
        vals = sub[value_col].values
        if len(vals) < 2:
            continue
        qs = np.linspace(0.0, 1.0, q + 1)
        try_edges = np.quantile(vals, qs)
        edges = try_edges.copy()
        for i in range(1, len(edges)):
            if edges[i] <= edges[i - 1]:
                edges[i] = np.nextafter(edges[i - 1], float("inf"))
        edges_dict[g] = edges

    def apply_edges(df_in: pd.DataFrame):
        out = pd.Series(index=df_in.index, dtype="float32")
        out.iloc[:] = np.nan
        for g, idx in df_in.groupby(group_col, observed=True).groups.items():
            e = edges_dict.get(g, None)
            if e is None:
                out.loc[idx] = 0.5
                continue
            v = df_in.loc[idx, value_col].fillna(e[0]).values
            bins = np.digitize(v, e[1:-1], right=True)
            denom = max(1, len(e) - 2)
            out.loc[idx] = bins.astype("float32") / float(denom)
        out.fillna(0.5, inplace=True)
        return out

    train_pool_df[feature_name] = apply_edges(train_pool_df[[group_col, value_col]].copy())
    val_df[feature_name] = apply_edges(val_df[[group_col, value_col]].copy())
    test_df[feature_name] = apply_edges(test_df[[group_col, value_col]].copy())
    return train_pool_df, val_df, test_df

# -------------------------
# Global feature selections (purpose: choose candidate categorical and numeric columns)
# -------------------------
exclude_for_typing = {TARGET_COL, ID_COL, FOLD_COL}
all_cat, all_num = get_cat_num_cols(train, TARGET_COL, ID_COL, exclude=exclude_for_typing)
sel_cat = pick_top_cats(all_cat, train, k=6, exclude=META_COLS)
sel_num_for_deviation = pick_top_nums(all_num, train, k=5, exclude=META_COLS)
transform_targets = [c for c in [C_INCOME, C_DTI] if c in train.columns]
all_features_for_te = [c for c in (all_cat + all_num) if c not in META_COLS]
print("Selected categoricals (≤6): %s", sel_cat)
print("Selected numeric for group-mean deviations (≤5): %s", sel_num_for_deviation)
print("Numeric transform targets: %s", transform_targets)
print("All features for target encoding (%d): %s", len(all_features_for_te), all_features_for_te)

# -------------------------
# Preprocess for arbitrary held-out fold (purpose: per-fold encoders; inputs: train_df, test_df, held_out_fold)
# -------------------------
def preprocess_for_outer_fold(train_df, test_df, held_out_fold, sel_cat, sel_num_for_deviation, transform_targets, all_features_for_te):
    """Fit encoders/transforms on train_pool=all folds except held_out_fold; apply to its validation and test."""
    if held_out_fold == -1:
        tr_pool = train_df.copy()
        va = train_df.iloc[0:0].copy()  # empty
    else:
        tr_pool = train_df.loc[train_df[FOLD_COL] != held_out_fold].copy()
        va = train_df.loc[train_df[FOLD_COL] == held_out_fold].copy()
    te = test_df.copy()
    y_pool = tr_pool[TARGET_COL].astype(int)

    # Inner folds based on existing assignment in tr_pool
    inner_fold_ids = sorted(int(f) for f in tr_pool[FOLD_COL].unique().tolist())
    inner_folds = {}
    for f in inner_fold_ids:
        inner_tr_idx = tr_pool.index[tr_pool[FOLD_COL] != f]
        inner_va_idx = tr_pool.index[tr_pool[FOLD_COL] == f]
        inner_folds[int(f)] = (inner_tr_idx, inner_va_idx)

    # Frequency encoding
    for c in sel_cat:
        if c not in tr_pool.columns:
            continue
        mapping = frequency_encode(tr_pool, tr_pool[c])
        tr_pool[f"{c}__freq"] = tr_pool[c].map(mapping).fillna(0).astype("float32")
        if len(va) > 0:
            va[f"{c}__freq"] = va[c].map(mapping).fillna(0).astype("float32")
        te[f"{c}__freq"] = te[c].map(mapping).fillna(0).astype("float32")

    # OOF TE on ALL features (categorical + numerical)
    for c in all_features_for_te:
        if c not in tr_pool.columns:
            continue
        oof_te, te_map, te_g = oof_target_encode(tr_pool, y_pool, c, inner_folds, m=10.0)
        tr_pool[f"{c}__te_m10"] = oof_te.astype("float32")
        if len(va) > 0:
            va[f"{c}__te_m10"] = va[c].map(te_map).fillna(te_g).astype("float32")
        te[f"{c}__te_m10"] = te[c].map(te_map).fillna(te_g).astype("float32")

    # OOF WOE (clip WOE) - only for categorical features
    for c in sel_cat:
        if c not in tr_pool.columns:
            continue
        oof_woe, woe_map, _iv = oof_woe_encode(tr_pool, y_pool, c, inner_folds, eps=0.5)
        tr_pool[f"{c}__woe"] = oof_woe.astype("float32")
        if len(va) > 0:
            va[f"{c}__woe"] = va[c].map(woe_map).fillna(0.0).astype("float32")
        te[f"{c}__woe"] = te[c].map(woe_map).fillna(0.0).astype("float32")

    # Numeric transforms on income & DTI
    for col in transform_targets:
        if col not in tr_pool.columns:
            continue
        enc, med = fit_kbins(tr_pool[col], n_bins=10)
        tr_pool[f"{col}__qbin10"] = transform_kbins(enc, med, tr_pool[col])
        if len(va) > 0:
            va[f"{col}__qbin10"] = transform_kbins(enc, med, va[col])
        te[f"{col}__qbin10"] = transform_kbins(enc, med, te[col])

        qt, med_q = fit_rank_gaussian(tr_pool[col])
        tr_pool[f"{col}__rgauss"] = transform_rank_gaussian(qt, med_q, tr_pool[col])
        if len(va) > 0:
            va[f"{col}__rgauss"] = transform_rank_gaussian(qt, med_q, va[col])
        te[f"{col}__rgauss"] = transform_rank_gaussian(qt, med_q, te[col])

        pt, med_p = fit_yeojohnson(tr_pool[col])
        tr_pool[f"{col}__yeoj"] = transform_yeojohnson(pt, med_p, tr_pool[col])
        if len(va) > 0:
            va[f"{col}__yeoj"] = transform_yeojohnson(pt, med_p, va[col])
        te[f"{col}__yeoj"] = transform_yeojohnson(pt, med_p, te[col])

    # Group mean deviations
    tr_pool, va, te = group_mean_deviation(tr_pool, va, te, sel_cat, sel_num_for_deviation)

    # Percentile features
    if "credit_score" in tr_pool.columns and "grade_subgrade" in tr_pool.columns:
        tr_pool, va, te = group_percentile_feature(tr_pool, va, te, "grade_subgrade", "credit_score", "credit_score__pctl_in_grade")
    if "credit_score" in tr_pool.columns and "education_level" in tr_pool.columns:
        tr_pool, va, te = group_percentile_feature(tr_pool, va, te, "education_level", "credit_score", "credit_score__pctl_in_edu")

    # Missingness indicators
    tr_pool = add_missing_indicators(tr_pool, exclude_cols=META_COLS)
    if len(va) > 0:
        va = add_missing_indicators(va, exclude_cols=META_COLS)
    te = add_missing_indicators(te, exclude_cols={ID_COL})

    # Feature list: original numeric (excluding raw categoricals/meta) + engineered blocks
    excl = {TARGET_COL, ID_COL, FOLD_COL}
    raw_cat, raw_num = get_cat_num_cols(train_df, TARGET_COL, ID_COL, exclude=excl)
    raw_num_cols = [c for c in raw_num if c not in META_COLS]

    eng_cols = [c for c in tr_pool.columns if (
        c not in train_df.columns or
        c.endswith("__freq") or c.endswith("__te_m10") or c.endswith("__woe") or
        "__gm_" in c or "__dev_" in c or
        c.endswith("__qbin10") or c.endswith("__rgauss") or c.endswith("__yeoj") or
        c.endswith("__isna") or
        c.endswith("__pctl_in_grade") or c.endswith("__pctl_in_edu")
    )]
    feature_cols = sorted(set(raw_num_cols + eng_cols))
    feature_cols = [c for c in feature_cols if (c not in META_COLS and not c.endswith("__iv"))]

    X_tr = tr_pool[feature_cols].copy()
    y_tr = tr_pool[TARGET_COL].astype(int).copy()
    if len(va) > 0:
        X_va = va[feature_cols].copy()
        y_va = va[TARGET_COL].astype(int).copy()
    else:
        X_va = va  # empty
        y_va = va  # empty
    X_te = te[feature_cols].copy()
    return X_tr, y_tr, X_va, y_va, X_te, feature_cols

# -------------------------
# XGBoost params and trainers
# -------------------------
def build_xgb_params(base_lr=0.05, n_estimators=1500, early_stopping_rounds=100):
    params = dict(
        booster="gbtree",
        objective="binary:logistic",
        eval_metric="auc",
        tree_method="hist",
        device=XGB_DEVICE,   # 'cuda:0' or 'cpu'
        learning_rate=base_lr,
        max_depth=6,
        min_child_weight=8,
        subsample=0.8,
        colsample_bytree=0.8,
        colsample_bylevel=0.8,
        gamma=0.0,
        reg_lambda=1.0,
        reg_alpha=0.0,
        max_bin=256,
        grow_policy="depthwise",
        random_state=2025,
        n_estimators=n_estimators,
        n_jobs=0,
        early_stopping_rounds=early_stopping_rounds,
        verbosity=1,
    )
    return params

def train_xgb_single(X_tr, y_tr, X_va, y_va, params, label="baseline"):
    t0 = time.time()
    clf = xgb.XGBClassifier(**params)
    clf.fit(X_tr, y_tr, eval_set=[(X_va, y_va)], verbose=False)
    best_it = getattr(clf, "best_iteration", None)
    y_pred_va = clf.predict_proba(X_va, iteration_range=(0, best_it + 1) if best_it is not None else None)[:, 1]
    val_auc = roc_auc_score(y_va, y_pred_va)
    elapsed = time.time() - t0
    print("XGB %s: val AUC=%.6f | best_iteration=%s | time=%.1fs", label, val_auc, str(best_it), elapsed)
    return clf, val_auc, best_it, elapsed

def optuna_tune_xgb(X_tr, y_tr, X_va, y_va, base_params, time_budget_sec=300):
    print("Optuna tuning start (budget=%ds).", time_budget_sec)
    study = optuna.create_study(direction="maximize", study_name="xgb_ps_s5e11_v4")

    def objective(trial: optuna.trial.Trial):
        p = base_params.copy()
        p.update({
            "learning_rate": trial.suggest_float("learning_rate", 0.02, 0.12),
            "max_depth": trial.suggest_int("max_depth", 4, 9),
            "min_child_weight": trial.suggest_float("min_child_weight", 2.0, 12.0),
            "subsample": trial.suggest_float("subsample", 0.6, 1.0),
            "colsample_bytree": trial.suggest_float("colsample_bytree", 0.6, 1.0),
            "colsample_bylevel": trial.suggest_float("colsample_bylevel", 0.6, 1.0),
            "reg_lambda": trial.suggest_float("reg_lambda", 0.5, 5.0, log=True),
            "reg_alpha": trial.suggest_float("reg_alpha", 0.0, 2.0),
            "gamma": trial.suggest_float("gamma", 0.0, 5.0),
            "max_bin": trial.suggest_categorical("max_bin", [128, 256, 512]),
            "n_estimators": trial.suggest_int("n_estimators", 600, 1500),
        })
        # Keep device and tree_method fixed
        p["tree_method"] = base_params["tree_method"]
        p["device"] = base_params["device"]
        p["random_state"] = 2025
        p["early_stopping_rounds"] = base_params["early_stopping_rounds"]

        model = xgb.XGBClassifier(**p)
        model.fit(X_tr, y_tr, eval_set=[(X_va, y_va)], verbose=False)
        best_it = getattr(model, "best_iteration", None)
        y_pred = model.predict_proba(X_va, iteration_range=(0, best_it + 1) if best_it is not None else None)[:, 1]
        auc = roc_auc_score(y_va, y_pred)
        return auc

    study.optimize(objective, n_trials=200, timeout=time_budget_sec, gc_after_trial=True)
    best_params = study.best_params
    best_value = study.best_value
    print("Optuna best AUC=%.6f with params=%s", best_value, json.dumps(best_params))

    tuned_params = base_params.copy()
    tuned_params.update(best_params)
    # Retrain on the same fold-0 split to verify
    model = xgb.XGBClassifier(**tuned_params)
    t0 = time.time()
    model.fit(X_tr, y_tr, eval_set=[(X_va, y_va)], verbose=False)
    best_it = getattr(model, "best_iteration", None)
    y_pred = model.predict_proba(X_va, iteration_range=(0, best_it + 1) if best_it is not None else None)[:, 1]
    auc = roc_auc_score(y_va, y_pred)
    elapsed = time.time() - t0
    print("Tuned retrain: val AUC=%.6f | best_iteration=%s | retrain_time=%.1fs", auc, str(best_it), elapsed)
    return model, auc, best_it, tuned_params

# -------------------------
# CV trainer and final refit
# -------------------------
def assign_outer_folds(df: pd.DataFrame, n_splits=5, seed=2025):
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=seed)
    folds = np.full(len(df), -1, dtype=int)
    for i, (_, va_idx) in enumerate(skf.split(df.drop(columns=[TARGET_COL]), df[TARGET_COL].values)):
        folds[va_idx] = i
    out = df.copy()
    out[FOLD_COL] = folds
    return out

def train_xgb_cv_and_predict(train_df, test_df, params, n_splits=5, debug=False):
    print("Starting %d-fold CV training with per-fold encoders.", n_splits)
    train_df = assign_outer_folds(train_df, n_splits=n_splits, seed=2025)
    oof = np.zeros(len(train_df), dtype=np.float32)
    test_preds = []
    fold_aucs = []
    best_its = []

    for f in range(n_splits):
        print("Fold %d/%d: preprocessing (fit on train folds only).", f+1, n_splits)
        X_tr, y_tr, X_va, y_va, X_te, feats = preprocess_for_outer_fold(
            train_df, test_df, held_out_fold=f,
            sel_cat=sel_cat, sel_num_for_deviation=sel_num_for_deviation, transform_targets=transform_targets,
            all_features_for_te=all_features_for_te
        )
        params_use = params.copy()
        if debug:
            # Downsample training to 1000 rows in DEBUG to save time
            if len(X_tr) > 1000:
                X_tr, _, y_tr, _ = train_test_split(X_tr, y_tr, test_size=(1.0 - 1000/len(X_tr)), stratify=y_tr, random_state=2025)
            params_use["n_estimators"] = min(200, params_use.get("n_estimators", 1500))
            params_use["early_stopping_rounds"] = min(20, params_use.get("early_stopping_rounds", 100))

        print("Fold %d: training XGBoost.", f+1)
        clf = xgb.XGBClassifier(**params_use)
        clf.fit(X_tr, y_tr, eval_set=[(X_va, y_va)], verbose=False)
        best_it = getattr(clf, "best_iteration", None)
        y_va_pred = clf.predict_proba(X_va, iteration_range=(0, best_it + 1) if best_it is not None else None)[:, 1]
        fold_auc = roc_auc_score(y_va, y_va_pred)
        fold_aucs.append(fold_auc)
        oof[train_df.index[train_df[FOLD_COL] == f]] = y_va_pred
        y_te_pred = clf.predict_proba(X_te, iteration_range=(0, best_it + 1) if best_it is not None else None)[:, 1]
        test_preds.append(y_te_pred)
        best_its.append(best_it if best_it is not None else params_use.get("n_estimators", 1000))
        print("Fold %d: AUC=%.6f | best_iteration=%s", f+1, fold_auc, str(best_it))

    oof_auc = roc_auc_score(train_df[TARGET_COL].values, oof)
    y_test_cv = np.mean(np.vstack(test_preds), axis=0)
    print("CV complete. OOF AUC=%.6f | per-fold AUCs=%s | median best_it=%d",
                 oof_auc, [round(a, 6) for a in fold_aucs], int(np.median(best_its)))
    return y_test_cv, oof_auc, int(np.median(best_its)), feats

def refit_full_and_predict(train_df, test_df, params, debug=False):
    print("Refit on all training data with inner OOF encoders; no held-out validation.")
    # Assign inner folds (for encoders) deterministically
    train_df_full = assign_outer_folds(train_df, n_splits=5, seed=2025)
    X_tr_full, y_tr_full, X_va_dummy, y_va_dummy, X_te_full, feats_full = preprocess_for_outer_fold(
        train_df_full, test_df, held_out_fold=-1,
        sel_cat=sel_cat, sel_num_for_deviation=sel_num_for_deviation, transform_targets=transform_targets,
        all_features_for_te=all_features_for_te
    )
    params_use = params.copy()
    if debug:
        if len(X_tr_full) > 1000:
            X_tr_full, _, y_tr_full, _ = train_test_split(X_tr_full, y_tr_full, test_size=(1.0 - 1000/len(X_tr_full)), stratify=y_tr_full, random_state=2025)
        params_use["n_estimators"] = min(200, params_use.get("n_estimators", 1500))
        params_use["early_stopping_rounds"] = min(20, params_use.get("early_stopping_rounds", 100))

    # For full refit, use training data as eval_set just to track rounds; acceptable since encoders are fixed.
    clf_full = xgb.XGBClassifier(**params_use)
    clf_full.fit(X_tr_full, y_tr_full, eval_set=[(X_tr_full, y_tr_full)], verbose=False)
    best_it_full = getattr(clf_full, "best_iteration", None)
    y_test_full = clf_full.predict_proba(X_te_full, iteration_range=(0, best_it_full + 1) if best_it_full is not None else None)[:, 1]
    print("Full refit complete. best_iteration=%s | n_features=%d", str(best_it_full), X_tr_full.shape[1])
    return y_test_full, best_it_full, feats_full

# -------------------------
# Main pipeline runs twice: DEBUG then FULL
# -------------------------
def run_pipeline(DEBUG: bool):
    mode = "DEBUG" if DEBUG else "FULL"
    print("===== Running in %s mode =====", mode)

    # Create a single 5-fold assignment for baseline/tuning on fold 0
    train_folds = assign_outer_folds(train.copy(), n_splits=5, seed=2025)
    # Preprocess for fold 0 for baseline/tuning
    X_tr0, y_tr0, X_va0, y_va0, X_te0, feats0 = preprocess_for_outer_fold(
        train_folds, test.copy(), held_out_fold=0,
        sel_cat=sel_cat, sel_num_for_deviation=sel_num_for_deviation, transform_targets=transform_targets,
        all_features_for_te=all_features_for_te
    )

    # Baseline params (reduced trees in DEBUG)
    if DEBUG:
        base_params = build_xgb_params(base_lr=0.05, n_estimators=150, early_stopping_rounds=20)
        # Downsample training to 1000 rows for the initial fold-0 baseline
        if len(X_tr0) > 1000:
            X_tr0, _, y_tr0, _ = train_test_split(X_tr0, y_tr0, test_size=(1.0 - 1000/len(X_tr0)), stratify=y_tr0, random_state=2025)
    else:
        base_params = build_xgb_params(base_lr=0.05, n_estimators=1500, early_stopping_rounds=100)

    print("Baseline training on fold 0 (purpose: establish reference AUC).")
    model_base, auc_base, best_it_base, t_base = train_xgb_single(X_tr0, y_tr0, X_va0, y_va0, base_params, label="baseline-fold0")

    # Tuning only in FULL mode
    if not DEBUG:
        model_tuned, auc_tuned, best_it_tuned, tuned_params = optuna_tune_xgb(
            X_tr0, y_tr0, X_va0, y_va0, base_params, time_budget_sec=300
        )
        if auc_tuned >= auc_base:
            final_params = tuned_params
            print("Selected tuned params (AUC=%.6f >= baseline %.6f).", auc_tuned, auc_base)
        else:
            final_params = base_params
            print("Selected baseline params (AUC=%.6f > tuned %.6f).", auc_base, auc_tuned)
    else:
        final_params = base_params
        print("DEBUG mode: tuning skipped; using baseline params.")

    # CV training + predictions
    y_test_cv, oof_auc, median_best_it, feats_cv = train_xgb_cv_and_predict(
        train.copy(), test.copy(), final_params, n_splits=5, debug=DEBUG
    )

    if DEBUG:
        print("DEBUG mode: submission generation skipped per requirements.")
        return

    # Full refit + predictions
    y_test_full, best_it_full, feats_full = refit_full_and_predict(
        train.copy(), test.copy(), final_params, debug=False
    )

    # Blend CV ensemble with full-refit model (simple mean)
    y_test_final = 0.5 * y_test_cv + 0.5 * y_test_full
    y_test_final = np.clip(y_test_final, 1e-9, 1 - 1e-9)

    # Write submission
    submission = pd.DataFrame({ID_COL: test[ID_COL].values, TARGET_COL: y_test_final})
    submission.to_csv(SUBMISSION_PATH, index=False)
    print("Submission written to %s", SUBMISSION_PATH)

    # Log prediction distribution
    desc = pd.Series(y_test_final).describe(percentiles=[0.01, 0.05, 0.1, 0.5, 0.9, 0.95, 0.99])
    print("Prediction distribution summary:\n%s", desc.to_string())
    print("Run complete: OOF AUC=%.6f | median_best_it(CV)=%d | device=%s", oof_auc, median_best_it, XGB_DEVICE)

# -------------------------
# Execute: DEBUG then FULL
# -------------------------
run_pipeline(DEBUG=True)   # no submission
run_pipeline(DEBUG=False)  # produce submission_4.csv