In [None]:
"""
Enhanced Loan Prediction Pipeline
Improvements:
1. Better feature engineering with interaction terms
2. Advanced categorical encoding strategies
3. Optimized hyperparameters
4. Cleaner code structure
5. Better memory management
6. Enhanced ensemble methods
"""

# ============================================================================
# IMPORTS
# ============================================================================
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, TargetEncoder
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, roc_curve, RocCurveDisplay, ConfusionMatrixDisplay
from sklearn.model_selection import StratifiedKFold
from xgboost import DMatrix, XGBClassifier
import xgboost as xgb
import lightgbm
from lightgbm import LGBMClassifier
from tqdm.notebook import tqdm
from colorama import Fore, Style
import warnings
import gc
import torch

warnings.filterwarnings("ignore")

# ============================================================================
# CONFIGURATION
# ============================================================================
class Config:
    """Enhanced configuration with better organization"""
    
    # Paths
    target = 'loan_paid_back'
    train = pd.read_csv('/kaggle/input/playground-series-s5e11/train.csv', index_col='id')
    test = pd.read_csv('/kaggle/input/playground-series-s5e11/test.csv', index_col='id')
    submission = pd.read_csv('/kaggle/input/playground-series-s5e11/sample_submission.csv')
    orig = pd.read_csv('/kaggle/input/loan-prediction-dataset-2025/loan_dataset_20000.csv')

    # Device setup
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    
    # Model parameters
    state = 42
    n_splits = 10
    early_stop = 200
    metric = 'roc_auc'
    task_type = "binary"
    task_is_regression = False
    n_classes = 2
    labels = [0, 1]
    
    # Cross-validation
    folds = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=state)
    
    # Feature engineering flags
    outliers = False
    log_trf = False
    missing = False


# ============================================================================
# PREPROCESSING
# ============================================================================
class Preprocessing(Config):
    """Enhanced preprocessing with better feature engineering"""
    
    def __init__(self):
        super().__init__()
        
    def fit_transform(self):
        """Main preprocessing pipeline"""
        self.prepare_data()
        
        if self.missing:
            self.missing_values()
        
        # Combine train and test for consistent feature engineering
        combine = pd.concat([self.X, self.test], axis=0)
        combine = self.feature_engineering(combine)
        
        # Split back
        self.X = combine.iloc[:len(self.X)].copy()
        self.test = combine.iloc[len(self.X):].copy()
        
        # Update feature lists
        self.num_features = self.test.select_dtypes(exclude=['object', 'bool', 'category']).columns.tolist()
        self.cat_features = self.test.select_dtypes(include=['object', 'bool', 'category']).columns.tolist()
        
        if self.outliers:
            self.remove_outliers()
        if self.log_trf:
            self.log_transformation()
        
        print(f"✓ Final feature count: {len(self.num_features)} numerical, {len(self.cat_features)} categorical")
        return self.X, self.y, self.test, self.cat_features, self.num_features
    
    def prepare_data(self):
        """Initial data preparation"""
        self.train_raw = self.train.copy()
        self.y = self.train[self.target]
        self.X = self.train.drop(self.target, axis=1)
        
        self.num_features = self.X.select_dtypes(exclude=['object', 'bool']).columns.tolist()
        self.cat_features = self.X.select_dtypes(include=['object', 'bool']).columns.tolist()
    
    def feature_engineering(self, data):
        """Enhanced feature engineering with multiple strategies"""
        df = data.copy()
        
        # ========== Original dataset statistics ==========
        global_stats = {
            'mean': self.orig[self.target].mean(), 
            'count': len(self.orig)
        }
        
        # Target encoding from original dataset
        for c in self.num_features + self.cat_features:
            for agg_func in ['mean', 'count']:
                col_name = f'{c}_org_{agg_func}'
                tmp = (self.orig.groupby(c)[self.target]
                      .agg(agg_func)
                      .rename(col_name)
                      .reset_index())
                df = df.merge(tmp, on=c, how='left')
                df[col_name] = df[col_name].fillna(global_stats.get(agg_func, 0))
        
        # ========== Numerical transformations ==========
        for c in self.num_features:
            # Log transformations
            df[f"log_{c}"] = np.log1p(df[c])
            
            # Polynomial features
            df[f"{c}_sq"] = df[c] ** 2
            df[f"{c}_sqrt"] = np.sqrt(df[c])
        
        # ========== Domain-specific features ==========
        # Credit utilization ratio
        df['credit_utilization'] = df['debt_to_income_ratio'] * df['annual_income'] / (df['loan_amount'] + 1)
        
        # Loan burden
        df['loan_burden'] = df['loan_amount'] / (df['annual_income'] + 1)
        
        # Risk score (combination of multiple factors)
        df['risk_score'] = (
            (850 - df['credit_score']) / 850 * 0.4 +
            df['debt_to_income_ratio'] * 0.3 +
            df['interest_rate'] / 21 * 0.3
        )
        
        # Income to loan ratio
        df['income_loan_ratio'] = df['annual_income'] / (df['loan_amount'] + 1)
        
        # Affordability index
        df['affordability'] = (df['annual_income'] * (1 - df['debt_to_income_ratio'])) / (df['loan_amount'] + 1)
        
        # ========== Grade features ==========
        df['grade_number'] = df['grade_subgrade'].str[1].astype(int)
        grade_map = {'A': 1, 'B': 2, 'C': 3, 'D': 4, 'E': 5, 'F': 6, 'G': 7}
        df['grade_rank'] = df['grade_subgrade'].str[0].map(grade_map)
        df['grade_combined'] = df['grade_rank'] * 10 + df['grade_number']
        
        # ========== Binning strategies ==========
        self.numtocat_features = []
        
        # Low cardinality numerics to categories
        lowcard = ['credit_score', 'debt_to_income_ratio', 'interest_rate']
        for c in lowcard:
            if c in self.num_features:
                df[f"{c}_cat"] = pd.qcut(df[c], q=10, labels=False, duplicates='drop')
                df[f"{c}_cat"] = df[f"{c}_cat"].astype('category')
                self.numtocat_features.append(f"{c}_cat")
        
        # High cardinality features
        highcard = ['annual_income', 'loan_amount']
        for c in highcard:
            if c in self.num_features:
                # Rounded values
                df[f'{c}_round'] = df[c].round(0)
                df[f"{c}_round"], _ = pd.factorize(df[f"{c}_round"])
                df[f"{c}_round"] = df[f"{c}_round"].astype('category')
                self.numtocat_features.append(f"{c}_round")
                
                # Thousands buckets
                df[f'{c}_thousands'] = df[c].round(-3)
                df[f"{c}_thousands"], _ = pd.factorize(df[f"{c}_thousands"])
                df[f"{c}_thousands"] = df[f"{c}_thousands"].astype('category')
                self.numtocat_features.append(f"{c}_thousands")
        
        # ========== Frequency encoding ==========
        all_cats = self.numtocat_features + self.cat_features
        for c in all_cats:
            freqs = df[c].value_counts(normalize=True)
            df[f"{c}_freq"] = df[c].map(freqs)
        
        # ========== Interaction features ==========
        # Key interactions
        df['income_credit_interaction'] = df['annual_income'] * df['credit_score']
        df['loan_rate_interaction'] = df['loan_amount'] * df['interest_rate']
        df['debt_credit_interaction'] = df['debt_to_income_ratio'] * df['credit_score']
        
        # Convert categorical features
        df[self.cat_features] = df[self.cat_features].astype('category')
        
        return df
    
    def log_transformation(self):
        """Apply log transformation to target"""
        self.y = np.log1p(self.y)
    
    def remove_outliers(self):
        """Remove outliers using IQR method"""
        Q1 = self.y.quantile(0.25)
        Q3 = self.y.quantile(0.75)
        IQR = Q3 - Q1
        lower_limit = Q1 - 1.5 * IQR
        upper_limit = Q3 + 1.5 * IQR
        mask = (self.y >= lower_limit) & (self.y <= upper_limit)
        self.X = self.X[mask]
        self.y = self.y[mask]
        self.X.reset_index(drop=True, inplace=True)
        self.y.reset_index(drop=True, inplace=True)
        print(f"✓ Removed {(~mask).sum()} outliers")
    
    def missing_values(self):
        """Handle missing values"""
        self.X[self.cat_features] = self.X[self.cat_features].fillna('Missing')
        self.test[self.cat_features] = self.test[self.cat_features].fillna('Missing')


# ============================================================================
# MODELS
# ============================================================================
def get_models():
    """Enhanced model configurations"""
    models = {
        'XGB_optimized': XGBClassifier(
            tree_method='hist',
            n_estimators=10000,
            objective='binary:logistic',
            random_state=Config.state,
            enable_categorical=True,
            verbosity=0,
            eval_metric='auc',
            booster='gbtree',
            n_jobs=-1,
            learning_rate=0.01,
            device="cuda" if Config.device == 'cuda' else "cpu",
            reg_lambda=0.31,
            reg_alpha=4.45,
            colsample_bytree=0.10,
            subsample=0.67,
            max_depth=8,
            min_child_weight=2,
            max_bin=512
        ),
        
        'LGBM_optimized': LGBMClassifier(
            random_state=Config.state,
            verbose=-1,
            n_estimators=10000,
            metric='auc',
            objective='binary',
            learning_rate=0.01,
            max_depth=5,
            min_child_samples=162,
            subsample=0.44,
            colsample_bytree=0.23,
            num_leaves=332,
            reg_alpha=0.05,
            reg_lambda=7.07,
            max_bin=500,
        ),
        
        'HGB_optimized': HistGradientBoostingClassifier(
            max_iter=10000,
            random_state=Config.state,
            early_stopping=True,
            categorical_features="from_dtype",
            learning_rate=0.01,
            loss='log_loss',
            l2_regularization=0.011,
            max_depth=4,
            max_leaf_nodes=85,
            min_samples_leaf=50
        ),
    }
    return models


# ============================================================================
# TRAINER
# ============================================================================
class Trainer(Config):
    """Enhanced training pipeline with better memory management"""
    
    def __init__(self, X, y, test, models, num_features, cat_features, training=True):
        super().__init__()
        self.X = X
        self.test = test
        self.y = y
        self.models = models
        self.training = training
        self.num_features = num_features
        self.cat_features = cat_features
        
        # Results storage
        self.scores = pd.DataFrame(columns=['Score'], dtype=float)
        self.OOF_preds = pd.DataFrame(dtype=float)
        self.TEST_preds = pd.DataFrame(dtype=float)
    
    def score_metric(self, y_true, y_pred):
        """Calculate evaluation metric"""
        return roc_auc_score(y_true, y_pred)
    
    def train_model(self, model, X, y, test, model_name):
        """Train a single model with cross-validation"""
        oof_pred = np.zeros(X.shape[0], dtype=float)
        test_pred = np.zeros(test.shape[0], dtype=float)
        
        print(f"\n{'='*60}")
        print(f"Training {model_name}")
        print(f"{'='*60}")
        
        params = model.get_params()
        
        for n_fold, (train_idx, valid_idx) in enumerate(self.folds.split(X, y)):
            print(f"\nFold {n_fold + 1}/{self.n_splits}")
            
            # Split data
            X_train = X.iloc[train_idx].copy()
            y_train = y.iloc[train_idx]
            X_val = X.iloc[valid_idx].copy()
            y_val = y.iloc[valid_idx]
            X_test = test.copy()
            
            # Target encoding for categorical features
            if self.cat_features:
                te = TargetEncoder(random_state=42, shuffle=True, cv=5, smooth=15)
                X_train[self.cat_features] = te.fit_transform(X_train[self.cat_features], y_train).astype('float32')
                X_val[self.cat_features] = te.transform(X_val[self.cat_features]).astype('float32')
                X_test[self.cat_features] = te.transform(X_test[self.cat_features]).astype('float32')
            
            # Train based on model type
            if "LGBM" in model_name:
                train_data = lightgbm.Dataset(X_train, label=y_train)
                val_data = lightgbm.Dataset(X_val, label=y_val, reference=train_data)
                
                model = lightgbm.train(
                    params=params,
                    train_set=train_data,
                    valid_sets=[val_data],
                    callbacks=[
                        lightgbm.early_stopping(stopping_rounds=self.early_stop, verbose=False),
                        lightgbm.log_evaluation(period=0)
                    ]
                )
                y_pred_val = model.predict(X_val)
                y_pred_test = model.predict(X_test)
                
            elif "XGB" in model_name:
                dtrain = DMatrix(X_train, label=y_train, enable_categorical=True)
                dval = DMatrix(X_val, label=y_val, enable_categorical=True)
                dtest = DMatrix(X_test, enable_categorical=True)
                
                model = xgb.train(
                    params=params,
                    dtrain=dtrain,
                    evals=[(dval, "valid")],
                    num_boost_round=100000,
                    early_stopping_rounds=self.early_stop,
                    verbose_eval=False
                )
                y_pred_val = model.predict(dval)
                y_pred_test = model.predict(dtest)
                
            elif "HGB" in model_name:
                model.fit(X_train, y_train, X_val=X_val, y_val=y_val)
                y_pred_val = model.predict_proba(X_val)[:, 1]
                y_pred_test = model.predict_proba(X_test)[:, 1]
            
            else:
                model.fit(X_train, y_train)
                y_pred_val = model.predict_proba(X_val)[:, 1]
                y_pred_test = model.predict_proba(X_test)[:, 1]
            
            # Store predictions
            oof_pred[valid_idx] = y_pred_val
            test_pred += y_pred_test / self.n_splits
            
            # Calculate score
            score = self.score_metric(y_val, y_pred_val)
            print(f"  Fold {n_fold + 1} ROC-AUC: {score:.6f}")
            self.scores.loc[model_name, f'Fold {n_fold + 1}'] = score
            
            # Memory cleanup
            del X_train, y_train, X_val, y_val
            gc.collect()
        
        # Calculate average score
        avg_score = self.scores.loc[model_name, [f'Fold {i+1}' for i in range(self.n_splits)]].mean()
        self.scores.loc[model_name, 'Score'] = avg_score
        print(f"\n{model_name} Average ROC-AUC: {avg_score:.6f}")
        
        return oof_pred, test_pred
    
    def run(self):
        """Main training loop"""
        for model_name, model in tqdm(self.models.items(), desc="Training models"):
            if self.training:
                oof_pred, test_pred = self.train_model(model, self.X, self.y, self.test, model_name)
                
                # Save predictions
                pd.DataFrame(oof_pred, columns=[model_name]).to_csv(f'{model_name}_oof.csv', index=False)
                pd.DataFrame(test_pred, columns=[model_name]).to_csv(f'{model_name}_test.csv', index=False)
            else:
                # Load predictions
                oof_pred = pd.read_csv(f'/kaggle/input/loan-models/{model_name}_oof.csv')[model_name].values
                test_pred = pd.read_csv(f'/kaggle/input/loan-models/{model_name}_test.csv')[model_name].values
            
            self.OOF_preds[model_name] = oof_pred
            self.TEST_preds[model_name] = test_pred
        
        # Ensemble if multiple models
        if len(self.models) > 1:
            print(f"\n{'='*60}")
            print("Creating Ensemble")
            print(f"{'='*60}")
            
            meta_model = LogisticRegression(C=0.1, random_state=self.state, max_iter=1000)
            self.OOF_preds["Ensemble"], self.TEST_preds["Ensemble"] = self.train_model(
                meta_model, self.OOF_preds, self.y, self.TEST_preds, 'Ensemble'
            )
            
            self.plot_results()
            return self.TEST_preds["Ensemble"]
        else:
            model_name = list(self.models.keys())[0]
            self.plot_results()
            return self.TEST_preds[model_name]
    
    def plot_results(self):
        """Visualize results"""
        # Score comparison
        plt.figure(figsize=(14, 6))
        scores_sorted = self.scores.sort_values('Score', ascending=True)
        colors = ['#3cb371' if idx != 'Ensemble' else 'red' for idx in scores_sorted.index]
        bars = plt.barh(scores_sorted.index, scores_sorted['Score'], color=colors, height=0.6)
        plt.bar_label(bars, fmt='%.6f', padding=5)
        plt.xlabel('ROC-AUC Score', fontsize=12)
        plt.ylabel('Model', fontsize=12)
        plt.title('Model Performance Comparison', fontsize=14, fontweight='bold')
        plt.grid(axis='x', alpha=0.3)
        plt.tight_layout()
        plt.show()
        
        # ROC curves and confusion matrix
        fig, axes = plt.subplots(1, 2, figsize=(14, 6))
        
        # ROC curves
        for col in self.OOF_preds.columns:
            RocCurveDisplay.from_predictions(
                self.y, self.OOF_preds[col], 
                name=f"{col} (AUC={self.scores.loc[col, 'Score']:.4f})",
                ax=axes[0]
            )
        axes[0].plot([0, 1], [0, 1], 'k--', lw=2, label='Random')
        axes[0].set_xlabel('False Positive Rate', fontsize=11)
        axes[0].set_ylabel('True Positive Rate', fontsize=11)
        axes[0].set_title('ROC Curves', fontsize=12, fontweight='bold')
        axes[0].legend(loc="lower right", fontsize=9)
        axes[0].grid(alpha=0.3)
        
        # Confusion matrix
        best_model = self.scores['Score'].idxmax()
        ConfusionMatrixDisplay.from_predictions(
            self.y,
            (self.OOF_preds[best_model] >= 0.5).astype(int),
            display_labels=['Not Paid', 'Paid'],
            cmap='Greens',
            ax=axes[1]
        )
        axes[1].set_title(f'Confusion Matrix - {best_model}', fontsize=12, fontweight='bold')
        
        plt.tight_layout()
        plt.show()


# ============================================================================
# MAIN EXECUTION
# ============================================================================
def main():
    """Main execution pipeline"""
    print(f"{Style.BRIGHT}{Fore.GREEN}")
    print("="*60)
    print("ENHANCED LOAN PREDICTION PIPELINE")
    print("="*60)
    print(f"{Style.RESET_ALL}\n")
    
    # Preprocessing
    print(f"{Fore.CYAN}Step 1: Preprocessing{Style.RESET_ALL}")
    preprocessor = Preprocessing()
    X, y, test, cat_features, num_features = preprocessor.fit_transform()
    
    # Model training
    print(f"\n{Fore.CYAN}Step 2: Model Training{Style.RESET_ALL}")
    models = get_models()
    trainer = Trainer(X, y, test, models, num_features, cat_features, training=False)
    test_predictions = trainer.run()
    
    # Create submission
    print(f"\n{Fore.CYAN}Step 3: Creating Submission{Style.RESET_ALL}")
    submission = Config.submission
    submission[Config.target] = test_predictions
    submission.to_csv("submission.csv", index=False)
    
    print(f"\n{Fore.GREEN}✓ Submission saved successfully!{Style.RESET_ALL}")
    print(f"Prediction range: [{test_predictions.min():.4f}, {test_predictions.max():.4f}]")
    print(f"Prediction mean: {test_predictions.mean():.4f}")
    
    # Visualize submission distribution
    plt.figure(figsize=(12, 5))
    plt.subplot(1, 2, 1)
    plt.hist(test_predictions, bins=50, color='#3cb371', alpha=0.7, edgecolor='black')
    plt.xlabel('Predicted Probability')
    plt.ylabel('Frequency')
    plt.title('Prediction Distribution')
    plt.grid(alpha=0.3)
    
    plt.subplot(1, 2, 2)
    sns.kdeplot(test_predictions, fill=True, color='#3cb371', linewidth=2)
    plt.xlabel('Predicted Probability')
    plt.ylabel('Density')
    plt.title('Prediction Density')
    plt.grid(alpha=0.3)
    
    plt.tight_layout()
    plt.show()
    
    return submission


# Execute
if __name__ == "__main__":
    submission = main()