In [None]:
# Import warnings and suppress unnecessary outputs
import warnings
warnings.filterwarnings('ignore')

# Import core libraries
import pandas as pd
import numpy as np
import optuna

# Import modeling and evaluation tools
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.metrics import roc_auc_score
from xgboost import XGBClassifier

# Import plotting libraries
import plotly.graph_objects as go
from plotly.subplots import make_subplots

In [None]:
# Define the data loading function
def load_data():
    # Read train, test, and original datasets
    train = pd.read_csv('/kaggle/input/playground-series-s5e11/train.csv')
    test = pd.read_csv('/kaggle/input/playground-series-s5e11/test.csv')
    orig = pd.read_csv('/kaggle/input/loan-prediction-dataset-2025/loan_dataset_20000.csv')

    # Define target variable
    target = 'loan_paid_back'

    # Return datasets and target name
    return train, test, orig, target

In [None]:
# Define preprocessing and feature engineering
def preprocess_data(train, test, orig, target):
    # Define numerical columns for outlier treatment
    numerical_cols = ['annual_income', 'debt_to_income_ratio', 'credit_score',
                      'loan_amount', 'interest_rate']

    # Apply IQR-based clipping for outlier removal
    for col in numerical_cols:
        Q1 = train[col].quantile(0.25)
        Q3 = train[col].quantile(0.75)
        IQR = Q3 - Q1
        lower = Q1 - 1.5 * IQR
        upper = Q3 + 1.5 * IQR
        train[col] = train[col].clip(lower, upper)
        test[col] = test[col].clip(lower, upper)

    # Extract grade from grade_subgrade
    train['grade'] = train['grade_subgrade'].str[0]
    test['grade'] = test['grade_subgrade'].str[0]

    # Create financial ratio features
    train['loan_to_income'] = train['loan_amount'] / (train['annual_income'] + 1)
    test['loan_to_income'] = test['loan_amount'] / (test['annual_income'] + 1)

    train['total_debt'] = train['debt_to_income_ratio'] * train['annual_income']
    test['total_debt'] = test['debt_to_income_ratio'] * test['annual_income']

    # Create composite risk score feature
    train['risk_score'] = (train['debt_to_income_ratio'] * 40 +
                           (1 - train['credit_score'] / 850) * 30 +
                           train['interest_rate'] * 2)

    test['risk_score'] = (test['debt_to_income_ratio'] * 40 +
                          (1 - test['credit_score'] / 850) * 30 +
                          test['interest_rate'] * 2)

    # Apply target encoding using original dataset means
    base_cols = [col for col in train.columns if col not in ['id', target, 'grade']]
    
    for col in base_cols:
        if col in orig.columns:
            mean_map = orig.groupby(col)[target].mean()
            train[f"orig_mean_{col}"] = train[col].map(mean_map)
            test[f"orig_mean_{col}"] = test[col].map(mean_map)

    # Define categorical and feature columns
    categorical_cols = ['gender', 'marital_status', 'education_level',
                        'employment_status', 'loan_purpose',
                        'grade_subgrade', 'grade']

    feature_cols = [col for col in train.columns if col not in ['id', target] + categorical_cols]

    # Split into final feature matrices
    X = train[feature_cols].copy()
    y = train[target].copy()
    X_test = test[feature_cols].copy()

    # Return processed data
    return X, y, X_test

In [None]:
# Define Optuna objective function for XGBoost
def objective(trial, X, y):
    # Suggest hyperparameters
    params = {
        'objective': 'binary:logistic',
        'eval_metric': 'auc',
        'max_depth': trial.suggest_int('max_depth', 2, 6),
        'learning_rate': trial.suggest_float('learning_rate', 0.0025, 0.1, log=True),
        'n_estimators': trial.suggest_int('n_estimators', 1000, 10000),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),
        'subsample': trial.suggest_float('subsample', 0.6, 1.0),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
        'gamma': trial.suggest_float('gamma', 0.0, 0.3),
        'reg_alpha': trial.suggest_float('reg_alpha', 0.0, 0.5),
        'reg_lambda': trial.suggest_float('reg_lambda', 0.5, 2.0),
        'random_state': 42,
        'n_jobs': -1,
        'device': 'cuda',
        'tree_method': 'hist'
    }

    # Split data for validation
    X_train, X_val, y_train, y_val = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)

    # Train model
    model = XGBClassifier(**params)
    model.fit(X_train, y_train, eval_set=[(X_val, y_val)],
              early_stopping_rounds=100, verbose=0)

    # Predict and calculate AUC
    preds = model.predict_proba(X_val)[:, 1]
    auc = roc_auc_score(y_val, preds)
    return auc

In [None]:
# Define function to run Optuna optimization
def optimize_hyperparameters(X, y, n_trials=30):
    # Initialize Optuna study
    study = optuna.create_study(direction='maximize')
    study.optimize(lambda trial: objective(trial, X, y), n_trials=n_trials)

    # Print best parameters and score
    print("=" * 80)
    print("ðŸŽ¯ Optuna Hyperparameter Optimization Complete")
    print(f"Best AUC Score: {study.best_value:.6f}")
    print("Best Parameters:")
    for key, val in study.best_params.items():
        print(f"{key}: {val}")
    print("=" * 80)

    # Return best parameters
    return study.best_params

In [None]:
# Define XGBoost model training with optimized parameters
def train_xgboost(X, y, X_test, best_params):
    # Initialize cross-validation
    N_SPLITS = 7
    skf = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=42)

    # Add fixed parameters to Optuna params
    params = best_params.copy()
    params.update({
        'objective': 'binary:logistic',
        'eval_metric': 'auc',
        'random_state': 42,
        'n_jobs': -1,
        'device': 'cuda',
        'tree_method': 'hist'
    })

    # Initialize arrays
    oof = np.zeros(len(X))
    test_pred = np.zeros(len(X_test))
    scores = []

    # Train using stratified k-fold
    for fold, (train_idx, val_idx) in enumerate(skf.split(X, y), 1):
        X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

        model = XGBClassifier(**params)
        model.fit(X_train, y_train, eval_set=[(X_val, y_val)],
                  early_stopping_rounds=100, verbose=0)

        oof[val_idx] = model.predict_proba(X_val)[:, 1]
        test_pred += model.predict_proba(X_test)[:, 1] / N_SPLITS
        scores.append(roc_auc_score(y_val, oof[val_idx]))

    # Compute metrics
    cv_auc = roc_auc_score(y, oof)
    mean_auc = np.mean(scores)
    std_auc = np.std(scores)

    # Return predictions and performance
    return oof, test_pred, cv_auc, mean_auc, std_auc

In [None]:
# Define model evaluation and visualization
def evaluate_model(oof, y):
    # Generate predictions
    from sklearn.metrics import confusion_matrix, roc_curve, accuracy_score

    y_pred = (oof > 0.5).astype(int)
    cm = confusion_matrix(y, y_pred)

    # Compute ROC curve
    fpr, tpr, _ = roc_curve(y, oof)
    auc_score = roc_auc_score(y, oof)
    acc_score = accuracy_score(y, y_pred)

    # Create subplots
    fig = make_subplots(rows=1, cols=2, subplot_titles=('Confusion Matrix', 'ROC Curve'),
                        specs=[[{'type': 'heatmap'}, {'type': 'scatter'}]])

    # Add confusion matrix
    fig.add_trace(go.Heatmap(z=cm, x=['Pred Default', 'Pred Paid'],
                             y=['Act Default', 'Act Paid'], showscale=False),
                  row=1, col=1)

    # Add ROC curve
    fig.add_trace(go.Scatter(x=fpr, y=tpr, mode='lines',
                             name=f'XGBoost (AUC={auc_score:.4f})', line=dict(width=3)),
                  row=1, col=2)

    # Add diagonal
    fig.add_trace(go.Scatter(x=[0, 1], y=[0, 1], mode='lines',
                             name='Random', line=dict(width=2, dash='dash')),
                  row=1, col=2)

    # Update layout
    fig.update_layout(title=f'XGBoost | AUC: {auc_score:.6f} | Accuracy: {acc_score:.4f}',
                      height=500)
    fig.show()

In [None]:
# Define the submission generator
def save_submission(test, target, test_pred):
    # Create submission dataframe
    submission = pd.DataFrame({'id': test['id'], target: test_pred})

    # Save file
    submission.to_csv('submission.csv', index=False)

    # Print summary
    print("=" * 80)
    print("ðŸ“ˆ XGBoost Model Summary")
    print(f"File Saved: submission.csv ({len(submission)} rows)")
    print("=" * 80)

In [None]:
# Define the main function
def main():
    # Load datasets
    train, test, orig, target = load_data()

    # Preprocess data
    X, y, X_test = preprocess_data(train, test, orig, target)

    # Optimize with Optuna
    best_params = optimize_hyperparameters(X, y, n_trials=768)

    # Train model
    oof, test_pred, cv_auc, mean_auc, std_auc = train_xgboost(X, y, X_test, best_params)

    # Evaluate model
    evaluate_model(oof, y)

    # Display results
    print("=" * 80)
    print("âœ… Cross-Validation Results")
    print(f"OOF AUC: {cv_auc:.6f}")
    print(f"Mean Fold AUC: {mean_auc:.6f} Â± {std_auc:.6f}")
    print("=" * 80)

    # Save submission
    save_submission(test, target, test_pred)

In [None]:
# Execute script
if __name__ == "__main__":
    main()