# üöÄ Advanced Improvements for Loan Repayment Prediction

**Goal:** Push AUC-ROC score beyond 0.92178 using advanced techniques

This notebook implements:
1. Advanced Feature Engineering
2. Ensemble Methods (Stacking/Blending)
3. Class Imbalance Handling
4. Feature Selection
5. Advanced Hyperparameter Tuning


## üì¶ Import Libraries


In [1]:
# Data manipulation and analysis
import pandas as pd
import numpy as np

# Machine Learning
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold, cross_val_predict
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.utils.class_weight import compute_sample_weight
import xgboost as xgb
import lightgbm as lgb
from catboost import CatBoostClassifier

# Warnings
import warnings
warnings.filterwarnings('ignore')

# Set display options
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)

print("‚úÖ All libraries imported successfully!")


‚úÖ All libraries imported successfully!


## üì• Load Data


In [2]:
# Load datasets
train = pd.read_csv("data/train.csv")
test = pd.read_csv("data/test.csv")

print(f"üìä Training set shape: {train.shape}")
print(f"üìä Test set shape: {test.shape}")
print(f"\n‚úÖ Data loaded successfully!")


üìä Training set shape: (593994, 13)
üìä Test set shape: (254569, 12)

‚úÖ Data loaded successfully!


## üßπ Data Cleaning


In [3]:
# Create copies for cleaning
train_clean = train.copy()
test_clean = test.copy()

# Check for duplicates
train_duplicates = train_clean.duplicated().sum()
test_duplicates = test_clean.duplicated().sum()

if train_duplicates > 0:
    train_clean = train_clean.drop_duplicates()
    print(f"‚úÖ Removed {train_duplicates} duplicates from training set")
else:
    print("‚úÖ No duplicates found in training set")

if test_duplicates > 0:
    test_clean = test_clean.drop_duplicates()
    print(f"‚úÖ Removed {test_duplicates} duplicates from test set")
else:
    print("‚úÖ No duplicates found in test set")

print(f"\nFinal shapes:")
print(f"Train: {train_clean.shape}")
print(f"Test: {test_clean.shape}")


‚úÖ No duplicates found in training set
‚úÖ No duplicates found in test set

Final shapes:
Train: (593994, 13)
Test: (254569, 12)


## üîß Basic Feature Engineering


In [4]:
# Separate features and target
X_train = train_clean.drop(['id', 'loan_paid_back'], axis=1)
y_train = train_clean['loan_paid_back']
X_test = test_clean.drop('id', axis=1)

print(f"Training features shape: {X_train.shape}")
print(f"Training target shape: {y_train.shape}")
print(f"Test features shape: {X_test.shape}")


Training features shape: (593994, 11)
Training target shape: (593994,)
Test features shape: (254569, 11)


In [5]:
# Create basic features
print("=" * 60)
print("üîß CREATING BASIC FEATURES")
print("=" * 60)

def create_features(df):
    """Create new features from existing ones"""
    df = df.copy()
    
    # Income to loan ratio (handle division by zero and NaN)
    df['income_to_loan_ratio'] = df['annual_income'] / (df['loan_amount'] + 1)
    df['income_to_loan_ratio'] = df['income_to_loan_ratio'].replace([np.inf, -np.inf], 0).fillna(0)
    
    # Monthly payment estimate (simplified)
    df['monthly_payment_estimate'] = df['loan_amount'] * (df['interest_rate'] / 100 / 12)
    df['monthly_payment_estimate'] = df['monthly_payment_estimate'].fillna(0)
    
    # Payment to income ratio (handle division by zero and NaN)
    df['payment_to_income_ratio'] = df['monthly_payment_estimate'] / (df['annual_income'] / 12 + 1)
    df['payment_to_income_ratio'] = df['payment_to_income_ratio'].replace([np.inf, -np.inf], 0).fillna(0)
    
    # Credit score categories (handle edge cases)
    df['credit_score_category'] = pd.cut(
        df['credit_score'], 
        bins=[0, 580, 670, 740, float('inf')], 
        labels=['Poor', 'Fair', 'Good', 'Excellent'],
        include_lowest=True,
        ordered=True
    )
    df['credit_score_category'] = df['credit_score_category'].fillna('Fair')
    
    # Interest rate categories
    df['interest_rate_category'] = pd.cut(
        df['interest_rate'],
        bins=[0, 10, 13, 16, float('inf')],
        labels=['Low', 'Medium', 'High', 'Very High'],
        include_lowest=True,
        ordered=True
    )
    df['interest_rate_category'] = df['interest_rate_category'].fillna('Medium')

    # Loan amount categories
    df['loan_amount_category'] = pd.cut(
        df['loan_amount'],
        bins=[0, 5000, 15000, 30000, float('inf')],
        labels=['Small', 'Medium', 'Large', 'Very Large'],
        include_lowest=True,
        ordered=True
    )
    df['loan_amount_category'] = df['loan_amount_category'].fillna('Medium')
    
    # DTI risk level
    df['dti_risk_level'] = pd.cut(
        df['debt_to_income_ratio'],
        bins=[0, 0.2, 0.4, 0.6, float('inf')],
        labels=['Low', 'Medium', 'High', 'Very High'],
        include_lowest=True,
        ordered=True
    )
    df['dti_risk_level'] = df['dti_risk_level'].fillna('Medium')
    
    # Extract grade and subgrade from grade_subgrade
    df['grade'] = df['grade_subgrade'].str[0]
    df['subgrade'] = pd.to_numeric(df['grade_subgrade'].str[1:], errors='coerce').fillna(0).astype(int)
    if 'grade_subgrade' in df.columns:
        df = df.drop('grade_subgrade', axis=1)
    
    # Employment risk (Unemployed/Retired might be riskier)
    df['employment_risk'] = df['employment_status'].map({
        'Employed': 0,
        'Self-employed': 1,
        'Unemployed': 2,
        'Retired': 1,
        'Student': 1
    })
    df['employment_risk'] = df['employment_risk'].fillna(1)
    
    # Education level encoding (ordinal)
    education_order = {
        'High School': 1,
        "Bachelor's": 2,
        "Master's": 3,
        'PhD': 4,
        'Other': 0
    }
    df['education_encoded'] = df['education_level'].map(education_order)
    df['education_encoded'] = df['education_encoded'].fillna(0)
    
    # Final check: replace any remaining NaN, inf, or -inf values in numeric columns
    numeric_cols = df.select_dtypes(include=[np.number]).columns
    df[numeric_cols] = df[numeric_cols].replace([np.inf, -np.inf], 0).fillna(0)
    
    return df

# Apply feature engineering
X_train_fe = create_features(X_train)
X_test_fe = create_features(X_test)

print("‚úÖ Basic features created!")
print(f"New feature shape: {X_train_fe.shape}")


üîß CREATING BASIC FEATURES
‚úÖ Basic features created!
New feature shape: (593994, 21)


---

## üöÄ Advanced Feature Engineering


In [6]:
# Advanced Feature Engineering - Part 2
print("=" * 60)
print("üîß ADVANCED FEATURE ENGINEERING")
print("=" * 60)

def create_advanced_features(df):
    """Create advanced interaction and polynomial features"""
    df = df.copy()
    
    # Interaction features between important variables
    df['credit_score_x_interest_rate'] = df['credit_score'] * df['interest_rate']
    df['credit_score_x_dti'] = df['credit_score'] * df['debt_to_income_ratio']
    df['income_x_credit_score'] = df['annual_income'] * df['credit_score']
    df['loan_x_interest'] = df['loan_amount'] * df['interest_rate']
    df['dti_x_interest'] = df['debt_to_income_ratio'] * df['interest_rate']
    
    # Ratio features
    df['credit_to_loan_ratio'] = df['credit_score'] / (df['loan_amount'] + 1)
    df['credit_to_income_ratio'] = df['credit_score'] / (df['annual_income'] + 1)
    
    # Polynomial features for key variables
    df['credit_score_squared'] = df['credit_score'] ** 2
    df['dti_squared'] = df['debt_to_income_ratio'] ** 2
    df['interest_rate_squared'] = df['interest_rate'] ** 2
    
    # Log transformations (handle zeros)
    df['log_annual_income'] = np.log1p(df['annual_income'])
    df['log_loan_amount'] = np.log1p(df['loan_amount'])
    df['log_credit_score'] = np.log1p(df['credit_score'])
    
    # Risk score combinations
    df['risk_score'] = (df['debt_to_income_ratio'] * 0.4 + 
                       (1 - df['credit_score'] / 850) * 0.3 + 
                       (df['interest_rate'] / 30) * 0.3)
    
    # Affordability metrics
    df['affordability_score'] = (df['annual_income'] / 12) / (df['monthly_payment_estimate'] + 1)
    df['debt_service_ratio'] = df['debt_to_income_ratio'] * df['interest_rate'] / 100
    
    # Replace inf and NaN
    numeric_cols = df.select_dtypes(include=[np.number]).columns
    df[numeric_cols] = df[numeric_cols].replace([np.inf, -np.inf], 0).fillna(0)
    
    return df

# Apply advanced feature engineering
X_train_advanced = create_advanced_features(X_train_fe)
X_test_advanced = create_advanced_features(X_test_fe)

print("‚úÖ Advanced features created:")
print(f"  - Interaction features (credit_score √ó interest_rate, etc.)")
print(f"  - Polynomial features (squared terms)")
print(f"  - Log transformations")
print(f"  - Risk and affordability scores")
print(f"\nNew feature shape: {X_train_advanced.shape}")


üîß ADVANCED FEATURE ENGINEERING
‚úÖ Advanced features created:
  - Interaction features (credit_score √ó interest_rate, etc.)
  - Polynomial features (squared terms)
  - Log transformations
  - Risk and affordability scores

New feature shape: (593994, 37)


## üè∑Ô∏è Categorical Encoding


In [7]:
# Re-encode categorical variables with new features
print("=" * 60)
print("üè∑Ô∏è  RE-ENCODING WITH ADVANCED FEATURES")
print("=" * 60)

# Define nominal variables for one-hot encoding
nominal_vars = ['gender', 'marital_status', 'education_level', 'employment_status', 
                'loan_purpose', 'grade', 'credit_score_category', 
                'interest_rate_category', 'loan_amount_category', 'dti_risk_level']

# One-hot encode again with new features
X_train_advanced_encoded = pd.get_dummies(X_train_advanced, columns=nominal_vars, 
                                          prefix=nominal_vars, drop_first=True)
X_test_advanced_encoded = pd.get_dummies(X_test_advanced, columns=nominal_vars, 
                                         prefix=nominal_vars, drop_first=True)

# Align columns
missing_cols = set(X_train_advanced_encoded.columns) - set(X_test_advanced_encoded.columns)
for col in missing_cols:
    X_test_advanced_encoded[col] = 0
X_test_advanced_encoded = X_test_advanced_encoded[X_train_advanced_encoded.columns]

# Scale numerical features
numerical_features_advanced = [col for col in X_train_advanced_encoded.columns 
                              if col not in [c for c in X_train_advanced_encoded.columns 
                                            if any(x in c for x in nominal_vars)]]
numerical_features_advanced = [col for col in numerical_features_advanced 
                               if X_train_advanced_encoded[col].dtype in ['int64', 'float64']]

scaler_advanced = StandardScaler()
X_train_advanced_scaled = X_train_advanced_encoded.copy()
X_test_advanced_scaled = X_test_advanced_encoded.copy()

X_train_advanced_scaled[numerical_features_advanced] = scaler_advanced.fit_transform(
    X_train_advanced_encoded[numerical_features_advanced])
X_test_advanced_scaled[numerical_features_advanced] = scaler_advanced.transform(
    X_test_advanced_encoded[numerical_features_advanced])

# Final cleanup
X_train_advanced_scaled = X_train_advanced_scaled.replace([np.inf, -np.inf], 0).fillna(0)
X_test_advanced_scaled = X_test_advanced_scaled.replace([np.inf, -np.inf], 0).fillna(0)

print(f"‚úÖ Advanced features encoded and scaled!")
print(f"Final shape: {X_train_advanced_scaled.shape}")


üè∑Ô∏è  RE-ENCODING WITH ADVANCED FEATURES
‚úÖ Advanced features encoded and scaled!
Final shape: (593994, 64)


## üéØ Feature Selection


In [8]:
# Feature Selection - Remove less important features
print("=" * 60)
print("üéØ FEATURE SELECTION")
print("=" * 60)

# Use a quick model to get feature importance
temp_model = xgb.XGBClassifier(n_estimators=50, random_state=42, eval_metric='logloss')
temp_model.fit(X_train_advanced_scaled, y_train)

feature_importance_df = pd.DataFrame({
    'feature': X_train_advanced_scaled.columns,
    'importance': temp_model.feature_importances_
}).sort_values('importance', ascending=False)

# Select top features (keep top 90%)
importance_threshold = feature_importance_df['importance'].quantile(0.1)  # Keep top 90%
selected_features = feature_importance_df[feature_importance_df['importance'] > importance_threshold]['feature'].tolist()

print(f"Original features: {len(X_train_advanced_scaled.columns)}")
print(f"Selected features: {len(selected_features)}")
print(f"Features removed: {len(X_train_advanced_scaled.columns) - len(selected_features)}")

X_train_selected = X_train_advanced_scaled[selected_features]
X_test_selected = X_test_advanced_scaled[selected_features]

print(f"\n‚úÖ Feature selection completed!")
print(f"Selected features shape: {X_train_selected.shape}")


üéØ FEATURE SELECTION
Original features: 64
Selected features: 48
Features removed: 16

‚úÖ Feature selection completed!
Selected features shape: (593994, 48)


## ü§ñ Ensemble Model Training


In [9]:
# Split data for ensemble training
X_train_ens, X_val_ens, y_train_ens, y_val_ens = train_test_split(
    X_train_selected, y_train, test_size=0.2, random_state=42, stratify=y_train
)

print(f"Ensemble training set: {X_train_ens.shape}")
print(f"Ensemble validation set: {X_val_ens.shape}")


Ensemble training set: (475195, 48)
Ensemble validation set: (118799, 48)


In [10]:
# Train multiple models for ensemble with class weights
print("=" * 60)
print("ü§ñ TRAINING ENSEMBLE MODELS")
print("=" * 60)

# Calculate class weights for imbalanced data
class_weights = compute_sample_weight('balanced', y_train_ens)

# Train base models with better hyperparameters
base_models = {}

# XGBoost with class weights
print("Training XGBoost...")
xgb_model = xgb.XGBClassifier(
    n_estimators=500,
    max_depth=6,
    learning_rate=0.05,
    subsample=0.9,
    colsample_bytree=0.9,
    min_child_weight=3,
    gamma=0.1,
    reg_alpha=0.1,
    reg_lambda=1,
    scale_pos_weight=len(y_train_ens[y_train_ens==0]) / len(y_train_ens[y_train_ens==1]),
    random_state=42,
    eval_metric='logloss',
    n_jobs=-1
)
xgb_model.fit(X_train_ens, y_train_ens, sample_weight=class_weights)
base_models['XGBoost'] = xgb_model
xgb_pred = xgb_model.predict_proba(X_val_ens)[:, 1]
print(f"  XGBoost Val AUC: {roc_auc_score(y_val_ens, xgb_pred):.4f}")

# LightGBM with class weights
print("Training LightGBM...")
lgb_model = lgb.LGBMClassifier(
    n_estimators=500,
    max_depth=6,
    learning_rate=0.05,
    num_leaves=50,
    subsample=0.9,
    colsample_bytree=0.9,
    min_child_samples=20,
    reg_alpha=0.1,
    reg_lambda=1,
    scale_pos_weight=len(y_train_ens[y_train_ens==0]) / len(y_train_ens[y_train_ens==1]),
    random_state=42,
    verbose=-1,
    n_jobs=-1
)
lgb_model.fit(X_train_ens, y_train_ens, sample_weight=class_weights)
base_models['LightGBM'] = lgb_model
lgb_pred = lgb_model.predict_proba(X_val_ens)[:, 1]
print(f"  LightGBM Val AUC: {roc_auc_score(y_val_ens, lgb_pred):.4f}")

# CatBoost with class weights
print("Training CatBoost...")
cat_model = CatBoostClassifier(
    iterations=500,
    depth=6,
    learning_rate=0.05,
    l2_leaf_reg=3,
    subsample=0.9,
    colsample_bylevel=0.9,
    scale_pos_weight=len(y_train_ens[y_train_ens==0]) / len(y_train_ens[y_train_ens==1]),
    random_state=42,
    verbose=False,
    thread_count=-1
)
cat_model.fit(X_train_ens, y_train_ens, sample_weight=class_weights)
base_models['CatBoost'] = cat_model
cat_pred = cat_model.predict_proba(X_val_ens)[:, 1]
print(f"  CatBoost Val AUC: {roc_auc_score(y_val_ens, cat_pred):.4f}")

print("\n‚úÖ Base models trained!")


ü§ñ TRAINING ENSEMBLE MODELS
Training XGBoost...
  XGBoost Val AUC: 0.9203
Training LightGBM...
  LightGBM Val AUC: 0.9202
Training CatBoost...
  CatBoost Val AUC: 0.9180

‚úÖ Base models trained!


## üìö Stacking Ensemble


In [11]:
# Create stacking ensemble using cross-validation
print("=" * 60)
print("üìö CREATING STACKING ENSEMBLE")
print("=" * 60)

# Generate out-of-fold predictions for stacking
n_folds = 5
skf = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=42)

stack_train = np.zeros((X_train_ens.shape[0], len(base_models)))
stack_val = np.zeros((X_val_ens.shape[0], len(base_models)))

for idx, (name, model) in enumerate(base_models.items()):
    print(f"Generating OOF predictions for {name}...")
    # Out-of-fold predictions for training set
    oof_preds = cross_val_predict(model, X_train_ens, y_train_ens, 
                                   cv=skf, method='predict_proba', n_jobs=-1)[:, 1]
    stack_train[:, idx] = oof_preds
    # Direct predictions for validation set
    stack_val[:, idx] = model.predict_proba(X_val_ens)[:, 1]

# Train meta-learner (Logistic Regression)
print("\nTraining meta-learner...")
meta_learner = LogisticRegression(random_state=42, max_iter=1000, C=0.1)
meta_learner.fit(stack_train, y_train_ens)

# Evaluate stacking ensemble
stack_val_pred = meta_learner.predict_proba(stack_val)[:, 1]
stack_auc = roc_auc_score(y_val_ens, stack_val_pred)
print(f"‚úÖ Stacking Ensemble Val AUC: {stack_auc:.4f}")

# Also try simple averaging
avg_val_pred = np.mean(stack_val, axis=1)
avg_auc = roc_auc_score(y_val_ens, avg_val_pred)
print(f"‚úÖ Simple Average Val AUC: {avg_auc:.4f}")

# Use the better method
if stack_auc >= avg_auc:
    use_stacking = True
    print(f"\nüèÜ Using Stacking Ensemble (AUC: {stack_auc:.4f})")
else:
    use_stacking = False
    print(f"\nüèÜ Using Simple Average (AUC: {avg_auc:.4f})")


üìö CREATING STACKING ENSEMBLE
Generating OOF predictions for XGBoost...
Generating OOF predictions for LightGBM...
Generating OOF predictions for CatBoost...

Training meta-learner...
‚úÖ Stacking Ensemble Val AUC: 0.9204
‚úÖ Simple Average Val AUC: 0.9201

üèÜ Using Stacking Ensemble (AUC: 0.9204)


## üéØ Final Ensemble Training on Full Data


In [12]:
# Train final ensemble on full dataset
print("=" * 60)
print("üéØ TRAINING FINAL ENSEMBLE ON FULL DATA")
print("=" * 60)

# Retrain base models on full training data
final_base_models = {}
class_weights_full = compute_sample_weight('balanced', y_train)

print("Retraining base models on full dataset...")
for name, model_template in base_models.items():
    print(f"  Retraining {name}...")
    if name == 'XGBoost':
        final_model = xgb.XGBClassifier(
            n_estimators=500, max_depth=6, learning_rate=0.05,
            subsample=0.9, colsample_bytree=0.9, min_child_weight=3,
            gamma=0.1, reg_alpha=0.1, reg_lambda=1,
            scale_pos_weight=len(y_train[y_train==0]) / len(y_train[y_train==1]),
            random_state=42, eval_metric='logloss', n_jobs=-1
        )
    elif name == 'LightGBM':
        final_model = lgb.LGBMClassifier(
            n_estimators=500, max_depth=6, learning_rate=0.05,
            num_leaves=50, subsample=0.9, colsample_bytree=0.9,
            min_child_samples=20, reg_alpha=0.1, reg_lambda=1,
            scale_pos_weight=len(y_train[y_train==0]) / len(y_train[y_train==1]),
            random_state=42, verbose=-1, n_jobs=-1
        )
    else:  # CatBoost
        final_model = CatBoostClassifier(
            iterations=500, depth=6, learning_rate=0.05,
            l2_leaf_reg=3, subsample=0.9, colsample_bylevel=0.9,
            scale_pos_weight=len(y_train[y_train==0]) / len(y_train[y_train==1]),
            random_state=42, verbose=False, thread_count=-1
        )
    
    final_model.fit(X_train_selected, y_train, sample_weight=class_weights_full)
    final_base_models[name] = final_model

# Generate OOF predictions for meta-learner training
print("\nGenerating OOF predictions for meta-learner...")
stack_full_train = np.zeros((X_train_selected.shape[0], len(final_base_models)))

for idx, (name, model) in enumerate(final_base_models.items()):
    print(f"  {name}...")
    oof_preds = cross_val_predict(model, X_train_selected, y_train, 
                                   cv=skf, method='predict_proba', n_jobs=-1)[:, 1]
    stack_full_train[:, idx] = oof_preds

# Train final meta-learner
final_meta_learner = LogisticRegression(random_state=42, max_iter=1000, C=0.1)
final_meta_learner.fit(stack_full_train, y_train)

print("\n‚úÖ Final ensemble trained!")

# Cross-validation score
cv_scores_ensemble = cross_val_score(
    final_meta_learner, stack_full_train, y_train, 
    cv=skf, scoring='roc_auc', n_jobs=-1
)
print(f"‚úÖ Final Ensemble CV AUC: {cv_scores_ensemble.mean():.4f} (+/- {cv_scores_ensemble.std() * 2:.4f})")


üéØ TRAINING FINAL ENSEMBLE ON FULL DATA
Retraining base models on full dataset...
  Retraining XGBoost...
  Retraining LightGBM...
  Retraining CatBoost...

Generating OOF predictions for meta-learner...
  XGBoost...
  LightGBM...
  CatBoost...

‚úÖ Final ensemble trained!
‚úÖ Final Ensemble CV AUC: 0.9210 (+/- 0.0018)


## üîÆ Making Final Predictions


In [13]:
# Make final predictions
print("=" * 60)
print("üîÆ MAKING FINAL PREDICTIONS")
print("=" * 60)

# Generate predictions from base models
test_predictions_base = np.zeros((X_test_selected.shape[0], len(final_base_models)))

for idx, (name, model) in enumerate(final_base_models.items()):
    print(f"Generating {name} predictions...")
    test_predictions_base[:, idx] = model.predict_proba(X_test_selected)[:, 1]

# Combine using meta-learner or simple average
if use_stacking:
    final_test_predictions = final_meta_learner.predict_proba(test_predictions_base)[:, 1]
    print("Using stacking ensemble for final predictions")
else:
    final_test_predictions = np.mean(test_predictions_base, axis=1)
    print("Using simple average for final predictions")

print(f"\nPredictions shape: {final_test_predictions.shape}")
print(f"Prediction range: [{final_test_predictions.min():.4f}, {final_test_predictions.max():.4f}]")
print(f"Mean prediction: {final_test_predictions.mean():.4f}")

# Create submission file
submission_improved = pd.DataFrame({
    'id': test_clean['id'],
    'loan_paid_back': final_test_predictions
})

print(f"\n‚úÖ Improved submission file created!")
print(f"Submission shape: {submission_improved.shape}")
submission_improved.head(10)


üîÆ MAKING FINAL PREDICTIONS
Generating XGBoost predictions...
Generating LightGBM predictions...
Generating CatBoost predictions...
Using stacking ensemble for final predictions

Predictions shape: (254569,)
Prediction range: [0.1106, 0.9890]
Mean prediction: 0.6410

‚úÖ Improved submission file created!
Submission shape: (254569, 2)


Unnamed: 0,id,loan_paid_back
0,593994,0.671066
1,593995,0.954491
2,593996,0.154719
3,593997,0.660625
4,593998,0.858155
5,593999,0.93302
6,594000,0.971704
7,594001,0.91576
8,594002,0.797269
9,594003,0.110673


In [14]:
# Save improved submission
submission_improved.to_csv('submission.csv', index=False)
print("‚úÖ Improved submission saved to 'submission.csv'")
print("\nüìä Summary:")
print(f"  Final Ensemble CV AUC: {cv_scores_ensemble.mean():.4f} (+/- {cv_scores_ensemble.std() * 2:.4f})")
print(f"  Expected improvement over baseline: +{cv_scores_ensemble.mean() - 0.92178:.4f}")


‚úÖ Improved submission saved to 'submission.csv'

üìä Summary:
  Final Ensemble CV AUC: 0.9210 (+/- 0.0018)
  Expected improvement over baseline: +-0.0008
