In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from itertools import combinations
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
import xgboost as xgb
from lightgbm import LGBMClassifier
import lightgbm as lgb
from cuml.preprocessing.TargetEncoder import TargetEncoder
import warnings
import gc
warnings.filterwarnings('ignore')

In [None]:
print("="*80)
print("LOADING DATA")
print("="*80)
train = pd.read_csv('/kaggle/input/playground-series-s5e11/train.csv')
test = pd.read_csv('/kaggle/input/playground-series-s5e11/test.csv')
orig = pd.read_csv('/kaggle/input/loan-prediction-dataset-2025/loan_dataset_20000.csv')
submission = pd.read_csv('/kaggle/input/playground-series-s5e11/sample_submission.csv')

print(f'Train: {train.shape}, Test: {test.shape}, Orig: {orig.shape}')

In [None]:
def downcasting(data, verbose=True):
    mem_before = data.memory_usage().sum() / 1024**2
    if verbose:
        print(f"Memory: {mem_before:.2f} MB", end=" → ")
            
    for col in data.select_dtypes(include=["number"]).columns:
        if pd.api.types.is_integer_dtype(data[col]):
            data[col] = pd.to_numeric(data[col], downcast="integer")
        elif pd.api.types.is_float_dtype(data[col]):
            data[col] = pd.to_numeric(data[col], downcast="float")
    
    mem_after = data.memory_usage().sum() / 1024**2
    if verbose:
        print(f"{mem_after:.2f} MB (↓{(100 * (mem_before - mem_after) / mem_before):.1f}%)")
    return data

train = downcasting(train)
test = downcasting(test)
orig = downcasting(orig)

In [None]:
# ============================================================================
# CONFIGURATION
# ============================================================================
target = 'loan_paid_back'
CATS = ['gender', 'marital_status', 'education_level', 'employment_status', 
        'loan_purpose', 'grade_subgrade']
NUMS = ['annual_income', 'debt_to_income_ratio', 'credit_score', 
        'loan_amount', 'interest_rate', 'age']

In [None]:
print('\n' + '='*80)
print('HIGH-VALUE FEATURE ENGINEERING')
print('='*80)

test[target] = -1
combine = pd.concat([train, test, orig], axis=0, ignore_index=True)

combine['financial_health'] = (combine['credit_score'] / 850) * (1 - combine['debt_to_income_ratio'])
combine['loan_burden'] = combine['loan_amount'] / (combine['annual_income'] + 1)
combine['monthly_burden'] = (combine['loan_amount'] * combine['interest_rate'] / 1200) / ((combine['annual_income'] / 12) + 1)
combine['credit_power'] = combine['credit_score'] / (combine['interest_rate'] + 0.1)
combine['income_efficiency'] = combine['annual_income'] * (1 - combine['debt_to_income_ratio'])

combine['high_risk'] = ((combine['debt_to_income_ratio'] > 0.4) & (combine['credit_score'] < 650)).astype(np.int8)
combine['low_risk'] = ((combine['debt_to_income_ratio'] < 0.3) & (combine['credit_score'] > 700)).astype(np.int8)

combine['grade_letter'] = combine['grade_subgrade'].str[0]
combine['grade_number'] = combine['grade_subgrade'].str[1].astype(int)
grade_map = {'A': 1, 'B': 2, 'C': 3, 'D': 4, 'E': 5, 'F': 6, 'G': 7}
combine['grade_rank'] = combine['grade_letter'].map(grade_map)
combine['grade_score'] = combine['grade_rank'] * 10 + combine['grade_number']

combine['log_income'] = np.log1p(combine['annual_income'])
combine['log_loan'] = np.log1p(combine['loan_amount'])
combine['log_credit'] = np.log1p(combine['credit_score'])

combine['credit_squared'] = combine['credit_score'] ** 2
combine['debt_squared'] = combine['debt_to_income_ratio'] ** 2

combine['income_credit'] = combine['log_income'] * combine['credit_score'] / 1000
combine['debt_loan'] = combine['debt_to_income_ratio'] * combine['log_loan']
combine['rate_burden'] = combine['interest_rate'] * combine['loan_burden']

combine['risk_score'] = (combine['debt_to_income_ratio'] * combine['interest_rate'] * 100) / (combine['credit_score'] + 1)
combine['affordability_score'] = (combine['annual_income'] / 12) / (combine['loan_amount'] * combine['interest_rate'] / 1200 + 1)

combine['age_income_ratio'] = combine['age'] / (combine['log_income'] + 1)
combine['credit_age_interaction'] = combine['credit_score'] * combine['age'] / 1000

NEW_NUM_FEATURES = ['financial_health', 'loan_burden', 'monthly_burden', 'credit_power',
                    'income_efficiency', 'log_income', 'log_loan', 'log_credit',
                    'credit_squared', 'debt_squared', 'income_credit', 'debt_loan', 
                    'rate_burden', 'grade_number', 'grade_rank', 'grade_score',
                    'high_risk', 'low_risk', 'risk_score', 'affordability_score',
                    'age_income_ratio', 'credit_age_interaction']

NEW_CAT_FEATURES = ['grade_letter']

print(f'Created {len(NEW_NUM_FEATURES)} high-value numeric features')

CATS.append('grade_letter')

In [None]:
print('\n' + '='*80)
print('FACTORIZATION')
print('='*80)

CATS1 = []
SIZES = {}

for c in NUMS + CATS:
    n = c
    if c in NUMS: 
        n = f"{c}_cat"
        CATS1.append(n)
    combine[n], _ = combine[c].factorize()
    SIZES[n] = combine[n].max() + 1
    combine[n] = combine[n].astype('int32')

print(f'Factorized {len(NUMS)} nums → {len(CATS1)} categorical versions')

In [None]:
print('\n' + '='*80)
print('2-WAY INTERACTIONS')
print('='*80)

pairs = list(combinations(CATS + CATS1, 2))
new_cols = {}
CATS2 = []

for c1, c2 in pairs:
    name = "_".join(sorted((c1, c2)))
    new_cols[name] = combine[c1] * SIZES[c2] + combine[c2]
    CATS2.append(name)

if new_cols:
    new_df = pd.DataFrame(new_cols)         
    combine = pd.concat([combine, new_df], axis=1) 
    del new_df
    gc.collect()

print(f'Created {len(CATS2)} 2-way interactions')

In [None]:
print('\n' + '='*80)
print('STRATEGIC 3-WAY INTERACTIONS')
print('='*80)

CATS3 = []
strategic_3way = [
    ('grade_subgrade', 'employment_status', 'loan_purpose'),
    ('grade_subgrade', 'education_level', 'loan_purpose'),
    ('employment_status', 'education_level', 'marital_status'),
]

for c1, c2, c3 in strategic_3way:
    if c1 in CATS and c2 in CATS and c3 in CATS:
        name = f"{c1}_{c2}_{c3}"
        combine[name] = (combine[c1].astype(str) + '_' + 
                        combine[c2].astype(str) + '_' + 
                        combine[c3].astype(str))
        combine[name], _ = combine[name].factorize()
        combine[name] = combine[name].astype('int32')
        CATS3.append(name)

print(f'Created {len(CATS3)} strategic 3-way interactions')

In [None]:
print('\n' + '='*80)
print('COUNT ENCODING')
print('='*80)

CE = []
CC = CATS + CATS1 + CATS2 + CATS3

for i, c in enumerate(CC):
    if i % 20 == 0:
        print(f'Progress: {i}/{len(CC)}', end='\r')
    tmp = combine.groupby(c)[target].count()
    tmp = tmp.astype('int32')
    tmp.name = f"CE_{c}"
    CE.append(f"CE_{c}")
    combine = combine.merge(tmp, on=c, how='left')

print(f'Created {len(CE)} count features                ')

In [None]:
# ============================================================================
# SPLIT DATA
# ============================================================================
train = combine.iloc[:len(train)].copy()
test = combine.iloc[len(train):len(train)+len(test)].copy()
orig = combine.iloc[-len(orig):].copy()
del combine
gc.collect()

print(f'\nTrain: {train.shape}, Test: {test.shape}, Orig: {orig.shape}')

In [None]:
# ============================================================================
# FEATURE SET
# ============================================================================
FEATURES = NUMS + CATS + CATS1 + CATS2 + CATS3 + CE + NEW_NUM_FEATURES
print(f'\nTotal Features: {len(FEATURES)}')

CATS_FINAL = [c for c in CATS if c not in (CATS1 + CATS2 + CATS3)]
print(f'Categorical features (non-TE): {len(CATS_FINAL)}')

In [None]:
# ============================================================================
# DATA LOADER FOR QUANTILE DMATRIX
# ============================================================================
class IterLoadForDMatrix(xgb.core.DataIter):
    def __init__(self, df=None, features=None, target=None, batch_size=256*1024):
        self.features = features
        self.target = target
        self.df = df
        self.it = 0 
        self.batch_size = batch_size
        self.batches = int(np.ceil(len(df) / self.batch_size))
        super().__init__()
    
    def reset(self):
        self.it = 0
    
    def next(self, input_data):
        if self.it == self.batches:
            return 0
        
        a = self.it * self.batch_size
        b = min((self.it + 1) * self.batch_size, len(self.df))
        dt = self.df.iloc[a:b]
        input_data(data=dt[self.features], label=dt[self.target]) 
        self.it += 1
        return 1

In [None]:
print('\n' + '='*80)
print('STAGE 1: INITIAL TRAINING WITH STRATIFIED CV')
print('='*80)

FOLDS = 7
SEED = 42

params_xgb = {
    "objective": "binary:logistic",  
    "eval_metric": "auc",           
    "learning_rate": 0.0075,
    "max_depth": 0,
    "subsample": 0.76,
    "colsample_bytree": 0.66,
    "seed": SEED,
    "device": "cuda",
    "grow_policy": "lossguide", 
    "max_leaves": 34,          
    'scale_pos_weight': 0.86,
    "min_child_weight": 5,
    'lambda': 6.5, 
    'alpha': 3.2,
    'gamma': 0.55,
}

oof_preds1 = np.zeros(len(train))
test_preds1 = np.zeros(len(test))

skf = StratifiedKFold(n_splits=FOLDS, shuffle=True, random_state=SEED)

for fold, (train_idx, val_idx) in enumerate(skf.split(train, train[target])):
    print(f'\nFold {fold+1}/{FOLDS}')
    
    Xy_train = train.iloc[train_idx][FEATURES + [target]].copy()
    Xy_more = orig[FEATURES + [target]].copy()
    Xy_train = pd.concat([Xy_train, Xy_more], axis=0, ignore_index=True)
    
    X_valid = train.iloc[val_idx][FEATURES].copy()
    y_valid = train.iloc[val_idx][target].copy()
    X_test = test[FEATURES].copy()
    
    CC_TE = CATS1 + CATS2 + CATS3
    print(f'Target encoding {len(CC_TE)} features...', end=' ')
    
    for c in CC_TE:
        try:
            TE = TargetEncoder(n_folds=10, smooth=12, split_method='random', stat='mean')
            
            result_train = TE.fit_transform(Xy_train[[c]], Xy_train[target])
            if hasattr(result_train, 'values'):
                Xy_train[c] = result_train.values.ravel().astype('float32')
            else:
                Xy_train[c] = result_train.ravel().astype('float32')
            
            result_valid = TE.transform(X_valid[[c]])
            if hasattr(result_valid, 'values'):
                X_valid[c] = result_valid.values.ravel().astype('float32')
            else:
                X_valid[c] = result_valid.ravel().astype('float32')
            
            result_test = TE.transform(X_test[[c]])
            if hasattr(result_test, 'values'):
                X_test[c] = result_test.values.ravel().astype('float32')
            else:
                X_test[c] = result_test.ravel().astype('float32')
        except:
            continue
    
    print('Done')
    
    for cat in CATS_FINAL:
        if cat in Xy_train.columns:
            Xy_train[cat] = Xy_train[cat].astype('category')
            X_valid[cat] = X_valid[cat].astype('category')
            X_test[cat] = X_test[cat].astype('category')
    
    Xy_train_iter = IterLoadForDMatrix(Xy_train, FEATURES, target)
    dtrain = xgb.QuantileDMatrix(Xy_train_iter, enable_categorical=True, max_bin=256)
    dval = xgb.DMatrix(X_valid, label=y_valid, enable_categorical=True)
    dtest = xgb.DMatrix(X_test, enable_categorical=True)
    
    model = xgb.train(
        params=params_xgb,
        dtrain=dtrain,
        num_boost_round=18000,
        evals=[(dval, "valid")],
        early_stopping_rounds=450,
        verbose_eval=False
    )
    
    oof_preds1[val_idx] = model.predict(dval, iteration_range=(0, model.best_iteration + 1))
    test_preds1 += model.predict(dtest, iteration_range=(0, model.best_iteration + 1)) / FOLDS
    
    fold_auc = roc_auc_score(y_valid, oof_preds1[val_idx])
    print(f'Fold {fold+1} AUC: {fold_auc:.5f} (Best iteration: {model.best_iteration})')
    
    del Xy_train, Xy_more, X_valid, X_test, dtrain, dval, dtest, model
    gc.collect()

cv_auc1 = roc_auc_score(train[target], oof_preds1)
print(f'\n{"="*80}')
print(f'XGB CV: {cv_auc1:.5f}')
print('='*80)

In [None]:
print('\n' + '='*80)
print('STAGE 2: LGBM TRAINING')
print('='*80)

params_lgb = {
    'objective': 'binary',
    'metric': 'auc',
    'boosting_type': 'gbdt',
    'learning_rate': 0.0075,
    'num_leaves': 34,
    'max_depth': -1,
    'min_child_samples': 23,
    'subsample': 0.76,
    'subsample_freq': 1,
    'colsample_bytree': 0.66,
    'reg_alpha': 3.2,
    'reg_lambda': 6.5,
    'min_split_gain': 0.55,
    'random_state': SEED,
    'n_jobs': -1,
    'device': 'gpu',
    'verbose': -1,
}

oof_preds2 = np.zeros(len(train))
test_preds2 = np.zeros(len(test))

for fold, (train_idx, val_idx) in enumerate(skf.split(train, train[target])):
    print(f'\nFold {fold+1}/{FOLDS}')
    
    Xy_train = train.iloc[train_idx][FEATURES + [target]].copy()
    Xy_more = orig[FEATURES + [target]].copy()
    Xy_train = pd.concat([Xy_train, Xy_more], axis=0, ignore_index=True)
    
    X_valid = train.iloc[val_idx][FEATURES].copy()
    y_valid = train.iloc[val_idx][target].copy()
    X_test = test[FEATURES].copy()
    
    CC_TE = CATS1 + CATS2 + CATS3
    print(f'Target encoding {len(CC_TE)} features...', end=' ')
    
    for c in CC_TE:
        try:
            TE = TargetEncoder(n_folds=10, smooth=12, split_method='random', stat='mean')
            
            result_train = TE.fit_transform(Xy_train[[c]], Xy_train[target])
            if hasattr(result_train, 'values'):
                Xy_train[c] = result_train.values.ravel().astype('float32')
            else:
                Xy_train[c] = result_train.ravel().astype('float32')
            
            result_valid = TE.transform(X_valid[[c]])
            if hasattr(result_valid, 'values'):
                X_valid[c] = result_valid.values.ravel().astype('float32')
            else:
                X_valid[c] = result_valid.ravel().astype('float32')
            
            result_test = TE.transform(X_test[[c]])
            if hasattr(result_test, 'values'):
                X_test[c] = result_test.values.ravel().astype('float32')
            else:
                X_test[c] = result_test.ravel().astype('float32')
        except:
            continue
    
    print('Done')
    
    model = LGBMClassifier(**params_lgb, n_estimators=18000)
    
    model.fit(
        Xy_train[FEATURES], Xy_train[target],
        eval_set=[(X_valid, y_valid)],
        callbacks=[
            lgb.early_stopping(stopping_rounds=450, verbose=False),
            lgb.log_evaluation(period=0)
        ]
    )
    
    oof_preds2[val_idx] = model.predict_proba(X_valid)[:, 1]
    test_preds2 += model.predict_proba(X_test)[:, 1] / FOLDS
    
    fold_auc = roc_auc_score(y_valid, oof_preds2[val_idx])
    print(f'Fold {fold+1} AUC: {fold_auc:.5f}')
    
    del Xy_train, Xy_more, X_valid, X_test, model
    gc.collect()

cv_auc2 = roc_auc_score(train[target], oof_preds2)
print(f'\n{"="*80}')
print(f'LGBM CV: {cv_auc2:.5f}')
print('='*80)

In [None]:
print('\n' + '='*80)
print('STAGE 3: PSEUDO-LABELING ON HIGH-CONFIDENCE TEST DATA')
print('='*80)

initial_test_preds = (test_preds1 + test_preds2) / 2

confidence_threshold_high = 0.95
confidence_threshold_low = 0.05

high_conf_idx = (initial_test_preds >= confidence_threshold_high) | (initial_test_preds <= confidence_threshold_low)
print(f'High-confidence test samples: {high_conf_idx.sum()} / {len(test)} ({100*high_conf_idx.sum()/len(test):.2f}%)')

if high_conf_idx.sum() > 1000:
    pseudo_test = test[high_conf_idx].copy()
    pseudo_test[target] = (initial_test_preds[high_conf_idx] > 0.5).astype(int)
    
    print(f'\nRetraining with {len(pseudo_test)} pseudo-labeled samples...')
    
    oof_preds1_pl = np.zeros(len(train))
    test_preds1_pl = np.zeros(len(test))
    
    for fold, (train_idx, val_idx) in enumerate(skf.split(train, train[target])):
        print(f'\nFold {fold+1}/{FOLDS}')
        
        Xy_train = train.iloc[train_idx][FEATURES + [target]].copy()
        Xy_more = orig[FEATURES + [target]].copy()
        Xy_pseudo = pseudo_test[FEATURES + [target]].copy()
        Xy_train = pd.concat([Xy_train, Xy_more, Xy_pseudo], axis=0, ignore_index=True)
        
        X_valid = train.iloc[val_idx][FEATURES].copy()
        y_valid = train.iloc[val_idx][target].copy()
        X_test = test[FEATURES].copy()
        
        CC_TE = CATS1 + CATS2 + CATS3
        print(f'Target encoding {len(CC_TE)} features...', end=' ')
        
        for c in CC_TE:
            try:
                TE = TargetEncoder(n_folds=10, smooth=12, split_method='random', stat='mean')
                
                result_train = TE.fit_transform(Xy_train[[c]], Xy_train[target])
                if hasattr(result_train, 'values'):
                    Xy_train[c] = result_train.values.ravel().astype('float32')
                else:
                    Xy_train[c] = result_train.ravel().astype('float32')
                
                result_valid = TE.transform(X_valid[[c]])
                if hasattr(result_valid, 'values'):
                    X_valid[c] = result_valid.values.ravel().astype('float32')
                else:
                    X_valid[c] = result_valid.ravel().astype('float32')
                
                result_test = TE.transform(X_test[[c]])
                if hasattr(result_test, 'values'):
                    X_test[c] = result_test.values.ravel().astype('float32')
                else:
                    X_test[c] = result_test.ravel().astype('float32')
            except:
                continue
        
        print('Done')
        
        for cat in CATS_FINAL:
            if cat in Xy_train.columns:
                Xy_train[cat] = Xy_train[cat].astype('category')
                X_valid[cat] = X_valid[cat].astype('category')
                X_test[cat] = X_test[cat].astype('category')
        
        Xy_train_iter = IterLoadForDMatrix(Xy_train, FEATURES, target)
        dtrain = xgb.QuantileDMatrix(Xy_train_iter, enable_categorical=True, max_bin=256)
        dval = xgb.DMatrix(X_valid, label=y_valid, enable_categorical=True)
        dtest = xgb.DMatrix(X_test, enable_categorical=True)
        
        model = xgb.train(
            params=params_xgb,
            dtrain=dtrain,
            num_boost_round=18000,
            evals=[(dval, "valid")],
            early_stopping_rounds=450,
            verbose_eval=False
        )
        
        oof_preds1_pl[val_idx] = model.predict(dval, iteration_range=(0, model.best_iteration + 1))
        test_preds1_pl += model.predict(dtest, iteration_range=(0, model.best_iteration + 1)) / FOLDS
        
        fold_auc = roc_auc_score(y_valid, oof_preds1_pl[val_idx])
        print(f'Fold {fold+1} AUC: {fold_auc:.5f}')
        
        del Xy_train, Xy_more, Xy_pseudo, X_valid, X_test, dtrain, dval, dtest, model
        gc.collect()
    
    cv_auc1_pl = roc_auc_score(train[target], oof_preds1_pl)
    print(f'\nXGB with Pseudo-labeling CV: {cv_auc1_pl:.5f}')
    
    if cv_auc1_pl > cv_auc1:
        print(f'Pseudo-labeling improved XGB: +{(cv_auc1_pl - cv_auc1):.5f}')
        oof_preds1 = oof_preds1_pl
        test_preds1 = test_preds1_pl
        cv_auc1 = cv_auc1_pl
    else:
        print(f'Pseudo-labeling did not improve, keeping original')
else:
    print('Not enough high-confidence samples for pseudo-labeling')

In [None]:
print('\n' + '='*80)
print('FINAL ENSEMBLE OPTIMIZATION')
print('='*80)

best_weight = 0.5
best_score = 0

for w1 in np.arange(0.35, 0.66, 0.01):
    oof_blend = w1 * oof_preds1 + (1 - w1) * oof_preds2
    score = roc_auc_score(train[target], oof_blend)
    if score > best_score:
        best_score = score
        best_weight = w1

print(f'Optimal weight: {best_weight:.3f} (XGB) / {1-best_weight:.3f} (LGBM)')

final_oof = best_weight * oof_preds1 + (1 - best_weight) * oof_preds2
final_test = best_weight * test_preds1 + (1 - best_weight) * test_preds2

final_cv = roc_auc_score(train[target], final_oof)

print(f'\nXGB CV: {cv_auc1:.5f}')
print(f'LGBM CV: {cv_auc2:.5f}')
print(f'Optimized Ensemble CV: {final_cv:.5f}')
print(f'Expected LB: {final_cv + 0.00016:.5f} - {final_cv + 0.00022:.5f}')

In [None]:
print('\n' + '='*80)
print('CREATING VISUALIZATIONS')
print('='*80)

fig, axes = plt.subplots(2, 2, figsize=(15, 12))

axes[0, 0].hist(final_oof, bins=50, alpha=0.7, color='steelblue', edgecolor='black')
axes[0, 0].axvline(final_oof.mean(), color='red', linestyle='--', linewidth=2)
axes[0, 0].set_xlabel('Predicted Probability')
axes[0, 0].set_ylabel('Frequency')
axes[0, 0].set_title('OOF Predictions Distribution')
axes[0, 0].grid(True, alpha=0.3)

axes[0, 1].hist([oof_preds1, oof_preds2], bins=30, alpha=0.6, label=['XGB', 'LGBM'])
axes[0, 1].set_xlabel('Predicted Probability')
axes[0, 1].set_ylabel('Frequency')
axes[0, 1].set_title('Model Predictions Comparison')
axes[0, 1].legend()
axes[0, 1].grid(True, alpha=0.3)

model_names = ['XGB', 'LGBM', 'Ensemble']
model_scores = [cv_auc1, cv_auc2, final_cv]
bars = axes[1, 0].bar(model_names, model_scores, color=['steelblue', 'coral', 'green'], alpha=0.7, edgecolor='black')
for bar, score in zip(bars, model_scores):
    height = bar.get_height()
    axes[1, 0].text(bar.get_x() + bar.get_width()/2., height,
                    f'{score:.5f}', ha='center', va='bottom', fontsize=11, fontweight='bold')
axes[1, 0].set_ylabel('CV AUC Score')
axes[1, 0].set_title('Model Performance Comparison')
axes[1, 0].grid(True, alpha=0.3, axis='y')
axes[1, 0].set_ylim([min(model_scores) - 0.0015, max(model_scores) + 0.0015])

weights = [best_weight, 1 - best_weight]
colors = ['steelblue', 'coral']

In [None]:

# ============================================================================
# VISUALIZATION
# ============================================================================
print('\n' + '='*80)
print('CREATING VISUALIZATIONS')
print('='*80)

fig, axes = plt.subplots(2, 2, figsize=(15, 12))

axes[0, 0].hist(final_oof, bins=50, alpha=0.7, color='steelblue', edgecolor='black')
axes[0, 0].axvline(final_oof.mean(), color='red', linestyle='--', linewidth=2)
axes[0, 0].set_xlabel('Predicted Probability')
axes[0, 0].set_ylabel('Frequency')
axes[0, 0].set_title('OOF Predictions Distribution')
axes[0, 0].grid(True, alpha=0.3)

axes[0, 1].hist([oof_preds1, oof_preds2], bins=30, alpha=0.6, label=['XGB', 'LGBM'])
axes[0, 1].set_xlabel('Predicted Probability')
axes[0, 1].set_ylabel('Frequency')
axes[0, 1].set_title('Model Predictions Comparison')
axes[0, 1].legend()
axes[0, 1].grid(True, alpha=0.3)

model_names = ['XGB', 'LGBM', 'Ensemble']
model_scores = [cv_auc1, cv_auc2, final_cv]
bars = axes[1, 0].bar(model_names, model_scores, color=['steelblue', 'coral', 'green'], alpha=0.7, edgecolor='black')
for bar, score in zip(bars, model_scores):
    height = bar.get_height()
    axes[1, 0].text(bar.get_x() + bar.get_width()/2., height,
                    f'{score:.5f}', ha='center', va='bottom', fontsize=11, fontweight='bold')
axes[1, 0].set_ylabel('CV AUC Score')
axes[1, 0].set_title('Model Performance Comparison')
axes[1, 0].grid(True, alpha=0.3, axis='y')
axes[1, 0].set_ylim([min(model_scores) - 0.0015, max(model_scores) + 0.0015])

weights = [best_weight, 1 - best_weight]
colors = ['steelblue', 'coral']
axes[1, 1].pie(weights, labels=['XGB', 'LGBM'], autopct='%1.1f%%', 
               colors=colors, startangle=90, textprops={'fontsize': 12, 'fontweight': 'bold'})
axes[1, 1].set_title('Ensemble Weight Distribution')

plt.tight_layout()
plt.savefig('model_performance.png', dpi=300, bbox_inches='tight')
print('Saved: model_performance.png')
plt.show()

In [None]:
# ============================================================================
# SAVE
# ============================================================================
submission[target] = final_test
submission.to_csv('submission.csv', index=False)

oof_submission = pd.DataFrame({
    'id': train['id'],
    target: final_oof
})
oof_submission.to_csv(f'oof_predictions_{final_cv:.5f}.csv', index=False)

print(f'\n{"="*80}')
print('✅ SAVED: submission.csv')
print(f'✅ SAVED: oof_predictions_{final_cv:.5f}.csv')
print(f'✅ SAVED: model_performance.png')
print(f'\n{"="*80}')
print(f'FINAL CV SCORE: {final_cv:.5f}')
print(f'Expected LB: {final_cv + 0.00016:.5f} - {final_cv + 0.00022:.5f}')
print(f'Target: 0.928+')
print(f'Previous Best LB: 0.92677')
print(f'Expected Improvement: +{(final_cv + 0.00019 - 0.92677):.5f}')
print('='*80)