# Home Credit Default Risk - LightGBM Pipeline

This notebook implements a complete ML pipeline with **two modes**:
- **TRAIN_MODE = True**: Full training + model saving
- **TRAIN_MODE = False**: Load saved models for inference only

## Pipeline Stages
1. **Data Loading** - Load 7 CSV tables
2. **Preprocessing** - Encoding, outlier handling, missing value marking
3. **Feature Engineering** - Derived features, aggregations, multi-table joins
4. **Model Training/Loading** - K-Fold CV with LightGBM or load saved models
5. **Output** - Predictions, feature importance, ROC curves, and training curves

In [None]:
# =============================================================================
# Stage 1: Configuration & Data Loading
# =============================================================================

import numpy as np
import pandas as pd
import gc
import time
import re
import os
import joblib
from contextlib import contextmanager
from lightgbm import LGBMClassifier
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.model_selection import KFold, StratifiedKFold
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

# =============================================================================
# CONFIGURATION - Change these settings as needed
# =============================================================================
TRAIN_MODE = False  # True: train new models, False: load saved models for inference
DEBUG = False       # True: use 10000 rows for quick testing
NUM_ROWS = 10000 if DEBUG else None

DATA_PATH = './home-credit-default-risk/'
MODEL_DIR = './saved_models/'  # Directory for saved models
SUBMISSION_FILE = 'submission_kernel02.csv'

# Create model directory if needed
os.makedirs(MODEL_DIR, exist_ok=True)

print(f"Mode: {'TRAINING' if TRAIN_MODE else 'INFERENCE (using saved models)'}")
print(f"Debug: {DEBUG}")
print(f"Model directory: {MODEL_DIR}")

@contextmanager
def timer(title):
    t0 = time.time()
    yield
    print(f"{title} - done in {time.time() - t0:.0f}s")

# Load all datasets
with timer("Loading all datasets"):
    application_train = pd.read_csv(f'{DATA_PATH}application_train.csv', nrows=NUM_ROWS)
    application_test = pd.read_csv(f'{DATA_PATH}application_test.csv', nrows=NUM_ROWS)
    bureau = pd.read_csv(f'{DATA_PATH}bureau.csv', nrows=NUM_ROWS)
    bureau_balance = pd.read_csv(f'{DATA_PATH}bureau_balance.csv', nrows=NUM_ROWS)
    previous_application = pd.read_csv(f'{DATA_PATH}previous_application.csv', nrows=NUM_ROWS)
    pos_cash_balance = pd.read_csv(f'{DATA_PATH}POS_CASH_balance.csv', nrows=NUM_ROWS)
    installments_payments = pd.read_csv(f'{DATA_PATH}installments_payments.csv', nrows=NUM_ROWS)
    credit_card_balance = pd.read_csv(f'{DATA_PATH}credit_card_balance.csv', nrows=NUM_ROWS)

print(f"Train samples: {len(application_train)}, Test samples: {len(application_test)}")

In [None]:
# =============================================================================
# Stage 2: Data Preprocessing
# =============================================================================

def one_hot_encoder(df, nan_as_category=True):
    original_columns = list(df.columns)
    categorical_columns = [col for col in df.columns if df[col].dtype == 'object']
    df = pd.get_dummies(df, columns=categorical_columns, dummy_na=nan_as_category)
    df.columns = [re.sub(r'[^A-Za-z0-9_]+', '_', col) for col in df.columns]
    new_columns = [c for c in df.columns if c not in original_columns]
    return df, new_columns

with timer("Preprocessing application data"):
    df = pd.concat([application_train, application_test], ignore_index=True)
    df = df[df['CODE_GENDER'] != 'XNA']
    for bin_feature in ['CODE_GENDER', 'FLAG_OWN_CAR', 'FLAG_OWN_REALTY']:
        df[bin_feature], _ = pd.factorize(df[bin_feature])
    df, cat_cols = one_hot_encoder(df, nan_as_category=False)
    df['DAYS_EMPLOYED'].replace(365243, np.nan, inplace=True)

with timer("Preprocessing bureau and bureau_balance"):
    bureau_balance, bb_cat = one_hot_encoder(bureau_balance, nan_as_category=True)
    bureau, bureau_cat = one_hot_encoder(bureau, nan_as_category=True)

with timer("Preprocessing previous applications"):
    previous_application, prev_cat = one_hot_encoder(previous_application, nan_as_category=True)
    for col in ['DAYS_FIRST_DRAWING', 'DAYS_FIRST_DUE', 'DAYS_LAST_DUE_1ST_VERSION', 
                'DAYS_LAST_DUE', 'DAYS_TERMINATION']:
        if col in previous_application.columns:
            previous_application[col].replace(365243, np.nan, inplace=True)

with timer("Preprocessing other tables"):
    pos_cash_balance, pos_cat = one_hot_encoder(pos_cash_balance, nan_as_category=True)
    installments_payments, ins_cat = one_hot_encoder(installments_payments, nan_as_category=True)
    credit_card_balance, cc_cat = one_hot_encoder(credit_card_balance, nan_as_category=True)

del application_train, application_test
gc.collect()
print(f"Preprocessed main df shape: {df.shape}")

In [None]:
# =============================================================================
# Stage 3: Feature Engineering
# =============================================================================

with timer("Creating application features"):
    df['DAYS_EMPLOYED_PERC'] = df['DAYS_EMPLOYED'] / df['DAYS_BIRTH']
    df['INCOME_CREDIT_PERC'] = df['AMT_INCOME_TOTAL'] / df['AMT_CREDIT']
    df['INCOME_PER_PERSON'] = df['AMT_INCOME_TOTAL'] / df['CNT_FAM_MEMBERS']
    df['ANNUITY_INCOME_PERC'] = df['AMT_ANNUITY'] / df['AMT_INCOME_TOTAL']
    df['PAYMENT_RATE'] = df['AMT_ANNUITY'] / df['AMT_CREDIT']

with timer("Creating bureau features"):
    bb_aggregations = {'MONTHS_BALANCE': ['min', 'max', 'size']}
    for col in bb_cat:
        if col in bureau_balance.columns:
            bb_aggregations[col] = ['mean']
    bb_agg = bureau_balance.groupby('SK_ID_BUREAU').agg(bb_aggregations)
    bb_agg.columns = pd.Index([e[0] + "_" + e[1].upper() for e in bb_agg.columns.tolist()])
    bureau = bureau.join(bb_agg, how='left', on='SK_ID_BUREAU')
    bureau.drop(['SK_ID_BUREAU'], axis=1, inplace=True)
    del bureau_balance, bb_agg
    gc.collect()
    
    num_aggregations = {}
    agg_mapping = {
        'DAYS_CREDIT': ['min', 'max', 'mean', 'var'], 'DAYS_CREDIT_ENDDATE': ['min', 'max', 'mean'],
        'DAYS_CREDIT_UPDATE': ['mean'], 'CREDIT_DAY_OVERDUE': ['max', 'mean'],
        'AMT_CREDIT_MAX_OVERDUE': ['mean'], 'AMT_CREDIT_SUM': ['max', 'mean', 'sum'],
        'AMT_CREDIT_SUM_DEBT': ['max', 'mean', 'sum'], 'AMT_CREDIT_SUM_OVERDUE': ['mean'],
        'AMT_CREDIT_SUM_LIMIT': ['mean', 'sum'], 'AMT_ANNUITY': ['max', 'mean'],
        'CNT_CREDIT_PROLONG': ['sum'], 'MONTHS_BALANCE_MIN': ['min'],
        'MONTHS_BALANCE_MAX': ['max'], 'MONTHS_BALANCE_SIZE': ['mean', 'sum']
    }
    for col, aggs in agg_mapping.items():
        if col in bureau.columns:
            num_aggregations[col] = aggs
    cat_aggregations = {}
    for cat in bureau_cat:
        if cat in bureau.columns:
            cat_aggregations[cat] = ['mean']
    for cat in bb_cat:
        col_name = cat + "_MEAN"
        if col_name in bureau.columns:
            cat_aggregations[col_name] = ['mean']
    bureau_agg = bureau.groupby('SK_ID_CURR').agg({**num_aggregations, **cat_aggregations})
    bureau_agg.columns = pd.Index(['BURO_' + e[0] + "_" + e[1].upper() for e in bureau_agg.columns.tolist()])
    if 'CREDIT_ACTIVE_Active' in bureau.columns:
        active = bureau[bureau['CREDIT_ACTIVE_Active'] == 1]
        if len(active) > 0:
            active_agg = active.groupby('SK_ID_CURR').agg(num_aggregations)
            active_agg.columns = pd.Index(['ACTIVE_' + e[0] + "_" + e[1].upper() for e in active_agg.columns.tolist()])
            bureau_agg = bureau_agg.join(active_agg, how='left', on='SK_ID_CURR')
            del active_agg
        del active
    if 'CREDIT_ACTIVE_Closed' in bureau.columns:
        closed = bureau[bureau['CREDIT_ACTIVE_Closed'] == 1]
        if len(closed) > 0:
            closed_agg = closed.groupby('SK_ID_CURR').agg(num_aggregations)
            closed_agg.columns = pd.Index(['CLOSED_' + e[0] + "_" + e[1].upper() for e in closed_agg.columns.tolist()])
            bureau_agg = bureau_agg.join(closed_agg, how='left', on='SK_ID_CURR')
            del closed_agg
        del closed
    del bureau
    gc.collect()
    df = df.join(bureau_agg, how='left', on='SK_ID_CURR')
    del bureau_agg
    gc.collect()
    print(f"After bureau features: {df.shape}")

with timer("Creating previous application features"):
    previous_application['APP_CREDIT_PERC'] = previous_application['AMT_APPLICATION'] / previous_application['AMT_CREDIT']
    prev_num_aggregations = {
        'AMT_ANNUITY': ['min', 'max', 'mean'], 'AMT_APPLICATION': ['min', 'max', 'mean'],
        'AMT_CREDIT': ['min', 'max', 'mean'], 'APP_CREDIT_PERC': ['min', 'max', 'mean', 'var'],
        'AMT_DOWN_PAYMENT': ['min', 'max', 'mean'], 'AMT_GOODS_PRICE': ['min', 'max', 'mean'],
        'HOUR_APPR_PROCESS_START': ['min', 'max', 'mean'], 'RATE_DOWN_PAYMENT': ['min', 'max', 'mean'],
        'DAYS_DECISION': ['min', 'max', 'mean'], 'CNT_PAYMENT': ['mean', 'sum'],
    }
    prev_num_aggregations = {k: v for k, v in prev_num_aggregations.items() if k in previous_application.columns}
    cat_aggregations = {cat: ['mean'] for cat in prev_cat if cat in previous_application.columns}
    prev_agg = previous_application.groupby('SK_ID_CURR').agg({**prev_num_aggregations, **cat_aggregations})
    prev_agg.columns = pd.Index(['PREV_' + e[0] + "_" + e[1].upper() for e in prev_agg.columns.tolist()])
    if 'NAME_CONTRACT_STATUS_Approved' in previous_application.columns:
        approved = previous_application[previous_application['NAME_CONTRACT_STATUS_Approved'] == 1]
        if len(approved) > 0:
            approved_agg = approved.groupby('SK_ID_CURR').agg(prev_num_aggregations)
            approved_agg.columns = pd.Index(['APPROVED_' + e[0] + "_" + e[1].upper() for e in approved_agg.columns.tolist()])
            prev_agg = prev_agg.join(approved_agg, how='left', on='SK_ID_CURR')
            del approved_agg
        del approved
    if 'NAME_CONTRACT_STATUS_Refused' in previous_application.columns:
        refused = previous_application[previous_application['NAME_CONTRACT_STATUS_Refused'] == 1]
        if len(refused) > 0:
            refused_agg = refused.groupby('SK_ID_CURR').agg(prev_num_aggregations)
            refused_agg.columns = pd.Index(['REFUSED_' + e[0] + "_" + e[1].upper() for e in refused_agg.columns.tolist()])
            prev_agg = prev_agg.join(refused_agg, how='left', on='SK_ID_CURR')
            del refused_agg
        del refused
    del previous_application
    gc.collect()
    df = df.join(prev_agg, how='left', on='SK_ID_CURR')
    del prev_agg
    gc.collect()
    print(f"After previous application features: {df.shape}")

with timer("Creating POS cash features"):
    aggregations = {'MONTHS_BALANCE': ['max', 'mean', 'size'], 'SK_DPD': ['max', 'mean'], 'SK_DPD_DEF': ['max', 'mean']}
    aggregations = {k: v for k, v in aggregations.items() if k in pos_cash_balance.columns}
    for cat in pos_cat:
        if cat in pos_cash_balance.columns:
            aggregations[cat] = ['mean']
    pos_agg = pos_cash_balance.groupby('SK_ID_CURR').agg(aggregations)
    pos_agg.columns = pd.Index(['POS_' + e[0] + "_" + e[1].upper() for e in pos_agg.columns.tolist()])
    pos_agg['POS_COUNT'] = pos_cash_balance.groupby('SK_ID_CURR').size()
    del pos_cash_balance
    gc.collect()
    df = df.join(pos_agg, how='left', on='SK_ID_CURR')
    del pos_agg
    gc.collect()
    print(f"After POS cash features: {df.shape}")

with timer("Creating installments features"):
    installments_payments['PAYMENT_PERC'] = installments_payments['AMT_PAYMENT'] / installments_payments['AMT_INSTALMENT']
    installments_payments['PAYMENT_DIFF'] = installments_payments['AMT_INSTALMENT'] - installments_payments['AMT_PAYMENT']
    installments_payments['DPD'] = installments_payments['DAYS_ENTRY_PAYMENT'] - installments_payments['DAYS_INSTALMENT']
    installments_payments['DBD'] = installments_payments['DAYS_INSTALMENT'] - installments_payments['DAYS_ENTRY_PAYMENT']
    installments_payments['DPD'] = installments_payments['DPD'].apply(lambda x: x if x > 0 else 0)
    installments_payments['DBD'] = installments_payments['DBD'].apply(lambda x: x if x > 0 else 0)
    ins_aggregations = {
        'NUM_INSTALMENT_VERSION': ['nunique'], 'DPD': ['max', 'mean', 'sum'], 'DBD': ['max', 'mean', 'sum'],
        'PAYMENT_PERC': ['max', 'mean', 'sum', 'var'], 'PAYMENT_DIFF': ['max', 'mean', 'sum', 'var'],
        'AMT_INSTALMENT': ['max', 'mean', 'sum'], 'AMT_PAYMENT': ['min', 'max', 'mean', 'sum'],
        'DAYS_ENTRY_PAYMENT': ['max', 'mean', 'sum']
    }
    ins_aggregations = {k: v for k, v in ins_aggregations.items() if k in installments_payments.columns}
    for cat in ins_cat:
        if cat in installments_payments.columns:
            ins_aggregations[cat] = ['mean']
    ins_agg = installments_payments.groupby('SK_ID_CURR').agg(ins_aggregations)
    ins_agg.columns = pd.Index(['INSTAL_' + e[0] + "_" + e[1].upper() for e in ins_agg.columns.tolist()])
    ins_agg['INSTAL_COUNT'] = installments_payments.groupby('SK_ID_CURR').size()
    del installments_payments
    gc.collect()
    df = df.join(ins_agg, how='left', on='SK_ID_CURR')
    del ins_agg
    gc.collect()
    print(f"After installments features: {df.shape}")

with timer("Creating credit card features"):
    if 'SK_ID_PREV' in credit_card_balance.columns:
        credit_card_balance.drop(['SK_ID_PREV'], axis=1, inplace=True)
    numeric_cols = credit_card_balance.select_dtypes(include=[np.number]).columns.tolist()
    numeric_cols = [c for c in numeric_cols if c != 'SK_ID_CURR']
    cc_agg = credit_card_balance.groupby('SK_ID_CURR')[numeric_cols].agg(['min', 'max', 'mean', 'sum', 'var'])
    cc_agg.columns = pd.Index(['CC_' + e[0] + "_" + e[1].upper() for e in cc_agg.columns.tolist()])
    cc_agg['CC_COUNT'] = credit_card_balance.groupby('SK_ID_CURR').size()
    del credit_card_balance
    gc.collect()
    df = df.join(cc_agg, how='left', on='SK_ID_CURR')
    del cc_agg
    gc.collect()

df.columns = [re.sub(r'[^A-Za-z0-9_]+', '_', col) for col in df.columns]
print(f"Final df shape after all features: {df.shape}")

In [None]:
# =============================================================================
# Stage 4: Model Training OR Loading
# =============================================================================
import lightgbm as lgb

# Split data
train_df = df[df['TARGET'].notnull()].copy()
test_df = df[df['TARGET'].isnull()].copy()
feats = [f for f in train_df.columns if f not in ['TARGET', 'SK_ID_CURR', 'SK_ID_BUREAU', 'SK_ID_PREV', 'index']]
print(f"Train shape: {train_df.shape}, Test shape: {test_df.shape}")
print(f"Number of features: {len(feats)}")

del df
gc.collect()

num_folds = 3 if DEBUG else 10

if TRAIN_MODE:
    # ==================== TRAINING MODE ====================
    print("\n" + "="*60)
    print("TRAINING MODE: Training new models")
    print("="*60)
    
    folds = KFold(n_splits=num_folds, shuffle=True, random_state=1001)
    oof_preds = np.zeros(train_df.shape[0])
    sub_preds = np.zeros(test_df.shape[0])
    feature_importance_df = pd.DataFrame()
    training_history = []
    fold_results = []
    models = []  # Store trained models
    
    with timer(f"Training {num_folds}-Fold LightGBM"):
        for n_fold, (train_idx, valid_idx) in enumerate(folds.split(train_df[feats], train_df['TARGET'])):
            train_x, train_y = train_df[feats].iloc[train_idx], train_df['TARGET'].iloc[train_idx]
            valid_x, valid_y = train_df[feats].iloc[valid_idx], train_df['TARGET'].iloc[valid_idx]
            
            clf = LGBMClassifier(
                n_jobs=4, n_estimators=10000, learning_rate=0.02, num_leaves=34,
                colsample_bytree=0.9497036, subsample=0.8715623, max_depth=8,
                reg_alpha=0.041545473, reg_lambda=0.0735294, min_split_gain=0.0222415,
                min_child_weight=39.3259775, verbose=-1,
            )
            
            eval_results = {}
            clf.fit(
                train_x, train_y,
                eval_set=[(train_x, train_y), (valid_x, valid_y)],
                eval_names=['train', 'valid'],
                eval_metric='auc',
                callbacks=[
                    lgb.early_stopping(stopping_rounds=200),
                    lgb.log_evaluation(period=200),
                    lgb.record_evaluation(eval_results)
                ]
            )
            
            # Save model
            model_path = os.path.join(MODEL_DIR, f'lgbm_fold_{n_fold}.pkl')
            joblib.dump(clf, model_path)
            models.append(clf)
            
            training_history.append({
                'fold': n_fold + 1,
                'train_auc': eval_results['train']['auc'],
                'valid_auc': eval_results['valid']['auc'],
                'best_iteration': clf.best_iteration_
            })
            
            valid_preds = clf.predict_proba(valid_x, num_iteration=clf.best_iteration_)[:, 1]
            oof_preds[valid_idx] = valid_preds
            sub_preds += clf.predict_proba(test_df[feats], num_iteration=clf.best_iteration_)[:, 1] / num_folds
            
            fold_results.append({
                'fold': n_fold + 1,
                'y_true': valid_y.values,
                'y_pred': valid_preds,
                'auc': roc_auc_score(valid_y, valid_preds)
            })
            
            fold_importance_df = pd.DataFrame()
            fold_importance_df["feature"] = feats
            fold_importance_df["importance"] = clf.feature_importances_
            fold_importance_df["fold"] = n_fold + 1
            feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)
            
            print(f'Fold {n_fold + 1:2d} AUC: {fold_results[-1]["auc"]:.6f} | Model saved to: {model_path}')
            
            del train_x, train_y, valid_x, valid_y
            gc.collect()
    
    # Save feature list for inference
    joblib.dump(feats, os.path.join(MODEL_DIR, 'feature_list.pkl'))
    
    full_auc = roc_auc_score(train_df['TARGET'], oof_preds)
    print(f'\nFull OOF AUC: {full_auc:.6f}')
    print(f'Models saved to: {MODEL_DIR}')
    
    train_targets = train_df['TARGET'].values

else:
    # ==================== INFERENCE MODE ====================
    print("\n" + "="*60)
    print("INFERENCE MODE: Loading saved models")
    print("="*60)
    
    # Load feature list
    saved_feats = joblib.load(os.path.join(MODEL_DIR, 'feature_list.pkl'))
    
    # Align features
    missing_feats = [f for f in saved_feats if f not in test_df.columns]
    if missing_feats:
        print(f"Warning: {len(missing_feats)} features missing, filling with 0")
        for f in missing_feats:
            test_df[f] = 0
    
    sub_preds = np.zeros(test_df.shape[0])
    models = []
    
    with timer("Loading models and making predictions"):
        for n_fold in range(num_folds):
            model_path = os.path.join(MODEL_DIR, f'lgbm_fold_{n_fold}.pkl')
            if os.path.exists(model_path):
                clf = joblib.load(model_path)
                models.append(clf)
                sub_preds += clf.predict_proba(test_df[saved_feats])[:, 1] / num_folds
                print(f'Loaded model from: {model_path}')
            else:
                print(f'Warning: Model not found: {model_path}')
    
    print(f'\nLoaded {len(models)} models for inference')
    
    # Placeholders for visualization (not available in inference mode)
    oof_preds = None
    train_targets = None
    training_history = None
    fold_results = None
    feature_importance_df = None

# Store predictions
test_df['TARGET'] = sub_preds
submission_df = test_df[['SK_ID_CURR', 'TARGET']]

In [None]:
# =============================================================================
# Stage 5: Output & Visualization
# =============================================================================

# Save submission
with timer("Saving submission file"):
    submission_df.to_csv(SUBMISSION_FILE, index=False)
    print(f"Submission saved to: {SUBMISSION_FILE}")
    print(f"Shape: {submission_df.shape}")
    print(f"Predictions range: [{submission_df['TARGET'].min():.4f}, {submission_df['TARGET'].max():.4f}]")

# Visualizations only available in TRAIN_MODE
if TRAIN_MODE and feature_importance_df is not None:
    
    # Feature Importance
    with timer("Generating feature importance"):
        cols = (feature_importance_df[["feature", "importance"]]
                .groupby("feature").mean()
                .sort_values(by="importance", ascending=False)[:40].index)
        best_features = feature_importance_df.loc[feature_importance_df.feature.isin(cols)]
        plt.figure(figsize=(10, 12))
        sns.barplot(x="importance", y="feature", 
                    data=best_features.sort_values(by="importance", ascending=False), palette="viridis")
        plt.title('LightGBM Feature Importance')
        plt.tight_layout()
        plt.savefig('lgbm_importances01.png', dpi=150)
        plt.show()
    
    # Per-fold ROC curves
    with timer("Generating per-fold ROC curves"):
        n_folds = len(fold_results)
        cols = min(5, n_folds)
        rows = (n_folds + cols - 1) // cols
        fig, axes = plt.subplots(rows, cols, figsize=(4*cols, 4*rows))
        axes = axes.flatten() if n_folds > 1 else [axes]
        colors = plt.cm.viridis(np.linspace(0, 0.9, n_folds))
        for idx, fold_data in enumerate(fold_results):
            ax = axes[idx]
            fpr, tpr, _ = roc_curve(fold_data['y_true'], fold_data['y_pred'])
            ax.plot(fpr, tpr, color=colors[idx], lw=2, label=f'AUC = {fold_data["auc"]:.4f}')
            ax.plot([0, 1], [0, 1], 'k--', lw=1, alpha=0.5)
            ax.set_title(f'Fold {fold_data["fold"]}')
            ax.legend(loc='lower right')
            ax.grid(alpha=0.3)
        for idx in range(n_folds, len(axes)):
            axes[idx].axis('off')
        plt.suptitle('ROC Curves for Each Fold', fontsize=14)
        plt.tight_layout()
        plt.savefig('roc_curves_per_fold.png', dpi=150, bbox_inches='tight')
        plt.show()
    
    # Combined ROC
    with timer("Generating combined ROC curve"):
        plt.figure(figsize=(8, 6))
        for fold_data in fold_results:
            fpr, tpr, _ = roc_curve(fold_data['y_true'], fold_data['y_pred'])
            plt.plot(fpr, tpr, alpha=0.3, lw=1)
        fpr, tpr, _ = roc_curve(train_targets, oof_preds)
        oof_auc = roc_auc_score(train_targets, oof_preds)
        plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'Mean OOF (AUC = {oof_auc:.4f})')
        plt.plot([0, 1], [0, 1], 'navy', lw=2, linestyle='--')
        plt.title('Combined ROC Curve')
        plt.legend(loc='lower right')
        plt.grid(alpha=0.3)
        plt.tight_layout()
        plt.savefig('roc_curve.png', dpi=150)
        plt.show()
    
    # Training curves
    with timer("Generating training curves"):
        n_folds = len(training_history)
        cols = min(5, n_folds)
        rows = (n_folds + cols - 1) // cols
        fig, axes = plt.subplots(rows, cols, figsize=(4*cols, 3*rows))
        axes = axes.flatten() if n_folds > 1 else [axes]
        for idx, fold_history in enumerate(training_history):
            ax = axes[idx]
            iterations = range(1, len(fold_history['train_auc']) + 1)
            ax.plot(iterations, fold_history['train_auc'], label='Train', color='blue', alpha=0.7)
            ax.plot(iterations, fold_history['valid_auc'], label='Valid', color='red', alpha=0.7)
            ax.axvline(x=fold_history['best_iteration'], color='green', linestyle='--', alpha=0.7)
            ax.set_title(f'Fold {fold_history["fold"]} (best: {fold_history["best_iteration"]})')
            ax.legend(loc='lower right', fontsize=8)
            ax.grid(alpha=0.3)
        for idx in range(n_folds, len(axes)):
            axes[idx].axis('off')
        plt.suptitle('Training Curves', fontsize=14)
        plt.tight_layout()
        plt.savefig('training_curves.png', dpi=150, bbox_inches='tight')
        plt.show()
    
    # Summary
    print("\n" + "="*60)
    print("TRAINING SUMMARY")
    print("="*60)
    for fr in fold_results:
        print(f"  Fold {fr['fold']:2d}: AUC = {fr['auc']:.6f}")
    print(f"\n  Mean Fold AUC: {np.mean([fr['auc'] for fr in fold_results]):.6f}")
    print(f"  OOF AUC:       {oof_auc:.6f}")

else:
    print("\n" + "="*60)
    print("INFERENCE COMPLETE")
    print("="*60)
    print(f"Predictions generated using {len(models)} saved models")
    print(f"Visualizations skipped (only available in TRAIN_MODE)")

print(f"\nOutput files:")
print(f"  - {SUBMISSION_FILE}")
if TRAIN_MODE:
    print(f"  - {MODEL_DIR}lgbm_fold_*.pkl (saved models)")
    print(f"  - lgbm_importances01.png")
    print(f"  - roc_curves_per_fold.png")
    print(f"  - roc_curve.png")
    print(f"  - training_curves.png")