# Home Credit Default Risk - LightGBM Pipeline

This notebook implements a complete ML pipeline with **two modes**:
- **TRAIN_MODE = True**: Full training + model saving
- **TRAIN_MODE = False**: Load saved models for inference only

## Pipeline Stages
1. **Data Loading** - Load 7 CSV tables
2. **Preprocessing** - Encoding, outlier handling, missing value marking
3. **Feature Engineering** - Derived features, aggregations, multi-table joins
4. **Model Training/Loading** - K-Fold CV with LightGBM or load saved models
5. **Output** - Predictions, feature importance, ROC curves, and training curves

In [None]:
# =============================================================================
# Environment Detection and Configuration
# =============================================================================
import os
import platform

def detect_environment():
    """Detect running environment: local Mac, Kaggle, or other."""
    if os.path.exists('/kaggle'):
        return 'kaggle'
    elif platform.system() == 'Darwin':
        return 'mac'
    else:
        return 'other'

ENVIRONMENT = detect_environment()
print(f"Detected environment: {ENVIRONMENT}")

# Set data paths based on environment
if ENVIRONMENT == 'kaggle':
    DATA_PATH = '/kaggle/input/home-credit-default-risk/'
    OUTPUT_PATH = '/kaggle/working/'
    MODEL_DIR = '/kaggle/working/saved_models/'
    LOG_DIR = '/kaggle/working/logs/'
else:  # mac or other (local)
    DATA_PATH = './home-credit-default-risk/'
    OUTPUT_PATH = './'
    MODEL_DIR = './saved_models/'
    LOG_DIR = './logs/'

print(f"Data path: {DATA_PATH}")
print(f"Output path: {OUTPUT_PATH}")
print(f"Model directory: {MODEL_DIR}")
print(f"Log directory: {LOG_DIR}")

# Create necessary directories
os.makedirs(MODEL_DIR, exist_ok=True)
os.makedirs(LOG_DIR, exist_ok=True)

In [None]:
# =============================================================================
# GPU Detection for LightGBM
# =============================================================================
import subprocess

def detect_gpu():
    """Detect if NVIDIA GPU is available."""
    try:
        result = subprocess.run(['nvidia-smi'], capture_output=True, text=True, timeout=10)
        if result.returncode == 0:
            return True, "NVIDIA GPU detected"
    except (FileNotFoundError, subprocess.TimeoutExpired):
        pass
    return False, "No GPU detected"

GPU_AVAILABLE, GPU_INFO = detect_gpu()
print(f"GPU Detection: {GPU_INFO}")

# =============================================================================
# Important Note about LightGBM GPU
# =============================================================================
# LightGBM GPU acceleration requires:
# 1. A GPU-enabled build of LightGBM (pip default is CPU-only)
# 2. OpenCL or CUDA libraries
#
# On Kaggle, even with GPU enabled, the pre-installed LightGBM is CPU version.
# GPU acceleration is more beneficial for XGBoost or CatBoost on Kaggle.
#
# For this notebook, we'll use CPU training which is still fast for LightGBM.
# =============================================================================

# Force CPU mode for reliable execution
USE_GPU = False
DEVICE_TYPE = 'cpu'

if GPU_AVAILABLE:
    print("\n⚠️ Note: GPU detected but LightGBM will use CPU.")
    print("   Reason: Kaggle's LightGBM is CPU-only build.")
    print("   This is normal - LightGBM is already very fast on CPU!")
else:
    print("\n→ Training will use CPU")

print(f"\nDevice for training: {DEVICE_TYPE.upper()}")

In [None]:
# =============================================================================
# Stage 1: Configuration & Data Loading
# =============================================================================

import numpy as np
import pandas as pd
import gc
import time
import re
import os
import joblib
import logging
from datetime import datetime
from contextlib import contextmanager
from lightgbm import LGBMClassifier
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.model_selection import KFold, StratifiedKFold
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

# =============================================================================
# CONFIGURATION
# =============================================================================
TRAIN_MODE = True   # True: train new models, False: load saved models
DEBUG = False       # True: use 10000 rows for quick testing
NUM_ROWS = 10000 if DEBUG else None

# Paths are set by environment detection cell above
SUBMISSION_FILE = os.path.join(OUTPUT_PATH, 'submission_kernel02.csv')

# =============================================================================
# Logging Setup - Detailed epoch logging to file
# =============================================================================
log_filename = os.path.join(LOG_DIR, f'training_log_{datetime.now().strftime("%Y%m%d_%H%M%S")}.log')

file_handler = logging.FileHandler(log_filename, encoding='utf-8')
file_handler.setLevel(logging.DEBUG)
file_handler.setFormatter(logging.Formatter('%(asctime)s | %(levelname)s | %(message)s'))

train_logger = logging.getLogger('lightgbm_training')
train_logger.setLevel(logging.DEBUG)
# Clear existing handlers to avoid duplicates
train_logger.handlers.clear()
train_logger.addHandler(file_handler)

print(f"Mode: {'TRAINING' if TRAIN_MODE else 'INFERENCE'}")
print(f"Debug: {DEBUG}")
print(f"Log file: {log_filename}")

@contextmanager
def timer(title):
    t0 = time.time()
    yield
    elapsed = time.time() - t0
    print(f"{title} - done in {elapsed:.0f}s")
    train_logger.info(f"{title} completed in {elapsed:.0f}s")

# Load all datasets
with timer("Loading all datasets"):
    application_train = pd.read_csv(f'{DATA_PATH}application_train.csv', nrows=NUM_ROWS)
    application_test = pd.read_csv(f'{DATA_PATH}application_test.csv', nrows=NUM_ROWS)
    bureau = pd.read_csv(f'{DATA_PATH}bureau.csv', nrows=NUM_ROWS)
    bureau_balance = pd.read_csv(f'{DATA_PATH}bureau_balance.csv', nrows=NUM_ROWS)
    previous_application = pd.read_csv(f'{DATA_PATH}previous_application.csv', nrows=NUM_ROWS)
    pos_cash_balance = pd.read_csv(f'{DATA_PATH}POS_CASH_balance.csv', nrows=NUM_ROWS)
    installments_payments = pd.read_csv(f'{DATA_PATH}installments_payments.csv', nrows=NUM_ROWS)
    credit_card_balance = pd.read_csv(f'{DATA_PATH}credit_card_balance.csv', nrows=NUM_ROWS)

print(f"Train samples: {len(application_train)}, Test samples: {len(application_test)}")
train_logger.info(f"Loaded data - Train: {len(application_train)}, Test: {len(application_test)}")

In [None]:
# =============================================================================
# Stage 2: Data Preprocessing
# =============================================================================

def one_hot_encoder(df, nan_as_category=True):
    original_columns = list(df.columns)
    categorical_columns = [col for col in df.columns if df[col].dtype == 'object']
    df = pd.get_dummies(df, columns=categorical_columns, dummy_na=nan_as_category)
    df.columns = [re.sub(r'[^A-Za-z0-9_]+', '_', col) for col in df.columns]
    new_columns = [c for c in df.columns if c not in original_columns]
    return df, new_columns

with timer("Preprocessing application data"):
    df = pd.concat([application_train, application_test], ignore_index=True)
    df = df[df['CODE_GENDER'] != 'XNA']
    for bin_feature in ['CODE_GENDER', 'FLAG_OWN_CAR', 'FLAG_OWN_REALTY']:
        df[bin_feature], _ = pd.factorize(df[bin_feature])
    df, cat_cols = one_hot_encoder(df, nan_as_category=False)
    df['DAYS_EMPLOYED'].replace(365243, np.nan, inplace=True)

with timer("Preprocessing bureau and bureau_balance"):
    bureau_balance, bb_cat = one_hot_encoder(bureau_balance, nan_as_category=True)
    bureau, bureau_cat = one_hot_encoder(bureau, nan_as_category=True)

with timer("Preprocessing previous applications"):
    previous_application, prev_cat = one_hot_encoder(previous_application, nan_as_category=True)
    for col in ['DAYS_FIRST_DRAWING', 'DAYS_FIRST_DUE', 'DAYS_LAST_DUE_1ST_VERSION', 
                'DAYS_LAST_DUE', 'DAYS_TERMINATION']:
        if col in previous_application.columns:
            previous_application[col].replace(365243, np.nan, inplace=True)

with timer("Preprocessing other tables"):
    pos_cash_balance, pos_cat = one_hot_encoder(pos_cash_balance, nan_as_category=True)
    installments_payments, ins_cat = one_hot_encoder(installments_payments, nan_as_category=True)
    credit_card_balance, cc_cat = one_hot_encoder(credit_card_balance, nan_as_category=True)

del application_train, application_test
gc.collect()
print(f"Preprocessed main df shape: {df.shape}")

In [None]:
# =============================================================================
# Stage 3: Feature Engineering
# =============================================================================

with timer("Creating application features"):
    df['DAYS_EMPLOYED_PERC'] = df['DAYS_EMPLOYED'] / df['DAYS_BIRTH']
    df['INCOME_CREDIT_PERC'] = df['AMT_INCOME_TOTAL'] / df['AMT_CREDIT']
    df['INCOME_PER_PERSON'] = df['AMT_INCOME_TOTAL'] / df['CNT_FAM_MEMBERS']
    df['ANNUITY_INCOME_PERC'] = df['AMT_ANNUITY'] / df['AMT_INCOME_TOTAL']
    df['PAYMENT_RATE'] = df['AMT_ANNUITY'] / df['AMT_CREDIT']

with timer("Creating bureau features"):
    bb_aggregations = {'MONTHS_BALANCE': ['min', 'max', 'size']}
    for col in bb_cat:
        if col in bureau_balance.columns:
            bb_aggregations[col] = ['mean']
    bb_agg = bureau_balance.groupby('SK_ID_BUREAU').agg(bb_aggregations)
    bb_agg.columns = pd.Index([e[0] + "_" + e[1].upper() for e in bb_agg.columns.tolist()])
    bureau = bureau.join(bb_agg, how='left', on='SK_ID_BUREAU')
    bureau.drop(['SK_ID_BUREAU'], axis=1, inplace=True)
    del bureau_balance, bb_agg
    gc.collect()
    
    num_aggregations = {}
    agg_mapping = {
        'DAYS_CREDIT': ['min', 'max', 'mean', 'var'], 'DAYS_CREDIT_ENDDATE': ['min', 'max', 'mean'],
        'DAYS_CREDIT_UPDATE': ['mean'], 'CREDIT_DAY_OVERDUE': ['max', 'mean'],
        'AMT_CREDIT_MAX_OVERDUE': ['mean'], 'AMT_CREDIT_SUM': ['max', 'mean', 'sum'],
        'AMT_CREDIT_SUM_DEBT': ['max', 'mean', 'sum'], 'AMT_CREDIT_SUM_OVERDUE': ['mean'],
        'AMT_CREDIT_SUM_LIMIT': ['mean', 'sum'], 'AMT_ANNUITY': ['max', 'mean'],
        'CNT_CREDIT_PROLONG': ['sum'], 'MONTHS_BALANCE_MIN': ['min'],
        'MONTHS_BALANCE_MAX': ['max'], 'MONTHS_BALANCE_SIZE': ['mean', 'sum']
    }
    for col, aggs in agg_mapping.items():
        if col in bureau.columns:
            num_aggregations[col] = aggs
    cat_aggregations = {}
    for cat in bureau_cat:
        if cat in bureau.columns:
            cat_aggregations[cat] = ['mean']
    for cat in bb_cat:
        col_name = cat + "_MEAN"
        if col_name in bureau.columns:
            cat_aggregations[col_name] = ['mean']
    bureau_agg = bureau.groupby('SK_ID_CURR').agg({**num_aggregations, **cat_aggregations})
    bureau_agg.columns = pd.Index(['BURO_' + e[0] + "_" + e[1].upper() for e in bureau_agg.columns.tolist()])
    if 'CREDIT_ACTIVE_Active' in bureau.columns:
        active = bureau[bureau['CREDIT_ACTIVE_Active'] == 1]
        if len(active) > 0:
            active_agg = active.groupby('SK_ID_CURR').agg(num_aggregations)
            active_agg.columns = pd.Index(['ACTIVE_' + e[0] + "_" + e[1].upper() for e in active_agg.columns.tolist()])
            bureau_agg = bureau_agg.join(active_agg, how='left', on='SK_ID_CURR')
            del active_agg
        del active
    if 'CREDIT_ACTIVE_Closed' in bureau.columns:
        closed = bureau[bureau['CREDIT_ACTIVE_Closed'] == 1]
        if len(closed) > 0:
            closed_agg = closed.groupby('SK_ID_CURR').agg(num_aggregations)
            closed_agg.columns = pd.Index(['CLOSED_' + e[0] + "_" + e[1].upper() for e in closed_agg.columns.tolist()])
            bureau_agg = bureau_agg.join(closed_agg, how='left', on='SK_ID_CURR')
            del closed_agg
        del closed
    del bureau
    gc.collect()
    df = df.join(bureau_agg, how='left', on='SK_ID_CURR')
    del bureau_agg
    gc.collect()
    print(f"After bureau features: {df.shape}")

with timer("Creating previous application features"):
    previous_application['APP_CREDIT_PERC'] = previous_application['AMT_APPLICATION'] / previous_application['AMT_CREDIT']
    prev_num_aggregations = {
        'AMT_ANNUITY': ['min', 'max', 'mean'], 'AMT_APPLICATION': ['min', 'max', 'mean'],
        'AMT_CREDIT': ['min', 'max', 'mean'], 'APP_CREDIT_PERC': ['min', 'max', 'mean', 'var'],
        'AMT_DOWN_PAYMENT': ['min', 'max', 'mean'], 'AMT_GOODS_PRICE': ['min', 'max', 'mean'],
        'HOUR_APPR_PROCESS_START': ['min', 'max', 'mean'], 'RATE_DOWN_PAYMENT': ['min', 'max', 'mean'],
        'DAYS_DECISION': ['min', 'max', 'mean'], 'CNT_PAYMENT': ['mean', 'sum'],
    }
    prev_num_aggregations = {k: v for k, v in prev_num_aggregations.items() if k in previous_application.columns}
    cat_aggregations = {cat: ['mean'] for cat in prev_cat if cat in previous_application.columns}
    prev_agg = previous_application.groupby('SK_ID_CURR').agg({**prev_num_aggregations, **cat_aggregations})
    prev_agg.columns = pd.Index(['PREV_' + e[0] + "_" + e[1].upper() for e in prev_agg.columns.tolist()])
    if 'NAME_CONTRACT_STATUS_Approved' in previous_application.columns:
        approved = previous_application[previous_application['NAME_CONTRACT_STATUS_Approved'] == 1]
        if len(approved) > 0:
            approved_agg = approved.groupby('SK_ID_CURR').agg(prev_num_aggregations)
            approved_agg.columns = pd.Index(['APPROVED_' + e[0] + "_" + e[1].upper() for e in approved_agg.columns.tolist()])
            prev_agg = prev_agg.join(approved_agg, how='left', on='SK_ID_CURR')
            del approved_agg
        del approved
    if 'NAME_CONTRACT_STATUS_Refused' in previous_application.columns:
        refused = previous_application[previous_application['NAME_CONTRACT_STATUS_Refused'] == 1]
        if len(refused) > 0:
            refused_agg = refused.groupby('SK_ID_CURR').agg(prev_num_aggregations)
            refused_agg.columns = pd.Index(['REFUSED_' + e[0] + "_" + e[1].upper() for e in refused_agg.columns.tolist()])
            prev_agg = prev_agg.join(refused_agg, how='left', on='SK_ID_CURR')
            del refused_agg
        del refused
    del previous_application
    gc.collect()
    df = df.join(prev_agg, how='left', on='SK_ID_CURR')
    del prev_agg
    gc.collect()
    print(f"After previous application features: {df.shape}")

with timer("Creating POS cash features"):
    aggregations = {'MONTHS_BALANCE': ['max', 'mean', 'size'], 'SK_DPD': ['max', 'mean'], 'SK_DPD_DEF': ['max', 'mean']}
    aggregations = {k: v for k, v in aggregations.items() if k in pos_cash_balance.columns}
    for cat in pos_cat:
        if cat in pos_cash_balance.columns:
            aggregations[cat] = ['mean']
    pos_agg = pos_cash_balance.groupby('SK_ID_CURR').agg(aggregations)
    pos_agg.columns = pd.Index(['POS_' + e[0] + "_" + e[1].upper() for e in pos_agg.columns.tolist()])
    pos_agg['POS_COUNT'] = pos_cash_balance.groupby('SK_ID_CURR').size()
    del pos_cash_balance
    gc.collect()
    df = df.join(pos_agg, how='left', on='SK_ID_CURR')
    del pos_agg
    gc.collect()
    print(f"After POS cash features: {df.shape}")

with timer("Creating installments features"):
    installments_payments['PAYMENT_PERC'] = installments_payments['AMT_PAYMENT'] / installments_payments['AMT_INSTALMENT']
    installments_payments['PAYMENT_DIFF'] = installments_payments['AMT_INSTALMENT'] - installments_payments['AMT_PAYMENT']
    installments_payments['DPD'] = installments_payments['DAYS_ENTRY_PAYMENT'] - installments_payments['DAYS_INSTALMENT']
    installments_payments['DBD'] = installments_payments['DAYS_INSTALMENT'] - installments_payments['DAYS_ENTRY_PAYMENT']
    installments_payments['DPD'] = installments_payments['DPD'].apply(lambda x: x if x > 0 else 0)
    installments_payments['DBD'] = installments_payments['DBD'].apply(lambda x: x if x > 0 else 0)
    ins_aggregations = {
        'NUM_INSTALMENT_VERSION': ['nunique'], 'DPD': ['max', 'mean', 'sum'], 'DBD': ['max', 'mean', 'sum'],
        'PAYMENT_PERC': ['max', 'mean', 'sum', 'var'], 'PAYMENT_DIFF': ['max', 'mean', 'sum', 'var'],
        'AMT_INSTALMENT': ['max', 'mean', 'sum'], 'AMT_PAYMENT': ['min', 'max', 'mean', 'sum'],
        'DAYS_ENTRY_PAYMENT': ['max', 'mean', 'sum']
    }
    ins_aggregations = {k: v for k, v in ins_aggregations.items() if k in installments_payments.columns}
    for cat in ins_cat:
        if cat in installments_payments.columns:
            ins_aggregations[cat] = ['mean']
    ins_agg = installments_payments.groupby('SK_ID_CURR').agg(ins_aggregations)
    ins_agg.columns = pd.Index(['INSTAL_' + e[0] + "_" + e[1].upper() for e in ins_agg.columns.tolist()])
    ins_agg['INSTAL_COUNT'] = installments_payments.groupby('SK_ID_CURR').size()
    del installments_payments
    gc.collect()
    df = df.join(ins_agg, how='left', on='SK_ID_CURR')
    del ins_agg
    gc.collect()
    print(f"After installments features: {df.shape}")

with timer("Creating credit card features"):
    if 'SK_ID_PREV' in credit_card_balance.columns:
        credit_card_balance.drop(['SK_ID_PREV'], axis=1, inplace=True)
    numeric_cols = credit_card_balance.select_dtypes(include=[np.number]).columns.tolist()
    numeric_cols = [c for c in numeric_cols if c != 'SK_ID_CURR']
    cc_agg = credit_card_balance.groupby('SK_ID_CURR')[numeric_cols].agg(['min', 'max', 'mean', 'sum', 'var'])
    cc_agg.columns = pd.Index(['CC_' + e[0] + "_" + e[1].upper() for e in cc_agg.columns.tolist()])
    cc_agg['CC_COUNT'] = credit_card_balance.groupby('SK_ID_CURR').size()
    del credit_card_balance
    gc.collect()
    df = df.join(cc_agg, how='left', on='SK_ID_CURR')
    del cc_agg
    gc.collect()

df.columns = [re.sub(r'[^A-Za-z0-9_]+', '_', col) for col in df.columns]
print(f"Final df shape after all features: {df.shape}")

In [None]:
# =============================================================================
# Stage 4: Model Training OR Loading
# =============================================================================
import lightgbm as lgb

# =============================================================================
# Custom Callback: Log every epoch to file, print every 50 to console
# =============================================================================
class TrainingLogger:
    """Log every epoch to file, print every N epochs to console."""
    def __init__(self, logger, fold_num, console_period=50):
        self.logger = logger
        self.fold_num = fold_num
        self.console_period = console_period
        self.start_time = None
    
    def __call__(self, env):
        if self.start_time is None:
            self.start_time = time.time()
        
        iteration = env.iteration + 1
        train_auc = valid_auc = None
        
        for data_name, eval_name, result, _ in env.evaluation_result_list:
            if data_name == 'train' and eval_name == 'auc':
                train_auc = result
            elif data_name == 'valid' and eval_name == 'auc':
                valid_auc = result
        
        # Log EVERY epoch to file
        elapsed = time.time() - self.start_time
        train_logger.debug(f"Fold {self.fold_num} | Epoch {iteration:5d} | Train AUC: {train_auc:.6f} | Valid AUC: {valid_auc:.6f} | Time: {elapsed:.1f}s")
        
        # Print every 50 epochs to console
        if iteration % self.console_period == 0:
            print(f"  [Fold {self.fold_num}] Epoch {iteration:5d}: Train={train_auc:.6f}, Valid={valid_auc:.6f}")

# Split data
train_df = df[df['TARGET'].notnull()].copy()
test_df = df[df['TARGET'].isnull()].copy()
feats = [f for f in train_df.columns if f not in ['TARGET', 'SK_ID_CURR', 'SK_ID_BUREAU', 'SK_ID_PREV', 'index']]
print(f"Train shape: {train_df.shape}, Test shape: {test_df.shape}")
print(f"Number of features: {len(feats)}")

del df
gc.collect()

num_folds = 3 if DEBUG else 10

# =============================================================================
# LightGBM Parameters - Native API with proper GPU support
# =============================================================================
lgb_params = {
    'objective': 'binary',
    'metric': 'auc',
    'boosting_type': 'gbdt',
    'num_leaves': 34,
    'learning_rate': 0.02,
    'feature_fraction': 0.9497036,
    'bagging_fraction': 0.8715623,
    'bagging_freq': 1,
    'max_depth': 8,
    'lambda_l1': 0.041545473,
    'lambda_l2': 0.0735294,
    'min_split_gain': 0.0222415,
    'min_child_weight': 39.3259775,
    'verbose': -1,
    'num_threads': 4,
}

# Add GPU or CPU specific parameters
if USE_GPU:
    lgb_params['device_type'] = 'gpu'
    lgb_params['gpu_platform_id'] = 0
    lgb_params['gpu_device_id'] = 0
    lgb_params['num_threads'] = 1  # GPU typically uses single thread for data loading
    print("\n✓ GPU acceleration ENABLED (using lgb.train native API)")
    train_logger.info("GPU acceleration enabled with native API")
else:
    lgb_params['device_type'] = 'cpu'
    print("\n→ Training on CPU")
    train_logger.info("Training on CPU")

if TRAIN_MODE:
    print("\n" + "="*60)
    print("TRAINING MODE: Training new models")
    print("="*60)
    train_logger.info("="*60)
    train_logger.info(f"TRAINING MODE | Folds: {num_folds} | Device: {lgb_params['device_type'].upper()}")
    train_logger.info("="*60)
    
    folds = KFold(n_splits=num_folds, shuffle=True, random_state=1001)
    oof_preds = np.zeros(train_df.shape[0])
    sub_preds = np.zeros(test_df.shape[0])
    feature_importance_df = pd.DataFrame()
    training_history = []
    fold_results = []
    models = []
    
    with timer(f"Training {num_folds}-Fold LightGBM"):
        for n_fold, (train_idx, valid_idx) in enumerate(folds.split(train_df[feats], train_df['TARGET'])):
            fold_start = time.time()
            print(f"\n--- Fold {n_fold + 1}/{num_folds} ---")
            train_logger.info(f"\n{'='*40}")
            train_logger.info(f"Fold {n_fold + 1} | Train: {len(train_idx)}, Valid: {len(valid_idx)}")
            
            train_x, train_y = train_df[feats].iloc[train_idx], train_df['TARGET'].iloc[train_idx]
            valid_x, valid_y = train_df[feats].iloc[valid_idx], train_df['TARGET'].iloc[valid_idx]
            
            # Create LightGBM Dataset objects (native API)
            lgb_train = lgb.Dataset(train_x, label=train_y)
            lgb_valid = lgb.Dataset(valid_x, label=valid_y, reference=lgb_train)
            
            # Custom training logger callback
            training_callback = TrainingLogger(train_logger, n_fold + 1, console_period=50)
            
            # Store evaluation results
            evals_result = {}
            
            # Train using native API
            model = lgb.train(
                lgb_params,
                lgb_train,
                num_boost_round=10000,
                valid_sets=[lgb_train, lgb_valid],
                valid_names=['train', 'valid'],
                callbacks=[
                    lgb.early_stopping(stopping_rounds=200),
                    lgb.record_evaluation(evals_result),
                    training_callback
                ]
            )
            
            # Save model
            model_path = os.path.join(MODEL_DIR, f'lgbm_fold_{n_fold}.txt')
            model.save_model(model_path)
            models.append(model)
            
            training_history.append({
                'fold': n_fold + 1,
                'train_auc': evals_result['train']['auc'],
                'valid_auc': evals_result['valid']['auc'],
                'best_iteration': model.best_iteration
            })
            
            # Predictions
            valid_preds = model.predict(valid_x, num_iteration=model.best_iteration)
            oof_preds[valid_idx] = valid_preds
            sub_preds += model.predict(test_df[feats], num_iteration=model.best_iteration) / num_folds
            
            fold_auc = roc_auc_score(valid_y, valid_preds)
            fold_results.append({
                'fold': n_fold + 1,
                'y_true': valid_y.values,
                'y_pred': valid_preds,
                'auc': fold_auc
            })
            
            # Feature importance
            fold_importance_df = pd.DataFrame()
            fold_importance_df["feature"] = feats
            fold_importance_df["importance"] = model.feature_importance(importance_type='gain')
            fold_importance_df["fold"] = n_fold + 1
            feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)
            
            fold_time = time.time() - fold_start
            print(f'Fold {n_fold + 1:2d} AUC: {fold_auc:.6f} | Best iter: {model.best_iteration} | Time: {fold_time:.0f}s')
            train_logger.info(f"Fold {n_fold + 1} DONE: AUC={fold_auc:.6f}, Best={model.best_iteration}, Time={fold_time:.0f}s")
            
            del train_x, train_y, valid_x, valid_y, lgb_train, lgb_valid
            gc.collect()
    
    # Save feature list for inference
    joblib.dump(feats, os.path.join(MODEL_DIR, 'feature_list.pkl'))
    
    full_auc = roc_auc_score(train_df['TARGET'], oof_preds)
    print(f'\nFull OOF AUC: {full_auc:.6f}')
    print(f'Models saved to: {MODEL_DIR}')
    train_logger.info(f"\nFINAL OOF AUC: {full_auc:.6f}")
    train_logger.info(f"Models saved to: {MODEL_DIR}")
    
    train_targets = train_df['TARGET'].values

else:
    print("\n" + "="*60)
    print("INFERENCE MODE: Loading saved models")
    print("="*60)
    
    saved_feats = joblib.load(os.path.join(MODEL_DIR, 'feature_list.pkl'))
    
    missing_feats = [f for f in saved_feats if f not in test_df.columns]
    if missing_feats:
        print(f"Warning: {len(missing_feats)} features missing, filling with 0")
        for f in missing_feats:
            test_df[f] = 0
    
    sub_preds = np.zeros(test_df.shape[0])
    models = []
    
    with timer("Loading models and making predictions"):
        for n_fold in range(num_folds):
            model_path = os.path.join(MODEL_DIR, f'lgbm_fold_{n_fold}.txt')
            if os.path.exists(model_path):
                model = lgb.Booster(model_file=model_path)
                models.append(model)
                sub_preds += model.predict(test_df[saved_feats]) / num_folds
                print(f'Loaded: {model_path}')
            else:
                print(f'Warning: Not found: {model_path}')
    
    print(f'\nLoaded {len(models)} models')
    
    oof_preds = None
    train_targets = None
    training_history = None
    fold_results = None
    feature_importance_df = None

test_df['TARGET'] = sub_preds
submission_df = test_df[['SK_ID_CURR', 'TARGET']]

In [None]:
# =============================================================================
# Stage 5: Output & Visualization
# =============================================================================

# Save submission
with timer("Saving submission file"):
    submission_df.to_csv(SUBMISSION_FILE, index=False)
    print(f"Submission saved to: {SUBMISSION_FILE}")
    print(f"Shape: {submission_df.shape}")
    print(f"Predictions range: [{submission_df['TARGET'].min():.4f}, {submission_df['TARGET'].max():.4f}]")

# Visualizations only available in TRAIN_MODE
if TRAIN_MODE and feature_importance_df is not None:
    
    # Feature Importance
    with timer("Generating feature importance"):
        cols = (feature_importance_df[["feature", "importance"]]
                .groupby("feature").mean()
                .sort_values(by="importance", ascending=False)[:40].index)
        best_features = feature_importance_df.loc[feature_importance_df.feature.isin(cols)]
        plt.figure(figsize=(10, 12))
        sns.barplot(x="importance", y="feature", 
                    data=best_features.sort_values(by="importance", ascending=False), palette="viridis")
        plt.title('LightGBM Feature Importance')
        plt.tight_layout()
        plt.savefig('lgbm_importances01.png', dpi=150)
        plt.show()
    
    # Per-fold ROC curves
    with timer("Generating per-fold ROC curves"):
        n_folds = len(fold_results)
        cols = min(5, n_folds)
        rows = (n_folds + cols - 1) // cols
        fig, axes = plt.subplots(rows, cols, figsize=(4*cols, 4*rows))
        axes = axes.flatten() if n_folds > 1 else [axes]
        colors = plt.cm.viridis(np.linspace(0, 0.9, n_folds))
        for idx, fold_data in enumerate(fold_results):
            ax = axes[idx]
            fpr, tpr, _ = roc_curve(fold_data['y_true'], fold_data['y_pred'])
            ax.plot(fpr, tpr, color=colors[idx], lw=2, label=f'AUC = {fold_data["auc"]:.4f}')
            ax.plot([0, 1], [0, 1], 'k--', lw=1, alpha=0.5)
            ax.set_title(f'Fold {fold_data["fold"]}')
            ax.legend(loc='lower right')
            ax.grid(alpha=0.3)
        for idx in range(n_folds, len(axes)):
            axes[idx].axis('off')
        plt.suptitle('ROC Curves for Each Fold', fontsize=14)
        plt.tight_layout()
        plt.savefig('roc_curves_per_fold.png', dpi=150, bbox_inches='tight')
        plt.show()
    
    # Combined ROC
    with timer("Generating combined ROC curve"):
        plt.figure(figsize=(8, 6))
        for fold_data in fold_results:
            fpr, tpr, _ = roc_curve(fold_data['y_true'], fold_data['y_pred'])
            plt.plot(fpr, tpr, alpha=0.3, lw=1)
        fpr, tpr, _ = roc_curve(train_targets, oof_preds)
        oof_auc = roc_auc_score(train_targets, oof_preds)
        plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'Mean OOF (AUC = {oof_auc:.4f})')
        plt.plot([0, 1], [0, 1], 'navy', lw=2, linestyle='--')
        plt.title('Combined ROC Curve')
        plt.legend(loc='lower right')
        plt.grid(alpha=0.3)
        plt.tight_layout()
        plt.savefig('roc_curve.png', dpi=150)
        plt.show()
    
    # Training curves
    with timer("Generating training curves"):
        n_folds = len(training_history)
        cols = min(5, n_folds)
        rows = (n_folds + cols - 1) // cols
        fig, axes = plt.subplots(rows, cols, figsize=(4*cols, 3*rows))
        axes = axes.flatten() if n_folds > 1 else [axes]
        for idx, fold_history in enumerate(training_history):
            ax = axes[idx]
            iterations = range(1, len(fold_history['train_auc']) + 1)
            ax.plot(iterations, fold_history['train_auc'], label='Train', color='blue', alpha=0.7)
            ax.plot(iterations, fold_history['valid_auc'], label='Valid', color='red', alpha=0.7)
            ax.axvline(x=fold_history['best_iteration'], color='green', linestyle='--', alpha=0.7)
            ax.set_title(f'Fold {fold_history["fold"]} (best: {fold_history["best_iteration"]})')
            ax.legend(loc='lower right', fontsize=8)
            ax.grid(alpha=0.3)
        for idx in range(n_folds, len(axes)):
            axes[idx].axis('off')
        plt.suptitle('Training Curves', fontsize=14)
        plt.tight_layout()
        plt.savefig('training_curves.png', dpi=150, bbox_inches='tight')
        plt.show()
    
    # Summary
    print("\n" + "="*60)
    print("TRAINING SUMMARY")
    print("="*60)
    for fr in fold_results:
        print(f"  Fold {fr['fold']:2d}: AUC = {fr['auc']:.6f}")
    print(f"\n  Mean Fold AUC: {np.mean([fr['auc'] for fr in fold_results]):.6f}")
    print(f"  OOF AUC:       {oof_auc:.6f}")

else:
    print("\n" + "="*60)
    print("INFERENCE COMPLETE")
    print("="*60)
    print(f"Predictions generated using {len(models)} saved models")
    print(f"Visualizations skipped (only available in TRAIN_MODE)")

print(f"\nOutput files:")
print(f"  - {SUBMISSION_FILE}")
if TRAIN_MODE:
    print(f"  - {MODEL_DIR}lgbm_fold_*.pkl (saved models)")
    print(f"  - lgbm_importances01.png")
    print(f"  - roc_curves_per_fold.png")
    print(f"  - roc_curve.png")
    print(f"  - training_curves.png")