In [1]:
# pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu128
# pip install tqdm
# pip install ipywidgets
# pip install "numpy<2.0"
# pip install --upgrade scipy scikit-learn
# pip install pyarrow
# pip install lightgbm
# pip install openpyxl
# pip install optuna
# pip install \
#     --extra-index-url=https://pypi.nvidia.com \
#     "cudf-cu12==25.6." "dask-cudf-cu12==25.6." "cuml-cu12==25.6.*" \
#     "cugraph-cu12==25.6." "nx-cugraph-cu12==25.6." "cuxfilter-cu12==25.6.*" \
#     "cucim-cu12==25.6." "pylibraft-cu12==25.6." "raft-dask-cu12==25.6.*" \
#     "cuvs-cu12==25.6." "nx-cugraph-cu12==25.6."
# curl -L https://lambdalabs-guest-agent.s3.us-west-2.amazonaws.com/scripts/install.sh | sudo bash

In [1]:
import pandas as pd
import warnings
import numpy as np
import os
import gc
import shutil
import uuid
from multiprocessing import Pool, cpu_count
from functools import partial
from tqdm import tqdm
import pyarrow.parquet as pq
import pyarrow as pa
import time
import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold
import json
import matplotlib
import sys
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from scipy.optimize import minimize_scalar
import torch.optim as optim
import optuna
from scipy.fft import rfft
from scipy.stats import entropy
import scipy.stats
import traceback

# Force Matplotlib to use a non-GUI backend
matplotlib.use("Agg")
import matplotlib.pyplot as plt

warnings.filterwarnings('ignore', category=pd.errors.PerformanceWarning)

# --- THE DEFINITIVE FIX for the "ArrowDtype" error ---
if 'dask' in sys.modules:
    del sys.modules['dask']

def run_stage_0_advanced_feature_engineering():
    print("--- Starting Advanced Feature Engineering Pipeline (v3) ---")
    start_time = time.time()

    # --- 1. Load All Data ---
    print("1/7: Loading all raw data files...")
    try:
        train_df = pd.read_parquet('train_data.parquet')
        test_df = pd.read_parquet('test_data_r3.parquet')
        add_trans = pd.read_parquet('add_trans.parquet')
        add_event = pd.read_parquet('add_event.parquet')
        offer_meta = pd.read_parquet('offer_metadata.parquet')
    except FileNotFoundError as e:
        print(f"❌ ERROR: Could not find a required data file. Make sure all .parquet files are in the same directory. Details: {e}")
        exit()

    # Combine train and test for consistent processing
    train_len = len(train_df)
    all_data = pd.concat([train_df, test_df], ignore_index=True)
    del train_df, test_df
    gc.collect()

    # --- 2. Preprocessing & Timestamp Conversion ---
    print("2/7: Processing timestamps...")
    all_data['id4'] = pd.to_datetime(all_data['id4'])
    offer_meta['id12'] = pd.to_datetime(offer_meta['id12'])
    offer_meta['id13'] = pd.to_datetime(offer_meta['id13'])
    add_trans['f370'] = pd.to_datetime(add_trans['f370'])
    add_event['id4'] = pd.to_datetime(add_event['id4'])
    add_event['id7'] = pd.to_datetime(add_event['id7'], errors='coerce') # Click timestamp can be NaT

    # --- 3. Engineer Static Offer Features (from offer_metadata) ---
    print("3/7: Engineering static offer features (including NLP)...")
    offer_meta['offer_duration_days'] = (offer_meta['id13'] - offer_meta['id12']).dt.days
    offer_meta['offer_start_dayofweek'] = offer_meta['id12'].dt.dayofweek
    offer_meta['brand_id'], _ = pd.factorize(offer_meta['id11'])
    offer_meta['industry_id'], _ = pd.factorize(offer_meta['id10'])
    offer_meta.rename(columns={'f375': 'redemption_frequency', 'f376': 'discount_rate', 'id8': 'member_industry_code', 'id9': 'offer_name', 'f378': 'offer_body'}, inplace=True)
    offer_meta['is_industry_match'] = (offer_meta['industry_id'] == offer_meta['member_industry_code']).astype(int)

    # --- NEW v3 FEATURE: Basic NLP on Offer Text ---
    print("  -> Generating NLP features from offer text...")
    offer_meta['offer_name'] = offer_meta['offer_name'].astype(str).str.lower()
    offer_meta['offer_body'] = offer_meta['offer_body'].astype(str).str.lower()

    offer_meta['offer_name_len'] = offer_meta['offer_name'].str.len()
    offer_meta['offer_body_words'] = offer_meta['offer_body'].str.split().str.len()

    # Keyword matching
    offer_meta['has_keyword_cashback'] = offer_meta['offer_body'].str.contains('cash back|statement credit|refund', regex=True).astype(int)
    offer_meta['has_keyword_points'] = offer_meta['offer_body'].str.contains('points|miles|rewards', regex=True).astype(int)
    offer_meta['has_keyword_discount'] = offer_meta['offer_body'].str.contains('% off|discount|save', regex=True).astype(int)
    offer_meta['has_keyword_spend_x'] = offer_meta['offer_body'].str.contains('spend', regex=False).astype(int)

    # Select columns to merge
    offer_features_to_merge = [
        'id3', 'offer_duration_days', 'offer_start_dayofweek', 'brand_id', 'industry_id',
        'redemption_frequency', 'discount_rate', 'is_industry_match',
        'offer_name_len', 'offer_body_words', 'has_keyword_cashback', 
        'has_keyword_points', 'has_keyword_discount', 'has_keyword_spend_x',
        'id12', 'id13'
    ]
    
    # --- FIX for dtype mismatch error ---
    # Ensure the merge key 'id3' is the same type in both dataframes before merging.
    all_data['id3'] = all_data['id3'].astype('int64')
    offer_meta['id3'] = offer_meta['id3'].astype('int64')

    all_data = all_data.merge(offer_meta[offer_features_to_merge], on='id3', how='left')

    # Create time-based interaction features
    all_data['days_since_offer_start'] = (all_data['id4'] - all_data['id12']).dt.days
    all_data['days_until_offer_end'] = (all_data['id13'] - all_data['id4']).dt.days
    all_data.drop(columns=['id12', 'id13'], inplace=True)
    gc.collect()

    # --- 4. Engineer Historical Customer-Offer Interaction Features (from add_event) ---
    print("4/7: Engineering customer-offer historical interaction features (impressions, clicks & timing)...")
    
    # --- FIX for dtype mismatch error ---
    # Ensure the merge key 'id3' is the same type before merging.
    add_event['id3'] = add_event['id3'].astype('int64')

    add_event = add_event.merge(offer_meta[['id3', 'brand_id', 'industry_id']], on='id3', how='left')
    add_event.sort_values(by=['id2', 'id4'], inplace=True)

    # Create click features
    add_event['clicked'] = add_event['id7'].notna().astype(int)

    # --- NEW v3 FEATURE: Advanced Temporal Features ---
    print("  -> Generating advanced temporal (time-since-last-event) features...")
    add_event['time_since_last_impression_seconds'] = add_event.groupby('id2')['id4'].diff().dt.total_seconds()
    # Create a column with the timestamp only if a click occurred
    add_event['click_timestamp'] = add_event['id4'].where(add_event['clicked'] == 1)
    # Forward-fill the last click time for each customer
    add_event['last_click_timestamp'] = add_event.groupby('id2')['click_timestamp'].ffill()
    add_event['time_since_last_click_seconds'] = (add_event['id4'] - add_event['last_click_timestamp']).dt.total_seconds()

    # Calculate lagged cumulative counts for impressions and clicks
    # The shift(1) is critical to prevent data leakage from the current event
    add_event['customer_total_impressions_before'] = add_event.groupby('id2').cumcount()
    add_event['customer_brand_impressions_before'] = add_event.groupby(['id2', 'brand_id']).cumcount()
    add_event['customer_total_clicks_before'] = add_event.groupby('id2')['clicked'].cumsum().shift(1).fillna(0)
    add_event['customer_brand_clicks_before'] = add_event.groupby(['id2', 'brand_id'])['clicked'].cumsum().shift(1).fillna(0)

    # Calculate historical Click-Through-Rates (CTR)
    epsilon = 1e-6
    add_event['customer_ctr_before'] = add_event['customer_total_clicks_before'] / (add_event['customer_total_impressions_before'] + epsilon)
    add_event['customer_brand_ctr_before'] = add_event['customer_brand_clicks_before'] / (add_event['customer_brand_impressions_before'] + epsilon)

    # Create a unique event key to merge these features back to the main dataframe
    event_key = ['id2', 'id3', 'id4']
    event_features_to_merge = [
        'time_since_last_impression_seconds', 'time_since_last_click_seconds',
        'customer_total_impressions_before', 'customer_brand_impressions_before',
        'customer_total_clicks_before', 'customer_brand_clicks_before',
        'customer_ctr_before', 'customer_brand_ctr_before'
    ]
    
    # --- FIX for composite key dtype mismatch ---
    # Ensure all columns in the merge key have the same dtype before merging.
    for col in event_key:
        all_data[col] = all_data[col].astype(add_event[col].dtype)

    all_data = all_data.merge(add_event[event_key + event_features_to_merge], on=event_key, how='left')
    del add_event
    gc.collect()


    # --- 5. Engineer Customer Transaction Features (from add_trans) --- LEAKAGE FIXED ---
    print("5/7: Engineering POINT-IN-TIME customer transaction features to prevent data leakage...")
    
    # Ensure add_trans has a proper timestamp and is sorted for time-based operations
    # --- FIX for ValueError on timedelta conversion ---
    # Force f371 to numeric, coercing any non-numeric values (like strings) to NaT.
    # This is a robust way to handle potentially dirty data in the time column.
    time_as_seconds = pd.to_numeric(add_trans['f371'], errors='coerce')
    add_trans['transaction_timestamp'] = add_trans['f370'] + pd.to_timedelta(time_as_seconds, unit='s')
    
    add_trans.sort_values(by=['id2', 'transaction_timestamp'], inplace=True)

    # --- Create a unified transaction history dataframe ---
    # Calculate cumulative features directly on the sorted transaction data
    print("  -> Calculating point-in-time cumulative aggregates...")
    
    # Use a temporary dataframe to build aggregates
    trans_history = add_trans[['id2', 'transaction_timestamp', 'f367', 'f368', 'f369']].copy()
    
    # Debits and Credits
    is_debit = trans_history['f369'] == 'D'
    is_credit = trans_history['f369'] == 'C'
    debit_amount = trans_history['f367'].where(is_debit, 0)
    credit_amount = trans_history['f367'].where(is_credit, 0)

    # Cumulative calculations
    grouped = trans_history.groupby('id2')
    trans_history['customer_total_spend'] = grouped['f367'].transform(lambda x: x[is_debit].cumsum())
    trans_history['customer_num_transactions'] = grouped.cumcount() + 1
    
    trans_history['customer_num_debits'] = grouped['f369'].transform(lambda x: (x == 'D').cumsum())
    trans_history['customer_num_refunds'] = grouped['f369'].transform(lambda x: (x == 'C').cumsum())
    
    trans_history['customer_total_refund_amount'] = grouped['f367'].transform(lambda x: x[is_credit].cumsum())

    # Fill NaNs that result from cumsum on filtered series
    for col in ['customer_total_spend', 'customer_total_refund_amount', 'customer_num_debits', 'customer_num_refunds']:
        trans_history[col] = grouped[col].ffill().fillna(0)

    # Point-in-time ratios and averages
    epsilon = 1e-6
    trans_history['customer_avg_trans_amount'] = trans_history['customer_total_spend'] / (trans_history['customer_num_debits'] + epsilon)
    trans_history['customer_refund_rate'] = trans_history['customer_num_refunds'] / (trans_history['customer_num_debits'] + epsilon)
    
    # --- FIX for DataError: No numeric types to aggregate ---
    # The default .expanding() tries to cast to numeric. For object types like Product ID,
    # we need a method that is safe for strings. This custom apply is robust and avoids
    # the internal numeric casting of the expanding window function.
    def expanding_nunique_for_objects(series):
        seen = set()
        result = []
        for x in series:
            seen.add(x)
            result.append(len(seen))
        return pd.Series(result, index=series.index)
    
    trans_history['customer_num_unique_products'] = grouped['f368'].transform(expanding_nunique_for_objects)

    # --- Merge point-in-time features back to the main dataset ---
    print("  -> Merging point-in-time transaction features into main dataset...")
    
    # --- FIX for ValueError: left keys must be sorted ---
    # merge_asof requires the 'on' key (the timestamp) to be sorted in the left dataframe.
    all_data.sort_values(by='id4', inplace=True)

    features_to_merge = [
        'id2', 'transaction_timestamp', 'customer_total_spend', 'customer_num_transactions',
        'customer_avg_trans_amount', 'customer_num_unique_products', 'customer_total_refund_amount',
        'customer_num_refunds', 'customer_refund_rate'
    ]
    
    # --- FIX for ValueError: Merge keys contain nulls ---
    # merge_asof cannot handle nulls in the 'on' or 'by' keys of the right dataframe.
    # We drop any rows from our history where the timestamp is invalid.
    trans_history.dropna(subset=['id2', 'transaction_timestamp'], inplace=True)
    
    # Use merge_asof to get the LATEST transaction state for each event
    all_data = pd.merge_asof(
        all_data,
        trans_history[features_to_merge],
        left_on='id4',
        right_on='transaction_timestamp',
        by='id2',
        direction='backward' # This is crucial for point-in-time correctness
    )
    
    # Also calculate the days since the last transaction
    all_data['days_since_last_transaction'] = (all_data['id4'] - all_data['transaction_timestamp']).dt.days
    all_data.drop(columns=['transaction_timestamp'], inplace=True)

    del add_trans, trans_history
    gc.collect()


    # --- 6. Finalize and Save ---
    print("6/7: Finalizing datasets and saving to disk...")
    # Split back into train and test
    train_enriched = all_data.iloc[:train_len].copy()
    test_enriched = all_data.iloc[train_len:].copy()

    # A final check for any NaNs introduced by merges, filling with a neutral value like -1 or 0.
    print("  -> Filling NaN values introduced during merges...")
    new_numeric_cols = [
        'offer_duration_days', 'offer_start_dayofweek', 'brand_id', 'industry_id',
        'redemption_frequency', 'discount_rate', 'is_industry_match', 'days_since_offer_start', 
        'days_until_offer_end', 'offer_name_len', 'offer_body_words', 'has_keyword_cashback',
        'has_keyword_points', 'has_keyword_discount', 'has_keyword_spend_x',
        'time_since_last_impression_seconds', 'time_since_last_click_seconds',
        'customer_total_impressions_before', 'customer_brand_impressions_before',
        'customer_total_clicks_before', 'customer_brand_clicks_before', 'customer_ctr_before', 
        'customer_brand_ctr_before', 'customer_avg_trans_amount', 'customer_total_spend',
        'customer_num_transactions', 'customer_num_unique_products', 'days_since_last_transaction',
        'customer_total_refund_amount', 'customer_num_refunds', 'customer_refund_rate'
    ]
    for col in new_numeric_cols:
        if col in train_enriched.columns:
            # For time-based features, a large negative number might be better than 0 or -1
            if 'seconds' in col or 'days' in col:
                train_enriched[col] = train_enriched[col].fillna(-999)
                test_enriched[col] = test_enriched[col].fillna(-999)
            # For rates and counts, 0 is a more natural fill value
            elif 'rate' in col or 'num_' in col or 'total_' in col or 'avg' in col or 'before' in col:
                train_enriched[col] = train_enriched[col].fillna(0)
                test_enriched[col] = test_enriched[col].fillna(0)
            else: # For IDs and other general features
                train_enriched[col] = train_enriched[col].fillna(-1)
                test_enriched[col] = test_enriched[col].fillna(-1)


    # --- 7. Save Final Datasets ---
    os.makedirs('inter', exist_ok=True)
    train_enriched.to_parquet('inter/train_enriched.parquet', index=False)
    test_enriched.to_parquet('inter/test_enriched.parquet', index=False)
    del all_data, train_enriched, test_enriched
    gc.collect()

    end_time = time.time()
    print(f"\n--- Advanced Feature Engineering (v3) Complete! ---")
    print(f"✅ Saved 'inter/train_enriched.parquet'")
    print(f"✅ Saved 'inter/test_enriched.parquet'")
    print(f"Total time taken: {((end_time - start_time) / 60):.2f} minutes")

if __name__ == "__main__":
    # Stage 0: Create rich features from raw data
    run_stage_0_advanced_feature_engineering()

--- Starting Advanced Feature Engineering Pipeline (v3) ---
1/7: Loading all raw data files...
2/7: Processing timestamps...
3/7: Engineering static offer features (including NLP)...
  -> Generating NLP features from offer text...
4/7: Engineering customer-offer historical interaction features (impressions, clicks & timing)...
  -> Generating advanced temporal (time-since-last-event) features...
5/7: Engineering POINT-IN-TIME customer transaction features to prevent data leakage...
  -> Calculating point-in-time cumulative aggregates...
  -> Merging point-in-time transaction features into main dataset...
6/7: Finalizing datasets and saving to disk...
  -> Filling NaN values introduced during merges...

--- Advanced Feature Engineering (v3) Complete! ---
✅ Saved 'inter/train_enriched.parquet'
✅ Saved 'inter/test_enriched.parquet'
Total time taken: 7.94 minutes


In [2]:
def run_stage_1_train_val_split():
    TRAIN_INPUT = "inter/train_enriched.parquet"
    TEST_INPUT = "inter/test_enriched.parquet"
    INTER_DIR = "inter"
    BATCH_SIZE = 100_000
    VAL_RATIO = 0.15

    print("\n**Script 0: Train-Val Split (Customer-Aware & Time-Based)**")
    os.makedirs(INTER_DIR, exist_ok=True)

    parquet_file = pq.ParquetFile(TRAIN_INPUT)

    # 1. Load IDs and timestamps to determine the split
    print("1/5: Loading IDs and timestamps to determine validation customers...")
    id_chunks = []
    # CRITICAL FIX: Load customer ID 'id2' to ensure customer-based split
    for batch in parquet_file.iter_batches(batch_size=BATCH_SIZE, columns=["id2", "id4"]):
        chunk_df = batch.to_pandas()
        id_chunks.append(chunk_df)

    id_df = pd.concat(id_chunks, ignore_index=True)
    print(f"1/5 ID index DataFrame created with shape: {id_df.shape}")
    del id_chunks
    gc.collect()

    id_df["id4"] = pd.to_datetime(id_df["id4"])
    id_df.sort_values(by="id4", inplace=True)

    # 2. Identify validation customers based on time
    # This ensures that the validation set is from a later time period than the training set.
    n_rows = len(id_df)
    split_idx = int(n_rows * (1 - VAL_RATIO))
    valid_customer_ids = set(id_df["id2"].iloc[split_idx:])
    print(f"2/5: Identified {len(valid_customer_ids)} customers for the time-based validation set.")
    del id_df
    gc.collect()

    # 3. Stream data and split into train/valid based on the identified customer IDs
    print("3/5: Splitting full dataset based on validation customer IDs...")
    train_rows = []
    valid_rows = []

    for batch in parquet_file.iter_batches(batch_size=BATCH_SIZE):
        batch_df = batch.to_pandas()
        is_valid_customer = batch_df["id2"].isin(valid_customer_ids)
        valid_rows.append(batch_df[is_valid_customer])
        train_rows.append(batch_df[~is_valid_customer])
    del parquet_file, valid_customer_ids

    # 4. Save the split datasets
    train_split = pd.concat(train_rows, ignore_index=True)
    train_split.to_parquet(f"{INTER_DIR}/train_0.parquet", index=False)
    print(f"3/5 Saved train_0.parquet with shape: {train_split.shape}")
    del train_split, train_rows
    gc.collect()

    valid_split = pd.concat(valid_rows, ignore_index=True)
    valid_split.to_parquet(f"{INTER_DIR}/valid_0.parquet", index=False)
    print(f"4/5 Saved valid_0.parquet with shape: {valid_split.shape}")
    del valid_split, valid_rows
    gc.collect()

    # 5. Copy test data for consistency
    test_df = pd.read_parquet(TEST_INPUT)
    test_df.to_parquet(f"{INTER_DIR}/test_0.parquet", index=False)
    print(f"5/5 Saved test_0.parquet with shape: {test_df.shape}")
    del test_df
    gc.collect()

if __name__ == "__main__":
    # Stage 1: Split enriched training data into train/validation sets
    run_stage_1_train_val_split()


**Script 0: Train-Val Split (Customer-Aware & Time-Based)**
1/5: Loading IDs and timestamps to determine validation customers...
1/5 ID index DataFrame created with shape: (770164, 2)
2/5: Identified 7991 customers for the time-based validation set.
3/5: Splitting full dataset based on validation customer IDs...
3/5 Saved train_0.parquet with shape: (589815, 403)
4/5 Saved valid_0.parquet with shape: (180349, 403)
5/5 Saved test_0.parquet with shape: (337714, 403)


In [3]:
def preprocess_data(train_df, valid_df, test_df, data_dict_path):
    """
    A comprehensive function to clean and typecast datasets.
    1. Removes 100% empty columns based on the training set.
    2. Applies robust type casting for numerical and OHE features.
    3. Applies consistent Label Encoding for all categorical features.
    """
    print("\n--- Step 1: Removing 100% Empty Features ---")

    # Identify columns that are completely null in the training data
    uniques_per_col = train_df.nunique(dropna=True)
    empty_cols = uniques_per_col[uniques_per_col == 0].index.tolist()

    if empty_cols:
        print(f"Found {len(empty_cols)} empty columns to drop.")
        # Drop from all datasets for consistency
        train_df.drop(columns=empty_cols, inplace=True)
        valid_df.drop(columns=empty_cols, inplace=True)
        test_df.drop(columns=empty_cols, inplace=True)
    else:
        print("No completely empty columns found in the training set.")

    print("\n--- Step 2: Applying Type Conversions and Encoding ---")
    data_dict = pd.read_csv(data_dict_path)
    type_map = data_dict.set_index("masked_column")["Type"].str.strip().to_dict()

    # Identify all feature types from the dictionary
    all_cols = train_df.columns
    numerical_cols = [
        col
        for col, dtype in type_map.items()
        if dtype == "Numerical" and col in all_cols
    ]
    numerical_cols.append("f218")

    ohe_cols = [
        col
        for col, dtype in type_map.items()
        if dtype == "One hot encoded" and col in all_cols
    ]

    categorical_cols = [
        col
        for col, dtype in type_map.items()
        if dtype == "Categorical" and col in all_cols
    ]
    if "id3" in categorical_cols:
        categorical_cols.remove("id3")

    datetime_cols = ["id4", "id5"]
    label_col = "y"

    # --- FIX: Explicitly add new categorical features for proper encoding ---
    new_categorical_features = ['brand_id', 'industry_id']
    for new_cat in new_categorical_features:
        if new_cat not in categorical_cols:
            categorical_cols.append(new_cat)

    # Process all three dataframes
    for df in [train_df, valid_df, test_df]:
        # Numerical conversion
        for col in numerical_cols:
            if col not in datetime_cols and col in df.columns:
                df[col] = df[col].astype(np.float32)

        # One-Hot Encoded conversion
        for col in ohe_cols:
            if col in df.columns:
                df[col] = df[col].astype(np.float32).fillna(0).astype(np.int8)

        # Datetime conversion
        if "id4" in df.columns:
            df["id4"] = pd.to_datetime(df["id4"])
        if "id5" in df.columns:
            df["id5"] = pd.to_datetime(df["id5"])

    print("Numerical, OHE, and Datetime conversions complete.")

    # --- FIX: Explicitly handle new numeric features for memory efficiency ---
    new_numeric_cols = [
        'offer_duration_days', 'offer_start_dayofweek', 'redemption_frequency', 'discount_rate', 'is_industry_match', 'days_since_offer_start',
        'days_until_offer_end', 'offer_name_len', 'offer_body_words', 'has_keyword_cashback',
        'has_keyword_points', 'has_keyword_discount', 'has_keyword_spend_x',
        'time_since_last_impression_seconds', 'time_since_last_click_seconds',
        'customer_total_impressions_before', 'customer_brand_impressions_before',
        'customer_total_clicks_before', 'customer_brand_clicks_before', 'customer_ctr_before',
        'customer_brand_ctr_before', 'customer_avg_trans_amount', 'customer_total_spend',
        'customer_num_transactions', 'customer_num_unique_products', 'days_since_last_transaction',
        'customer_total_refund_amount', 'customer_num_refunds', 'customer_refund_rate'
    ]
    print(f"Applying type conversion to {len(new_numeric_cols)} newly engineered numeric features...")
    for df in [train_df, valid_df, test_df]:
        for col in new_numeric_cols:
            if col in df.columns:
                df[col] = df[col].astype(np.float32)

    print(f"Applying Label Encoding to {len(categorical_cols)} categorical features...")
    for col in categorical_cols:
        if col in train_df.columns:
            train_df[col] = train_df[col].fillna("Missing")
            valid_df[col] = valid_df[col].fillna("Missing")
            test_df[col] = test_df[col].fillna("Missing")

            learned_categories = train_df[col].astype("category").dtype

            train_df[col] = train_df[col].astype(learned_categories)
            valid_df[col] = valid_df[col].astype(learned_categories)
            test_df[col] = test_df[col].astype(learned_categories)

            train_df[col] = train_df[col].cat.codes
            valid_df[col] = valid_df[col].cat.codes
            test_df[col] = test_df[col].cat.codes

    for df in [train_df, valid_df]:
        if label_col in df.columns:
            df[label_col] = df[label_col].astype(np.int8)

    print("All preprocessing is complete.")
    return train_df, valid_df, test_df


def run_stage_2_preprocessing():
    DATA_DICT_PATH = "data_dictionary.csv"
    INTER_DIR = "inter"
    print("\n**Script 1: Unified Cleaning and Preprocessing\n")

    train_df = pd.read_parquet(f"{INTER_DIR}/train_0.parquet")
    test_df = pd.read_parquet(f"{INTER_DIR}/test_0.parquet")
    valid_df = pd.read_parquet(f"{INTER_DIR}/valid_0.parquet")

    processed_train, processed_valid, processed_test = preprocess_data(
        train_df, valid_df, test_df, DATA_DICT_PATH
    )

    processed_train.to_parquet(f"{INTER_DIR}/train_1.parquet")
    processed_test.to_parquet(f"{INTER_DIR}/test_1.parquet")
    processed_valid.to_parquet(f"{INTER_DIR}/valid_1.parquet")
    print("All files saved successfully.")

    del train_df, test_df, valid_df, processed_train, processed_valid, processed_test
    gc.collect()

if __name__ == "__main__":
    # Stage 2: Clean and preprocess the split datasets
    run_stage_2_preprocessing()


**Script 1: Unified Cleaning and Preprocessing


--- Step 1: Removing 100% Empty Features ---
Found 4 empty columns to drop.

--- Step 2: Applying Type Conversions and Encoding ---
Numerical, OHE, and Datetime conversions complete.
Applying type conversion to 29 newly engineered numeric features...
Applying Label Encoding to 13 categorical features...
All preprocessing is complete.
All files saved successfully.


In [4]:
import pandas as pd
import warnings
import numpy as np
import os
import gc
import shutil
import uuid
from multiprocessing import Pool, cpu_count
from functools import partial
from tqdm import tqdm
import pyarrow.parquet as pq
import pyarrow as pa
import time
import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold
import json
import matplotlib
import sys
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from scipy.optimize import minimize_scalar
import torch.optim as optim
import optuna
from scipy.fft import rfft
from scipy.stats import entropy
import scipy.stats

# Force Matplotlib to use a non-GUI backend
matplotlib.use("Agg")
import matplotlib.pyplot as plt

warnings.filterwarnings('ignore', category=pd.errors.PerformanceWarning)
warnings.filterwarnings('ignore', category=RuntimeWarning)

# --- Configuration ---
SKIP_TRAINING = False
DATA_DICT_PATH = "data_dictionary.csv"
INPUT_DIR = "inter"
OUTPUT_DIR = "inter"
TEMP_SPLIT_DIR = os.path.join(OUTPUT_DIR, "temp_splits")
TEMP_PARTS_DIR = os.path.join(OUTPUT_DIR, "temp_parts")
TEMP_COMBINED_DIR = os.path.join(OUTPUT_DIR, "temp_combined")
N_CORES = cpu_count()

# --- Helper Functions ---
def downcast_dtypes(df: pd.DataFrame) -> pd.DataFrame:
    """
    Downcasts float64 columns to the more memory-efficient float32 type.
    """
    for col in df.select_dtypes(include=['float64']).columns:
        df[col] = df[col].astype(np.float32)
    return df

# --- Worker for Phase 1 - Parallel Splitting ---
def split_customer_data_worker(customer_id_chunk, main_df, output_dir):
    """
    Takes a chunk of customer IDs, filters the main DataFrame for those customers,
    and saves each customer's data to a separate file.
    """
    try:
        sub_df = main_df[main_df['id2'].isin(customer_id_chunk)]
        for customer_id, group_df in sub_df.groupby('id2'):
            output_file = os.path.join(output_dir, f"customer_{customer_id}.parquet")
            group_df.to_parquet(output_file)
        return True
    except Exception as e:
        print(f"Error in splitting worker for chunk starting with {customer_id_chunk[0]}: {e}")
        return False

# --- Worker for Phase 2 - Parallel Feature Engineering ---
def process_customer_file(customer_id, input_dir, output_dir, numerical_cols):
    """
    Processes a SINGLE customer's data by reading it from a pre-split file.
    """
    try:
        input_path = os.path.join(input_dir, f"customer_{customer_id}.parquet")
        group_df = pd.read_parquet(input_path)
        
        # --- DEFINITIVE FIX V3: Robust Cleaning & Defensive Programming ---
        # 1. Defensively clean any object columns that might contain lists or mixed types.
        def clean_mixed_type(x):
            if isinstance(x, (list, tuple, np.ndarray)):
                return x[0] if len(x) > 0 else np.nan
            return x

        for col in group_df.columns:
            if group_df[col].dtype == 'object':
                # This .apply is only used on object columns to avoid performance hits
                group_df[col] = group_df[col].apply(clean_mixed_type)
        
        # 2. Force all feature columns to numeric, coercing any post-cleaning errors.
        f_cols = [c for c in group_df.columns if c.startswith('f')]
        if f_cols:
            group_df[f_cols] = group_df[f_cols].apply(pd.to_numeric, errors='coerce')
        
        # 3. Create a clean copy to work with, preventing SettingWithCopyWarning
        group_df = group_df.copy()

        # The previous, less effective cleaning blocks are now removed.

        # --- Helper functions ---
        def wma(series, period):
            # The series is now guaranteed to be numeric, so direct operations are safe.
            return series.ewm(alpha=2 / (period + 1), adjust=False).mean()
        
        def hma(series, period):
            if period <= 1: return series
            period_sqrt = int(np.sqrt(period)); period_sqrt = 1 if period_sqrt < 1 else period_sqrt
            wma1 = 2 * wma(series, period // 2); wma2 = wma(series, period)
            return wma(wma1 - wma2, period_sqrt)
        
        def safe_shannon_entropy(series):
            if len(series) < 15: return np.nan
            
            # The series is now numeric, but may have NaNs from rolling windows.
            # Interpolate and then fill any remaining NaNs at the edges
            interpolated_series = series.interpolate(method='linear').ffill().bfill()
            
            # Final check for validity before FFT
            if interpolated_series.isnull().any() or interpolated_series.nunique() < 2: return np.nan
            
            psd = np.abs(rfft(interpolated_series.values))**2
            if psd.sum() == 0: return np.nan
            psd_prob = psd / psd.sum()
            return scipy.stats.entropy(psd_prob)

        # --- Prerequisite variables with additional safety ---
        # Ensure id4 is properly datetime before calculations
        if group_df['id4'].dtype != 'datetime64[ns]':
            group_df['id4'] = pd.to_datetime(group_df['id4'], errors='coerce')
        
        days_since_last_event = (group_df['id4'].diff().dt.total_seconds() / (3600 * 24)) + 1e-6
        # Additional safety: ensure days_since_last_event is numeric
        days_since_last_event = pd.to_numeric(days_since_last_event, errors='coerce').fillna(1e-6)
        
        is_click = (group_df['f29'].diff() > 0) if 'f29' in group_df.columns else pd.Series([False] * len(group_df), index=group_df.index)
        group_df['is_click_int'] = is_click.astype(int)

        # --- Feature Engineering Blocks ---
        group_df['offer_interaction_count'] = group_df.groupby('id3').cumcount()
        if 'offer_category' in group_df.columns:
            group_df['category_interaction_count'] = group_df.groupby('offer_category').cumcount()
            category_clicks = group_df.groupby('offer_category')['is_click_int'].cumsum().shift(1).fillna(0)
            category_views = group_df.groupby('offer_category').cumcount()
            group_df['customer_category_ctr'] = category_clicks / (category_views + 1)
        if 'session_id' in group_df.columns:
            session_gb = group_df.groupby('session_id')
            group_df['session_offer_count'] = session_gb.cumcount()
            group_df['session_clicks'] = session_gb['is_click_int'].cumsum().shift(1).fillna(0)
            group_df['time_since_session_start_mins'] = (group_df['id4'] - session_gb['id4'].transform('min')).dt.total_seconds() / 60
        
        if 'offer_age_days' in group_df.columns:
            rolling_avg_age = group_df['offer_age_days'].rolling(10, min_periods=1).mean()
            group_df['age_vs_recent_avg'] = group_df['offer_age_days'] - rolling_avg_age
        group_df['time_since_last_event_hours'] = days_since_last_event * 24
        if 'f29' in group_df.columns:
            click_timestamps = group_df['id4'].where(is_click).ffill()
            group_df['time_since_last_click'] = (group_df['id4'] - click_timestamps).dt.total_seconds() / 3600
        
        # === DEFINITIVE FIX 1/3: Add data cleaning to Velocity loop ===
        key_velocity_cols = {'f43': 'balance', 'f77': 'engagement_ratio', 'f59': 'time_spent'}
        for col, name in key_velocity_cols.items():
            if col in group_df.columns:
                # The robust cleaning at the start of the function ensures s_numeric is valid
                s_numeric = group_df[col]
                if s_numeric.isnull().all():
                    group_df[f'{name}_velocity'] = np.nan
                    continue
                
                change = s_numeric.diff()
                velocity = change / days_since_last_event
                
                # --- CORRECTED CODE (per user recommendation) ---
                # Compute 1st and 99th percentiles separately (as scalars, not as a list)
                lower_q = velocity.expanding(min_periods=5).quantile(0.01).ffill()
                upper_q = velocity.expanding(min_periods=5).quantile(0.99).ffill()

                # Clip velocity values to the [lower_q, upper_q] range.
                # .clip() correctly handles NaN bounds by not clipping those values.
                group_df[f'{name}_velocity'] = velocity.clip(lower_q, upper_q)

        # === DEFINITIVE FIX 2/3: Add data cleaning to Rolling Features loop ===
        for col in numerical_cols:
            if col in group_df.columns:
                # EXTRA SAFETY: Double-check for list-like values before any operations
                if group_df[col].dtype == 'object':
                    group_df[col] = group_df[col].apply(
                        lambda x: x[0] if isinstance(x, (list, tuple, np.ndarray)) and len(x) > 0 else (np.nan if isinstance(x, (list, tuple, np.ndarray)) else x)
                    )
                
                s_numeric = pd.to_numeric(group_df[col], errors='coerce')
                if s_numeric.isnull().all():
                    group_df[f'{col}_diff1'] = np.nan
                    for k in [5, 10, 20]:
                        group_df[f'{col}hma{k}'] = np.nan
                    for k in [10, 20]:
                        group_df[f'{col}roll_std{k}'] = np.nan
                    continue

                group_df[f'{col}_diff1'] = s_numeric.diff(1)
                for k in [5, 10, 20]:
                    group_df[f'{col}hma{k}'] = hma(s_numeric, k)
                for k in [10, 20]:
                    group_df[f'{col}roll_std{k}'] = s_numeric.rolling(k, min_periods=3).std()

        # === DEFINITIVE FIX 3/3: Add data cleaning to Time-Series Features loop ===
        key_ts_cols = ['f218', 'f219', 'f220', 'f102']
        for col in key_ts_cols:
            if col in group_df.columns:
                # EXTRA SAFETY: Double-check for list-like values before any operations
                if group_df[col].dtype == 'object':
                    group_df[col] = group_df[col].apply(
                        lambda x: x[0] if isinstance(x, (list, tuple, np.ndarray)) and len(x) > 0 else (np.nan if isinstance(x, (list, tuple, np.ndarray)) else x)
                    )
                
                s_numeric = pd.to_numeric(group_df[col], errors='coerce')
                if s_numeric.isnull().all():
                    group_df[f'{col}_stability_5'] = np.nan
                    group_df[f'{col}_lumpiness_5'] = np.nan
                    group_df[f'{col}_rolling_shannon_entropy'] = np.nan
                    continue

                rm = s_numeric.rolling(5, min_periods=1).mean()
                rv = s_numeric.rolling(5, min_periods=1).var()
                group_df[f'{col}_stability_5'] = rm.rolling(5, min_periods=1).var()
                group_df[f'{col}_lumpiness_5'] = rv.rolling(5, min_periods=1).var()
                group_df[f'{col}_rolling_shannon_entropy'] = s_numeric.rolling(20, min_periods=15).apply(safe_shannon_entropy, raw=False)

        group_df['time_since_last_seen_this_offer'] = group_df.groupby('id3')['id4'].diff().dt.total_seconds() / 3600
        if 'offer_category' in group_df.columns: group_df['time_since_last_seen_this_category'] = group_df.groupby('offer_category')['id4'].diff().dt.total_seconds() / 3600
        
        group_df.drop(columns=['is_click_int'], inplace=True, errors='ignore')
        group_df = downcast_dtypes(group_df)
        output_path = os.path.join(output_dir, f"part_{customer_id}.parquet")
        group_df.to_parquet(output_path)
        return True
    except Exception as e:
        # --- ENHANCED ERROR LOGGING (As per user request) ---
        print(f"\n--- CRITICAL ERROR: Processing failed for customer ID: {customer_id} ---")
        error_type = type(e).__name__
        print(f"    Error Type: {error_type}")
        print(f"    Error Message: {e}")
        # Try to load the data again just for logging purposes
        try:
            failing_df = pd.read_parquet(os.path.join(input_dir, f"customer_{customer_id}.parquet"))
            print(f"    Data shape for this customer: {failing_df.shape}")
            print("    Sample of failing customer data:")
            print(failing_df.head(5).to_string())
            print("    Data types of failing customer data:")
            print(failing_df.info())
        except Exception as log_e:
            print(f"    Could not reload data for logging: {log_e}")
        print("    Full Traceback:")
        traceback.print_exc()
        print(f"--- END ERROR LOG FOR CUSTOMER {customer_id} ---\n")
        return False

# --- Worker for Phase 3 - Parallel Assembly ---
def combine_parts_worker(file_chunk, master_schema, output_dir):
    """
    Reads a chunk of part files, concatenates them, and saves a single larger,
    schema-consistent file by adding missing columns and casting to the master schema.
    """
    try:
        df_list = [pd.read_parquet(f) for f in file_chunk]
        chunk_df = pd.concat(df_list, ignore_index=True)
        
        current_cols = set(chunk_df.columns)
        for field in master_schema:
            if field.name not in current_cols:
                chunk_df[field.name] = pd.Series(dtype=field.type.to_pandas_dtype())

        chunk_df = chunk_df[master_schema.names]
        
        table = pa.Table.from_pandas(chunk_df, schema=master_schema, preserve_index=False)
        
        output_path = os.path.join(output_dir, f"combined_{uuid.uuid4()}.parquet")
        pq.write_table(table, output_path)
        return True
    except Exception as e:
        print(f"Error in combining worker: {e}")
        return False

# --- Main Feature Engineering Function ---
def engineer_features_final(
    df: pd.DataFrame, 
    output_path: str,
    data_dict_path: str, 
    global_start_time=None,
    offer_profile_features=None, # Removed customer_profile_features
    schema_dict=None,
    is_test_run=False
):
    print(f"\nProcessing DataFrame with shape: {df.shape} to create file: {output_path}")
    print(f"Using {N_CORES} cores.")
    
    for temp_dir in [TEMP_SPLIT_DIR, TEMP_PARTS_DIR, TEMP_COMBINED_DIR]:
        if os.path.exists(temp_dir): shutil.rmtree(temp_dir)
        os.makedirs(temp_dir)
        print(f"  -> Created temporary directory: {temp_dir}")

    data_dict = pd.read_csv(data_dict_path)
    type_map = {row["masked_column"]: row["Type"].strip() for _, row in data_dict.iterrows()}
    numerical_cols = [col for col in df.columns if col.startswith("f") and type_map.get(col) == "Numerical"]
    if 'f218' not in numerical_cols: numerical_cols.append('f218')
    
    print("Step 1/5: Engineering fast, vectorized features...")
    df["id4"] = pd.to_datetime(df["id4"])
    df = df.sort_values(by=['id2', 'id4'])
    time_diff_mins = df.groupby('id2')['id4'].diff().dt.total_seconds().div(60)
    session_break = (time_diff_mins > 30).cumsum()
    df['session_id'] = df['id2'].astype(str) + '_' + session_break.astype(str)
    df.rename(columns={'f223': 'offer_age_days', 'f224': 'offer_time_to_expiry_days'}, inplace=True)
    df['customer_account_age_days'] = (df['id4'] - df.groupby('id2')['id4'].transform('min')).dt.total_seconds() / (3600 * 24)
    if global_start_time is None: global_start_time = df['id4'].min()
    df['time_since_dataset_start_days'] = (df['id4'] - global_start_time).dt.total_seconds() / (3600 * 24)
    df['day_sin'] = np.sin(2 * np.pi * df['id4'].dt.day / 31); df['day_cos'] = np.cos(2 * np.pi * df['id4'].dt.day / 31)
    category_cols = [f'f{i}' for i in range(226, 233)]; existing_cat_cols = [c for c in category_cols if c in df.columns]
    if existing_cat_cols: df['offer_category'] = df[existing_cat_cols].idxmax(axis=1)
    for col in numerical_cols:
        if col in df.columns: df[col] = np.floor(df[col] * 100) / 100
    df['time_of_day_bin'] = pd.cut(df['id4'].dt.hour, bins=[-1, 5, 11, 17, 23], labels=[0, 1, 2, 3]).astype(np.int8)
    df['is_weekend'] = (df['id4'].dt.weekday >= 5).astype(np.int8)
    df['month_sin'] = np.sin(2 * np.pi * df['id4'].dt.month / 12); df['month_cos'] = np.cos(2 * np.pi * df['id4'].dt.month / 12)
    df['dayofweek_sin'] = np.sin(2 * np.pi * df['id4'].dt.dayofweek / 7); df['dayofweek_cos'] = np.cos(2 * np.pi * df['id4'].dt.dayofweek / 7)
    df['hour_sin'] = np.sin(2 * np.pi * df['id4'].dt.hour / 24); df['hour_cos'] = np.cos(2 * np.pi * df['id4'].dt.hour / 24)
    is_holiday = (df['id4'].dt.is_month_end & (df['id4'].dt.month == 12)) | (df['id4'].dt.is_month_start & (df['id4'].dt.month == 1))
    df['is_holiday_week'] = is_holiday.astype(np.int8)
    df['is_payday_week'] = ((df['id4'].dt.day >= 25) | (df['id4'].dt.day <= 5)).astype(np.int8)

    # === BLOCK REMOVED: The leaky global customer profiles were calculated here ===
    # This entire block was removed to prevent lookahead bias.
    # The new dynamic profiles are calculated below.
    
    # --- Offer Profiles (Calculated once on training data, then reused) ---
    if offer_profile_features is None:
        print("Step 2/5: Computing offer profiles...")
        base_offer_profiles = df.groupby('id3').agg(
            offer_popularity=('id1', 'count'), 
            offer_customer_reach=('id2', 'nunique')
        )
        monthly_counts = df.groupby([df['id4'].dt.to_period('M'), 'id3'])['id1'].count().unstack('id3').fillna(0)
        if len(monthly_counts) >= 6:
            growth_rate = (monthly_counts.iloc[-3:].mean() - monthly_counts.iloc[-6:-3].mean()) / (monthly_counts.iloc[-6:-3].mean() + 1e-6)
            base_offer_profiles['offer_growth_rate'] = growth_rate
        else:
            base_offer_profiles['offer_growth_rate'] = 0
        base_offer_profiles['offer_lifecycle_stage'] = pd.cut(base_offer_profiles['offer_growth_rate'], bins=[-np.inf, 0, 0.2, np.inf], labels=[0, 1, 2]).astype(np.int8)
        offer_profile_features = base_offer_profiles

    # --- Leakage-Free Historical & Dynamic Features ---
    print("Step 2.5/5: Engineering leakage-free historical features...")
    
    # Map the non-leaky offer profiles
    for col in tqdm(offer_profile_features.columns, desc="  -> Mapping offer profiles"): 
        df[col] = df['id3'].map(offer_profile_features[col])
    
    # --- FIX for cumsum TypeError on 'y' column ---
    # The 'y' column in the test set is a placeholder, so this calculation is not meaningful for it.
    # We also add a defensive type conversion to prevent dtype errors.
    if 'y' in df.columns:
        df['y'] = pd.to_numeric(df['y'], errors='coerce')

    if not is_test_run:
        # Calculate historical CTR (leakage-free) for train/valid sets
        offer_group = df.groupby('id3')
        historical_clicks = offer_group['y'].cumsum().shift(1)
        historical_views = offer_group.cumcount()
        df['offer_historical_ctr_fixed'] = (historical_clicks / (historical_views + 1)).fillna(0)
    else:
        # For the test set, this feature doesn't make sense, so create a placeholder.
        df['offer_historical_ctr_fixed'] = 0
    
    # Calculate popularity vs customer's historical norm (leakage-free)
    cust_popularity_avg = df.groupby('id2')['offer_popularity'].expanding().mean().shift(1)
    cust_popularity_avg = cust_popularity_avg.reset_index(level=0, drop=True)
    df['popularity_vs_customer_norm'] = df['offer_popularity'] - cust_popularity_avg.fillna(df['offer_popularity'])
    
    # === NEW LEAKAGE-FREE BLOCK: Dynamic Customer Profiles ===
    print("Step 2.7/5: Engineering dynamic customer profiles for all features...")
    
    # Get all numerical and categorical columns for dynamic profiling
    all_profile_cols = numerical_cols + [col for col in df.columns if col.startswith('cust_profile_') and 'nunique' in col]

    for col in tqdm(all_profile_cols, desc="  -> Calculating dynamic profiles"):
        if col in df.columns:
            customer_group = df.groupby('id2')[col]
            
            # Calculate expanding aggregates, shifting to prevent leakage from the current event
            expanding_mean = customer_group.expanding().mean().shift(1)
            expanding_std = customer_group.expanding().std().shift(1)
            expanding_max = customer_group.expanding().max().shift(1)
            expanding_min = customer_group.expanding().min().shift(1)
            
            # Reset index to align with the main dataframe
            df[f'dynamic_{col}_mean'] = expanding_mean.reset_index(level=0, drop=True)
            df[f'dynamic_{col}_std'] = expanding_std.reset_index(level=0, drop=True)
            df[f'dynamic_{col}_max'] = expanding_max.reset_index(level=0, drop=True)
            df[f'dynamic_{col}_min'] = expanding_min.reset_index(level=0, drop=True)

    # Fill NaNs that result from the shift (i.e., the first event for each customer)
    fill_cols = [c for c in df.columns if c.startswith('dynamic_')]
    df[fill_cols] = df[fill_cols].fillna(0)
    # === END OF NEW BLOCK ===

    # --- PHASE 1: PARALLEL SPLITTING ---
    print(f"\nStep 3/5: Pre-splitting data for {df['id2'].nunique()} customers in parallel...")
    customer_ids = df['id2'].unique()
    id_chunks = np.array_split(customer_ids, N_CORES * 4)
    split_func = partial(split_customer_data_worker, main_df=df, output_dir=TEMP_SPLIT_DIR)
    with Pool(N_CORES) as p:
        for _ in tqdm(p.imap_unordered(split_func, id_chunks), total=len(id_chunks), desc=" -> Parallel Splitting Progress"):
            pass
    del df; gc.collect()

    # --- PHASE 2: PARALLEL FEATURE ENGINEERING ---
    # --- DEFINITIVE FIX for FileNotFoundError: Process only the files that were actually created. ---
    # This prevents race conditions or silent errors in the splitting phase from crashing the processing phase.
    print(f"Step 3.5/5: Discovering customer files from disk to process...")
    try:
        customer_files = [f for f in os.listdir(TEMP_SPLIT_DIR) if f.startswith('customer_') and f.endswith('.parquet')]
        customer_ids_from_files = [f.split('_')[1].split('.')[0] for f in customer_files]
        print(f"   -> Found {len(customer_ids_from_files)} customer files to process.")
    except FileNotFoundError:
        print(f"   -> ⚠️  Warning: Temporary split directory '{TEMP_SPLIT_DIR}' not found. Skipping feature engineering.")
        customer_ids_from_files = []

    if customer_ids_from_files:
        print(f"Processing all {len(customer_ids_from_files)} customer files in parallel...")
        processing_func = partial(process_customer_file, input_dir=TEMP_SPLIT_DIR, output_dir=TEMP_PARTS_DIR, numerical_cols=numerical_cols)
        with Pool(N_CORES) as p:
            # Pass the list of IDs extracted from the actual files on disk
            for _ in tqdm(p.imap_unordered(processing_func, customer_ids_from_files), total=len(customer_ids_from_files), desc=" -> Feature Engineering Progress"):
                pass

    if os.path.exists(TEMP_SPLIT_DIR):
        shutil.rmtree(TEMP_SPLIT_DIR)

    # --- Step 4/5: Robust, Parallel Assembly ---
    print("\nStep 4/5: Combining processed parts from disk...")
    part_files = [os.path.join(TEMP_PARTS_DIR, f) for f in os.listdir(TEMP_PARTS_DIR) if f.endswith('.parquet')]
    if not part_files:
        print("Warning: No temporary part files were created.")
        shutil.rmtree(TEMP_PARTS_DIR)
        return None, offer_profile_features.reset_index(), {}, global_start_time
    
    if schema_dict is None:
        print("  -> No master schema found. Creating a robust union schema from a sample of part files...")
        sample_files = part_files[:min(200, len(part_files))]
        all_schemas = [pq.read_schema(f) for f in sample_files]
        all_fields = {}
        for schema in all_schemas:
            for field in schema:
                if field.name not in all_fields or pa.types.is_null(all_fields[field.name].type):
                     all_fields[field.name] = field
        master_schema_pa = pa.schema(list(all_fields.values()))
        print("  -> Master schema created successfully.")
    else:
        master_schema_pa = pa.schema(schema_dict)
    
    # --- PHASE 3: PARALLEL ASSEMBLY ---
    print(f"Step 4.5/5: Combining {len(part_files)} parts in parallel...")
    file_chunks = np.array_split(part_files, N_CORES * 4)
    combine_func = partial(combine_parts_worker, master_schema=master_schema_pa, output_dir=TEMP_COMBINED_DIR)
    with Pool(N_CORES) as p:
        for _ in tqdm(p.imap_unordered(combine_func, file_chunks), total=len(file_chunks), desc=" -> Combining Chunks Progress"):
            pass
    shutil.rmtree(TEMP_PARTS_DIR)

    # --- Final, fast combination of large chunks ---
    print("\nStep 5/5: Writing final output file...")
    combined_files = [os.path.join(TEMP_COMBINED_DIR, f) for f in os.listdir(TEMP_COMBINED_DIR)]
    write_schema = master_schema_pa
    if is_test_run and 'y' in write_schema.names:
        print(" -> Test run detected. Removing 'y' column from final output schema.")
        write_schema = write_schema.remove(write_schema.get_field_index('y'))
    with pq.ParquetWriter(output_path, schema=write_schema) as writer:
        for part_file in tqdm(combined_files, desc=" -> Writing Final File"):
            table = pq.read_table(part_file)
            if is_test_run and 'y' in table.column_names:
                table = table.drop(['y'])
            writer.write_table(table)
            
    shutil.rmtree(TEMP_COMBINED_DIR)
    
    print(f"Finished creating file: {output_path}")
    final_schema_dict = {field.name: field.type for field in master_schema_pa}
    # Return None for customer profiles as they are now part of the main df
    return None, offer_profile_features.reset_index(), final_schema_dict, global_start_time

# --- Main Execution ---
if __name__ == "__main__":
    print("🚀 ***Script 2: Final Production-Grade Parallel Feature Engineering (Leakage Fixed) ***")
    os.makedirs(OUTPUT_DIR, exist_ok=True)

    if not SKIP_TRAINING:
        print("\n--- Processing Training Data ---")
        train_df = pd.read_parquet(f"{INPUT_DIR}/train_1.parquet")
        
        _, offer_profile, master_schema, global_start_time = engineer_features_final(
            train_df, output_path=f"{OUTPUT_DIR}/train_2_final.parquet", data_dict_path=DATA_DICT_PATH
        )
        # customer_profile is no longer created or saved
        offer_profile.to_parquet(f"{OUTPUT_DIR}/offer_profile_aggs_final.parquet")
        del train_df; gc.collect()
    else:
        print("\n--- Skipping Training Phase: Loading artifacts from disk... ---")
        offer_profile = pd.read_parquet(f"{OUTPUT_DIR}/offer_profile_aggs_final.parquet")
        train_final_schema = pq.read_schema(f"{OUTPUT_DIR}/train_2_final.parquet")
        master_schema = {field.name: field.type for field in train_final_schema}
        train_pq_file = pq.ParquetFile(f"{OUTPUT_DIR}/train_2_final.parquet")
        if 'id4' in train_pq_file.schema.names:
            id4_col = train_pq_file.read(columns=['id4'])
            global_start_time = id4_col['id4'].min().as_py()
        else:
            global_start_time = pd.Timestamp.now()
        print(" -> Artifacts loaded successfully.")
    
    print("\n--- Processing Validation Data ---")
    valid_df = pd.read_parquet(f"{INPUT_DIR}/valid_1.parquet")
    
    valid_df.reset_index(drop=True, inplace=True)
    if 'y' in valid_df.columns:
        valid_df['y'] = pd.to_numeric(valid_df['y'])
    _, _, _, _ = engineer_features_final(
        valid_df, output_path=f"{OUTPUT_DIR}/valid_2_final.parquet", data_dict_path=DATA_DICT_PATH, 
        global_start_time=global_start_time,
        offer_profile_features=offer_profile.set_index('id3'), 
        schema_dict=master_schema
    )
    del valid_df; gc.collect()
    
    print("\n--- Processing Test Data ---")
    test_file_path = f"{INPUT_DIR}/test_1.parquet"
    if os.path.exists(test_file_path):
        test_df = pd.read_parquet(test_file_path)
        test_df.reset_index(drop=True, inplace=True)
        if 'y' not in test_df.columns: test_df['y'] = -1
        _, _, _, _ = engineer_features_final(
            test_df, output_path=f"{OUTPUT_DIR}/test_2_final.parquet", data_dict_path=DATA_DICT_PATH, 
            global_start_time=global_start_time, 
            offer_profile_features=offer_profile.set_index('id3'), 
            schema_dict=master_schema,
            is_test_run=True
        )
        del test_df, offer_profile
    else:
        print("\n'test_1.parquet' not found. Skipping test set processing.")
    
    print("\n--- All datasets have been feature-engineered and saved successfully. ---")

🚀 ***Script 2: Final Production-Grade Parallel Feature Engineering (Leakage Fixed) ***

--- Processing Training Data ---

Processing DataFrame with shape: (589815, 399) to create file: inter/train_2_final.parquet
Using 30 cores.
  -> Created temporary directory: inter/temp_splits
  -> Created temporary directory: inter/temp_parts
  -> Created temporary directory: inter/temp_combined
Step 1/5: Engineering fast, vectorized features...
Step 2/5: Computing offer profiles...
Step 2.5/5: Engineering leakage-free historical features...


  -> Mapping offer profiles: 100%|██████████| 4/4 [00:00<00:00, 100.08it/s]


Step 2.7/5: Engineering dynamic customer profiles for all features...


  -> Calculating dynamic profiles: 100%|██████████| 267/267 [09:31<00:00,  2.14s/it]



Step 3/5: Pre-splitting data for 38559 customers in parallel...


 -> Parallel Splitting Progress: 100%|██████████| 120/120 [45:43<00:00, 22.86s/it] 


Step 3.5/5: Discovering customer files from disk to process...
   -> Found 38559 customer files to process.
Processing all 38559 customer files in parallel...


 -> Feature Engineering Progress: 100%|██████████| 38559/38559 [1:11:23<00:00,  9.00it/s]



Step 4/5: Combining processed parts from disk...
  -> No master schema found. Creating a robust union schema from a sample of part files...
  -> Master schema created successfully.
Step 4.5/5: Combining 38559 parts in parallel...


 -> Combining Chunks Progress: 100%|██████████| 120/120 [09:56<00:00,  4.97s/it] 



Step 5/5: Writing final output file...


 -> Writing Final File: 100%|██████████| 120/120 [02:57<00:00,  1.48s/it]


Finished creating file: inter/train_2_final.parquet

--- Processing Validation Data ---

Processing DataFrame with shape: (180349, 399) to create file: inter/valid_2_final.parquet
Using 30 cores.
  -> Created temporary directory: inter/temp_splits
  -> Created temporary directory: inter/temp_parts
  -> Created temporary directory: inter/temp_combined
Step 1/5: Engineering fast, vectorized features...
Step 2.5/5: Engineering leakage-free historical features...


  -> Mapping offer profiles: 100%|██████████| 4/4 [00:00<00:00, 273.04it/s]


Step 2.7/5: Engineering dynamic customer profiles for all features...


  -> Calculating dynamic profiles: 100%|██████████| 267/267 [02:07<00:00,  2.10it/s]



Step 3/5: Pre-splitting data for 7991 customers in parallel...


 -> Parallel Splitting Progress: 100%|██████████| 120/120 [12:42<00:00,  6.36s/it]


Step 3.5/5: Discovering customer files from disk to process...
   -> Found 7991 customer files to process.
Processing all 7991 customer files in parallel...


 -> Feature Engineering Progress: 100%|██████████| 7991/7991 [14:45<00:00,  9.03it/s]



Step 4/5: Combining processed parts from disk...
Step 4.5/5: Combining 7991 parts in parallel...


 -> Combining Chunks Progress: 100%|██████████| 120/120 [01:48<00:00,  1.10it/s]



Step 5/5: Writing final output file...


 -> Writing Final File: 100%|██████████| 120/120 [02:04<00:00,  1.04s/it]


Finished creating file: inter/valid_2_final.parquet

--- Processing Test Data ---

Processing DataFrame with shape: (337714, 399) to create file: inter/test_2_final.parquet
Using 30 cores.
  -> Created temporary directory: inter/temp_splits
  -> Created temporary directory: inter/temp_parts
  -> Created temporary directory: inter/temp_combined
Step 1/5: Engineering fast, vectorized features...
Step 2.5/5: Engineering leakage-free historical features...


  -> Mapping offer profiles: 100%|██████████| 4/4 [00:00<00:00, 158.86it/s]


Step 2.7/5: Engineering dynamic customer profiles for all features...


  -> Calculating dynamic profiles: 100%|██████████| 267/267 [04:45<00:00,  1.07s/it]



Step 3/5: Pre-splitting data for 18956 customers in parallel...


 -> Parallel Splitting Progress: 100%|██████████| 120/120 [26:02<00:00, 13.02s/it]


Step 3.5/5: Discovering customer files from disk to process...
   -> Found 18956 customer files to process.
Processing all 18956 customer files in parallel...


 -> Feature Engineering Progress: 100%|██████████| 18956/18956 [35:12<00:00,  8.97it/s] 



Step 4/5: Combining processed parts from disk...
Step 4.5/5: Combining 18956 parts in parallel...


 -> Combining Chunks Progress: 100%|██████████| 120/120 [04:20<00:00,  2.17s/it]



Step 5/5: Writing final output file...
 -> Test run detected. Removing 'y' column from final output schema.


 -> Writing Final File: 100%|██████████| 120/120 [02:25<00:00,  1.21s/it]


Finished creating file: inter/test_2_final.parquet

--- All datasets have been feature-engineered and saved successfully. ---


In [5]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.model_selection import StratifiedGroupKFold
import os
import gc
from tqdm import tqdm

# --- Configuration ---
INPUT_DIR = "inter"
OUTPUT_DIR = "inter"
DATA_DICT_PATH = "data_dictionary.csv"
N_SPLITS = 5
N_FEATURES_TO_SELECT = 100  # Select top 100 features based on gain
SAMPLE_FRAC = 0.25  # Use 25% of customers for feature selection
RANDOM_STATE = 42

# --- Main Execution ---
if __name__ == "__main__":
    print("🚀 *** Script 3 (Final): OOF Generation with LambdaMART ***")
    os.makedirs(OUTPUT_DIR, exist_ok=True)

    # 1. Load Data
    # =================================================================
    print("\n--- Step 1: Loading feature-engineered data ---")
    try:
        train_df = pd.read_parquet(f"{INPUT_DIR}/train_2_final.parquet")
        valid_df = pd.read_parquet(f"{INPUT_DIR}/valid_2_final.parquet")
        test_df = pd.read_parquet(f"{INPUT_DIR}/test_2_final.parquet")
        print(f"   -> Train shape: {train_df.shape}")
    except FileNotFoundError as e:
        print(f"❌ ERROR: Could not find input files. Details: {e}")
        exit()

    # 2. Preliminary Feature Selection (using 'gain' importance)
    # =================================================================
    print(
        f"\n--- Step 2: Performing feature selection on {SAMPLE_FRAC * 100}% of customers ---"
    )

    target_col = "y"
    id_cols = ["id1", "id2", "id3", "id4", "id5"]

    data_dict = pd.read_csv(DATA_DICT_PATH)
    all_cats = data_dict[data_dict["Type"].str.strip() == "Categorical"][
        "masked_column"
    ].tolist()
    new_cat_features = [
        "offer_category",
        "session_id",
        "time_of_day_bin",
        "is_weekend",
        "is_holiday_week",
        "is_payday_week",
        "offer_lifecycle_stage",
    ]

    all_feature_cols = [
        col for col in train_df.columns if col not in id_cols + [target_col]
    ]
    categorical_features = [
        col for col in all_feature_cols if col in all_cats + new_cat_features
    ]

    customer_ids = train_df["id2"].unique()
    sample_customer_ids = (
        pd.Series(customer_ids)
        .sample(frac=SAMPLE_FRAC, random_state=RANDOM_STATE)
        .values
    )
    train_sample_df = train_df[train_df["id2"].isin(sample_customer_ids)].copy()

    for col in tqdm(categorical_features, desc="   -> Converting categorical features to category dtype"):
        if col in train_sample_df.columns:
            train_sample_df[col] = train_sample_df[col].astype("category")

    fs_model = lgb.LGBMClassifier(
        objective="binary", random_state=RANDOM_STATE, n_jobs=-1
    )
    fs_model.fit(
        train_sample_df[all_feature_cols],
        train_sample_df[target_col],
        categorical_feature=categorical_features,
    )

    # CRITICAL FIX: Use 'gain' for feature importance, not the default 'split'
    importances = pd.DataFrame(
        {
            "feature": all_feature_cols,
            "importance_gain": fs_model.booster_.feature_importance(
                importance_type="gain"
            ),
        }
    ).sort_values("importance_gain", ascending=False)

    top_features = importances.head(N_FEATURES_TO_SELECT)["feature"].tolist()
    top_categorical_features = [f for f in top_features if f in categorical_features]

    print(f"   -> Selected {len(top_features)} features based on gain importance.")
    del fs_model, train_sample_df, importances
    gc.collect()

    # 3. Prepare DataFrames for LambdaMART
    # =================================================================
    print("\n--- Step 3: Preparing data with selected features for ranking ---")

    # Sort all dataframes by customer and time to correctly calculate group sizes
    train_df = train_df.sort_values(by=["id2", "id5", "id4"])
    valid_df = valid_df.sort_values(by=["id2", "id5", "id4"])
    test_df = test_df.sort_values(by=["id2", "id5", "id4"])

    X_train = train_df[top_features].copy()
    y_train = train_df[target_col].copy()
    groups = train_df["id2"].copy()

    X_valid = valid_df[top_features].copy()
    X_test = test_df[top_features].copy()

    for col in tqdm(
        top_categorical_features, desc="   -> Converting categorical dtypes"
    ):
        X_train[col] = X_train[col].astype("category")
        X_valid[col] = X_valid[col].astype("category")
        X_test[col] = X_test[col].astype("category")

    # Calculate group sizes for the ranking objective
    train_group_sizes = train_df.groupby(["id2", "id5"]).size().to_numpy()

    # 4. OOF Generation with LambdaMART and StratifiedGroupKFold
    # =================================================================
    print("\n--- Step 4: Generating OOF predictions with LambdaMART ---")

    lgbm_params = {
        "objective": "lambdarank",  # <-- Use the ranking objective
        "metric": "map",  # <-- Use the built-in, fast MAP metric
        "eval_at": [7],  # <-- Tell the MAP metric to evaluate at k=7
        "boosting_type": "gbdt",
        "n_estimators": 2000,
        "learning_rate": 0.02,
        "num_leaves": 40,
        "max_depth": 7,
        "seed": RANDOM_STATE,
        "n_jobs": -1,
        "colsample_bytree": 0.7,
        "subsample": 0.7,
        "reg_alpha": 0.1,
        "reg_lambda": 0.1,
        "verbose": -1,
    }

    sgkf = StratifiedGroupKFold(
        n_splits=N_SPLITS, shuffle=True, random_state=RANDOM_STATE
    )

    oof_preds = np.zeros(len(train_df))
    valid_preds_ensemble = np.zeros(len(valid_df))
    test_preds_ensemble = np.zeros(len(test_df))

    for fold, (train_idx, val_idx) in enumerate(
        sgkf.split(X_train, y_train, groups=groups)
    ):
        print(f"\n  -> Processing Fold {fold + 1}/{N_SPLITS}...")
        X_train_fold, y_train_fold = X_train.iloc[train_idx], y_train.iloc[train_idx]
        X_val_fold, y_val_fold = X_train.iloc[val_idx], y_train.iloc[val_idx]

        # Get group sizes for the current fold's train and validation sets
        train_fold_groups = (
            train_df.iloc[train_idx].groupby(["id2", "id5"]).size().to_numpy()
        )
        val_fold_groups = (
            train_df.iloc[val_idx].groupby(["id2", "id5"]).size().to_numpy()
        )

        model = lgb.LGBMRanker(**lgbm_params)
        model.fit(
            X_train_fold,
            y_train_fold,
            group=train_fold_groups,
            eval_set=[(X_val_fold, y_val_fold)],
            eval_group=[val_fold_groups],
            eval_metric="map",
            callbacks=[
                lgb.log_evaluation(period=200),
                lgb.early_stopping(100, verbose=True),
            ],
        )

        # Generate OOF predictions (it's a ranking score, not a probability)
        fold_preds = model.predict(X_val_fold)
        oof_preds[val_idx] = fold_preds

        # Add to ensemble predictions for validation and test sets
        valid_preds_ensemble += model.predict(X_valid) / N_SPLITS
        test_preds_ensemble += model.predict(X_test) / N_SPLITS

    train_df["oof_lgbm_prediction"] = oof_preds
    valid_df["oof_lgbm_prediction"] = valid_preds_ensemble
    test_df["oof_lgbm_prediction"] = test_preds_ensemble

    print("\n   -> OOF and Ensemble predictions created successfully.")

    # 5. Save the final datasets
    # =================================================================
    print("\n--- Step 5: Saving new datasets with OOF/Ensemble features ---")
    train_df.to_parquet(f"{OUTPUT_DIR}/train_3_oof.parquet", index=False)
    valid_df.to_parquet(f"{OUTPUT_DIR}/valid_3_oof.parquet", index=False)
    test_df.to_parquet(f"{OUTPUT_DIR}/test_3_oof.parquet", index=False)

    print(f"   -> Saved 'train_3_oof.parquet' with shape {train_df.shape}")
    print(f"   -> Saved 'valid_3_oof.parquet' with shape {valid_df.shape}")
    print(f"   -> Saved 'test_3_oof.parquet' with shape {test_df.shape}")

    del train_df, valid_df, test_df, X_train, y_train, X_valid, X_test, groups
    gc.collect()

    print("\n✅ --- Final OOF Feature Generation Complete --- ✅")

🚀 *** Script 3 (Final): OOF Generation with LambdaMART ***

--- Step 1: Loading feature-engineered data ---
   -> Train shape: (589815, 3098)

--- Step 2: Performing feature selection on 25.0% of customers ---


   -> Converting categorical features to category dtype: 100%|██████████| 18/18 [00:00<00:00, 386.10it/s]


[LightGBM] [Info] Number of positive: 6866, number of negative: 138538
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 2.733138 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 413003
[LightGBM] [Info] Number of data points in the train set: 145404, number of used features: 2835
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.047220 -> initscore=-3.004563
[LightGBM] [Info] Start training from score -3.004563
   -> Selected 100 features based on gain importance.

--- Step 3: Preparing data with selected features for ranking ---


   -> Converting categorical dtypes: 100%|██████████| 1/1 [00:00<00:00,  8.21it/s]



--- Step 4: Generating OOF predictions with LambdaMART ---

  -> Processing Fold 1/5...




Training until validation scores don't improve for 100 rounds
[200]	valid_0's map@7: 0.962044
[400]	valid_0's map@7: 0.963286
Early stopping, best iteration is:
[412]	valid_0's map@7: 0.963344





  -> Processing Fold 2/5...




Training until validation scores don't improve for 100 rounds
[200]	valid_0's map@7: 0.959497
[400]	valid_0's map@7: 0.960044
[600]	valid_0's map@7: 0.960701
Early stopping, best iteration is:
[657]	valid_0's map@7: 0.961051





  -> Processing Fold 3/5...




Training until validation scores don't improve for 100 rounds
[200]	valid_0's map@7: 0.964553
[400]	valid_0's map@7: 0.965164
Early stopping, best iteration is:
[378]	valid_0's map@7: 0.965318





  -> Processing Fold 4/5...




Training until validation scores don't improve for 100 rounds
[200]	valid_0's map@7: 0.961928
[400]	valid_0's map@7: 0.962811
[600]	valid_0's map@7: 0.963221
[800]	valid_0's map@7: 0.963531
Early stopping, best iteration is:
[738]	valid_0's map@7: 0.963667





  -> Processing Fold 5/5...




Training until validation scores don't improve for 100 rounds
[200]	valid_0's map@7: 0.962848
[400]	valid_0's map@7: 0.96378
[600]	valid_0's map@7: 0.964203
[800]	valid_0's map@7: 0.964267
[1000]	valid_0's map@7: 0.964374
Early stopping, best iteration is:
[1005]	valid_0's map@7: 0.964377





   -> OOF and Ensemble predictions created successfully.

--- Step 5: Saving new datasets with OOF/Ensemble features ---
   -> Saved 'train_3_oof.parquet' with shape (589815, 3099)
   -> Saved 'valid_3_oof.parquet' with shape (180349, 3099)
   -> Saved 'test_3_oof.parquet' with shape (337714, 3098)

✅ --- Final OOF Feature Generation Complete --- ✅


In [1]:
import pandas as pd
import warnings
import numpy as np
import os
import gc
import shutil
import uuid
from multiprocessing import Pool, cpu_count
from functools import partial
from tqdm import tqdm
import pyarrow.parquet as pq
import pyarrow as pa
import time
import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold
import json
import matplotlib
import sys
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from scipy.optimize import minimize_scalar
import torch.optim as optim
import optuna
from scipy.fft import rfft
from scipy.stats import entropy
import scipy.stats

# Force Matplotlib to use a non-GUI backend
matplotlib.use("Agg")
import matplotlib.pyplot as plt
# --- SCRIPT 4: FEATURE SELECTION WITH PERMUTATION IMPORTANCE ---
# This script trains a preliminary model to find the most important features.
# It uses the GPU-accelerated permutation importance method for a robust measure
# of feature impact and saves the top 1000 features for downstream models.

# --- Helper Function for GPU-Accelerated Permutation Importance ---
def calculate_permutation_importance_gpu(model_path, valid_df_pd, feature_cols, categorical_features, target_col, n_repeats=5):
    """
    Calculates and plots feature importance using GPU-accelerated permutation importance.
    Returns a DataFrame of features and their importance scores.
    """
    print("\n" + "="*60)
    print("🚀 *** Calculating GPU-Accelerated Permutation Importance *** 🚀")
    print("="*60)

    # 1. Import RAPIDS libraries
    try:
        import cudf
        import cupy as cp
        import treelite
        from cuml import ForestInference
        from sklearn.metrics import roc_auc_score
    except ImportError as e:
        print(f"❌ Error: Missing RAPIDS libraries. Please ensure cuml, cudf, cupy, and treelite are installed. Details: {e}")
        return pd.DataFrame()

    # 2. Load model and data to GPU
    print("\n1/5: Loading model and validation data to GPU...")
    tl_model = treelite.Model.load(model_path, model_format='lightgbm')
    
    # --- FIX for object dtype error ---
    # Create a purely numerical dataframe for GPU processing by converting categories to codes.
    X_valid_pd_numeric = valid_df_pd[feature_cols].copy()
    print("   -> Converting categorical features to integer codes for GPU compatibility...")
    for col in categorical_features:
        if col in X_valid_pd_numeric.columns:
            # The dtype is already 'category' from the main script block
            X_valid_pd_numeric[col] = X_valid_pd_numeric[col].cat.codes
            
    X_valid_gdf = cudf.from_pandas(X_valid_pd_numeric)
    # --- END FIX ---
    
    y_valid_gseries = cudf.from_pandas(valid_df_pd[target_col])
    print("   -> Filling NaNs with 0 for GPU inference compatibility...")
    X_valid_gdf = X_valid_gdf.fillna(0) # FIL requires non-null data

    # 3. Create FIL model for accelerated inference
    print("\n2/5: Creating Forest Inference (FIL) model...")
    fil_model = ForestInference.load_from_treelite_model(tl_model, is_classifier=True)

    # 4. Calculate baseline score
    print("\n3/5: Calculating baseline validation AUC score...")
    baseline_preds = fil_model.predict_proba(X_valid_gdf)
    baseline_score = roc_auc_score(y_valid_gseries.to_numpy(), baseline_preds.to_numpy())
    print(f"   -> Baseline AUC: {baseline_score:.6f}")

    # 5. Calculate permutation importance
    print(f"\n4/5: Calculating importances for {len(feature_cols)} features...")
    importances = {}
    for col in tqdm(feature_cols, desc="Permuting Features"):
        permuted_scores = []
        original_col = X_valid_gdf[col].copy()
        for _ in range(n_repeats):
            shuffled_values = cp.random.permutation(original_col.to_cupy())
            X_valid_gdf[col] = shuffled_values
            permuted_preds = fil_model.predict_proba(X_valid_gdf)
            permuted_score = roc_auc_score(y_valid_gseries.to_numpy(), permuted_preds.to_numpy())
            permuted_scores.append(permuted_score)
        X_valid_gdf[col] = original_col # Restore original column
        importance = baseline_score - np.mean(permuted_scores)
        importances[col] = importance

    # 6. Process results
    print("\n5/5: Processing results...")
    importance_df = pd.DataFrame.from_dict(importances, orient='index', columns=['importance'])
    importance_df = importance_df.sort_values(by='importance', ascending=False)
    
    print("✅ Permutation importance calculation complete.")
    return importance_df

# --- Main Execution for Feature Selection ---
if __name__ == "__main__":
    print("🚀 *** Script 4: Feature Selection using Permutation Importance ***")
    
    # --- Configuration ---
    INPUT_DIR = "inter"
    MODEL_DIR = "model"
    DATA_DICT_PATH = "data_dictionary.csv"
    TOP_N_FEATURES = 100
    
    os.makedirs(MODEL_DIR, exist_ok=True)
    
    # 1. Load data
    print("\n--- Step 1: Loading data ---")
    train_df = pd.read_parquet(f"{INPUT_DIR}/train_3_oof.parquet")
    valid_df = pd.read_parquet(f"{INPUT_DIR}/valid_3_oof.parquet")

    # 2. Define features and target
    print("\n--- Step 2: Defining features ---")
    target_col = "y"
    id_cols = ["id1", "id2", "id3", "id4", "id5"]
    feature_cols = [col for col in train_df.columns if col not in id_cols + [target_col]]
    
    data_dict = pd.read_csv(DATA_DICT_PATH)
    all_cats = data_dict[data_dict["Type"].str.strip() == "Categorical"]["masked_column"].tolist()
    new_cat_features = ['offer_category','session_id','time_of_day_bin','is_weekend','is_holiday_week','is_payday_week','offer_lifecycle_stage','brand_id','industry_id']
    categorical_features = [col for col in feature_cols if col in all_cats + new_cat_features]

    for col in tqdm(categorical_features):
        train_df[col] = train_df[col].astype('category')
        valid_df[col] = valid_df[col].astype('category')
        
    X_train = train_df[feature_cols]
    y_train = train_df[target_col]
    X_valid = valid_df[feature_cols]
    y_valid = valid_df[target_col]
    
    # 3. Train a temporary model
    print("\n--- Step 3: Training temporary LightGBM model ---")
    temp_params = {
        'objective': 'binary', 'metric': 'auc', 'boosting_type': 'gbdt',
        'n_estimators': 1000, 'learning_rate': 0.03, 'num_leaves': 40,
        'max_depth': 6, 'seed': 42, 'n_jobs': -1, 'is_unbalance': True,
        'colsample_bytree': 0.7, 'subsample': 0.7, 'verbose': -1
    }
    temp_model = lgb.LGBMClassifier(**temp_params)
    temp_model.fit(X_train, y_train, eval_set=[(X_valid, y_valid)],
                   callbacks=[lgb.early_stopping(50, verbose=False)])
    
    temp_model_path = f"{MODEL_DIR}/temp_model_for_importance.txt"
    temp_model.booster_.save_model(temp_model_path)
    print(f"   -> Temporary model saved to '{temp_model_path}'")
    
    del X_train, y_train, X_valid, y_valid, train_df; gc.collect()

    # 4. Calculate permutation importance
    importance_df = calculate_permutation_importance_gpu(
        model_path=temp_model_path,
        valid_df_pd=valid_df, # Pass the full validation dataframe
        feature_cols=feature_cols,
        categorical_features=categorical_features,
        target_col=target_col,
        n_repeats=3 # Use 3-5 repeats for a good balance of stability and speed
    )

    # 5. Select top features and save the list
    if not importance_df.empty:
        # Keep only features with positive importance
        positive_importance_df = importance_df[importance_df['importance'] > 0]
        top_features = positive_importance_df.head(TOP_N_FEATURES).index.tolist()
        
        # The OOF feature is critical, so we ensure it's included
        if 'oof_lgbm_prediction' not in top_features:
            top_features.append('oof_lgbm_prediction')
            
        print(f"\n--- Step 5: Selected top {len(top_features)} features with positive importance ---")
        print(top_features[:15]) # Print a sample
        
        selected_features_path = os.path.join(MODEL_DIR, "top_100_features.json")
        with open(selected_features_path, 'w') as f:
            json.dump(top_features, f, indent=4)
        print(f"\n✅ Top feature list saved to '{selected_features_path}'")
    else:
        print("\n⚠️ Warning: Permutation importance calculation failed. Skipping feature selection.")

    print("\n--- Feature Selection Script Finished ---")

🚀 *** Script 4: Feature Selection using Permutation Importance ***

--- Step 1: Loading data ---

--- Step 2: Defining features ---


100%|██████████| 20/20 [00:00<00:00, 73.78it/s]



--- Step 3: Training temporary LightGBM model ---
   -> Temporary model saved to 'model/temp_model_for_importance.txt'

🚀 *** Calculating GPU-Accelerated Permutation Importance *** 🚀

1/5: Loading model and validation data to GPU...




   -> Converting categorical features to integer codes for GPU compatibility...
   -> Filling NaNs with 0 for GPU inference compatibility...

2/5: Creating Forest Inference (FIL) model...

3/5: Calculating baseline validation AUC score...
   -> Baseline AUC: 0.968592

4/5: Calculating importances for 3093 features...


Permuting Features: 100%|██████████| 3093/3093 [4:18:06<00:00,  5.01s/it]  


5/5: Processing results...
✅ Permutation importance calculation complete.

--- Step 5: Selected top 100 features with positive importance ---
['oof_lgbm_prediction', 'time_since_last_event_hours', 'time_since_last_seen_this_category', 'dynamic_f210_mean', 'time_since_session_start_mins', 'f125hma5', 'f132', 'f366hma20', 'f132hma20', 'f210hma5', 'f366hma10', 'f125hma20', 'f365hma20', 'f366hma5', 'f210hma20']

✅ Top feature list saved to 'model/top_100_features.json'

--- Feature Selection Script Finished ---





In [2]:
# --- THE DEFINITIVE FIX for the "ArrowDtype" error ---
# This block MUST be at the very top of the script, before any other imports.
import sys
if 'dask' in sys.modules:
    del sys.modules['dask']
# ---------------------------------------------------------

import pandas as pd
import numpy as np
import lightgbm as lgb
import optuna
import os
import gc
import json
import matplotlib

# Force Matplotlib to use a non-GUI backend to prevent errors in certain environments
matplotlib.use("Agg")
import matplotlib.pyplot as plt


# --- Configuration ---
INTER_DIR = "inter"
MODEL_DIR = "model"
DATA_DICT_PATH = "data_dictionary.csv"
# We use the final, most feature-rich datasets for tuning
TRAIN_FILE = "inter/train_3_oof.parquet"
VALID_FILE = "inter/valid_3_oof.parquet"
# Number of optimization trials to run. More is better, but takes longer.
N_TRIALS = 50
RANDOM_STATE = 42 

# --- Helper Function for Metric ---
# We use AUC as the optimization metric because it's faster than MAP@7 and highly correlated.
def calculate_map_at_7(y_true, y_pred_probs, ids_df):
    """Calculates Mean Average Precision at 7 for the Amex competition."""
    
    def ap_at_k(group, k=7):
        """Calculates Average Precision at k for a single group."""
        group = group.sort_values('pred', ascending=False)
        y_true_sorted = group['y'].values
        
        # Truncate at k
        y_true_sorted = y_true_sorted[:k]
        
        relevance = (y_true_sorted == 1)
        if np.sum(relevance) == 0:
            return 0.0
            
        precision_at_i = np.cumsum(relevance) / (np.arange(len(relevance)) + 1)
        ap = np.sum(precision_at_i * relevance) / np.sum(relevance)
        return ap

    # Create a temporary DataFrame for calculation
    calc_df = ids_df[['id2', 'id5']].copy()
    calc_df['y'] = y_true
    calc_df['pred'] = y_pred_probs

    # --- SCORING SCRIPT FIX ---
    # Convert id5 to date to match official scoring logic
    calc_df['id5'] = pd.to_datetime(calc_df['id5']).dt.date
    
    # Calculate AP for each customer-session group
    ap_scores = calc_df.groupby(['id2', 'id5']).apply(ap_at_k)
    
    # Return the mean of the AP scores
    return ap_scores.mean()

# --- Optuna Objective Function for LambdaMART ---
def objective(trial, X_train, y_train, train_groups, X_valid, y_valid, valid_groups, categorical_features):
    """
    Optuna objective function to tune hyperparameters for a LightGBM LambdaMART model.
    """
    # Define the search space for the hyperparameters
    params = {
        'objective': 'lambdarank',
        'metric': 'map',
        'eval_at': [7],
        'seed': RANDOM_STATE,
        'n_jobs': -1,
        'verbose': -1,
        'boosting_type': 'gbdt',
        'learning_rate': trial.suggest_float('learning_rate', 0.005, 0.05, log=True),
        'num_leaves': trial.suggest_int('num_leaves', 20, 150),
        'max_depth': trial.suggest_int('max_depth', 3, 12),
        'subsample': trial.suggest_float('subsample', 0.6, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),
        'reg_alpha': trial.suggest_float('reg_alpha', 1e-8, 10.0, log=True),
        'reg_lambda': trial.suggest_float('reg_lambda', 1e-8, 10.0, log=True),
    }

    model = lgb.LGBMRanker(**params)
    
    model.fit(
        X_train, y_train,
        group=train_groups,
        eval_set=[(X_valid, y_valid)],
        eval_group=[valid_groups],
        eval_metric='map',
        callbacks=[lgb.early_stopping(100, verbose=False)]
    )
    
    # Return the best MAP@7 score found during training
    map_score = model.best_score_['valid_0']['map@7']
    return map_score

# --- Main Execution ---
if __name__ == "__main__":
    os.makedirs(MODEL_DIR, exist_ok=True)
    
    print("🚀 *** Final Model Training with LambdaMART & Optuna ***")

    # 1. Load Data
    print("\n1. Loading final OOF-featured datasets...")
    train_df = pd.read_parquet(TRAIN_FILE)
    valid_df = pd.read_parquet(VALID_FILE)
    
    # 2. Prepare Data for LambdaMART
    print("\n2. Preparing data for ranking...")
    
    # CRITICAL: Sort data by group keys to ensure correct group calculation
    train_df = train_df.sort_values(by=['id2', 'id5', 'id4'])
    valid_df = valid_df.sort_values(by=['id2', 'id5', 'id4'])

    target_col = "y"
    id_cols = ["id1", "id2", "id3", "id4", "id5"]
    feature_cols = [col for col in train_df.columns if col not in id_cols + [target_col]]
    
    X_train = train_df[feature_cols].copy()
    y_train = train_df[target_col].copy()
    X_valid = valid_df[feature_cols].copy()
    y_valid = valid_df[target_col].copy()
    
    # Calculate group sizes for the ranker
    train_groups = train_df.groupby(['id2', 'id5']).size().to_numpy()
    valid_groups = valid_df.groupby(['id2', 'id5']).size().to_numpy()
    print(f"   -> Found {len(train_groups)} training groups and {len(valid_groups)} validation groups.")

    data_dict = pd.read_csv(DATA_DICT_PATH)
    all_cats = data_dict[data_dict["Type"].str.strip() == "Categorical"]["masked_column"].tolist()
    new_cat_features = [
        'offer_category', 'session_id', 'time_of_day_bin', 'is_weekend',
        'is_holiday_week', 'is_payday_week', 'offer_lifecycle_stage'
    ]
    categorical_features = [col for col in feature_cols if col in all_cats + new_cat_features]
    
    for col in categorical_features:
        X_train[col] = X_train[col].astype('category')
        X_valid[col] = X_valid[col].astype('category')
    
    valid_df_ids = valid_df[['id2', 'id5']].copy()
    
    # Clean up memory
    del train_df
    gc.collect()

    # 3. Run Optuna Study
    print(f"\n3. Starting Optuna study with {N_TRIALS} trials to maximize MAP@7...")
    study = optuna.create_study(direction='maximize')
    
    study.optimize(
        lambda trial: objective(trial, X_train, y_train, train_groups, X_valid, y_valid, valid_groups, categorical_features),
        n_trials=N_TRIALS,
        show_progress_bar=True 
    )

    # 4. Print and Save Results
    print("\n--- Optimization Finished ---")
    best_trial = study.best_trial
    print(f"  Value (MAP@7): {best_trial.value:.6f}")
    print("  Best Params: ")
    for key, value in best_trial.params.items():
        print(f"    {key}: {value}")
        
    best_params_path = os.path.join(MODEL_DIR, "best_lgbm_ranker_params.json")
    with open(best_params_path, 'w') as f:
        json.dump(best_trial.params, f, indent=4)
    print(f"\nBest ranking parameters saved to {best_params_path}")

    # 5. Re-train final model with best params
    print("\n5. Re-training final ranker model with best parameters...")
    best_params = best_trial.params
    # Add non-tuned parameters
    best_params['objective'] = 'lambdarank'
    best_params['metric'] = 'map'
    best_params['eval_at'] = [7]
    best_params['seed'] = RANDOM_STATE
    best_params['n_jobs'] = -1
    best_params['verbose'] = -1
    best_params['n_estimators'] = 4000 # Use a high number for final training with early stopping
    
    final_model = lgb.LGBMRanker(**best_params)
    
    final_model.fit(
        X_train, y_train,
        group=train_groups,
        eval_set=[(X_valid, y_valid)],
        eval_group=[valid_groups],
        eval_metric='map',
        callbacks=[
            lgb.log_evaluation(period=200),
            lgb.early_stopping(100, verbose=True)
        ]
    )
    
    # --- Artifact Saving ---
    print("\nSaving final model and artifacts...")
    final_model.booster_.save_model(f"{MODEL_DIR}/lgbm_final_ranker_model.txt")
    
    # Plot feature importance using the correct method for a fitted model
    lgb.plot_importance(final_model.booster_, max_num_features=30, figsize=(10, 15), importance_type='gain')
    plt.title('Tuned Ranker Feature Importance (Top 30 by Gain)')
    plt.tight_layout()
    plt.savefig(f"{MODEL_DIR}/feature_importance_ranker_tuned.png")
    plt.close() 
    print("   -> Model and feature importance plot saved successfully.")
    
    print("\nCalculating final validation MAP@7 score with optimized ranker...")
    valid_preds_scores = final_model.predict(X_valid)
    map_at_7_score = calculate_map_at_7(y_valid.values, valid_preds_scores, valid_df_ids)
    print("-" * 50)
    print(f"📊 Final Optimized Validation MAP@7 Score: {map_at_7_score:.6f}")
    print("-" * 50)

    print("\n✅ Script finished successfully. 🎉")

🚀 *** Final Model Training with LambdaMART & Optuna ***

1. Loading final OOF-featured datasets...

2. Preparing data for ranking...
   -> Found 42321 training groups and 10147 validation groups.


[I 2025-07-25 10:57:15,882] A new study created in memory with name: no-name-48df6ab3-1373-4815-b2f7-a9d525edd5c1



3. Starting Optuna study with 50 trials to maximize MAP@7...


  0%|          | 0/50 [00:00<?, ?it/s]



[I 2025-07-25 10:58:38,767] Trial 0 finished with value: 0.9523091283752422 and parameters: {'learning_rate': 0.04315490202135175, 'num_leaves': 120, 'max_depth': 7, 'subsample': 0.8124956402724836, 'colsample_bytree': 0.8502793825321208, 'reg_alpha': 2.5312632550260288e-08, 'reg_lambda': 2.601399743543997e-06}. Best is trial 0 with value: 0.9523091283752422.




[I 2025-07-25 10:59:49,698] Trial 1 finished with value: 0.9518134170326584 and parameters: {'learning_rate': 0.009072685934457543, 'num_leaves': 95, 'max_depth': 5, 'subsample': 0.7781742062266135, 'colsample_bytree': 0.8420037402594945, 'reg_alpha': 0.006779590781597455, 'reg_lambda': 5.584138854344181e-07}. Best is trial 0 with value: 0.9523091283752422.




[I 2025-07-25 11:01:02,421] Trial 2 finished with value: 0.952169959307924 and parameters: {'learning_rate': 0.007781020566505578, 'num_leaves': 33, 'max_depth': 6, 'subsample': 0.9307255693604418, 'colsample_bytree': 0.6612774405621021, 'reg_alpha': 0.00016191553924775503, 'reg_lambda': 9.731094679410602e-06}. Best is trial 0 with value: 0.9523091283752422.




[I 2025-07-25 11:02:27,086] Trial 3 finished with value: 0.9518851622643734 and parameters: {'learning_rate': 0.010946088116799379, 'num_leaves': 41, 'max_depth': 4, 'subsample': 0.7565613721005133, 'colsample_bytree': 0.8362715589076154, 'reg_alpha': 0.28574613489469325, 'reg_lambda': 1.603283030939266}. Best is trial 0 with value: 0.9523091283752422.




[I 2025-07-25 11:03:46,022] Trial 4 finished with value: 0.9515111785327119 and parameters: {'learning_rate': 0.007374499749018956, 'num_leaves': 57, 'max_depth': 6, 'subsample': 0.602766737704506, 'colsample_bytree': 0.7790151005096317, 'reg_alpha': 1.1983834902345488e-06, 'reg_lambda': 0.13678207998028383}. Best is trial 0 with value: 0.9523091283752422.




[I 2025-07-25 11:04:57,685] Trial 5 finished with value: 0.9519864030497714 and parameters: {'learning_rate': 0.007905060734687892, 'num_leaves': 109, 'max_depth': 5, 'subsample': 0.7721616596117595, 'colsample_bytree': 0.8285734300971197, 'reg_alpha': 0.026920168038839117, 'reg_lambda': 2.1084125731335783}. Best is trial 0 with value: 0.9523091283752422.




[I 2025-07-25 11:06:39,342] Trial 6 finished with value: 0.9519855778782058 and parameters: {'learning_rate': 0.008357342866276412, 'num_leaves': 36, 'max_depth': 9, 'subsample': 0.8062663282419631, 'colsample_bytree': 0.8428728879251387, 'reg_alpha': 1.484933485216028e-07, 'reg_lambda': 0.21545226160243008}. Best is trial 0 with value: 0.9523091283752422.




[I 2025-07-25 11:07:46,997] Trial 7 finished with value: 0.9517952749905192 and parameters: {'learning_rate': 0.005643530076504742, 'num_leaves': 117, 'max_depth': 3, 'subsample': 0.8335033373172093, 'colsample_bytree': 0.7978496941065103, 'reg_alpha': 0.9624820253832884, 'reg_lambda': 2.4027249534912234}. Best is trial 0 with value: 0.9523091283752422.




[I 2025-07-25 11:09:18,452] Trial 8 finished with value: 0.9522382423946218 and parameters: {'learning_rate': 0.02577929856334503, 'num_leaves': 102, 'max_depth': 10, 'subsample': 0.9442306416118061, 'colsample_bytree': 0.9422719552372593, 'reg_alpha': 7.126276986796462e-05, 'reg_lambda': 0.1467066289916334}. Best is trial 0 with value: 0.9523091283752422.




[I 2025-07-25 11:10:26,113] Trial 9 finished with value: 0.9522568051234159 and parameters: {'learning_rate': 0.038032997837404416, 'num_leaves': 53, 'max_depth': 4, 'subsample': 0.6480803878932863, 'colsample_bytree': 0.6427370391259613, 'reg_alpha': 0.0001237299751717432, 'reg_lambda': 1.5780826928959951e-06}. Best is trial 0 with value: 0.9523091283752422.




[I 2025-07-25 11:12:03,174] Trial 10 finished with value: 0.9522886287000594 and parameters: {'learning_rate': 0.04848890275128835, 'num_leaves': 147, 'max_depth': 12, 'subsample': 0.8736086578529093, 'colsample_bytree': 0.9818065289165737, 'reg_alpha': 1.4797766580838945e-08, 'reg_lambda': 0.001180024401116318}. Best is trial 0 with value: 0.9523091283752422.




[I 2025-07-25 11:13:40,603] Trial 11 finished with value: 0.9525725670511954 and parameters: {'learning_rate': 0.04804789542938092, 'num_leaves': 147, 'max_depth': 12, 'subsample': 0.8691225299211621, 'colsample_bytree': 0.992466716642839, 'reg_alpha': 1.0893575907089567e-08, 'reg_lambda': 0.0004113768971481198}. Best is trial 11 with value: 0.9525725670511954.




[I 2025-07-25 11:15:35,189] Trial 12 finished with value: 0.9523774852078081 and parameters: {'learning_rate': 0.02404105695072738, 'num_leaves': 147, 'max_depth': 8, 'subsample': 0.8776943448379959, 'colsample_bytree': 0.9333708193082186, 'reg_alpha': 9.296833384346104e-07, 'reg_lambda': 3.1215661252741404e-08}. Best is trial 11 with value: 0.9525725670511954.




[I 2025-07-25 11:17:11,992] Trial 13 finished with value: 0.952332355530169 and parameters: {'learning_rate': 0.021515624688891906, 'num_leaves': 149, 'max_depth': 12, 'subsample': 0.8899239489098313, 'colsample_bytree': 0.9275795535724093, 'reg_alpha': 2.793190385725638e-06, 'reg_lambda': 3.901449374760974e-08}. Best is trial 11 with value: 0.9525725670511954.




[I 2025-07-25 11:18:45,753] Trial 14 finished with value: 0.9524017246923735 and parameters: {'learning_rate': 0.030873770892089057, 'num_leaves': 130, 'max_depth': 9, 'subsample': 0.9790938689750094, 'colsample_bytree': 0.9154236728563686, 'reg_alpha': 2.5501799131083648e-06, 'reg_lambda': 0.0004775041577137679}. Best is trial 11 with value: 0.9525725670511954.




[I 2025-07-25 11:20:19,683] Trial 15 finished with value: 0.9525092657213339 and parameters: {'learning_rate': 0.030164885583534754, 'num_leaves': 129, 'max_depth': 10, 'subsample': 0.9983987811275791, 'colsample_bytree': 0.9918613617571379, 'reg_alpha': 1.647541656576636e-05, 'reg_lambda': 0.0006086779703423598}. Best is trial 11 with value: 0.9525725670511954.




[I 2025-07-25 11:22:13,624] Trial 16 finished with value: 0.952446980073196 and parameters: {'learning_rate': 0.01581373295865216, 'num_leaves': 76, 'max_depth': 10, 'subsample': 0.9988988345015171, 'colsample_bytree': 0.9842725082589511, 'reg_alpha': 0.0016196651008953584, 'reg_lambda': 0.00356637717852023}. Best is trial 11 with value: 0.9525725670511954.




[I 2025-07-25 11:23:47,128] Trial 17 finished with value: 0.9526481962319435 and parameters: {'learning_rate': 0.034200540828172386, 'num_leaves': 133, 'max_depth': 11, 'subsample': 0.710355199161106, 'colsample_bytree': 0.7242984823810963, 'reg_alpha': 1.1886589418043152e-05, 'reg_lambda': 3.63754825731392e-05}. Best is trial 17 with value: 0.9526481962319435.




[I 2025-07-25 11:25:01,192] Trial 18 finished with value: 0.9523578123802331 and parameters: {'learning_rate': 0.017383212876987637, 'num_leaves': 78, 'max_depth': 11, 'subsample': 0.7106290503798076, 'colsample_bytree': 0.716986733106572, 'reg_alpha': 8.569630372827893, 'reg_lambda': 4.640141449275704e-05}. Best is trial 17 with value: 0.9526481962319435.




[I 2025-07-25 11:26:34,557] Trial 19 finished with value: 0.9528524795483716 and parameters: {'learning_rate': 0.036209112810866244, 'num_leaves': 134, 'max_depth': 12, 'subsample': 0.7088490956883309, 'colsample_bytree': 0.7272748855806079, 'reg_alpha': 9.128461832526685e-08, 'reg_lambda': 0.00951995771808995}. Best is trial 19 with value: 0.9528524795483716.




[I 2025-07-25 11:28:05,684] Trial 20 finished with value: 0.9522687787706652 and parameters: {'learning_rate': 0.03578003040892951, 'num_leaves': 130, 'max_depth': 11, 'subsample': 0.7135933175566118, 'colsample_bytree': 0.7227223561025166, 'reg_alpha': 9.419953447297858e-08, 'reg_lambda': 0.01189867331695327}. Best is trial 19 with value: 0.9528524795483716.




[I 2025-07-25 11:30:13,832] Trial 21 finished with value: 0.9523344533766334 and parameters: {'learning_rate': 0.04684747003556021, 'num_leaves': 138, 'max_depth': 12, 'subsample': 0.7006495866759099, 'colsample_bytree': 0.7413687006413867, 'reg_alpha': 4.1511415206850547e-07, 'reg_lambda': 8.05077225085035e-05}. Best is trial 19 with value: 0.9528524795483716.




[I 2025-07-25 11:31:47,386] Trial 22 finished with value: 0.952644927390489 and parameters: {'learning_rate': 0.035570347891136875, 'num_leaves': 115, 'max_depth': 11, 'subsample': 0.6700206018387984, 'colsample_bytree': 0.6966651188637216, 'reg_alpha': 1.8720779403244957e-05, 'reg_lambda': 0.014002795151006634}. Best is trial 19 with value: 0.9528524795483716.




[I 2025-07-25 11:33:58,758] Trial 23 finished with value: 0.9520054551829596 and parameters: {'learning_rate': 0.01979112064147624, 'num_leaves': 114, 'max_depth': 11, 'subsample': 0.666121197495584, 'colsample_bytree': 0.6072429810110487, 'reg_alpha': 1.8928416040217657e-05, 'reg_lambda': 0.05751289235551308}. Best is trial 19 with value: 0.9528524795483716.




[I 2025-07-25 11:35:24,511] Trial 24 finished with value: 0.9521271699442238 and parameters: {'learning_rate': 0.012224555510484023, 'num_leaves': 93, 'max_depth': 9, 'subsample': 0.6408264292826178, 'colsample_bytree': 0.670757944616533, 'reg_alpha': 1.2318787440905712e-05, 'reg_lambda': 0.006891086593720632}. Best is trial 19 with value: 0.9528524795483716.




[I 2025-07-25 11:36:54,531] Trial 25 finished with value: 0.9522039483314101 and parameters: {'learning_rate': 0.029973487529453564, 'num_leaves': 124, 'max_depth': 11, 'subsample': 0.7423910981103219, 'colsample_bytree': 0.756094298065756, 'reg_alpha': 0.002691516260296913, 'reg_lambda': 4.9557367511087474e-05}. Best is trial 19 with value: 0.9528524795483716.




[I 2025-07-25 11:38:19,079] Trial 26 finished with value: 0.9525253205319446 and parameters: {'learning_rate': 0.037120462788094735, 'num_leaves': 109, 'max_depth': 8, 'subsample': 0.6860410936973571, 'colsample_bytree': 0.6949772002035963, 'reg_alpha': 6.823067658211402e-06, 'reg_lambda': 0.0288308531163257}. Best is trial 19 with value: 0.9528524795483716.




[I 2025-07-25 11:39:49,448] Trial 27 finished with value: 0.9523464443429882 and parameters: {'learning_rate': 0.02677357953637676, 'num_leaves': 137, 'max_depth': 10, 'subsample': 0.6099715326519156, 'colsample_bytree': 0.6984641901911557, 'reg_alpha': 0.0007432130939223513, 'reg_lambda': 0.0016753198066994288}. Best is trial 19 with value: 0.9528524795483716.




[I 2025-07-25 11:42:10,284] Trial 28 finished with value: 0.9529399801377797 and parameters: {'learning_rate': 0.03569272548411728, 'num_leaves': 138, 'max_depth': 11, 'subsample': 0.7357596723843407, 'colsample_bytree': 0.6145306410975434, 'reg_alpha': 2.320386769070152e-07, 'reg_lambda': 0.0001044178863479317}. Best is trial 28 with value: 0.9529399801377797.




[I 2025-07-25 11:43:41,571] Trial 29 finished with value: 0.9525999094490134 and parameters: {'learning_rate': 0.04002547345762381, 'num_leaves': 136, 'max_depth': 12, 'subsample': 0.7342068026214176, 'colsample_bytree': 0.6230698703063667, 'reg_alpha': 1.424588982708016e-07, 'reg_lambda': 6.630517967721582e-06}. Best is trial 28 with value: 0.9529399801377797.




[I 2025-07-25 11:45:17,203] Trial 30 finished with value: 0.9516693282220744 and parameters: {'learning_rate': 0.012874690341984575, 'num_leaves': 22, 'max_depth': 9, 'subsample': 0.8195944666075786, 'colsample_bytree': 0.7633518211951075, 'reg_alpha': 5.8776692575908976e-08, 'reg_lambda': 1.772206062975433e-07}. Best is trial 28 with value: 0.9529399801377797.




[I 2025-07-25 11:47:29,247] Trial 31 finished with value: 0.9528484453141989 and parameters: {'learning_rate': 0.03308930374721198, 'num_leaves': 122, 'max_depth': 11, 'subsample': 0.6691719354501987, 'colsample_bytree': 0.6003700926168187, 'reg_alpha': 2.9607235473389417e-05, 'reg_lambda': 1.595341666889147e-05}. Best is trial 28 with value: 0.9529399801377797.




[I 2025-07-25 11:48:58,607] Trial 32 finished with value: 0.952603269355441 and parameters: {'learning_rate': 0.03238516112314817, 'num_leaves': 123, 'max_depth': 11, 'subsample': 0.72610539279207, 'colsample_bytree': 0.6341766994626048, 'reg_alpha': 2.717735653275386e-07, 'reg_lambda': 0.00010184394077153882}. Best is trial 28 with value: 0.9529399801377797.




[I 2025-07-25 11:50:58,284] Trial 33 finished with value: 0.9525918197507968 and parameters: {'learning_rate': 0.0425376294585695, 'num_leaves': 100, 'max_depth': 10, 'subsample': 0.7704608021641326, 'colsample_bytree': 0.6017986579107264, 'reg_alpha': 3.638874790461442e-08, 'reg_lambda': 1.1382260675018496e-05}. Best is trial 28 with value: 0.9529399801377797.




[I 2025-07-25 11:52:31,471] Trial 34 finished with value: 0.9529289880927241 and parameters: {'learning_rate': 0.023401586655966592, 'num_leaves': 140, 'max_depth': 12, 'subsample': 0.6405103431491939, 'colsample_bytree': 0.6663622575964607, 'reg_alpha': 8.495009405357964e-05, 'reg_lambda': 1.2707422249593633e-06}. Best is trial 28 with value: 0.9529399801377797.




[I 2025-07-25 11:54:04,436] Trial 35 finished with value: 0.9527906760417776 and parameters: {'learning_rate': 0.022698964863646552, 'num_leaves': 140, 'max_depth': 12, 'subsample': 0.6274333682509718, 'colsample_bytree': 0.661113101597756, 'reg_alpha': 0.00031452956999722544, 'reg_lambda': 1.8343191511981384e-06}. Best is trial 28 with value: 0.9529399801377797.




[I 2025-07-25 11:55:23,274] Trial 36 finished with value: 0.9521937574346452 and parameters: {'learning_rate': 0.027289555207004934, 'num_leaves': 121, 'max_depth': 7, 'subsample': 0.6785535611515847, 'colsample_bytree': 0.665398571360921, 'reg_alpha': 5.088658507421099e-05, 'reg_lambda': 3.766842441942878e-07}. Best is trial 28 with value: 0.9529399801377797.




[I 2025-07-25 11:56:58,003] Trial 37 finished with value: 0.952219823246798 and parameters: {'learning_rate': 0.018922618165424298, 'num_leaves': 139, 'max_depth': 12, 'subsample': 0.6542404315200889, 'colsample_bytree': 0.6465479676561875, 'reg_alpha': 9.750253824455401e-07, 'reg_lambda': 1.0847270806390162e-05}. Best is trial 28 with value: 0.9529399801377797.




[I 2025-07-25 11:58:38,543] Trial 38 finished with value: 0.9525626834288787 and parameters: {'learning_rate': 0.04250780201959497, 'num_leaves': 124, 'max_depth': 6, 'subsample': 0.7552167197884667, 'colsample_bytree': 0.6167780434975253, 'reg_alpha': 0.01600022639040221, 'reg_lambda': 0.00015768167461561582}. Best is trial 28 with value: 0.9529399801377797.




[I 2025-07-25 12:00:33,345] Trial 39 finished with value: 0.9523453990511812 and parameters: {'learning_rate': 0.0269935554750068, 'num_leaves': 88, 'max_depth': 10, 'subsample': 0.6225902347293852, 'colsample_bytree': 0.6786461332707345, 'reg_alpha': 0.0004016178567714561, 'reg_lambda': 1.0113543260104355e-08}. Best is trial 28 with value: 0.9529399801377797.




[I 2025-07-25 12:01:57,630] Trial 40 finished with value: 0.9522649613046492 and parameters: {'learning_rate': 0.020905028585990918, 'num_leaves': 107, 'max_depth': 11, 'subsample': 0.7838606419163239, 'colsample_bytree': 0.6432734863696742, 'reg_alpha': 0.04861047971751806, 'reg_lambda': 9.678779166674262}. Best is trial 28 with value: 0.9529399801377797.




[I 2025-07-25 12:03:32,913] Trial 41 finished with value: 0.9526925063248254 and parameters: {'learning_rate': 0.02317912164477795, 'num_leaves': 141, 'max_depth': 12, 'subsample': 0.6322851369237664, 'colsample_bytree': 0.657802404201342, 'reg_alpha': 0.00023481627789532136, 'reg_lambda': 1.7568658402492515e-06}. Best is trial 28 with value: 0.9529399801377797.




[I 2025-07-25 12:05:15,019] Trial 42 finished with value: 0.9520256112694413 and parameters: {'learning_rate': 0.02383107587054205, 'num_leaves': 142, 'max_depth': 12, 'subsample': 0.6023864627483164, 'colsample_bytree': 0.6304478821161954, 'reg_alpha': 5.170444110127688e-05, 'reg_lambda': 3.403323939208628e-06}. Best is trial 28 with value: 0.9529399801377797.




[I 2025-07-25 12:06:51,614] Trial 43 finished with value: 0.9528455938073136 and parameters: {'learning_rate': 0.029344435540684836, 'num_leaves': 150, 'max_depth': 12, 'subsample': 0.6843720917386434, 'colsample_bytree': 0.6806942166886923, 'reg_alpha': 0.0015634241832383998, 'reg_lambda': 3.4782217670464973e-07}. Best is trial 28 with value: 0.9529399801377797.




[I 2025-07-25 12:09:02,379] Trial 44 finished with value: 0.9522235323287355 and parameters: {'learning_rate': 0.02818543197334092, 'num_leaves': 149, 'max_depth': 11, 'subsample': 0.6587549843632775, 'colsample_bytree': 0.8697706212147718, 'reg_alpha': 0.0035333036501822247, 'reg_lambda': 6.225806628916447e-07}. Best is trial 28 with value: 0.9529399801377797.




[I 2025-07-25 12:10:36,596] Trial 45 finished with value: 0.9524109094049891 and parameters: {'learning_rate': 0.0330517151801924, 'num_leaves': 128, 'max_depth': 12, 'subsample': 0.6787547746709942, 'colsample_bytree': 0.799687896792756, 'reg_alpha': 0.08290511800757981, 'reg_lambda': 1.0440267425690874e-07}. Best is trial 28 with value: 0.9529399801377797.




[I 2025-07-25 12:12:56,876] Trial 46 finished with value: 0.952490202414529 and parameters: {'learning_rate': 0.03855197411499276, 'num_leaves': 144, 'max_depth': 12, 'subsample': 0.6905603184269128, 'colsample_bytree': 0.6897078379151721, 'reg_alpha': 0.0007878409800532735, 'reg_lambda': 2.704394357161711e-05}. Best is trial 28 with value: 0.9529399801377797.




[I 2025-07-25 12:14:51,680] Trial 47 finished with value: 0.9521129213486913 and parameters: {'learning_rate': 0.005730828961920902, 'num_leaves': 62, 'max_depth': 10, 'subsample': 0.7443726518057666, 'colsample_bytree': 0.6010101045668377, 'reg_alpha': 0.0065609781206037495, 'reg_lambda': 5.615468156694983e-07}. Best is trial 28 with value: 0.9529399801377797.




[I 2025-07-25 12:16:24,776] Trial 48 finished with value: 0.9525652902335667 and parameters: {'learning_rate': 0.0445961198902304, 'num_leaves': 133, 'max_depth': 5, 'subsample': 0.7236032474909819, 'colsample_bytree': 0.6221579347916506, 'reg_alpha': 2.4068051798427375e-08, 'reg_lambda': 0.5397195235148253}. Best is trial 28 with value: 0.9529399801377797.




[I 2025-07-25 12:17:30,513] Trial 49 finished with value: 0.9517336295905965 and parameters: {'learning_rate': 0.024749617888519218, 'num_leaves': 143, 'max_depth': 3, 'subsample': 0.6984455869925298, 'colsample_bytree': 0.7100223452063918, 'reg_alpha': 2.778459337606068e-06, 'reg_lambda': 0.0002216319671853194}. Best is trial 28 with value: 0.9529399801377797.

--- Optimization Finished ---
  Value (MAP@7): 0.952940
  Best Params: 
    learning_rate: 0.03569272548411728
    num_leaves: 138
    max_depth: 11
    subsample: 0.7357596723843407
    colsample_bytree: 0.6145306410975434
    reg_alpha: 2.320386769070152e-07
    reg_lambda: 0.0001044178863479317

Best ranking parameters saved to model/best_lgbm_ranker_params.json

5. Re-training final ranker model with best parameters...




Training until validation scores don't improve for 100 rounds
[200]	valid_0's map@7: 0.952893
Early stopping, best iteration is:
[139]	valid_0's map@7: 0.953338

Saving final model and artifacts...
   -> Model and feature importance plot saved successfully.

Calculating final validation MAP@7 score with optimized ranker...




--------------------------------------------------
📊 Final Optimized Validation MAP@7 Score: 0.095106
--------------------------------------------------

✅ Script finished successfully. 🎉


  ap_scores = calc_df.groupby(['id2', 'id5']).apply(ap_at_k)


In [8]:
import pandas as pd
import numpy as np
import lightgbm as lgb
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW
from torch.optim.lr_scheduler import ReduceLROnPlateau
from sklearn.model_selection import StratifiedKFold
from tqdm import tqdm
import os
import gc
import json

# =================================================================================
# 🚀 SCRIPT 1: GENERATE BASE PREDICTIONS (LGBM)
# =================================================================================
# This part of the script is responsible for loading a pre-trained LightGBM model
# and using it to generate predictions on the test set. These predictions will
# serve as the base for the subsequent residual modeling stage.

def generate_lgbm_predictions():
    """
    Loads a pre-trained LightGBM ranker model and generates predictions on the
    final test set. This creates a base prediction file that can be used
    for ensembling later.
    """
    print("\n" + "="*80)
    print("🚀 === SCRIPT 1: Generating Base LGBM Predictions ===")
    print("="*80 + "\n")
    
    # --- 1. Load Artifacts ---
    print("1/4: Loading model, test data, and submission template...")
    try:
        model = lgb.Booster(model_file="model/lgbm_final_ranker_model.txt")
        # Load the full datasets which contain all features the model was trained on
        train_df = pd.read_parquet("inter/train_3_oof.parquet")
        test_df = pd.read_parquet("inter/test_3_oof.parquet")
    except FileNotFoundError as e:
        print(f"❌ Error: Could not find a required model/data file. Details: {e}")
        return

    print("   -> Artifacts loaded successfully.")
    print(f"   -> Test data shape: {test_df.shape}")

    # --- 2. Prepare Test Data ---
    print("\n2/4: Preparing test data and aligning features...")
    
    # --- FIX: Use ALL features for prediction, consistent with the final training step ---
    # The final model was trained on all available features, so we must use them all here.
    id_cols = ["id1", "id2", "id3", "id4", "id5"]
    features_to_use = [col for col in test_df.columns if col not in id_cols + ['y']]
    print(f"   -> Model was trained on {len(features_to_use)} features.")
    
    # The saved parquet file may not retain the 'category' dtype. We must redefine them
    # using the same logic as in the training script to ensure a perfect match.
    DATA_DICT_PATH = "data_dictionary.csv"
    data_dict = pd.read_csv(DATA_DICT_PATH)
    all_cats = data_dict[data_dict["Type"].str.strip() == "Categorical"]["masked_column"].tolist()
    new_cat_features = [
        "offer_category", "session_id", "time_of_day_bin", "is_weekend",
        "is_holiday_week", "is_payday_week", "offer_lifecycle_stage"
    ]
    all_possible_cats = all_cats + new_cat_features
    categorical_features = [f for f in features_to_use if f in all_possible_cats]
    print(f"   -> Found {len(categorical_features)} categorical features to convert.")

    print(f"   -> Matching categorical dtypes: ", end="")
    for col in tqdm(categorical_features, leave=False):
        # Ensure the column exists before trying to convert it
        if col in train_df.columns and col in test_df.columns:
            train_cat_dtype = train_df[col].astype('category').dtype
            test_df[col] = test_df[col].astype(train_cat_dtype)
    
    # Final feature alignment
    X_test = test_df[features_to_use]
    print(f"   -> Test features aligned. Final shape for prediction: {X_test.shape}")
    
    # --- 3. Generate Predictions ---
    print("\n3/4: Generating predictions on the test set...")
    predictions = model.predict(X_test)
    print("   -> Predictions generated.")
    
    # --- 4. Create Submission File ---
    print("\n4/4: Mapping predictions to submission format...")
    
    # --- SCORING SCRIPT FIX ---
    # Include id2, id3, and id5 for compatibility with the scoring script and final merge
    submission_df = test_df[['id1', 'id2', 'id3', 'id5']].copy()
    submission_df['pred'] = predictions
    
    output_path = 'lgbm_test_preds.csv'
    submission_df.to_csv(output_path, index=False)
    
    print("\n" + "="*50)
    print(f"✅ LGBM base predictions saved successfully: '{output_path}'")
    print("="*50)
    
    del test_df, train_df, X_test
    gc.collect()

if __name__ == "__main__":
    # STEP 1: Generate base predictions from the final tuned LGBM model.
    generate_lgbm_predictions()


🚀 === SCRIPT 1: Generating Base LGBM Predictions ===

1/4: Loading model, test data, and submission template...
   -> Artifacts loaded successfully.
   -> Test data shape: (337714, 3098)

2/4: Preparing test data and aligning features...
   -> Model was trained on 3093 features.
   -> Found 18 categorical features to convert.
   -> Matching categorical dtypes: 

                                               

   -> Test features aligned. Final shape for prediction: (337714, 3093)

3/4: Generating predictions on the test set...
   -> Predictions generated.

4/4: Mapping predictions to submission format...

✅ LGBM base predictions saved successfully: 'lgbm_test_preds.csv'


In [11]:
import pandas as pd
import numpy as np
import lightgbm as lgb
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW
from torch.optim.lr_scheduler import ReduceLROnPlateau
from sklearn.model_selection import StratifiedKFold
from tqdm import tqdm
import os
import gc
import json
# =================================================================================
# 🚀 SCRIPT 2: TRAIN RESIDUAL TRANSFORMER
# =================================================================================
# This script trains an advanced Transformer model. Instead of predicting the
# target directly, it's trained to predict the *error* (residual) of the
# primary LGBM model. This allows it to correct the base model's mistakes.

# --- Competition Metric Calculation ---
def calculate_map_at_7(y_true, y_pred_probs, ids_df):
    """Calculates Mean Average Precision at 7 for the Amex competition."""
    
    def ap_at_k(group, k=7):
        """Calculates Average Precision at k for a single group."""
        group = group.sort_values('pred', ascending=False)
        y_true_sorted = group['y'].values
        
        # Truncate at k
        y_true_sorted = y_true_sorted[:k]
        
        relevance = (y_true_sorted == 1)
        if np.sum(relevance) == 0:
            return 0.0
            
        precision_at_i = np.cumsum(relevance) / (np.arange(len(relevance)) + 1)
        ap = np.sum(precision_at_i * relevance) / np.sum(relevance)
        return ap

    # Create a temporary DataFrame for calculation
    calc_df = ids_df[['id2', 'id5']].copy()
    calc_df['y'] = y_true
    calc_df['pred'] = y_pred_probs

    # --- SCORING SCRIPT FIX ---
    # Convert id5 to date to match official scoring logic
    calc_df['id5'] = pd.to_datetime(calc_df['id5']).dt.date
    
    # Calculate AP for each customer-session group
    ap_scores = calc_df.groupby(['id2', 'id5']).apply(ap_at_k)
    
    # Return the mean of the AP scores
    return ap_scores.mean()

# --- Advanced Transformer Configuration ---
class TransformerConfig:
    TRAIN_FILE = "inter/train_3_oof.parquet"
    TEST_FILE = "inter/test_3_oof.parquet"
    DATA_DICT_PATH = "data_dictionary.csv"
    
    CAT_EMBED_DIM = 8
    EMBED_DIM = 192
    NUM_HEADS = 8
    NUM_LAYERS = 4
    DIM_FEEDFORWARD = 512
    DROPOUT = 0.2
    MAX_SEQ_LEN = 20
    
    EPOCHS = 10
    N_SPLITS = 5
    LR = 5e-5
    BATCH_SIZE = 256
    NUM_WORKERS = 2
    DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
    MODEL_OUTPUT_PATH = "model/transformer_v2_final_model"
    ENSEMBLE_WEIGHT = 0.1 ############### CRITICAL FIX ###############

# --- PyTorch Dataset and Model ---
class AmexAdvancedDataset(Dataset):
    def __init__(self, sequences_cat, sequences_num, labels):
        self.sequences_cat = torch.tensor(sequences_cat, dtype=torch.long)
        self.sequences_num = torch.tensor(sequences_num, dtype=torch.float32)
        self.labels = torch.tensor(labels, dtype=torch.float32)

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return self.sequences_cat[idx], self.sequences_num[idx], self.labels[idx]

class AmexPureTransformer(nn.Module):
    def __init__(self, config, num_numerical_features, cat_cardinalities):
        super().__init__()
        self.config = config
        self.cat_embeddings = nn.ModuleList([
            nn.Embedding(cardinality, config.CAT_EMBED_DIM) for cardinality in cat_cardinalities
        ])
        total_cat_embed_dim = len(cat_cardinalities) * config.CAT_EMBED_DIM
        self.num_norm = nn.LayerNorm(num_numerical_features)
        combined_feature_dim = total_cat_embed_dim + num_numerical_features
        self.feature_proj = nn.Linear(combined_feature_dim, config.EMBED_DIM)
        self.pos_encoder = nn.Parameter(torch.zeros(1, config.MAX_SEQ_LEN, config.EMBED_DIM))
        encoder_layer = nn.TransformerEncoderLayer(
            d_model=config.EMBED_DIM, nhead=config.NUM_HEADS,
            dim_feedforward=config.DIM_FEEDFORWARD, dropout=config.DROPOUT,
            activation='gelu', batch_first=True
        )
        self.transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers=config.NUM_LAYERS)
        self.head = nn.Sequential(
            nn.LayerNorm(config.EMBED_DIM),
            nn.Linear(config.EMBED_DIM, 64),
            nn.ReLU(),
            nn.Linear(64, 1)
        )

    def forward(self, x_cat, x_num):
        cat_embeds = [self.cat_embeddings[i](x_cat[:, :, i]) for i in range(x_cat.size(2))]
        all_cat_embeds = torch.cat(cat_embeds, dim=-1)
        x_num_norm = self.num_norm(x_num)
        x = torch.cat([all_cat_embeds, x_num_norm], dim=-1)
        x = self.feature_proj(x)
        x = x + self.pos_encoder[:, :x.size(1), :]
        x = self.transformer_encoder(x)
        x = self.head(x[:, -1, :]) # Use last token for prediction
        return x

# --- Data Preparation and Training Functions ---
def prepare_advanced_transformer_data(config):
    print("\n--- Preparing data for Advanced Transformer (Residual Modeling) ---")
    
    train_df = pd.read_parquet(config.TRAIN_FILE)
    test_df = pd.read_parquet(config.TEST_FILE)
    
    print("   -> Creating residual target (y - lgbm_oof_prediction)...")
    train_df['residual_target'] = train_df['y'] - train_df['oof_lgbm_prediction']
    
    combined_df = pd.concat([train_df, test_df], ignore_index=True)

    data_dict = pd.read_csv(config.DATA_DICT_PATH)
    all_cats_from_dict = data_dict[data_dict["Type"].str.strip() == "Categorical"]["masked_column"].tolist()
    new_cat_features = [
        'offer_category', 'session_id', 'time_of_day_bin', 'is_weekend',
        'is_holiday_week', 'is_payday_week', 'offer_lifecycle_stage',
        'brand_id', 'industry_id'
    ]
    categorical_cols = [col for col in train_df.columns if col in all_cats_from_dict + new_cat_features and col != 'id3']
    numerical_cols = [col for col in train_df.columns if col.startswith('f') and col not in categorical_cols]

    print("\n   -> Attempting to load list of top features for Transformer...")
    selected_features_path = os.path.join("model", "top_100_features.json")
    try:
        with open(selected_features_path, 'r') as f:
            top_features = json.load(f)
        print(f"   -> ✅ Successfully loaded {len(top_features)} selected features from '{selected_features_path}'.")
        numerical_cols = [col for col in numerical_cols if col in top_features]
        categorical_cols = [col for col in categorical_cols if col in top_features]
    except FileNotFoundError:
        print(f"   -> ⚠️ WARNING: Feature selection file not found at '{selected_features_path}'.")
        print("   -> Proceeding with all available features for the Transformer model.")
    except Exception as e:
        print(f"   -> ❌ ERROR: Could not read feature file. Error: {e}. Using all features.")

    if 'oof_lgbm_prediction' not in numerical_cols:
        numerical_cols.append('oof_lgbm_prediction')

    del combined_df; gc.collect()

    cat_cardinalities = []
    for col in tqdm(categorical_cols, desc="   -> Encoding Categoricals (Leakage-Proof)"):
        codes, uniques = pd.factorize(train_df[col], sort=True)
        mapping = {val: i + 1 for i, val in enumerate(uniques)}
        train_df[col] = codes + 1
        test_df[col] = test_df[col].map(mapping).fillna(0).astype(int)
        cat_cardinalities.append(len(uniques) + 1)

    def create_sequences(df, cols_cat, cols_num, lbl_col=None, is_test=False):
        sequences_cat, sequences_num, labels, customer_ids = [], [], [], []
        grouped = df.groupby('id2')
        for cust_id, group in tqdm(grouped, desc=f"Creating sequences ({'test' if is_test else 'train'})"):
            cat_feats = group[cols_cat].values
            num_feats = group[cols_num].values
            
            padding_len = config.MAX_SEQ_LEN - len(group)
            if padding_len > 0:
                cat_feats = np.vstack([np.zeros((padding_len, cat_feats.shape[1])), cat_feats])
                num_feats = np.vstack([np.zeros((padding_len, num_feats.shape[1])), num_feats])
            else:
                cat_feats = cat_feats[-config.MAX_SEQ_LEN:]
                num_feats = num_feats[-config.MAX_SEQ_LEN:]
            
            sequences_cat.append(cat_feats)
            sequences_num.append(num_feats)
            customer_ids.append(cust_id)
            if not is_test:
                labels.append(group[lbl_col].iloc[-1])
        
        final_cat = np.array(sequences_cat)
        final_num = np.nan_to_num(np.array(sequences_num))
        
        if is_test:
            return final_cat, final_num, np.array(customer_ids)
        return final_cat, final_num, np.array(labels), np.array(customer_ids)

    cat_train, num_train, lbl_train, train_cust_ids = create_sequences(train_df, categorical_cols, numerical_cols, 'residual_target')
    cat_test, num_test, test_cust_ids = create_sequences(test_df, categorical_cols, numerical_cols, is_test=True)
    
    train_data = (cat_train, num_train, lbl_train, train_cust_ids)
    test_data = (cat_test, num_test, test_cust_ids)
    
    return train_data, test_data, len(numerical_cols), cat_cardinalities

def train_one_epoch(model, dataloader, optimizer, criterion, device, scheduler=None):
    model.train()
    total_loss = 0
    for batch in tqdm(dataloader, desc="Training", leave=False):
        cat_seq, num_seq, labels = batch[0].to(device), batch[1].to(device), batch[2].to(device)
        optimizer.zero_grad()
        outputs = model(cat_seq, num_seq).squeeze()
        loss = criterion(outputs, labels)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        if scheduler: scheduler.step()
        total_loss += loss.item()
    return total_loss / len(dataloader)

def evaluate_model(model, dataloader, device, ids_df, lgbm_oof_preds_fold, val_customer_ids):
    """
    Evaluates the transformer model on a validation fold.
    It combines the base LGBM OOF preds with the transformer's residual preds.
    """
    model.eval()
    
    # Get residual predictions from the transformer (one prediction per customer)
    residual_preds = predict_transformer(model, dataloader, device)
    
    # --- FIX for length mismatch: Broadcast customer-level preds to event-level ---
    # Create a mapping from each customer ID to their single predicted residual
    customer_to_residual_map = pd.Series(residual_preds, index=val_customer_ids)
    
    # Use the map to assign the correct residual to every event row
    eval_df = ids_df[ids_df['id2'].isin(val_customer_ids)].copy()
    eval_df['residual_pred'] = eval_df['id2'].map(customer_to_residual_map)
    # --- END FIX ---

    # Combine with base LGBM OOF predictions
    eval_df['lgbm_pred'] = lgbm_oof_preds_fold
    
    # Ensure all components have the same length and are aligned
    # This is a critical sanity check that should now pass
    if len(eval_df) != len(lgbm_oof_preds_fold):
        print("Length mismatch during evaluation! This should not happen.")
        print(f"Eval DF: {len(eval_df)}, LGBM preds: {len(lgbm_oof_preds_fold)}")
        # Attempt to align, assuming the dataloader was created from a sorted df
        eval_df = eval_df.sort_values(['id2', 'id5', 'id4']).reset_index(drop=True)
        if len(eval_df) != len(lgbm_oof_preds_fold):
             # If it still doesn't match, we cannot proceed with evaluation for this fold
             return 0.0

    # CRITICAL FIX: Add a weight to the residual prediction to prevent it from overwhelming the base model
    config = TransformerConfig()
    eval_df['final_pred'] = eval_df['lgbm_pred'] + config.ENSEMBLE_WEIGHT * eval_df['residual_pred']
    
    # Calculate MAP@7 on the final ensembled prediction
    score = calculate_map_at_7(eval_df['y'], eval_df['final_pred'], eval_df[['id2', 'id5']])
    
    return score

def predict_transformer(model, dataloader, device):
    model.eval()
    predictions = []
    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Predicting", leave=False):
            outputs = model(batch[0].to(device), batch[1].to(device))
            predictions.append(outputs.cpu().numpy())
    return np.concatenate(predictions).flatten()

def run_stage2_transformer():
    print("\n" + "="*80)
    print("🚀 === SCRIPT 2: RESIDUAL TRANSFORMER PIPELINE ===")
    print("="*80)
    
    config = TransformerConfig()
    (train_data, test_data, num_numerical, cat_cards) = prepare_advanced_transformer_data(config)
    
    cat_train_full, num_train_full, lbl_train_full, train_cust_ids = train_data
    cat_test, num_test, test_cust_ids = test_data
    
    original_train_df = pd.read_parquet(config.TRAIN_FILE)
    customer_y_stratify = original_train_df.groupby('id2')['y'].max()
    
    skf = StratifiedKFold(n_splits=config.N_SPLITS, shuffle=True, random_state=42)
    oof_residual_preds = np.zeros(len(cat_train_full))
    test_residual_preds = np.zeros(len(cat_test))
    
    for fold, (train_idx, val_idx) in enumerate(skf.split(np.zeros(len(customer_y_stratify)), customer_y_stratify)):
        print(f"\n{'='*30} FOLD {fold+1}/{config.N_SPLITS} {'='*30}")
        
        X_cat_train, X_num_train, y_res_train = cat_train_full[train_idx], num_train_full[train_idx], lbl_train_full[train_idx]
        X_cat_val, X_num_val, y_res_val = cat_train_full[val_idx], num_train_full[val_idx], lbl_train_full[val_idx]
        
        val_cust_ids = customer_y_stratify.index[val_idx]
        ids_val_fold_df = original_train_df[original_train_df['id2'].isin(val_cust_ids)].copy()
        lgbm_oof_val_fold = ids_val_fold_df['oof_lgbm_prediction'].values

        loader_train = DataLoader(AmexAdvancedDataset(X_cat_train, X_num_train, y_res_train), batch_size=config.BATCH_SIZE, shuffle=True)
        loader_val = DataLoader(AmexAdvancedDataset(X_cat_val, X_num_val, y_res_val), batch_size=config.BATCH_SIZE, shuffle=False)

        model = AmexPureTransformer(config, num_numerical, cat_cards).to(config.DEVICE)
        criterion = nn.MSELoss()
        optimizer = AdamW(model.parameters(), lr=config.LR, weight_decay=1e-2)
        scheduler = ReduceLROnPlateau(optimizer, 'max', factor=0.5, patience=2)

        best_map_score = -1.0
        for epoch in range(config.EPOCHS):
            train_loss = train_one_epoch(model, loader_train, optimizer, criterion, config.DEVICE)
            map_score = evaluate_model(model, loader_val, config.DEVICE, ids_val_fold_df, lgbm_oof_val_fold, val_cust_ids)
            print(f"Epoch {epoch+1}/{config.EPOCHS} -> Train Loss: {train_loss:.6f} | Val MAP@7: {map_score:.6f}")
            scheduler.step(map_score)
            if map_score > best_map_score:
                best_map_score = map_score
                torch.save(model.state_dict(), f"{config.MODEL_OUTPUT_PATH}_fold_{fold+1}.pth")
                print(f"   -> 🎉 New best score! Model for Fold {fold+1} saved.")
        
        print(f"-> Loading best model for Fold {fold+1} (MAP@7: {best_map_score:.6f}) for predictions...")
        model.load_state_dict(torch.load(f"{config.MODEL_OUTPUT_PATH}_fold_{fold+1}.pth"))
        
        val_preds_fold = predict_transformer(model, loader_val, config.DEVICE)
        oof_residual_preds[val_idx] = val_preds_fold
        
        test_loader = DataLoader(AmexAdvancedDataset(cat_test, num_test, np.zeros(len(cat_test))), batch_size=config.BATCH_SIZE, shuffle=False)
        test_preds_fold = predict_transformer(model, test_loader, config.DEVICE)
        test_residual_preds += test_preds_fold / config.N_SPLITS
        
        del model, loader_train, loader_val; gc.collect(); torch.cuda.empty_cache()

    print("\n--- K-Fold Training Finished. Saving final OOF and Test predictions. ---")
    
    # --- FIX: Save OOF predictions for validation in the next stage ---
    oof_map = pd.DataFrame({'id2': train_cust_ids, 'oof_transformer_residual': oof_residual_preds})
    original_train_df = original_train_df.merge(oof_map, on='id2', how='left')
    original_train_df.to_parquet("inter/train_4_ensemble.parquet", index=False)
    print("✅ Saved 'train_4_ensemble.parquet' with OOF residual predictions.")
    
    test_map = pd.DataFrame({'id2': test_cust_ids, 'transformer_residual': test_residual_preds})
    test_df = pd.read_parquet(config.TEST_FILE)
    test_df = test_df.merge(test_map, on='id2', how='left')
    test_df.to_parquet("inter/test_4_ensemble.parquet", index=False)
    print("✅ Saved 'test_4_ensemble.parquet' with averaged test residual predictions.")

if __name__ == "__main__":
        # STEP 2: Train the Transformer to predict the residuals of the LGBM model.
    run_stage2_transformer() 


🚀 === SCRIPT 2: RESIDUAL TRANSFORMER PIPELINE ===

--- Preparing data for Advanced Transformer (Residual Modeling) ---
   -> Creating residual target (y - lgbm_oof_prediction)...

   -> Attempting to load list of top features for Transformer...
   -> ✅ Successfully loaded 100 selected features from 'model/top_100_features.json'.


   -> Encoding Categoricals (Leakage-Proof): 100%|██████████| 2/2 [00:00<00:00, 14.80it/s]
Creating sequences (train): 100%|██████████| 38559/38559 [00:40<00:00, 956.11it/s] 
Creating sequences (test): 100%|██████████| 18956/18956 [00:18<00:00, 1025.09it/s]





  ap_scores = calc_df.groupby(['id2', 'id5']).apply(ap_at_k)


Epoch 1/10 -> Train Loss: 3.386113 | Val MAP@7: 0.071326
   -> 🎉 New best score! Model for Fold 1 saved.


  ap_scores = calc_df.groupby(['id2', 'id5']).apply(ap_at_k)


Epoch 2/10 -> Train Loss: 2.500092 | Val MAP@7: 0.071326


  ap_scores = calc_df.groupby(['id2', 'id5']).apply(ap_at_k)


Epoch 3/10 -> Train Loss: 2.443614 | Val MAP@7: 0.071326


  ap_scores = calc_df.groupby(['id2', 'id5']).apply(ap_at_k)


Epoch 4/10 -> Train Loss: 2.417138 | Val MAP@7: 0.071326


  ap_scores = calc_df.groupby(['id2', 'id5']).apply(ap_at_k)


Epoch 5/10 -> Train Loss: 2.384015 | Val MAP@7: 0.071326


  ap_scores = calc_df.groupby(['id2', 'id5']).apply(ap_at_k)


Epoch 6/10 -> Train Loss: 2.373134 | Val MAP@7: 0.071326


  ap_scores = calc_df.groupby(['id2', 'id5']).apply(ap_at_k)


Epoch 7/10 -> Train Loss: 2.349376 | Val MAP@7: 0.071326


  ap_scores = calc_df.groupby(['id2', 'id5']).apply(ap_at_k)


Epoch 8/10 -> Train Loss: 2.329391 | Val MAP@7: 0.071326


  ap_scores = calc_df.groupby(['id2', 'id5']).apply(ap_at_k)


Epoch 9/10 -> Train Loss: 2.316992 | Val MAP@7: 0.071326


  ap_scores = calc_df.groupby(['id2', 'id5']).apply(ap_at_k)


Epoch 10/10 -> Train Loss: 2.295090 | Val MAP@7: 0.071326
-> Loading best model for Fold 1 (MAP@7: 0.071326) for predictions...


                                                            




  ap_scores = calc_df.groupby(['id2', 'id5']).apply(ap_at_k)


Epoch 1/10 -> Train Loss: 3.435701 | Val MAP@7: 0.070240
   -> 🎉 New best score! Model for Fold 2 saved.


  ap_scores = calc_df.groupby(['id2', 'id5']).apply(ap_at_k)


Epoch 2/10 -> Train Loss: 2.503355 | Val MAP@7: 0.070240


  ap_scores = calc_df.groupby(['id2', 'id5']).apply(ap_at_k)


Epoch 3/10 -> Train Loss: 2.444811 | Val MAP@7: 0.070240


  ap_scores = calc_df.groupby(['id2', 'id5']).apply(ap_at_k)


Epoch 4/10 -> Train Loss: 2.416897 | Val MAP@7: 0.070240


  ap_scores = calc_df.groupby(['id2', 'id5']).apply(ap_at_k)


Epoch 5/10 -> Train Loss: 2.377411 | Val MAP@7: 0.070240


  ap_scores = calc_df.groupby(['id2', 'id5']).apply(ap_at_k)


Epoch 6/10 -> Train Loss: 2.353336 | Val MAP@7: 0.070240


  ap_scores = calc_df.groupby(['id2', 'id5']).apply(ap_at_k)


Epoch 7/10 -> Train Loss: 2.325461 | Val MAP@7: 0.070240


  ap_scores = calc_df.groupby(['id2', 'id5']).apply(ap_at_k)


Epoch 8/10 -> Train Loss: 2.285412 | Val MAP@7: 0.070240


  ap_scores = calc_df.groupby(['id2', 'id5']).apply(ap_at_k)


Epoch 9/10 -> Train Loss: 2.264787 | Val MAP@7: 0.070240


  ap_scores = calc_df.groupby(['id2', 'id5']).apply(ap_at_k)


Epoch 10/10 -> Train Loss: 2.247444 | Val MAP@7: 0.070240
-> Loading best model for Fold 2 (MAP@7: 0.070240) for predictions...


                                                            




  ap_scores = calc_df.groupby(['id2', 'id5']).apply(ap_at_k)


Epoch 1/10 -> Train Loss: 3.468970 | Val MAP@7: 0.072136
   -> 🎉 New best score! Model for Fold 3 saved.


  ap_scores = calc_df.groupby(['id2', 'id5']).apply(ap_at_k)


Epoch 2/10 -> Train Loss: 2.520699 | Val MAP@7: 0.072136


  ap_scores = calc_df.groupby(['id2', 'id5']).apply(ap_at_k)


Epoch 3/10 -> Train Loss: 2.473070 | Val MAP@7: 0.072136


  ap_scores = calc_df.groupby(['id2', 'id5']).apply(ap_at_k)


Epoch 4/10 -> Train Loss: 2.432280 | Val MAP@7: 0.072136


  ap_scores = calc_df.groupby(['id2', 'id5']).apply(ap_at_k)


Epoch 5/10 -> Train Loss: 2.406925 | Val MAP@7: 0.072136


  ap_scores = calc_df.groupby(['id2', 'id5']).apply(ap_at_k)


Epoch 6/10 -> Train Loss: 2.394448 | Val MAP@7: 0.072136


  ap_scores = calc_df.groupby(['id2', 'id5']).apply(ap_at_k)


Epoch 7/10 -> Train Loss: 2.364064 | Val MAP@7: 0.072136


  ap_scores = calc_df.groupby(['id2', 'id5']).apply(ap_at_k)


Epoch 8/10 -> Train Loss: 2.352107 | Val MAP@7: 0.072136


  ap_scores = calc_df.groupby(['id2', 'id5']).apply(ap_at_k)


Epoch 9/10 -> Train Loss: 2.330311 | Val MAP@7: 0.072136


  ap_scores = calc_df.groupby(['id2', 'id5']).apply(ap_at_k)


Epoch 10/10 -> Train Loss: 2.322518 | Val MAP@7: 0.072136
-> Loading best model for Fold 3 (MAP@7: 0.072136) for predictions...


                                                            




  ap_scores = calc_df.groupby(['id2', 'id5']).apply(ap_at_k)


Epoch 1/10 -> Train Loss: 3.656245 | Val MAP@7: 0.070717
   -> 🎉 New best score! Model for Fold 4 saved.


  ap_scores = calc_df.groupby(['id2', 'id5']).apply(ap_at_k)


Epoch 2/10 -> Train Loss: 2.506965 | Val MAP@7: 0.070717


  ap_scores = calc_df.groupby(['id2', 'id5']).apply(ap_at_k)


Epoch 3/10 -> Train Loss: 2.457231 | Val MAP@7: 0.070717


  ap_scores = calc_df.groupby(['id2', 'id5']).apply(ap_at_k)


Epoch 4/10 -> Train Loss: 2.427223 | Val MAP@7: 0.070717


  ap_scores = calc_df.groupby(['id2', 'id5']).apply(ap_at_k)


Epoch 5/10 -> Train Loss: 2.389433 | Val MAP@7: 0.070717


  ap_scores = calc_df.groupby(['id2', 'id5']).apply(ap_at_k)


Epoch 6/10 -> Train Loss: 2.374139 | Val MAP@7: 0.070717


  ap_scores = calc_df.groupby(['id2', 'id5']).apply(ap_at_k)


Epoch 7/10 -> Train Loss: 2.355291 | Val MAP@7: 0.070717


  ap_scores = calc_df.groupby(['id2', 'id5']).apply(ap_at_k)


Epoch 8/10 -> Train Loss: 2.337255 | Val MAP@7: 0.070717


  ap_scores = calc_df.groupby(['id2', 'id5']).apply(ap_at_k)


Epoch 9/10 -> Train Loss: 2.317742 | Val MAP@7: 0.070717


  ap_scores = calc_df.groupby(['id2', 'id5']).apply(ap_at_k)


Epoch 10/10 -> Train Loss: 2.298141 | Val MAP@7: 0.070717
-> Loading best model for Fold 4 (MAP@7: 0.070717) for predictions...


                                                            




  ap_scores = calc_df.groupby(['id2', 'id5']).apply(ap_at_k)


Epoch 1/10 -> Train Loss: 3.614066 | Val MAP@7: 0.072770
   -> 🎉 New best score! Model for Fold 5 saved.


  ap_scores = calc_df.groupby(['id2', 'id5']).apply(ap_at_k)


Epoch 2/10 -> Train Loss: 2.510343 | Val MAP@7: 0.072770


  ap_scores = calc_df.groupby(['id2', 'id5']).apply(ap_at_k)


Epoch 3/10 -> Train Loss: 2.454408 | Val MAP@7: 0.072770


  ap_scores = calc_df.groupby(['id2', 'id5']).apply(ap_at_k)


Epoch 4/10 -> Train Loss: 2.418739 | Val MAP@7: 0.072770


  ap_scores = calc_df.groupby(['id2', 'id5']).apply(ap_at_k)


Epoch 5/10 -> Train Loss: 2.393563 | Val MAP@7: 0.072770


  ap_scores = calc_df.groupby(['id2', 'id5']).apply(ap_at_k)


Epoch 6/10 -> Train Loss: 2.377550 | Val MAP@7: 0.072770


  ap_scores = calc_df.groupby(['id2', 'id5']).apply(ap_at_k)


Epoch 7/10 -> Train Loss: 2.365309 | Val MAP@7: 0.072770


  ap_scores = calc_df.groupby(['id2', 'id5']).apply(ap_at_k)


Epoch 8/10 -> Train Loss: 2.351630 | Val MAP@7: 0.072770


  ap_scores = calc_df.groupby(['id2', 'id5']).apply(ap_at_k)


Epoch 9/10 -> Train Loss: 2.336400 | Val MAP@7: 0.072770


  ap_scores = calc_df.groupby(['id2', 'id5']).apply(ap_at_k)


Epoch 10/10 -> Train Loss: 2.322427 | Val MAP@7: 0.072770
-> Loading best model for Fold 5 (MAP@7: 0.072770) for predictions...


                                                            


--- K-Fold Training Finished. Saving final OOF and Test predictions. ---
✅ Saved 'train_4_ensemble.parquet' with OOF residual predictions.
✅ Saved 'test_4_ensemble.parquet' with averaged test residual predictions.


In [15]:
import pandas as pd
import numpy as np

# This is a placeholder for the TransformerConfig class. 
# In your notebook, this class should already be defined in a previous cell.
class TransformerConfig:
    ENSEMBLE_WEIGHT = 0.1

# =================================================================================
# 🚀 SCRIPT 3: FINAL ENSEMBLING AND SUBMISSION
# =================================================================================
# This final script combines the base LGBM predictions with the Transformer's
# residual predictions. It validates the approach on OOF data, applies the
# combination to the test set, scales the results, and generates the final
# submission file.

def run_final_ensembling():
    """
    Combines the base LGBM predictions with the averaged Transformer residual
    predictions to create the final submission file.
    """
    print("\n" + "="*80)
    print("🚀 === SCRIPT 3: FINAL RESIDUAL ENSEMBLING & SUBMISSION ===")
    print("="*80 + "\n")

    # Load base predictions and residual predictions
    # Make sure these files exist from the previous steps
    try:
        lgbm_preds = pd.read_csv('lgbm_test_preds.csv')
        transformer_preds_full = pd.read_parquet('inter/test_4_ensemble.parquet')
    except FileNotFoundError as e:
        print(f"ERROR: Could not find required input file. Make sure previous steps ran successfully.")
        print(e)
        return

    # --- FIX: Select only necessary columns to prevent column name clashes during merge ---
    transformer_preds = transformer_preds_full[['id1', 'transformer_residual']]

    # Merge predictions
    print("--- Applying final combination to test set predictions ---")
    merged_df = lgbm_preds.merge(transformer_preds, on='id1')

    # Ensemble with a weight on the residual
    config = TransformerConfig()
    merged_df['final_pred'] = merged_df['pred'] + config.ENSEMBLE_WEIGHT * merged_df['transformer_residual']

    # Scale final predictions to [0, 1] for submission
    print("   -> Scaling final predictions to the [0, 1] range for submission.")
    final_pred = merged_df['final_pred'].values
    final_pred_scaled = (final_pred - final_pred.min()) / (final_pred.max() - final_pred.min())
    merged_df['pred'] = final_pred_scaled

    # --- 4. Create Final Submission File ---
    print("\n--- Creating final submission file ---")
    
    # --- FIX: Use the 'merged_df' which now contains all necessary columns ---
    # Reorder columns to exactly match the submission template.
    submission_df = merged_df[['id1', 'id2', 'id3', 'id5', 'pred']].copy()
    
    # --- SCORING SCRIPT FIX ---
    # The submission requires id5 (as date).
    submission_df['id5'] = pd.to_datetime(submission_df['id5']).dt.date
    
    # Save to file
    output_path = 'final_residual_ensemble_submission.csv'
    # --- FIX: Save the correct DataFrame ('submission_df') ---
    submission_df.to_csv(output_path, index=False)
    
    print(f"\n🎉🎉 Final ensemble submission file created successfully: '{output_path}' 🎉🎉")
    print("Submission Head:")
    # --- FIX: Print the head of the correct DataFrame ---
    print(submission_df.head())

if __name__ == "__main__":
    # STEP 3: Ensemble the predictions, validate, and create the final submission.
    run_final_ensembling()


🚀 === SCRIPT 3: FINAL RESIDUAL ENSEMBLING & SUBMISSION ===

--- Applying final combination to test set predictions ---
   -> Scaling final predictions to the [0, 1] range for submission.

--- Creating final submission file ---

🎉🎉 Final ensemble submission file created successfully: 'final_residual_ensemble_submission.csv' 🎉🎉
Submission Head:
                                            id1      id2     id3         id5  \
0   1000064_25530_16-23_2023-11-06 12:27:38.171  1000064   25530  2023-11-06   
1   1000064_64401_16-23_2023-11-06 12:27:48.533  1000064   64401  2023-11-06   
2  1000064_260067_16-23_2023-11-06 12:28:22.141  1000064  260067  2023-11-06   
3  1000064_989989_16-23_2023-11-06 12:28:22.267  1000064  989989  2023-11-06   
4   1000064_521695_16-23_2023-11-06 12:28:22.85  1000064  521695  2023-11-06   

       pred  
0  0.624095  
1  0.646328  
2  0.593941  
3  0.125083  
4  0.285315  


In [16]:
def generate_feature_importance_report():
    """
    Loads the final trained model, extracts feature importances,
    and saves them to an Excel file as required for submission.
    """
    print("\n" + "="*80)
    print("🚀 === SCRIPT 4: Generating Feature Importance Report ===")
    print("="*80 + "\n")
    
    try:
        # Load the final trained model
        model = lgb.Booster(model_file="model/lgbm_final_ranker_model.txt")
        print("   -> Successfully loaded final trained model.")
        
        # Extract feature importance
        importance_df = pd.DataFrame({
            'feature': model.feature_name(),
            'importance_gain': model.feature_importance(importance_type='gain'),
            'importance_split': model.feature_importance(importance_type='split')
        }).sort_values(by='importance_gain', ascending=False)
        
        print(f"   -> Extracted importance for {len(importance_df)} features.")
        
        # Save to Excel
        output_path = 'feature_importance.xlsx'
        importance_df.to_excel(output_path, index=False, engine='openpyxl')
        
        print(f"\n✅ Feature importance report saved successfully to '{output_path}'")
        
    except FileNotFoundError:
        print("❌ Error: Could not find the final model file 'model/lgbm_final_ranker_model.txt'.")
        print("   -> Please ensure the model has been trained and saved before running this step.")
    except Exception as e:
        print(f"An unexpected error occurred: {e}")

if __name__ == "__main__":
    # STEP 4: Generate the feature importance report for submission.
    generate_feature_importance_report()


🚀 === SCRIPT 4: Generating Feature Importance Report ===

   -> Successfully loaded final trained model.
   -> Extracted importance for 3093 features.

✅ Feature importance report saved successfully to 'feature_importance.xlsx'
