# Numerai Modeling: Feature Engineering, Ensembling, and Advanced Training

This notebook demonstrates several advanced modeling techniques for the Numerai tournament:
1. **Feature Engineering**: Creating new features using UMAP, Denoising Autoencoders, Contrastive Learning (placeholder), and CTGAN.
2. **Target Exploration**: Analyzing auxiliary targets.
3. **Base Model Training**: Training LightGBM models on different targets and engineered features.
4. **Stacked Ensembling**: Combining base model predictions using a meta-model.
5. **Era-Invariant Training**: Using a PyTorch MLP with custom loss functions (correlation, era variance penalty, feature exposure penalty).
6. **Model Selection & Upload**: Choosing the final model and preparing for submission.

In [1]:
# Install dependencies
# Removed scipy version pin to potentially resolve numpy import errors
!pip install -q numerapi pandas pyarrow matplotlib lightgbm scikit-learn cloudpickle==2.2.1 seaborn umap-learn tensorflow torch ctgan tqdm

# Inline plots
%matplotlib inline

ImportError: Error importing numpy: you should not try to import numpy from
        its source directory; please exit the numpy source tree, and relaunch
        your python interpreter from there.

## Configuration

In [None]:
import pandas as pd
import numpy as np
import json
import gc
from numerapi import NumerAPI
import lightgbm as lgb
from sklearn.model_selection import GroupKFold
from sklearn.linear_model import Ridge
from sklearn.preprocessing import StandardScaler, QuantileTransformer
import cloudpickle
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import pearsonr
from tqdm.notebook import tqdm
import warnings

# Ignore specific warnings if needed (e.g., from CTGAN or TF)
warnings.filterwarnings('ignore', category=FutureWarning)
warnings.filterwarnings('ignore', category=UserWarning, module='sklearn.preprocessing._data')

# --- Configuration ---
DATA_VERSION = "v5.0"
MAIN_TARGET = "target_cyrusd_20"
AUX_TARGETS = [
  "target_victor_20",
  "target_xerxes_20",
  "target_teager2b_20"
]
TARGET_CANDIDATES = [MAIN_TARGET] + AUX_TARGETS
ERA_COL = "era"
DATA_TYPE_COL = "data_type"
TARGET_COL = "target" # Alias for MAIN_TARGET in original notebook
PREDICTION_COL = "prediction"

# Feature Engineering Hyperparameters
UMAP_N_COMPONENTS = 50
AE_ENCODING_DIM = 64
CONTRASTIVE_EMB_DIM = 64
CTGAN_EPOCHS = 50 # Reduced further for speed in example
AE_EPOCHS = 5 # Reduced further for speed in example

# Stacking Ensemble Config
N_FOLDS = 5 # Number of folds for OOF predictions
STACKING_MODEL_TYPE = 'LGBM' # 'LGBM' or 'Linear'

# PyTorch MLP Config
MLP_EPOCHS = 5 # Reduced further for speed in example
MLP_BATCH_SIZE = 1024 # Increased batch size might speed up training on GPU
MLP_LR = 0.001
VARIANCE_PENALTY_WEIGHT = 0.01 # lambda1
FEATURE_EXPOSURE_WEIGHT = 0.01 # lambda2
TOP_N_FEATURES_FOR_EXPOSURE = 50 # Use top N features for exposure penalty

# Model Selection Flags
USE_STACKING = True # Set to True to use Stacking, False for MLP
USE_MLP = False      # Set to True to use MLP (requires PyTorch)

# Speedup Options (Highly Recommended for faster iteration)
DOWNSAMPLE_TRAIN_ERAS = 4 # Use every Nth era for training (e.g., 4 or 10)
DOWNSAMPLE_VALID_ERAS = 4 # Use every Nth era for validation (e.g., 4 or 10)
FEATURE_SET_SIZE = "small" # 'small', 'medium', or 'all'

pd.set_option('display.max_rows', 100)
pd.set_option('display.float_format', lambda x: f'{x:.6f}')

# Add comment about GPU potential
# For significant speedups, especially with AE, CTGAN, and MLP, ensure you are running
# in an environment with a GPU and have the necessary GPU versions of 
# TensorFlow and PyTorch installed.

OSError: dlopen(/Users/I570611/miniconda3/envs/lightgbm-env/lib/python3.11/site-packages/lightgbm/lib/lib_lightgbm.dylib, 0x0006): Library not loaded: @rpath/libomp.dylib
  Referenced from: <8FC36893-94B8-343C-9D9F-4CCBFE81B89B> /Users/I570611/miniconda3/envs/lightgbm-env/lib/python3.11/site-packages/lightgbm/lib/lib_lightgbm.dylib
  Reason: tried: '/usr/local/opt/libomp/lib/libomp.dylib' (no such file), '/System/Volumes/Preboot/Cryptexes/OS/usr/local/opt/libomp/lib/libomp.dylib' (no such file), '/opt/homebrew/opt/libomp/lib/libomp.dylib' (mach-o file, but is an incompatible architecture (have 'arm64', need 'x86_64')), '/System/Volumes/Preboot/Cryptexes/OS/opt/homebrew/opt/libomp/lib/libomp.dylib' (no such file), '/opt/local/lib/libomp/libomp.dylib' (no such file), '/System/Volumes/Preboot/Cryptexes/OS/opt/local/lib/libomp/libomp.dylib' (no such file), '/usr/local/opt/libomp/lib/libomp.dylib' (no such file), '/System/Volumes/Preboot/Cryptexes/OS/usr/local/opt/libomp/lib/libomp.dylib' (no such file), '/opt/homebrew/opt/libomp/lib/libomp.dylib' (mach-o file, but is an incompatible architecture (have 'arm64', need 'x86_64')), '/System/Volumes/Preboot/Cryptexes/OS/opt/homebrew/opt/libomp/lib/libomp.dylib' (no such file), '/opt/local/lib/libomp/libomp.dylib' (no such file), '/System/Volumes/Preboot/Cryptexes/OS/opt/local/lib/libomp/libomp.dylib' (no such file), '/Users/I570611/miniconda3/envs/lightgbm-env/lib/python3.11/lib-dynload/../../libomp.dylib' (no such file), '/Users/I570611/miniconda3/envs/lightgbm-env/bin/../lib/libomp.dylib' (no such file), '/usr/local/lib/libomp.dylib' (no such file), '/usr/lib/libomp.dylib' (no such file, not in dyld cache)

## 1. Feature Engineering Functions

Define functions to generate new features. These functions now fit transformers/models on the input data and return both the modified DataFrame and the fitted objects.

In [None]:
import umap
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader
from ctgan import CTGAN

# --- UMAP Feature Creation ---
def umap_feature_creation(df_train, df_transform, feature_cols, n_components=UMAP_N_COMPONENTS, random_state=42):
    """Fits UMAP on df_train and transforms both df_train and df_transform."""
    print(f"Creating {n_components} UMAP features...")
    reducer = umap.UMAP(n_components=n_components, random_state=random_state, n_jobs=1) # n_jobs=1 can prevent potential issues
    
    # Fit on training data
    print(" Fitting UMAP on training data...")
    train_data = df_train[feature_cols].astype(np.float32).fillna(0.5)
    reducer.fit(train_data)
    
    # Transform training data
    print(" Transforming training data...")
    umap_features_train = reducer.transform(train_data)
    umap_feature_names = [f"umap_feat_{i}" for i in range(n_components)]
    df_train[umap_feature_names] = umap_features_train
    
    # Transform the second dataframe (validation or live)
    print(" Transforming second dataframe...")
    transform_data = df_transform[feature_cols].astype(np.float32).fillna(0.5)
    umap_features_transform = reducer.transform(transform_data)
    df_transform[umap_feature_names] = umap_features_transform

    print("UMAP features created and applied.")
    return df_train, df_transform, umap_feature_names, reducer

# --- Denoising Autoencoder Feature Creation ---
def denoising_autoencoder_features(df_train, df_transform, feature_cols, encoding_dim=AE_ENCODING_DIM, noise_factor=0.1, epochs=AE_EPOCHS, batch_size=1024):
    """Fits AE on df_train and transforms both df_train and df_transform."""
    print(f"Creating {encoding_dim} Denoising AE features...")
    input_dim = len(feature_cols)
    train_data = df_train[feature_cols].astype(np.float32).fillna(0.5).values

    # Add noise to training data for denoising objective
    noisy_train_data = train_data + noise_factor * np.random.normal(loc=0.0, scale=1.0, size=train_data.shape)
    noisy_train_data = np.clip(noisy_train_data, 0., 1.)

    # Define Autoencoder
    input_layer = keras.Input(shape=(input_dim,))
    # Simple encoder-decoder structure
    encoded = layers.Dense(encoding_dim * 2, activation='relu')(input_layer)
    encoded = layers.Dense(encoding_dim, activation='relu', name='encoder_output')(encoded) # Naming the layer
    decoded = layers.Dense(encoding_dim * 2, activation='relu')(encoded)
    decoded = layers.Dense(input_dim, activation='sigmoid')(decoded)

    autoencoder = keras.Model(input_layer, decoded)
    encoder = keras.Model(input_layer, encoded) # Separate encoder model

    autoencoder.compile(optimizer='adam', loss='mse')

    # Train Autoencoder on noisy training data to reconstruct original
    print(" Training Denoising Autoencoder...")
    autoencoder.fit(noisy_train_data, train_data,
                    epochs=epochs,
                    batch_size=batch_size,
                    shuffle=True,
                    validation_split=0.1,
                    verbose=0)

    # Get encoded features for training data
    print(" Encoding training data...")
    ae_features_train = encoder.predict(train_data)
    ae_feature_names = [f"ae_feat_{i}" for i in range(encoding_dim)]
    df_train[ae_feature_names] = ae_features_train

    # Get encoded features for the second dataframe
    print(" Encoding second dataframe...")
    transform_data = df_transform[feature_cols].astype(np.float32).fillna(0.5).values
    ae_features_transform = encoder.predict(transform_data)
    df_transform[ae_feature_names] = ae_features_transform

    print("AE features created and applied.")
    return df_train, df_transform, ae_feature_names, encoder

# --- Contrastive Learning Feature Creation (Placeholder) ---
def contrastive_feature_creation(df_train, df_transform, feature_cols, embedding_dim=CONTRASTIVE_EMB_DIM):
    """Placeholder: Generates random features for both dataframes."""
    print(f"Creating {embedding_dim} Contrastive (placeholder) features...")
    # Proper implementation is complex and data-dependent.
    num_samples_train = len(df_train)
    num_samples_transform = len(df_transform)
    contrastive_features_train = np.random.rand(num_samples_train, embedding_dim).astype(np.float32)
    contrastive_features_transform = np.random.rand(num_samples_transform, embedding_dim).astype(np.float32)
    
    contrastive_feature_names = [f"contrastive_feat_{i}" for i in range(embedding_dim)]
    df_train[contrastive_feature_names] = contrastive_features_train
    df_transform[contrastive_feature_names] = contrastive_features_transform
    print("Contrastive (placeholder) features created.")
    # No fitted object returned for this placeholder
    return df_train, df_transform, contrastive_feature_names, None 

# --- CTGAN Feature Creation ---
def synthetic_data_ctgan(df_train, df_transform, feature_cols, target_col, n_synthetic_samples_ratio=0.5, epochs=CTGAN_EPOCHS):
    """Fits CTGAN on df_train and generates a distance feature for both dataframes."""
    print(f"Creating synthetic features using CTGAN (target: {target_col})...")
    
    data_subset_train = df_train[feature_cols + [target_col]].copy().dropna(subset=[target_col])
    data_subset_train[feature_cols] = data_subset_train[feature_cols].fillna(0.5)

    # Use QuantileTransformer
    qt = QuantileTransformer(output_distribution='uniform', random_state=42) # Uniform might work better for CTGAN
    print(" Fitting QuantileTransformer...")
    data_transformed_train = qt.fit_transform(data_subset_train[feature_cols])
    data_transformed_df_train = pd.DataFrame(data_transformed_train, columns=feature_cols, index=data_subset_train.index)
    data_transformed_df_train[target_col] = data_subset_train[target_col].values

    discrete_columns = [] # Assuming no discrete columns for now

    # Train CTGAN
    print(" Training CTGAN...")
    ctgan_model = CTGAN(verbose=False)
    try:
        ctgan_model.fit(data_transformed_df_train, discrete_columns, epochs=epochs)
    except Exception as e:
        print(f"CTGAN fitting failed: {e}. Skipping CTGAN features.")
        return df_train, df_transform, [], None, None, None # Return Nones for objects

    # Generate synthetic samples
    print(" Generating synthetic data...")
    n_synthetic_samples = int(len(data_subset_train) * n_synthetic_samples_ratio)
    synthetic_data_transformed = ctgan_model.sample(n_synthetic_samples)

    # Inverse transform synthetic features
    synthetic_features_original_scale = qt.inverse_transform(synthetic_data_transformed[feature_cols])
    synthetic_df = pd.DataFrame(synthetic_features_original_scale, columns=feature_cols)
    
    # Calculate mean of synthetic features
    synthetic_mean = synthetic_df.mean(axis=0).astype(np.float32)
    ctgan_feature_name = f"dist_to_synth_mean_{target_col}"
    
    # Calculate distance feature for training data
    print(" Calculating distance feature for training data...")
    original_features_train = df_train[feature_cols].fillna(0.5).values.astype(np.float32)
    distances_train = np.linalg.norm(original_features_train - synthetic_mean.values, axis=1)
    df_train[ctgan_feature_name] = distances_train

    # Calculate distance feature for the second dataframe
    print(" Calculating distance feature for second dataframe...")
    original_features_transform = df_transform[feature_cols].fillna(0.5).values.astype(np.float32)
    distances_transform = np.linalg.norm(original_features_transform - synthetic_mean.values, axis=1)
    df_transform[ctgan_feature_name] = distances_transform

    print("CTGAN-derived features created and applied.")
    
    del data_subset_train, data_transformed_train, data_transformed_df_train, synthetic_data_transformed, synthetic_features_original_scale, synthetic_df
    gc.collect()
    
    return df_train, df_transform, [ctgan_feature_name], ctgan_model, qt, synthetic_mean

print("Feature engineering functions defined.")

## 2. Data Loading and Initial Exploration

Load training data and explore auxiliary targets.

In [None]:
napi = NumerAPI()

# Download metadata and training data
print("Downloading metadata...")
napi.download_dataset(f"{DATA_VERSION}/features.json")
print("Downloading training data...")
napi.download_dataset(f"{DATA_VERSION}/train.parquet")

# Load feature metadata and define feature sets
print("Loading feature metadata...")
feature_metadata = json.load(open(f"{DATA_VERSION}/features.json"))
feature_sets = feature_metadata["feature_sets"]
original_feature_cols = feature_sets[FEATURE_SET_SIZE] # Use configured feature set size
target_cols = feature_metadata["targets"]

# Load training data
print("Loading training data...")
train = pd.read_parquet(
    f"{DATA_VERSION}/train.parquet",
    columns=[ERA_COL, DATA_TYPE_COL] + original_feature_cols + target_cols
)

# Filter for training data type (just in case)
train = train[train[DATA_TYPE_COL] == "train"].copy()
del train[DATA_TYPE_COL]
gc.collect()

# Downsample eras if configured
if DOWNSAMPLE_TRAIN_ERAS > 1:
    print(f"Downsampling training data to every {DOWNSAMPLE_TRAIN_ERAS}th era...")
    train = train[train[ERA_COL].isin(train[ERA_COL].unique()[::DOWNSAMPLE_TRAIN_ERAS])].copy()
    gc.collect()

# Display target columns
print("Target columns in training data:")
display(train[[ERA_COL] + target_cols].head())

### The Main Target

The primary target for Numerai predictions is typically `target_cyrusd_20`. The `target` column is often an alias for this.

In [None]:
# Check if 'target' is an alias for the main target and prepare targets DataFrame
if TARGET_COL in train.columns:
    if not train[TARGET_COL].equals(train[MAIN_TARGET]):
        warnings.warn(f"'{TARGET_COL}' column is present but not equal to '{MAIN_TARGET}'. Check data consistency.")
    else:
        print(f"'{TARGET_COL}' column confirmed as alias for '{MAIN_TARGET}'.")
    # Keep only the main target and aux targets we need, drop the alias if it exists
    targets_to_keep = [ERA_COL] + [MAIN_TARGET] + AUX_TARGETS
    targets_df = train[[col for col in targets_to_keep if col in train.columns]].copy()
    # Update target_cols list to reflect only those present and needed
    target_cols = [col for col in [MAIN_TARGET] + AUX_TARGETS if col in targets_df.columns]
else:
    targets_df = train[[ERA_COL] + target_cols]

print(f"Using '{MAIN_TARGET}' as the main target.")
print(f"Auxiliary targets being considered: {AUX_TARGETS}")

### Target Names and Correlations

Auxiliary targets represent different stock market return definitions or time horizons (`_20` vs `_60` days). They have varying correlations with the main target, which can be useful for ensembling.

In [None]:
# Print target names grouped by name and time horizon (using all available targets initially for mapping)
all_target_cols = feature_metadata["targets"]
t20s = sorted([t for t in all_target_cols if t.endswith("_20")])
t60s = sorted([t for t in all_target_cols if t.endswith("_60")])
names = sorted(list(set([t.replace("target_", "").replace("_20", "").replace("_60", "") for t in all_target_cols])))

target_map_df = pd.DataFrame(index=names, columns=['20', '60'])
for t in t20s:
    name = t.replace("target_", "").replace("_20", "")
    if name in target_map_df.index:
      target_map_df.loc[name, '20'] = t
for t in t60s:
    name = t.replace("target_", "").replace("_60", "")
    if name in target_map_df.index:
      target_map_df.loc[name, '60'] = t

print("Target names grouped by name and horizon:")
display(target_map_df.dropna(how='all'))

In [None]:
# Calculate and display correlations with the main target (using only targets present in targets_df)
print(f"\nCorrelations of available targets with {MAIN_TARGET}:")
if MAIN_TARGET in targets_df.columns:
    target_corrs = (
        targets_df[target_cols]
        .corrwith(targets_df[MAIN_TARGET])
        .sort_values(ascending=False)
        .to_frame(f"corr_with_{MAIN_TARGET}")
    )
    display(target_corrs)

    # Plot correlation matrix heatmap
    plt.figure(figsize=(10, 8))
    sns.heatmap(
      targets_df[target_cols].corr(),
      cmap="coolwarm",
      xticklabels=False,
      yticklabels=False
    )
    plt.title("Target Correlation Matrix")
    plt.show()
else:
    print(f"Main target {MAIN_TARGET} not found in the loaded training data.")

## 3. Apply Feature Engineering

Generate UMAP, Denoising Autoencoder, Contrastive (placeholder), and CTGAN features. Fit on training data, transform validation data.

In [None]:
# --- Load Validation Data Before Feature Engineering Fit ---
# This is necessary so we can apply the *fitted* transformers later
print("\nDownloading validation data...")
napi.download_dataset(f"{DATA_VERSION}/validation.parquet")
print("Loading validation data...")
validation = pd.read_parquet(
    f"{DATA_VERSION}/validation.parquet",
    columns=[ERA_COL, DATA_TYPE_COL] + original_feature_cols + target_cols
)
validation = validation[validation[DATA_TYPE_COL] == "validation"].copy()
del validation[DATA_TYPE_COL]
gc.collect()

# Downsample validation eras if configured
if DOWNSAMPLE_VALID_ERAS > 1:
    print(f"Downsampling validation data to every {DOWNSAMPLE_VALID_ERAS}th era...")
    validation = validation[validation[ERA_COL].isin(validation[ERA_COL].unique()[::DOWNSAMPLE_VALID_ERAS])].copy()
    gc.collect()

# Embargo overlapping eras BEFORE feature engineering transform
last_train_era = int(train[ERA_COL].astype(int).max())
eras_to_embargo = [str(era).zfill(4) for era in range(last_train_era + 1, last_train_era + 5)] # Embargo 4 eras
validation = validation[~validation[ERA_COL].isin(eras_to_embargo)].copy()
print(f"Embargoed eras from validation: {eras_to_embargo}")
gc.collect()

# --- Fit Feature Engineering on Training Data & Transform Both ---
engineered_feature_cols = []
fitted_transformers = {} # Dictionary to store fitted objects

# UMAP
train, validation, umap_feats, fitted_transformers['umap_reducer'] = umap_feature_creation(
    train, validation, original_feature_cols, n_components=UMAP_N_COMPONENTS
)
engineered_feature_cols.extend(umap_feats)
gc.collect()

# Denoising Autoencoder
train, validation, ae_feats, fitted_transformers['ae_encoder'] = denoising_autoencoder_features(
    train, validation, original_feature_cols, encoding_dim=AE_ENCODING_DIM, epochs=AE_EPOCHS
)
engineered_feature_cols.extend(ae_feats)
gc.collect()

# Contrastive Learning (Placeholder)
train, validation, contrastive_feats, _ = contrastive_feature_creation(
    train, validation, original_feature_cols, embedding_dim=CONTRASTIVE_EMB_DIM
)
engineered_feature_cols.extend(contrastive_feats)
gc.collect()

# CTGAN (using main target for demonstration)
train, validation, ctgan_feats, fitted_transformers['ctgan_model'], fitted_transformers['qt'], fitted_transformers['synthetic_mean'] = synthetic_data_ctgan(
    train, validation, original_feature_cols, MAIN_TARGET, epochs=CTGAN_EPOCHS
)
engineered_feature_cols.extend(ctgan_feats)
gc.collect()

# Update the main feature list
feature_cols = original_feature_cols + engineered_feature_cols
print(f"\nTotal number of features after engineering: {len(feature_cols)}")
print(f"Engineered feature names: {engineered_feature_cols}")

print("\nTraining data with engineered features (head):")
display(train[feature_cols].head())
print("\nValidation data with engineered features (head):")
display(validation[feature_cols].head())

## 4. Base Model Training (LightGBM)

Train LightGBM models for each selected target using the original and engineered features.

In [None]:
print("Training LightGBM models on selected targets...")
models = {}
for target in tqdm(TARGET_CANDIDATES, desc="Training models"):
    print(f"Training model for {target}...")
    # Filter out rows where the current target is NaN for training
    train_target_filtered = train.dropna(subset=[target])
    
    # Define LGBM parameters (consider adjusting based on feature set size)
    lgbm_params = {
        'n_estimators': 2000,
        'learning_rate': 0.01,
        'max_depth': 5,
        'num_leaves': 2**4-1,
        'colsample_bytree': 0.1,
        'random_state': 42,
        'n_jobs': -1
    }
    
    model = lgb.LGBMRegressor(**lgbm_params)
    model.fit(
        train_target_filtered[feature_cols],
        train_target_filtered[target]
    )
    models[target] = model
    gc.collect()

print("Base models trained.")

### Base Model Evaluation

Generate predictions on the validation set for each base model and evaluate their individual performance (Correlation).

In [None]:
from numerai_tools.scoring import numerai_corr

print("Generating validation predictions for base models...")
validation_preds = pd.DataFrame(index=validation.index)
for target_name, model in models.items():
    pred_col_name = f"prediction_{target_name}"
    # Ensure validation data has all features before predicting
    validation_features = validation[feature_cols].fillna(0.5) # Handle potential NaNs introduced by FE on val
    validation_preds[pred_col_name] = model.predict(validation_features)

# Merge predictions back into the validation dataframe
validation = validation.join(validation_preds)

prediction_cols = list(validation_preds.columns)
print("\nValidation predictions generated:")
display(validation[prediction_cols].head())

# Evaluate individual model correlations
print("\nEvaluating base model correlations...")
# Ensure MAIN_TARGET exists and has non-NA values for correlation calculation
validation_eval = validation.dropna(subset=[MAIN_TARGET] + prediction_cols)
if validation_eval.empty:
    print("Warning: No valid rows remaining after dropping NaNs for correlation evaluation.")
    correlations = pd.DataFrame(columns=prediction_cols)
    cumsum_corrs = pd.DataFrame(columns=prediction_cols)
else:
    correlations = validation_eval.groupby(ERA_COL).apply(
        lambda d: numerai_corr(d[prediction_cols], d[MAIN_TARGET])
    )
    cumsum_corrs = correlations.cumsum()

    plt.figure(figsize=(10, 6))
    cumsum_corrs.plot(ax=plt.gca())
    plt.title("Cumulative Correlation of Base Model Validation Predictions")
    plt.xlabel("Era")
    plt.ylabel("Cumulative Correlation")
    plt.xticks([])
    plt.legend(title="Model Target")
    plt.grid(True, linestyle='--', alpha=0.5)
    plt.show()

print("\nSummary metrics for base models:")
def get_summary_metrics(scores, cumsum_scores):
    summary_metrics = {}
    mean = scores.mean()
    std = scores.std()
    sharpe = mean / std if std != 0 else np.nan
    # Ensure cumsum_scores is not empty before calculating drawdown
    if not cumsum_scores.empty:
      rolling_max = cumsum_scores.expanding(min_periods=1).max()
      max_drawdown = (rolling_max - cumsum_scores).max()
    else:
      max_drawdown = np.nan
    return {
        "mean": mean,
        "std": std,
        "sharpe": sharpe,
        "max_drawdown": max_drawdown,
    }

base_model_summary = {}
for pred_col in prediction_cols:
    if pred_col in correlations.columns:
      base_model_summary[pred_col] = get_summary_metrics(correlations[pred_col], cumsum_corrs[pred_col])
    else:
      base_model_summary[pred_col] = {'mean': np.nan, 'std': np.nan, 'sharpe': np.nan, 'max_drawdown': np.nan}

summary_df = pd.DataFrame(base_model_summary).T
display(summary_df)

## 5. Stacked Ensembling

Implement a stacked ensemble using out-of-fold predictions from the base models.

In [None]:
print("Generating Out-of-Fold (OOF) predictions for stacking...")

gkf = GroupKFold(n_splits=N_FOLDS)
oof_preds = pd.DataFrame(index=train.index)

# Store OOF predictions for each base model
for target_name, model in tqdm(models.items(), desc="Generating OOF preds"):
    print(f" Generating OOF for {target_name}...")
    oof_preds_target = pd.Series(index=train.index, dtype=np.float32)
    # Use only non-NaN target rows for training folds, but keep original index for alignment
    train_target_filtered = train.dropna(subset=[target_name]) 
    
    for fold, (train_idx_filtered, val_idx_filtered) in enumerate(gkf.split(train_target_filtered[feature_cols], train_target_filtered[target_name], groups=train_target_filtered[ERA_COL])):
        # Map filtered indices back to original DataFrame indices
        train_index_orig = train_target_filtered.iloc[train_idx_filtered].index
        val_index_orig = train_target_filtered.iloc[val_idx_filtered].index

        X_train_fold, X_val_fold = train.loc[train_index_orig, feature_cols], train.loc[val_index_orig, feature_cols]
        y_train_fold = train.loc[train_index_orig, target_name]
        
        fold_model = lgb.LGBMRegressor(
            n_estimators=500, # Fewer estimators for fold training
            learning_rate=0.01,
            max_depth=5,
            num_leaves=2**4-1,
            colsample_bytree=0.1,
            random_state=fold, # Vary random state per fold
            n_jobs=-1
        )
        fold_model.fit(X_train_fold, y_train_fold)
        
        # Store predictions on the validation part of the fold, using original train index
        oof_preds_target.loc[val_index_orig] = fold_model.predict(X_val_fold)
        
    oof_preds[f"oof_{target_name}"] = oof_preds_target
    gc.collect()

print("OOF predictions generated.")
display(oof_preds.head())

# Prepare training data for the meta-model
meta_train_features = oof_preds.copy()
meta_train_features[ERA_COL] = train[ERA_COL] # Add era for potential use in meta-model
meta_train_target = train[MAIN_TARGET]

# Drop rows where OOF preds or target are NaN 
valid_indices = meta_train_target.notna() & meta_train_features.notna().all(axis=1)
meta_train_features = meta_train_features.loc[valid_indices].copy()
meta_train_target = meta_train_target.loc[valid_indices].copy()

oof_feature_cols = list(oof_preds.columns)

# Train the meta-model
print(f"\nTraining meta-model ({STACKING_MODEL_TYPE})...")
if STACKING_MODEL_TYPE == 'LGBM':
    meta_model = lgb.LGBMRegressor(
        n_estimators=500,
        learning_rate=0.01,
        max_depth=3,
        num_leaves=2**3-1,
        colsample_bytree=0.8, # Use more features for meta-model
        random_state=42,
        n_jobs=-1
    )
    meta_model.fit(meta_train_features[oof_feature_cols], meta_train_target)
elif STACKING_MODEL_TYPE == 'Linear':
    scaler = StandardScaler() # Define scaler here
    meta_train_features_scaled = scaler.fit_transform(meta_train_features[oof_feature_cols])
    meta_model = Ridge(alpha=1.0, random_state=42)
    meta_model.fit(meta_train_features_scaled, meta_train_target)
    fitted_transformers['stacking_scaler'] = scaler # Store the scaler
else:
    raise ValueError("Invalid STACKING_MODEL_TYPE")

fitted_transformers['meta_model'] = meta_model # Store the meta-model
print("Meta-model trained.")

# Generate predictions on the validation set using the stacking pipeline
print("\nGenerating stacked predictions on validation set...")
meta_val_features = validation[prediction_cols].copy()
meta_val_features.columns = oof_feature_cols # Rename columns to match meta-model training

# Handle potential NaNs in validation base predictions before meta-prediction
meta_val_features = meta_val_features.fillna(meta_val_features.mean()) # Simple mean imputation

if STACKING_MODEL_TYPE == 'Linear':
    scaler_val = fitted_transformers['stacking_scaler'] # Use fitted scaler
    meta_val_features_scaled = scaler_val.transform(meta_val_features)
    stacked_preds = meta_model.predict(meta_val_features_scaled)
else: # LGBM
    stacked_preds = meta_model.predict(meta_val_features)

validation["prediction_stacked"] = stacked_preds

print("Stacked predictions generated.")
display(validation[["prediction_stacked"]].head())

### Stacked Ensemble Evaluation

Evaluate the performance of the stacked ensemble.

In [None]:
print("Evaluating stacked ensemble performance...")

# Add stacked predictions to the list for evaluation
evaluation_cols_stacking = prediction_cols + ["prediction_stacked"]

# Ensure MAIN_TARGET exists and has non-NA values for correlation calculation
validation_eval_stacking = validation.dropna(subset=[MAIN_TARGET] + evaluation_cols_stacking)

if validation_eval_stacking.empty:
    print("Warning: No valid rows remaining after dropping NaNs for stacking evaluation.")
    stacked_correlations = pd.DataFrame(columns=evaluation_cols_stacking)
    stacked_cumsum_corrs = pd.DataFrame(columns=evaluation_cols_stacking)
else:
    stacked_correlations = validation_eval_stacking.groupby(ERA_COL).apply(
        lambda d: numerai_corr(d[evaluation_cols_stacking], d[MAIN_TARGET])
    )
    stacked_cumsum_corrs = stacked_correlations.cumsum()

    plt.figure(figsize=(10, 6))
    stacked_cumsum_corrs.plot(ax=plt.gca())
    plt.title("Cumulative Correlation including Stacked Ensemble")
    plt.xlabel("Era")
    plt.ylabel("Cumulative Correlation")
    plt.xticks([])
    plt.legend(title="Model")
    plt.grid(True, linestyle='--', alpha=0.5)
    plt.show()

print("\nSummary metrics including Stacked Ensemble:")
stacked_summary = {}
for pred_col in evaluation_cols_stacking:
     if pred_col in stacked_correlations.columns:
       stacked_summary[pred_col] = get_summary_metrics(stacked_correlations[pred_col], stacked_cumsum_corrs[pred_col])
     else:
       stacked_summary[pred_col] = {'mean': np.nan, 'std': np.nan, 'sharpe': np.nan, 'max_drawdown': np.nan}

stacked_summary_df = pd.DataFrame(stacked_summary).T
display(stacked_summary_df)

## 6. Era-Invariant Training (PyTorch MLP Option)

Define and train a PyTorch MLP with a custom loss function incorporating negative Pearson correlation, era correlation variance penalty, and feature exposure penalty.

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader
from scipy.stats import rankdata

# --- Define MLP Architecture ---
class SimpleMLP(nn.Module):
    def __init__(self, input_dim):
        super(SimpleMLP, self).__init__()
        self.layers = nn.Sequential(
            nn.BatchNorm1d(input_dim), # Add BatchNorm
            nn.Linear(input_dim, 256),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.BatchNorm1d(256),
            nn.Linear(256, 128),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.BatchNorm1d(128),
            nn.Linear(128, 1),
            nn.Sigmoid() # Output between 0 and 1
        )

    def forward(self, x):
        return self.layers(x)

# --- Define Custom Loss Functions ---
def pearson_corr(preds, target):
    """Calculate Pearson correlation coefficient (handles potential NaNs)."""
    preds = preds.squeeze()
    target = target.squeeze()
    
    preds_mean = torch.mean(preds)
    target_mean = torch.mean(target)
    
    cov = torch.mean((preds - preds_mean) * (target - target_mean))
    preds_std = torch.std(preds)
    target_std = torch.std(target)
    
    epsilon = 1e-6
    corr = cov / (preds_std * target_std + epsilon)
    
    # If correlation is NaN (e.g., due to zero std dev), return 0
    return torch.nan_to_num(corr, nan=0.0)

def era_correlation_variance_penalty(preds, target, eras):
    """Calculate variance of per-era correlations."""
    unique_eras = torch.unique(eras)
    era_corrs = []
    for era in unique_eras:
        era_mask = (eras == era)
        era_preds = preds[era_mask]
        era_target = target[era_mask]
        if len(era_preds) > 1: # Need at least 2 points for correlation
            era_corrs.append(pearson_corr(era_preds, era_target))
    
    if len(era_corrs) > 1:
        era_corrs_tensor = torch.stack(era_corrs)
        # Filter out NaNs before calculating variance
        valid_corrs = era_corrs_tensor[~torch.isnan(era_corrs_tensor)]
        if len(valid_corrs) > 1:
             return torch.var(valid_corrs)
    return torch.tensor(0.0, device=preds.device)

def feature_exposure_penalty(preds, features):
    """Calculate mean of squared correlations between predictions and features."""
    num_features = features.shape[1]
    feature_corrs_sq = []
    preds_squeezed = preds.squeeze()
    
    for i in range(num_features):
        feature_col = features[:, i]
        # Check if feature column has variance
        if torch.std(feature_col) > 1e-6:
             corr = pearson_corr(preds_squeezed, feature_col)
             # Only append if correlation is not NaN
             if not torch.isnan(corr):
                 feature_corrs_sq.append(corr**2)
        
    if len(feature_corrs_sq) > 0:
        return torch.mean(torch.stack(feature_corrs_sq))
    return torch.tensor(0.0, device=preds.device)

# --- Training Loop ---
def train_mlp(train_df, feature_cols, target_col, era_col, original_feature_cols, top_n_features=TOP_N_FEATURES_FOR_EXPOSURE):
    print("\nTraining PyTorch MLP with custom loss...")
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"Using device: {device}")

    # Prepare data
    train_target_filtered = train_df.dropna(subset=[target_col])
    features = torch.tensor(train_target_filtered[feature_cols].fillna(0.5).values, dtype=torch.float32).to(device)
    target = torch.tensor(train_target_filtered[target_col].values, dtype=torch.float32).unsqueeze(1).to(device)
    eras = torch.tensor(train_target_filtered[era_col].astype(int).values, dtype=torch.long).to(device)
    
    # Select top N original features for feature exposure penalty (based on correlation with target)
    # Using original features for exposure penalty as requested
    feature_corrs = train_target_filtered[original_feature_cols].corrwith(train_target_filtered[target_col])
    top_feature_names = feature_corrs.abs().nlargest(top_n_features).index
    top_features_tensor = torch.tensor(train_target_filtered[top_feature_names].fillna(0.5).values, dtype=torch.float32).to(device)
    print(f"Using top {len(top_feature_names)} original features for exposure penalty.")

    dataset = TensorDataset(features, target, eras, top_features_tensor)
    dataloader = DataLoader(dataset, batch_size=MLP_BATCH_SIZE, shuffle=True)

    # Initialize model, optimizer, loss weights
    input_dim = len(feature_cols)
    mlp_model = SimpleMLP(input_dim).to(device)
    optimizer = optim.Adam(mlp_model.parameters(), lr=MLP_LR)
    lambda1 = VARIANCE_PENALTY_WEIGHT
    lambda2 = FEATURE_EXPOSURE_WEIGHT

    # Training
    for epoch in tqdm(range(MLP_EPOCHS), desc="Training MLP"):
        epoch_loss = 0.0
        mlp_model.train()
        for batch_features, batch_target, batch_eras, batch_top_features in dataloader:
            optimizer.zero_grad()
            preds = mlp_model(batch_features)

            # Calculate loss components
            corr_loss = -pearson_corr(preds, batch_target)
            var_penalty = era_correlation_variance_penalty(preds, batch_target, batch_eras)
            exposure_penalty = feature_exposure_penalty(preds, batch_top_features)

            # Combine losses
            total_loss = corr_loss + lambda1 * var_penalty + lambda2 * exposure_penalty
            
            if torch.isnan(total_loss):
                # print(f"Warning: NaN loss encountered in epoch {epoch+1}. Skipping batch.")
                # print(f" corr: {corr_loss.item()}, var: {var_penalty.item()}, exp: {exposure_penalty.item()}")
                continue # Skip backpropagation if loss is NaN

            total_loss.backward()
            optimizer.step()
            epoch_loss += total_loss.item()
        
        avg_epoch_loss = epoch_loss / len(dataloader) if len(dataloader) > 0 else 0
        print(f"Epoch [{epoch+1}/{MLP_EPOCHS}], Loss: {avg_epoch_loss:.6f}")
        
    print("MLP training finished.")
    fitted_transformers['mlp_model'] = mlp_model.to('cpu') # Store the trained model
    return mlp_model.to('cpu') # Move model back to CPU for prediction

# --- Control Flow for Model Training ---
mlp_model = None # Initialize to None
if USE_MLP:
    # Ensure PyTorch and dependencies are installed
    try:
        import torch
        mlp_model = train_mlp(train, feature_cols, MAIN_TARGET, ERA_COL, original_feature_cols)
        
        # Generate MLP predictions on validation set
        print("Generating MLP predictions on validation set...")
        mlp_model.eval()
        with torch.no_grad():
            # Ensure validation data has all features and handle NaNs
            val_features_tensor = torch.tensor(validation[feature_cols].fillna(0.5).values, dtype=torch.float32)
            mlp_preds = mlp_model(val_features_tensor).numpy().squeeze()
        validation["prediction_mlp"] = mlp_preds
        print("MLP predictions generated.")
        display(validation[["prediction_mlp"]].head())

    except ImportError:
        print("PyTorch not found. Skipping MLP training. Set USE_MLP=False or install PyTorch.")
        USE_MLP = False # Disable MLP usage if import fails
    except Exception as e:
        print(f"An error occurred during MLP training or prediction: {e}")
        mlp_model = None # Ensure mlp_model is None if training failed
        USE_MLP = False # Disable MLP usage if training fails

# Determine the final prediction column based on flags and success
if USE_MLP and mlp_model:
    print("Using MLP Ensemble based on configuration.")
    validation["prediction_final"] = validation["prediction_mlp"]
elif USE_STACKING:
    print("Using Stacked Ensemble based on configuration.")
    if "prediction_stacked" in validation.columns:
        validation["prediction_final"] = validation["prediction_stacked"]
    else:
        print("Warning: Stacked predictions not found. Defaulting to main target model.")
        validation["prediction_final"] = validation[f"prediction_{MAIN_TARGET}"]
else:
    print("Skipping Stacking and MLP training based on configuration.")
    print("Using the base model for the main target as the final prediction.")
    validation["prediction_final"] = validation[f"prediction_{MAIN_TARGET}"]

## 7. Final Model Evaluation

Evaluate the chosen final model (either Stacked Ensemble or MLP).

In [None]:
print("Evaluating final model performance...")

# Determine the final prediction column name again, ensuring it exists
if USE_MLP and mlp_model and "prediction_mlp" in validation.columns:
    final_pred_col = "prediction_mlp"
    comparison_cols = [f"prediction_{MAIN_TARGET}", final_pred_col]
elif USE_STACKING and "prediction_stacked" in validation.columns:
    final_pred_col = "prediction_stacked"
    comparison_cols = prediction_cols + [final_pred_col]
else:
    final_pred_col = f"prediction_{MAIN_TARGET}"
    comparison_cols = [final_pred_col]
    if final_pred_col not in validation.columns:
         raise ValueError("Could not find a valid prediction column for final evaluation.")

print(f"Evaluating column: {final_pred_col}")

# Ensure all columns for comparison exist in validation data
existing_comparison_cols = [col for col in comparison_cols if col in validation.columns]
if not existing_comparison_cols:
     raise ValueError("No valid columns found for final comparison evaluation.")

# Ensure MAIN_TARGET exists and has non-NA values for correlation calculation
validation_eval_final = validation.dropna(subset=[MAIN_TARGET] + existing_comparison_cols)

if validation_eval_final.empty:
    print("Warning: No valid rows remaining after dropping NaNs for final evaluation.")
    final_correlations = pd.DataFrame(columns=existing_comparison_cols)
    final_cumsum_corrs = pd.DataFrame(columns=existing_comparison_cols)
else:
    final_correlations = validation_eval_final.groupby(ERA_COL).apply(
        lambda d: numerai_corr(d[existing_comparison_cols], d[MAIN_TARGET])
    )
    final_cumsum_corrs = final_correlations.cumsum()

    plt.figure(figsize=(10, 6))
    final_cumsum_corrs.plot(ax=plt.gca())
    plt.title(f"Cumulative Correlation of Final Model ({final_pred_col}) vs Others")
    plt.xlabel("Era")
    plt.ylabel("Cumulative Correlation")
    plt.xticks([])
    plt.legend(title="Model")
    plt.grid(True, linestyle='--', alpha=0.5)
    plt.show()

print("\nFinal Summary Metrics:")
final_summary = {}
for pred_col in existing_comparison_cols:
    if pred_col in final_correlations.columns:
        final_summary[pred_col] = get_summary_metrics(final_correlations[pred_col], final_cumsum_corrs[pred_col])
    else:
        final_summary[pred_col] = {'mean': np.nan, 'std': np.nan, 'sharpe': np.nan, 'max_drawdown': np.nan}

final_summary_df = pd.DataFrame(final_summary).T
display(final_summary_df)

## 8. Model Upload

Define the final prediction function based on the selected model (Stacking or MLP) and prepare for upload. This function now uses the *fitted* feature engineering objects.

In [None]:
# --- Define Final Prediction Function ---
# This version assumes the necessary fitted objects are passed via the dependencies dictionary
def predict_final(live_features: pd.DataFrame, dependencies: dict) -> pd.DataFrame:
    """Generates predictions using the chosen final model and pre-fitted FE objects."""
    
    # Load dependencies
    original_feature_cols = dependencies['original_feature_cols']
    feature_cols = dependencies['feature_cols'] # Full list including engineered
    umap_reducer = dependencies.get('umap_reducer')
    ae_encoder = dependencies.get('ae_encoder')
    qt = dependencies.get('qt') # QuantileTransformer for CTGAN
    synthetic_mean = dependencies.get('synthetic_mean') # Mean for CTGAN distance
    ae_feature_names = dependencies.get('ae_feats', [])
    umap_feature_names = dependencies.get('umap_feats', [])
    contrastive_feature_names = dependencies.get('contrastive_feats', [])
    ctgan_feature_names = dependencies.get('ctgan_feats', [])
    
    models = dependencies['models'] # Base LGBM models
    meta_model = dependencies.get('meta_model') # Stacking meta-model
    scaler = dependencies.get('stacking_scaler') # Scaler for linear stacking
    mlp_model_dep = dependencies.get('mlp_model') # PyTorch MLP model
    
    USE_MLP_FLAG = dependencies['USE_MLP']
    USE_STACKING_FLAG = dependencies['USE_STACKING']
    STACKING_MODEL_TYPE_FLAG = dependencies['STACKING_MODEL_TYPE']
    MAIN_TARGET_NAME = dependencies['MAIN_TARGET']
    PREDICTION_COL_NAME = dependencies['PREDICTION_COL']
    
    # Apply feature engineering transformations to live features
    print("Applying feature engineering to live data...")
    live_features_eng = live_features.copy()
    live_data_orig = live_features_eng[original_feature_cols].astype(np.float32).fillna(0.5)
    
    if umap_reducer and umap_feature_names:
        print(" Applying UMAP...")
        umap_features_live = umap_reducer.transform(live_data_orig)
        live_features_eng[umap_feature_names] = umap_features_live
        
    if ae_encoder and ae_feature_names:
        print(" Applying AE...")
        ae_features_live = ae_encoder.predict(live_data_orig.values)
        live_features_eng[ae_feature_names] = ae_features_live
        
    if contrastive_feature_names: # Placeholder: Generate random
        print(" Applying Contrastive (placeholder)...")
        num_samples_live = len(live_features_eng)
        contrastive_features_live = np.random.rand(num_samples_live, len(contrastive_feature_names)).astype(np.float32)
        live_features_eng[contrastive_feature_names] = contrastive_features_live

    if ctgan_feature_names and synthetic_mean is not None:
        print(" Applying CTGAN distance feature...")
        distances_live = np.linalg.norm(live_data_orig.values - synthetic_mean.values, axis=1)
        live_features_eng[ctgan_feature_names[0]] = distances_live
    
    print("Feature engineering applied to live data.")

    # Ensure all feature columns are present, filling missing ones (e.g., if FE failed)
    for col in feature_cols:
        if col not in live_features_eng.columns:
            print(f"Warning: Feature column '{col}' missing in live data. Filling with 0.5.")
            live_features_eng[col] = 0.5 
            
    live_features_eng = live_features_eng[feature_cols].fillna(0.5) # Final check for NaNs

    # --- Prediction Logic ---
    if USE_MLP_FLAG and mlp_model_dep:
        print("Generating predictions using MLP model...")
        mlp_model_dep.eval()
        with torch.no_grad():
            live_features_tensor = torch.tensor(live_features_eng.values, dtype=torch.float32)
            predictions = mlp_model_dep(live_features_tensor).numpy().squeeze()
        submission_df = pd.DataFrame({'prediction': predictions}, index=live_features.index)
        
    elif USE_STACKING_FLAG and meta_model:
        print("Generating predictions using Stacked Ensemble...")
        base_preds_live = pd.DataFrame(index=live_features.index)
        oof_cols = [f"oof_{t}" for t in models.keys()]
        for target_name, model in models.items():
            base_preds_live[f"oof_{target_name}"] = model.predict(live_features_eng)
        
        base_preds_live = base_preds_live.fillna(base_preds_live.mean()) # Impute NaNs if any base pred failed
        
        if STACKING_MODEL_TYPE_FLAG == 'Linear' and scaler:
             base_preds_live_scaled = scaler.transform(base_preds_live[oof_cols])
             stacked_preds_live = meta_model.predict(base_preds_live_scaled)
        else: # LGBM
             stacked_preds_live = meta_model.predict(base_preds_live[oof_cols])
        
        submission_df = pd.DataFrame({'prediction': stacked_preds_live}, index=live_features.index)

    else: # Default to main target base model
        print(f"Generating predictions using base model for {MAIN_TARGET_NAME}...")
        predictions = models[MAIN_TARGET_NAME].predict(live_features_eng)
        submission_df = pd.DataFrame({'prediction': predictions}, index=live_features.index)

    # Rank predictions for submission
    ranked_submission = submission_df['prediction'].rank(pct=True, method="first")
    return ranked_submission.to_frame(PREDICTION_COL_NAME)

In [None]:
# --- Quick Test on Live Data ---
print("Downloading live features for testing...")
napi.download_dataset(f"{DATA_VERSION}/live.parquet")
live_features = pd.read_parquet(f"{DATA_VERSION}/live.parquet", columns=original_feature_cols)

# Prepare dependencies dictionary for the test
test_dependencies = {
    'models': models,
    'umap_reducer': fitted_transformers.get('umap_reducer'),
    'ae_encoder': fitted_transformers.get('ae_encoder'),
    'qt': fitted_transformers.get('qt'),
    'synthetic_mean': fitted_transformers.get('synthetic_mean'),
    'meta_model': fitted_transformers.get('meta_model'),
    'stacking_scaler': fitted_transformers.get('stacking_scaler'),
    'mlp_model': fitted_transformers.get('mlp_model'),
    'original_feature_cols': original_feature_cols,
    'engineered_feature_cols': engineered_feature_cols,
    'feature_cols': feature_cols,
    'ae_feats': ae_feats,
    'umap_feats': umap_feats,
    'contrastive_feats': contrastive_feats,
    'ctgan_feats': ctgan_feats,
    'UMAP_N_COMPONENTS': UMAP_N_COMPONENTS,
    'AE_ENCODING_DIM': AE_ENCODING_DIM,
    'CONTRASTIVE_EMB_DIM': CONTRASTIVE_EMB_DIM,
    'USE_MLP': USE_MLP,
    'USE_STACKING': USE_STACKING,
    'STACKING_MODEL_TYPE': STACKING_MODEL_TYPE,
    'MAIN_TARGET': MAIN_TARGET,
    'PREDICTION_COL': PREDICTION_COL
}

# Generate predictions using the final function
final_predictions = predict_final(live_features, test_dependencies)

print("\nSample of final predictions:")
display(final_predictions.head())

In [None]:
# --- Pickle the Prediction Function and Dependencies ---
print("Pickling the prediction function...")
try:
    # Define the dictionary containing the function and its necessary dependencies
    pickle_payload = {
        'predict_fn': predict_final,
        'dependencies': {
            'models': models, # Base LGBM models
            'umap_reducer': fitted_transformers.get('umap_reducer'),
            'ae_encoder': fitted_transformers.get('ae_encoder'),
            'qt': fitted_transformers.get('qt'), # QuantileTransformer for CTGAN
            'synthetic_mean': fitted_transformers.get('synthetic_mean'), # Mean for CTGAN distance
            'meta_model': fitted_transformers.get('meta_model'), # Stacking meta-model
            'stacking_scaler': fitted_transformers.get('stacking_scaler'), # Scaler for linear stacking
            'mlp_model': fitted_transformers.get('mlp_model'), # PyTorch MLP model
            'original_feature_cols': original_feature_cols,
            'engineered_feature_cols': engineered_feature_cols,
            'feature_cols': feature_cols,
            'ae_feats': ae_feats, # List of AE feature names
            'umap_feats': umap_feats, # List of UMAP feature names
            'contrastive_feats': contrastive_feats, # List of Contrastive feature names
            'ctgan_feats': ctgan_feats, # List of CTGAN feature names
            'UMAP_N_COMPONENTS': UMAP_N_COMPONENTS,
            'AE_ENCODING_DIM': AE_ENCODING_DIM,
            'CONTRASTIVE_EMB_DIM': CONTRASTIVE_EMB_DIM,
            'USE_MLP': USE_MLP,
            'USE_STACKING': USE_STACKING,
            'STACKING_MODEL_TYPE': STACKING_MODEL_TYPE,
            'MAIN_TARGET': MAIN_TARGET,
            'PREDICTION_COL': PREDICTION_COL
        }
    }
    
    # Register libraries that cloudpickle might struggle with by default
    cloudpickle.register_pickle_by_value(umap)
    cloudpickle.register_pickle_by_value(tf)
    cloudpickle.register_pickle_by_value(torch)
    cloudpickle.register_pickle_by_value(ctgan)
    
    # Pickle the payload
    with open("predict_final_model.pkl", "wb") as f:
        cloudpickle.dump(pickle_payload, f)
    print("Prediction function and dependencies pickled successfully to predict_final_model.pkl")

except NameError as e:
    print(f"Pickling failed: A required object might not be defined. Error: {e}")
    print("Ensure all models and transformers used in 'predict_final' are trained and available globally or passed correctly.")
except Exception as e:
     print(f"An unexpected error occurred during pickling: {e}")

In [None]:
# Download file if running in Google Colab
try:
    from google.colab import files
    files.download('predict_final_model.pkl')
except ImportError:
    print("Skipping download (not in Colab environment).")
except Exception as e:
    print(f"File download failed: {e}")

## 9. Conclusion

This notebook demonstrated adding feature engineering, stacked ensembling, and an optional era-invariant MLP training pipeline to the original target ensembling notebook.

Remember to choose the model (Stacking or MLP) you want to submit by setting the `USE_STACKING` or `USE_MLP` flags before pickling and uploading `predict_final_model.pkl` to [numer.ai](https://numer.ai).