In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/hull-tactical-market-prediction/train.csv
/kaggle/input/hull-tactical-market-prediction/test.csv
/kaggle/input/hull-tactical-market-prediction/kaggle_evaluation/default_inference_server.py
/kaggle/input/hull-tactical-market-prediction/kaggle_evaluation/default_gateway.py
/kaggle/input/hull-tactical-market-prediction/kaggle_evaluation/__init__.py
/kaggle/input/hull-tactical-market-prediction/kaggle_evaluation/core/templates.py
/kaggle/input/hull-tactical-market-prediction/kaggle_evaluation/core/base_gateway.py
/kaggle/input/hull-tactical-market-prediction/kaggle_evaluation/core/relay.py
/kaggle/input/hull-tactical-market-prediction/kaggle_evaluation/core/kaggle_evaluation.proto
/kaggle/input/hull-tactical-market-prediction/kaggle_evaluation/core/__init__.py
/kaggle/input/hull-tactical-market-prediction/kaggle_evaluation/core/generated/kaggle_evaluation_pb2.py
/kaggle/input/hull-tactical-market-prediction/kaggle_evaluation/core/generated/kaggle_evaluation_pb2_grpc.py
/kaggl

In [2]:
"""
Fixed Kaggle Submission for Hull Tactical Market Prediction
Handles sequential test data with historical context properly
"""

import os
import numpy as np
import pandas as pd
import polars as pl
import torch
import torch.nn as nn
from collections import deque
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
import kaggle_evaluation.default_inference_server


# ============================================================================
# CONSTANTS & CONFIGURATION
# ============================================================================
SEQUENCE_LENGTH = 60  # Minimum history needed for LSTM predictions
INPUT_DIM = 49  # Will be calculated automatically
HIDDEN_SIZE_1 = 128
HIDDEN_SIZE_2 = 64
HIDDEN_SIZE_3 = 32
MLP_OUTPUT_DIM = 1
DROPOUT = 0.2
MIN_INVESTMENT = 0
MAX_INVESTMENT = 1.5
CONTINUOUS_COLS=40
MISSING_COLS=5
BINARY_COLS=10
LAG_COLS=3
FOURIER_COLS=2

# Allocation parameters
ALLOCATION_LOOKBACK = 20  # Use last N predictions for allocation calculation
ALLOCATION_SPAN = 10      # EWM span for smoothing allocations

BASE_MODEL_URL = '/kaggle/input/hull-lstm-final/pytorch/default/2'
MODEL_URL = os.path.join(BASE_MODEL_URL, 'best_lstm_model.pth')

device = 'cuda' if torch.cuda.is_available() else 'cpu'


# ============================================================================
# MODEL ARCHITECTURE
# ============================================================================
class LSTM_Architecture(nn.Module):
    def __init__(self, input_dim, hidden_size_1=128, hidden_size_2=64, 
                 hidden_size_3=32, mlp_output_dim=1, dropout=0.2):
        super(LSTM_Architecture, self).__init__()
        
        self.input_size = input_dim
        self.hidden_sizes = [hidden_size_1, hidden_size_2, hidden_size_3]
        self.output_size = mlp_output_dim
        
        self.dropout_layer = nn.Dropout(dropout)
        
        self.lstm1 = nn.LSTM(input_size=self.input_size, hidden_size=self.hidden_sizes[0], 
                             num_layers=1, dropout=0.2, batch_first=True)
        self.lstm2 = nn.LSTM(input_size=self.hidden_sizes[0], hidden_size=self.hidden_sizes[1], 
                             num_layers=1, dropout=0.2, batch_first=True)
        self.lstm3 = nn.LSTM(input_size=self.hidden_sizes[1], hidden_size=self.hidden_sizes[2], 
                             num_layers=1, dropout=0.2, batch_first=True)
        
        self.layernorm1 = nn.LayerNorm(self.hidden_sizes[0])
        self.layernorm2 = nn.LayerNorm(self.hidden_sizes[1])
        self.layernorm3 = nn.LayerNorm(self.hidden_sizes[2])
        
        self.fc1 = nn.Linear(self.hidden_sizes[-1], 16)
        self.activation = nn.Tanh()
        self.fc2 = nn.Linear(16, self.output_size)
    
    def forward(self, x):
        out, _ = self.lstm1(x)
        out = self.layernorm1(out)
        out = self.dropout_layer(out)
        out, _ = self.lstm2(out)
        out = self.layernorm2(out)
        out = self.dropout_layer(out)
        out, _ = self.lstm3(out)
        out = self.layernorm3(out)
        out = self.dropout_layer(out)
        out = out[:, -1, :]
        out = self.fc1(out)
        out = self.activation(out)
        out = self.fc2(out)
        return out.squeeze(-1)


# ============================================================================
# HELPER FUNCTIONS
# ============================================================================
def simple_allocation(prediction):
    """
    Convert a single prediction to allocation directly.
    Used when we don't have enough prediction history for sophisticated methods.
    
    Args:
        prediction: Float, expected market excess return (typically -0.01 to 0.01)
    
    Returns:
        allocation: Float in [MIN_INVESTMENT, MAX_INVESTMENT]
    """
    # Scale prediction to allocation
    # Prediction of 0.01 (1% expected return) -> increase allocation
    # Prediction of -0.01 (-1% expected return) -> decrease allocation
    
    # Use tanh to squash to [-1, 1], then map to [MIN_INVESTMENT, MAX_INVESTMENT]
    # Multiplier of 100 means ±1% prediction maps to ±tanh(1) = ±0.76
    signal = np.tanh(100 * prediction)
    
    # Map from [-1, 1] to [MIN_INVESTMENT, MAX_INVESTMENT]
    # signal = -1 -> MIN_INVESTMENT (very bearish)
    # signal = 0 -> midpoint (neutral)
    # signal = +1 -> MAX_INVESTMENT (very bullish)
    midpoint = (MIN_INVESTMENT + MAX_INVESTMENT) / 2
    half_range = (MAX_INVESTMENT - MIN_INVESTMENT) / 2
    allocation = midpoint + signal * half_range
    
    return np.clip(allocation, MIN_INVESTMENT, MAX_INVESTMENT)


def create_positions(preds, span=ALLOCATION_SPAN):
    """
    Convert a series of predictions to portfolio positions using z-score normalization.
    
    Args:
        preds: List or array of predictions
        span: EWM span for smoothing
    
    Returns:
        Array of allocations in [MIN_INVESTMENT, MAX_INVESTMENT]
    """
    if len(preds) < 2:
        # Not enough data for std calculation
        return np.array([simple_allocation(preds[0])])
    
    s = pd.Series(preds)
    
    # Normalize predictions using z-score
    z = (s - s.mean()) / (s.std() + 1e-8)
    
    # Smooth with exponential weighted moving average
    z = z.ewm(span=span, min_periods=1).mean()
    
    # Apply non-linear transformation
    signal = np.tanh(2.5 * z)
    
    # Map from [-1, 1] to [MIN_INVESTMENT, MAX_INVESTMENT]
    midpoint = (MIN_INVESTMENT + MAX_INVESTMENT) / 2
    half_range = (MAX_INVESTMENT - MIN_INVESTMENT) / 2
    allocation = midpoint + signal * half_range
    
    # Convert to numpy array for consistent indexing
    return np.clip(allocation.values, MIN_INVESTMENT, MAX_INVESTMENT)


def add_fourier_terms_for_inference(df, periods, K=3, prefix="fourier"):
    """Add Fourier terms for time series decomposition"""
    t = df["date_id"].astype(float).values
    new_data = {}
    for name, P in periods.items():
        for k in range(1, K + 1):
            new_data[f"{prefix}_{name}_sin_{k}"] = np.sin(2 * np.pi * k * t / P)
            new_data[f"{prefix}_{name}_cos_{k}"] = np.cos(2 * np.pi * k * t / P)
    return pd.concat([df, pd.DataFrame(new_data, index=df.index)], axis=1)


# ============================================================================
# LOAD TRAINING ARTIFACTS
# ============================================================================
def load_artifacts():
    """Load all preprocessing objects and models"""
    global model, scaler, scaler_lags, scaler_fouriers, svd, svd_missing
    global model_5, model_21, periods, continuous_cols, binary_cols, missing_cols
    global added_columns, lag_cols, fourier_cols, buffer, prediction_history
    
    print("Loading artifacts...")
    
    # Initialize buffers
    buffer = deque(maxlen=SEQUENCE_LENGTH)
    prediction_history = deque(maxlen=ALLOCATION_LOOKBACK)
    
    # Load training data to get column names and train preprocessing
    base_url = "/kaggle/input/hull-tactical-market-prediction"
    df_train = pd.read_csv(os.path.join(base_url, "train.csv"))
    
    # Define added columns (columns that had NaN in training)
    added_columns = []
    for col in df_train.columns:
        if df_train[col].isna().sum() > 0:
            added_columns.append(col)
    
    # Fill missing values
    df_train = df_train.ffill().bfill()
    
    # Define periods for Fourier terms
    total_len = len(df_train)
    periods = {
        "weekly": 5,
        "monthly": 21,
        "yearly": 252,
        "cycle_1": total_len,
        "cycle_2": total_len / 2,
        "cycle_5": total_len / 5
    }
    
    # Add Fourier terms
    df_train = add_fourier_terms_for_inference(df_train, periods=periods, K=3)
    
    # Calculate rolling volatilities
    df_train["volatility_5"] = df_train["market_forward_excess_returns"].rolling(5).std()
    df_train["volatility_21"] = df_train["market_forward_excess_returns"].rolling(21).std()
    
    # Define macro columns
    macro_cols = [c for c in df_train.columns if "cycle" in c or "yearly" in c]
    
    # Train model_5
    mask = ~df_train["volatility_5"].isna() & ~df_train[macro_cols].isna().any(axis=1)
    model_5 = LinearRegression()
    model_5.fit(df_train.loc[mask, macro_cols], df_train.loc[mask, "volatility_5"])
    
    # Train model_21
    mask = ~df_train["volatility_21"].isna() & ~df_train[macro_cols].isna().any(axis=1)
    model_21 = LinearRegression()
    model_21.fit(df_train.loc[mask, macro_cols], df_train.loc[mask, "volatility_21"])
    
    # Create lagged features
    df_train["lagged_market_forward_excess_returns"] = df_train["market_forward_excess_returns"].shift(1)
    df_train["lagged_forward_returns"] = df_train["forward_returns"].shift(1)
    df_train["lagged_risk_free_rate"] = df_train["risk_free_rate"].shift(1)
    df_train = df_train.dropna(subset=['lagged_market_forward_excess_returns'])
    
    # Add Fourier predictions
    df_train["vol_fourier_5"] = model_5.predict(df_train[macro_cols])
    df_train["vol_fourier_trend"] = model_21.predict(df_train[macro_cols])
    
    # Define feature columns
    continuous_cols = df_train.filter(regex='^(M|E|I|P|V|S|MOM)').columns.tolist()
    continuous_cols = [c for c in continuous_cols if not c.endswith('_missing') and c in df_train.columns]
    
    binary_cols = [c for c in df_train.columns if c.startswith("D")]
    missing_cols = [c + "_missing" for c in added_columns]
    lag_cols = ['lagged_market_forward_excess_returns', 'lagged_forward_returns', 'lagged_risk_free_rate']
    fourier_cols = ['vol_fourier_trend', 'vol_fourier_5']
    
    # Fit scalers
    scaler = StandardScaler()
    scaler.fit(df_train[continuous_cols])
    
    scaler_lags = StandardScaler()
    scaler_lags.fit(df_train[lag_cols])
    
    scaler_fouriers = StandardScaler()
    scaler_fouriers.fit(df_train[fourier_cols])
    
    # Fit SVD
    X_train_continuous_scaled = scaler.transform(df_train[continuous_cols])
    svd = TruncatedSVD(n_components=CONTINUOUS_COLS, algorithm="randomized", random_state=42)
    svd.fit(X_train_continuous_scaled)
    
    # Create missing indicator columns for training
    missing_indicators = pd.DataFrame(index=df_train.index)
    for col in added_columns:
        # Check original unprocessed data for NaN
        df_original = pd.read_csv(os.path.join(base_url, "train.csv"))
        missing_indicators[col + "_missing"] = df_original[col].isna().astype(int)
    
    svd_missing = TruncatedSVD(n_components=MISSING_COLS, algorithm="randomized", random_state=42)
    svd_missing.fit(missing_indicators)
    
    # Calculate actual input dimension
    global INPUT_DIM
    INPUT_DIM = CONTINUOUS_COLS + LAG_COLS + len(binary_cols) + FOURIER_COLS + MISSING_COLS  # SVD + lags + binary + fourier + SVD_missing
    
    # Load model
    model = LSTM_Architecture(
        input_dim=INPUT_DIM,
        hidden_size_1=HIDDEN_SIZE_1,
        hidden_size_2=HIDDEN_SIZE_2,
        hidden_size_3=HIDDEN_SIZE_3,
        mlp_output_dim=MLP_OUTPUT_DIM,
        dropout=DROPOUT
    ).to(device)
    
    state_dict = torch.load(MODEL_URL, map_location=device, weights_only=True)
    model.load_state_dict(state_dict)
    model.eval()
    
    print(f"Artifacts loaded successfully. Input dimension: {INPUT_DIM}")


# ============================================================================
# PREDICTION FUNCTION
# ============================================================================
def predict(test: pl.DataFrame) -> float:
    """
    Main prediction function called for each timestep.
    
    The competition serves data sequentially, including historical context,
    so we build up our buffers naturally and make predictions once we have
    enough history.
    """
    try:
        # Convert to pandas
        row = test.to_pandas()
        
        # Step 1: Create missing flags
        row_missing_flags = pd.DataFrame(index=row.index)
        for col in added_columns:
            if col in row.columns:
                row_missing_flags[col + "_missing"] = row[col].isna().astype(int)
            else:
                row_missing_flags[col + "_missing"] = 0
        
        # Step 2: Fill NaNs
        row_filled = row.fillna(0)
        
        # Step 3: Add Fourier terms
        row_with_fourier = add_fourier_terms_for_inference(row_filled, periods=periods, K=3)
        
        # Step 4: Get macro columns and predict volatility features
        current_cols = row_with_fourier.columns
        macro_inputs = [c for c in current_cols if "cycle" in c or "yearly" in c]
        
        pred_trend = model_21.predict(row_with_fourier[macro_inputs])[0]
        pred_5 = model_5.predict(row_with_fourier[macro_inputs])[0]
        
        # Step 5: Process continuous features
        available_continuous = [c for c in continuous_cols if c in row_filled.columns]
        feat_cont = row_filled[available_continuous].reindex(columns=continuous_cols, fill_value=0)
        feat_cont_scaled = scaler.transform(feat_cont)
        X_svd = pd.DataFrame(
            svd.transform(feat_cont_scaled),
            index=row.index,
            columns=[f"svd_{i}" for i in range(CONTINUOUS_COLS)]
        )
        
        # Step 6: Process lagged features
        available_lags = [c for c in lag_cols if c in row_filled.columns]
        feat_lags = row_filled[available_lags].reindex(columns=lag_cols, fill_value=0)
        X_lags_scaled = pd.DataFrame(
            scaler_lags.transform(feat_lags),
            index=row.index,
            columns=lag_cols
        )
        
        # Step 7: Process binary features
        available_binary = [c for c in binary_cols if c in row_filled.columns]
        X_binary = row_filled[available_binary].reindex(columns=binary_cols, fill_value=0)
        
        # Step 8: Process Fourier predictions
        X_fouriers_raw = pd.DataFrame(
            [[pred_trend, pred_5]],
            columns=fourier_cols,
            index=row.index
        )
        X_fouriers_scaled = pd.DataFrame(
            scaler_fouriers.transform(X_fouriers_raw),
            columns=fourier_cols,
            index=row.index
        )
        
        # Step 9: Process missing flags
        available_missing = [c for c in missing_cols if c in row_missing_flags.columns]
        feat_missing = row_missing_flags[available_missing].reindex(columns=missing_cols, fill_value=0)
        X_svd_missing = pd.DataFrame(
            svd_missing.transform(feat_missing),
            index=row.index,
            columns=[f"svd_missing_{i}" for i in range(MISSING_COLS)]
        )
        
        # Step 10: Concatenate all features
        current_feature_df = pd.concat([
            X_svd,
            X_lags_scaled,
            X_binary,
            X_fouriers_scaled,
            X_svd_missing
        ], axis=1)
        
        # Flatten and add to buffer
        current_feature_vector = current_feature_df.values.flatten()
        buffer.append(current_feature_vector)
        
        # # Check if we have enough history for LSTM
        if len(buffer) < SEQUENCE_LENGTH:
            # Not enough history yet - return neutral allocation
            print(f"Building history: {len(buffer)}/{SEQUENCE_LENGTH}. Returning neutral allocation (1.0)")
            return 1.0
        
        # Create input tensor for LSTM
        x = torch.tensor(np.stack(list(buffer)), dtype=torch.float32).unsqueeze(0).to(device)
        
        # Make prediction
        with torch.no_grad():
            prediction = model(x).item()
        
        # Store prediction
        prediction_history.append(prediction)
        
        # Calculate allocation based on prediction history
        if len(prediction_history) == 1:
            # First prediction - use simple allocation
            allocation = simple_allocation(prediction)
            print(f"First prediction: {prediction:.6f} -> Allocation: {allocation:.4f}")
        else:
            # Use accumulated predictions for sophisticated allocation
            allocations = create_positions(list(prediction_history))
            allocation = float(allocations[-1])  # Now safe - returns numpy array
            print(f"Prediction: {prediction:.6f} | History: {len(prediction_history)} | Allocation: {allocation:.4f}")
        
        # Final safety clip
        allocation = float(np.clip(allocation, MIN_INVESTMENT, MAX_INVESTMENT))
        
        return allocation
        
    except Exception as e:
        import traceback
        print(f"ERROR in predict(): {e}")
        print(traceback.format_exc())
        # Return neutral allocation on error to avoid format issues
        return 1.0


# ============================================================================
# MAIN EXECUTION
# ============================================================================
# Load all artifacts before starting the server
load_artifacts()

# Initialize inference server
inference_server = kaggle_evaluation.default_inference_server.DefaultInferenceServer(predict)

if os.getenv('KAGGLE_IS_COMPETITION_RERUN'):
    inference_server.serve()
else:
    inference_server.run_local_gateway(('/kaggle/input/hull-tactical-market-prediction/',))

Loading artifacts...




Artifacts loaded successfully. Input dimension: 59
Building history: 1/60. Returning neutral allocation (1.0)
Building history: 2/60. Returning neutral allocation (1.0)
Building history: 3/60. Returning neutral allocation (1.0)
Building history: 4/60. Returning neutral allocation (1.0)
Building history: 5/60. Returning neutral allocation (1.0)
Building history: 6/60. Returning neutral allocation (1.0)
Building history: 7/60. Returning neutral allocation (1.0)
Building history: 8/60. Returning neutral allocation (1.0)
Building history: 9/60. Returning neutral allocation (1.0)
Building history: 10/60. Returning neutral allocation (1.0)
