# 1. Data Preparation and Feature Engineering

## Imports and Config

In [None]:
import os
import joblib
import pandas as pd
import polars as pl
import numpy as np
from sklearn.preprocessing import StandardScaler
from collections import deque
import math
import warnings

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

import kaggle_evaluation.default_inference_server

warnings.filterwarnings('ignore')

In [None]:
class Config:
    # Paths
    MODEL_PATH = 'best_transformer_model.pth'
    SCALER_PATH = 'scaler.joblib'
    BEST_K_PATH = 'best_k.joblib'

    # Features
    BASE_FEATURE_PREFIXES = ('D', 'E', 'I', 'M', 'P', 'S', 'V')
    KEY_FEATURES = ['M1', 'E1', 'I1', 'P1', 'S1', 'V1', 'M2', 'E2', 'I2', 'P2', 'S2', 'V2']
    ROLLING_WINDOWS = [5, 21, 63]
    LAG_PERIODS = [1, 5, 21]

    # Model & Train
    SEQUENCE_LENGTH = 60
    TRAIN_SPLIT_RATIO = 0.85
    DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # Transformer params
    D_MODEL = 128
    N_HEADS = 8
    N_LAYERS = 4
    DROPOUT = 0.2
    
    # Train params
    N_EPOCHS = 20
    BATCH_SIZE = 128
    LEARNING_RATE = 1e-4
    WEIGHT_DECAY = 1e-5
    
    # Post-processing
    K_VALUES_SEARCH = [10, 50, 100, 200, 400, 600, 800]
    
    HISTORY_LEN_FOR_FEATURES = max(ROLLING_WINDOWS + LAG_PERIODS)

## Competition metric
Just full copy from https://www.kaggle.com/code/metric/hull-competition-sharpe

In [None]:
MIN_INVESTMENT = 0
MAX_INVESTMENT = 2

class ParticipantVisibleError(Exception):
    pass

def to_allocation(predictions, k):
    """Converts model predictions into allocations."""
    allocations = 2 * (1 / (1 + np.exp(-np.array(predictions) * k)))
    return np.clip(allocations, 0, 2)

def score(solution: pd.DataFrame, submission: pd.DataFrame) -> float:
    """A local copy of the competition metric."""
    if not pd.api.types.is_numeric_dtype(submission['prediction']):
        raise ParticipantVisibleError('Predictions must be numeric')

    solution = solution.copy()
    solution['position'] = submission['prediction'].values

    if solution['position'].max() > MAX_INVESTMENT:
        raise ParticipantVisibleError(f'Position of {solution["position"].max()} exceeds maximum of {MAX_INVESTMENT}')
    if solution['position'].min() < MIN_INVESTMENT:
        raise ParticipantVisibleError(f'Position of {solution["position"].min()} below minimum of {MIN_INVESTMENT}')

    solution['strategy_returns'] = solution['risk_free_rate'] * (1 - solution['position']) + solution['position'] * solution['forward_returns']

    strategy_excess_returns = solution['strategy_returns'] - solution['risk_free_rate']
    strategy_excess_cumulative = (1 + strategy_excess_returns).prod()
    
    if len(solution) == 0: return 0.0
    
    strategy_mean_excess_return = (strategy_excess_cumulative) ** (1 / len(solution)) - 1
    strategy_std = solution['strategy_returns'].std()

    trading_days_per_yr = 252
    if strategy_std == 0: return 0.0
    
    sharpe = strategy_mean_excess_return / strategy_std * np.sqrt(trading_days_per_yr)
    strategy_volatility = float(strategy_std * np.sqrt(trading_days_per_yr) * 100)

    market_excess_returns = solution['forward_returns'] - solution['risk_free_rate']
    market_excess_cumulative = (1 + market_excess_returns).prod()
    market_mean_excess_return = (market_excess_cumulative) ** (1 / len(solution)) - 1
    market_std = solution['forward_returns'].std()
    market_volatility = float(market_std * np.sqrt(trading_days_per_yr) * 100)

    if market_volatility == 0: return 0.0

    excess_vol = max(0, strategy_volatility / market_volatility - 1.2) if market_volatility > 0 else 0
    vol_penalty = 1 + excess_vol
    return_gap = max(0, (market_mean_excess_return - strategy_mean_excess_return) * 100 * trading_days_per_yr)
    return_penalty = 1 + (return_gap**2) / 100

    adjusted_sharpe = sharpe / (vol_penalty * return_penalty)
    return min(float(adjusted_sharpe), 1_000_000)

# 2. Help-Class for model

In [None]:
class TimeSeriesDataset(Dataset):
    def __init__(self, X, y, seq_len):
        self.X = X
        self.y = y
        self.seq_len = seq_len
    def __len__(self):
        return len(self.X) - self.seq_len + 1
    def __getitem__(self, idx):
        sequence = self.X[idx : idx + self.seq_len]
        target = self.y[idx + self.seq_len - 1]
        return torch.tensor(sequence, dtype=torch.float32), torch.tensor(target, dtype=torch.float32)

class PositionalEncoding(nn.Module):
    def __init__(self, d_model: int, dropout: float = 0.1, max_len: int = 5000):
        super().__init__()
        self.dropout = nn.Dropout(p=dropout)
        position = torch.arange(max_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2) * (-math.log(10000.0) / d_model))
        pe = torch.zeros(max_len, 1, d_model)
        pe[:, 0, 0::2] = torch.sin(position * div_term)
        pe[:, 0, 1::2] = torch.cos(position * div_term)
        self.register_buffer('pe', pe)
    def forward(self, x):
        x = x + self.pe[:x.size(0)]
        return self.dropout(x)

class TransformerModel(nn.Module):
    def __init__(self, input_dim, d_model, n_heads, n_layers, dropout, sequence_length):
        super().__init__()
        self.d_model = d_model
        self.input_embedding = nn.Linear(input_dim, d_model)
        self.pos_encoder = PositionalEncoding(d_model, dropout, max_len=sequence_length)
        encoder_layer = nn.TransformerEncoderLayer(d_model, n_heads, d_model * 4, dropout, batch_first=True)
        self.transformer_encoder = nn.TransformerEncoder(encoder_layer, n_layers)
        self.reg_head = nn.Sequential(nn.Linear(d_model, d_model // 2), nn.ReLU(), nn.Linear(d_model // 2, 1))

    def forward(self, src):
        src = self.input_embedding(src) * math.sqrt(self.d_model)
        src = src.permute(1, 0, 2)
        src = self.pos_encoder(src)
        src = src.permute(1, 0, 2)
        output = self.transformer_encoder(src)
        return self.reg_head(output[:, -1, :]).squeeze(-1)

# 3. Advances & Optimized feature engineering

In [None]:
class FeatureEngineer:
    def __init__(self, config: Config):
        self.config = config
        self.feature_cols = None
        self.generated_feature_names = None

    def fit_transform(self, df: pd.DataFrame) -> pd.DataFrame:
        """Creates signs and remembers their names."""
        df_featured = self._generate(df)
        self.feature_cols = [col for col in df.columns if col.startswith(self.config.BASE_FEATURE_PREFIXES)]
        self.generated_feature_names = [c for c in df_featured.columns if c not in df.columns]
        return df_featured

    def transform(self, df: pd.DataFrame) -> pd.DataFrame:
        """Applies feature generation to new data."""
        return self._generate(df)

    def _generate(self, df_input: pd.DataFrame) -> pd.DataFrame:
        df = df_input.copy()
        
        feature_cols = [col for col in df.columns if col.startswith(self.config.BASE_FEATURE_PREFIXES)]
        key_features = [f for f in self.config.KEY_FEATURES if f in df.columns]

        generated_features = {}
        for col in key_features:
            for lag in self.config.LAG_PERIODS:
                generated_features[f'{col}_lag_{lag}'] = df[col].shift(lag)
                generated_features[f'{col}_diff_{lag}'] = df[col].diff(lag)
            for window in self.config.ROLLING_WINDOWS:
                rolling = df[col].rolling(window=window, min_periods=max(1, window // 2))
                generated_features[f'{col}_roll_mean_{window}'] = rolling.mean()
                generated_features[f'{col}_roll_std_{window}'] = rolling.std()

        feature_groups = {p: [c for c in feature_cols if c.startswith(p)] for p in set(c[0] for c in feature_cols)}
        for group_prefix, group_cols in feature_groups.items():
            if group_cols:
                generated_features[f'group_{group_prefix}_mean'] = df[group_cols].mean(axis=1)
                generated_features[f'group_{group_prefix}_std'] = df[group_cols].std(axis=1)
        
        generated_df = pd.DataFrame(generated_features)
        return pd.concat([df, generated_df], axis=1)

    def get_feature_names(self):
        if self.feature_cols is None or self.generated_feature_names is None:
            raise RuntimeError("FeatureEngineer must be fit first.")
        return self.feature_cols + self.generated_feature_names

# 4. Main megaclass

In [None]:
class MarketPredictor:
    def __init__(self, config: Config):
        self.config = config
        self.feature_engineer = FeatureEngineer(config)
        self.model = None
        self.scaler = None
        self.best_k = None
        self.feature_names = None
        
        self.inference_history = pd.DataFrame()

    def train(self, df_raw: pd.DataFrame):
        """The full training cycle of the model."""
        print("Starting training process...")

        df_raw[df_raw.select_dtypes(include=np.number).columns] = df_raw.select_dtypes(include=np.number).ffill().bfill()
        df_featured = self.feature_engineer.fit_transform(df_raw)
        self.feature_names = self.feature_engineer.get_feature_names()
        
        df_featured = df_featured.dropna(subset=self.feature_names).reset_index(drop=True)
        print(f"Total features: {len(self.feature_names)}")
        print(f"Dataset size after cleaning: {df_featured.shape}")

        TARGET = 'forward_returns'
        X = df_featured[self.feature_names]
        y = df_featured[TARGET]

        train_split_idx = int(len(df_featured) * self.config.TRAIN_SPLIT_RATIO)
        val_df_for_metric = df_featured.iloc[train_split_idx:].reset_index(drop=True) # для скоринга

        X_train, X_val = X.iloc[:train_split_idx], X.iloc[train_split_idx:]
        y_train, y_val = y.iloc[:train_split_idx], y.iloc[train_split_idx:]

        self.scaler = StandardScaler()
        X_train_scaled = self.scaler.fit_transform(X_train)
        X_val_scaled = self.scaler.transform(X_val)

        train_dataset = TimeSeriesDataset(X_train_scaled, y_train.values, self.config.SEQUENCE_LENGTH)
        val_dataset = TimeSeriesDataset(X_val_scaled, y_val.values, self.config.SEQUENCE_LENGTH)
        train_loader = DataLoader(train_dataset, batch_size=self.config.BATCH_SIZE, shuffle=True, num_workers=2)
        val_loader = DataLoader(val_dataset, batch_size=512, shuffle=False, num_workers=2)

        self.model = TransformerModel(
            input_dim=len(self.feature_names),
            d_model=self.config.D_MODEL,
            n_heads=self.config.N_HEADS,
            n_layers=self.config.N_LAYERS,
            dropout=self.config.DROPOUT,
            sequence_length=self.config.SEQUENCE_LENGTH
        ).to(self.config.DEVICE)
        print(f"Model created and moved to {self.config.DEVICE}")

        criterion = nn.MSELoss()
        optimizer = torch.optim.AdamW(self.model.parameters(), lr=self.config.LEARNING_RATE, weight_decay=self.config.WEIGHT_DECAY)
        scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'max', patience=3, factor=0.5)

        best_epoch_score = -np.inf

        for epoch in range(self.config.N_EPOCHS):
            self.model.train()
            total_train_loss = 0
            for seq, target in train_loader:
                seq, target = seq.to(self.config.DEVICE), target.to(self.config.DEVICE)
                optimizer.zero_grad()
                prediction = self.model(seq)
                loss = criterion(prediction, target)
                loss.backward()
                torch.nn.utils.clip_grad_norm_(self.model.parameters(), 0.7)
                optimizer.step()
                total_train_loss += loss.item()
            avg_train_loss = total_train_loss / len(train_loader)

            current_best_score, current_best_k = self._validate(val_loader, val_df_for_metric)
            scheduler.step(current_best_score)
            
            print(f"Epoch {epoch+1}/{self.config.N_EPOCHS} | Train Loss: {avg_train_loss:.4f} | Val Score: {current_best_score:.4f} at k={current_best_k}")

            if current_best_score > best_epoch_score:
                best_epoch_score = current_best_score
                self.best_k = current_best_k
                print(f"** New best score! Saving artifacts... Best k is {self.best_k} **")
                self._save_artifacts()

        print(f"\nTraining complete. Best validation score: {best_epoch_score:.4f} with k={self.best_k}")

    def _validate(self, val_loader, val_df_for_metric):
        """Performs validation on a deferred sample and finds the best `k`."""
        self.model.eval()
        val_preds = []
        with torch.no_grad():
            for seq, _ in val_loader:
                seq = seq.to(self.config.DEVICE)
                prediction = self.model(seq)
                val_preds.extend(prediction.cpu().numpy())
        
        actuals_for_metric = val_df_for_metric.iloc[self.config.SEQUENCE_LENGTH - 1:].copy()
        
        current_best_score = -np.inf
        current_best_k = None
        
        for k in self.config.K_VALUES_SEARCH: 
            allocations = to_allocation(val_preds, k=k)
            submission_df = pd.DataFrame({'prediction': allocations[:len(actuals_for_metric)]})

            current_score = score(actuals_for_metric, submission_df)
            
            if current_score > current_best_score:
                current_best_score = current_score
                current_best_k = k
        
        return current_best_score, current_best_k
    
    def _save_artifacts(self):
        """Saves the model, the scaler, and the best k."""
        torch.save(self.model.state_dict(), self.config.MODEL_PATH)
        joblib.dump(self.scaler, self.config.SCALER_PATH)
        joblib.dump(self.best_k, self.config.BEST_K_PATH)
        joblib.dump(self.feature_names, 'feature_names.joblib')

    def load_artifacts(self):
        """Uploads artifacts for the inference."""
        print("Loading artifacts for inference...")
        self.feature_names = joblib.load('feature_names.joblib')
        self.model = TransformerModel(
            input_dim=len(self.feature_names),
            d_model=self.config.D_MODEL,
            n_heads=self.config.N_HEADS,
            n_layers=self.config.N_LAYERS,
            dropout=self.config.DROPOUT,
            sequence_length=self.config.SEQUENCE_LENGTH
        )
        self.model.load_state_dict(torch.load(self.config.MODEL_PATH, map_location=self.config.DEVICE))
        self.model.to(self.config.DEVICE)
        self.model.eval()
        
        self.scaler = joblib.load(self.config.SCALER_PATH)
        self.best_k = joblib.load(self.config.BEST_K_PATH)
        print(f"Artifacts loaded. Model is on {self.config.DEVICE}. Best K is {self.best_k}.")

    def predict(self, test_df_polars: pl.DataFrame) -> pl.DataFrame:
        """
        Makes a prediction for a new batch of data.
        Manages the history for the correct calculation of features and sequences.
        """
        test_df = test_df_polars.to_pandas()
        
        if self.inference_history.empty:
            self.inference_history = test_df
            return pl.DataFrame({'prediction': [1.0] * len(test_df)})

        combined_df = pd.concat([self.inference_history, test_df], ignore_index=True)
        
        featured_df = self.feature_engineer.transform(combined_df)
        
        if len(featured_df) < self.config.SEQUENCE_LENGTH:
             self.inference_history = combined_df
             return pl.DataFrame({'prediction': [1.0] * len(test_df)})
        
        sequence_data = featured_df[self.feature_names].tail(self.config.SEQUENCE_LENGTH)
        sequence_data = sequence_data.ffill().bfill()
        scaled_sequence = self.scaler.transform(sequence_data)
        
        with torch.no_grad():
            tensor_sequence = torch.tensor(scaled_sequence, dtype=torch.float32).unsqueeze(0).to(self.config.DEVICE)
            raw_prediction = self.model(tensor_sequence).item()
            
        allocation = to_allocation([raw_prediction], k=self.best_k)[0]

        self.inference_history = combined_df.tail(self.config.HISTORY_LEN_FOR_FEATURES)
        
        return pl.DataFrame({'prediction': [allocation]})

# 5. Main (Train & Submission using classes)

In [None]:
config = Config()

print("--- Start Model Training ---")
predictor = MarketPredictor(config)
df_train_raw = pd.read_csv('/kaggle/input/hull-tactical-market-prediction/train.csv')
df_train_raw = df_train_raw.sort_values('date_id').reset_index(drop=True)
predictor.train(df_train_raw)
print("--- Training finished. Artifacts saved. ---")

predictor = MarketPredictor(config)


predictor.load_artifacts() 

def predict(test_df: pl.DataFrame) -> pl.DataFrame:
    return predictor.predict(test_df)


inference_server = kaggle_evaluation.default_inference_server.DefaultInferenceServer(predict)

if os.getenv('KAGGLE_IS_COMPETITION_RERUN'):
    inference_server.serve()
else:
    inference_server.run_local_gateway(('/kaggle/input/hull-tactical-market-prediction/',))