# 1. Introduction

## Imports

In [None]:
import os
import gc
import warnings
import joblib
import numpy as np
import pandas as pd
import polars as pl
import catboost
import optuna
import kaggle_evaluation.default_inference_server
from catboost import CatBoostRegressor
from sklearn.model_selection import TimeSeriesSplit
from scipy.stats import spearmanr

warnings.filterwarnings('ignore')
optuna.logging.set_verbosity(optuna.logging.WARNING)

## Config

In [None]:
TRAIN_PATH = '/kaggle/input/hull-tactical-market-prediction/train.csv'
LOCAL_GATEWAY_PATH = '/kaggle/input/hull-tactical-market-prediction/'
MODEL_PATH = 'catboost_model.cbm'
FEATURES_PATH = 'features.joblib'

TOP_FEATURES_FOR_FE = ['M4', 'V13', 'S5', 'S2', 'D2']
LAG_PERIODS = [1, 5, 20]
ROLLING_WINDOWS = [5, 20, 60]

TARGET = 'market_forward_excess_returns'
COLS_TO_DROP = ['forward_returns', 'risk_free_rate', 'excess_return', 'E7', 'V10', 'S3', 'M1', 'M14']
BEST_C = 0.5

# 2. Auxiliary function for creating features

In [None]:
def create_features(df: pd.DataFrame) -> pd.DataFrame:
    df_out = df.copy()
    
    for col in TOP_FEATURES_FOR_FE:
        if col in df_out.columns:
            for lag in LAG_PERIODS:
                df_out[f'{col}_lag_{lag}'] = df_out[col].shift(lag)
            for window in ROLLING_WINDOWS:
                df_out[f'{col}_roll_mean_{window}'] = df_out[col].rolling(window=window, min_periods=1).mean()
                df_out[f'{col}_roll_std_{window}'] = df_out[col].rolling(window=window, min_periods=1).std()

    df_out.ffill(inplace=True)
    for col in df_out.columns:
        if df_out[col].isnull().any():
            median_val = df_out[col].median()
            df_out[col].fillna(median_val if not np.isnan(median_val) else 0, inplace=True)
            
    return df_out

# 3. Catboost train

In [None]:
if not os.path.exists(MODEL_PATH):
    print("Model not found. Starting training process...")
    
    train_df = pd.read_csv(TRAIN_PATH)
    if 'date_id' not in train_df.columns:
        train_df['date_id'] = train_df.index
    train_df.drop(columns=[col for col in COLS_TO_DROP if col in train_df.columns], inplace=True)

    train_featured = create_features(train_df)
    train_featured.dropna(subset=[TARGET], inplace=True)
    
    FEATURES = [col for col in train_featured.columns if col not in [TARGET, 'date_id']]
    X = train_featured[FEATURES]
    y = train_featured[TARGET]

    def objective(trial):
        params = {
            'objective': 'RMSE',
            'iterations': trial.suggest_int('iterations', 480, 2100),
            'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.05, log=True),
            'depth': trial.suggest_int('depth', 5, 8),
            'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 2.0, 10.0, log=True),
            'random_strength': trial.suggest_float('random_strength', 1e-8, 1.0, log=True),
            'colsample_bylevel': trial.suggest_float('colsample_bylevel', 0.6, 1.0),
            'verbose': 0,
            'early_stopping_rounds': 50
        }
        
        tscv = TimeSeriesSplit(n_splits=4)
        scores = []
        for train_index, val_index in tscv.split(X):
            model_opt = CatBoostRegressor(**params)
            model_opt.fit(X.iloc[train_index], y.iloc[train_index], eval_set=(X.iloc[val_index], y.iloc[val_index]))
            preds = model_opt.predict(X.iloc[val_index])
            score, _ = spearmanr(y.iloc[val_index], preds)
            scores.append(score)
        return np.mean(scores)

    study = optuna.create_study(direction='maximize')
    study.optimize(objective, n_trials=200, timeout=5400)

    print(f"Best params found: {study.best_params}")
    final_model = CatBoostRegressor(**study.best_params, verbose=500, random_seed=42)
    final_model.fit(X, y)
    final_model.save_model(MODEL_PATH)
    joblib.dump(FEATURES, FEATURES_PATH)
    print("Training complete. Model and features saved.")
    
else:
    print("Model training skipped: Model file already exists.")

# 4. Loading artifacts and initializing the status for the inference

In [None]:
print("Loading artifacts for inference...")
try:
    model = CatBoostRegressor()
    model.load_model(MODEL_PATH)
    MODEL_FEATURES = joblib.load(FEATURES_PATH)
except Exception as e:
    raise RuntimeError(f"Could not load model/features. Ensure training was successful. Error: {e}")

print("Initializing prediction history...")
history_df = pd.read_csv(TRAIN_PATH)

cols_to_drop_hist = [
    col for col in COLS_TO_DROP 
    if col in history_df.columns and col != TARGET
]
history_df.drop(columns=cols_to_drop_hist, inplace=True)
if 'date_id' not in history_df.columns:
    history_df['date_id'] = history_df.index
    
print("Setup complete. Ready for prediction.")

# 5. Submission with `predict`

In [None]:
def predict(test_df_pl: pl.DataFrame) -> float:
    global history_df
    
    test_df_pd = test_df_pl.to_pandas()
    if 'date_id' not in test_df_pd.columns:
        last_date_id = history_df['date_id'].max() if not history_df.empty else -1
        test_df_pd['date_id'] = last_date_id + 1
    
    history_df = pd.concat([history_df, test_df_pd], ignore_index=True)

    slice_size = max(ROLLING_WINDOWS) + max(LAG_PERIODS) + 5
    historical_slice = history_df.tail(slice_size)
    processed_slice = create_features(historical_slice)
    
    current_features = processed_slice.tail(1)[MODEL_FEATURES]
    prediction = model.predict(current_features)[0]
    
    allocation = np.clip(1 + BEST_C * prediction, 0, 2)
    
    gc.collect()
    
    return float(allocation)

inference_server = kaggle_evaluation.default_inference_server.DefaultInferenceServer(predict)

if os.getenv('KAGGLE_IS_COMPETITION_RERUN'):
    print("Serving predictions for the competition...")
    inference_server.serve()
else:
    print("Running local gateway for testing...")
    inference_server.run_local_gateway((LOCAL_GATEWAY_PATH,))

print("Submission script finished.")