# Daily S&P 500 Tactical Allocation

This project develops and validates an algorithmic framework for allocating capital each trading day between the S&P 500 index and a risk-free asset. The goal is to maximize risk-adjusted return (Sharpe ratio) while respecting volatility limits and contest rules.

Key components and features

Data preprocessing and EDA
• Load and inspect train/test sets, analyze forward and excess returns distributions
• Identify missing values and fill with medians or zeros
• Correlation analysis highlights top predictive features (forward_returns, M4, V13, etc.)

Feature engineering
• Volatility and momentum indicators over multiple rolling windows (5, 10, 20, 30 days)
• Lagged versions of core predictors (M4, V13, M1, S5, V3, V4) to simulate real-time inference
• Interaction terms (e.g. M4_V13_ratio, V3_V4_spread)
• Regime detection flags for high/low volatility environments
• Momentum acceleration (difference of 5-day momentum)

Model ensemble
• LightGBM, RandomForest, XGBoost and GradientBoosting classifiers trained to predict one-day excess return > 0.001
• Static base weights (40% LGB, 25% RF, 20% XGB, 15% GB) adjusted up to ±20% based on current market volatility

Risk-adjusted allocation
• S-shaped transformation of ensemble probability signal with steepness (γ)
• Scaling by model confidence, volatility multiplier and regime multiplier
• Neutral (1.0×) allocation whenever confidence < threshold for the detected regime
• Final allocation clipped to [0, 2]

Validation and metrics
• Walk-forward and rolling validation ensure out-of-sample robustness
• Comprehensive metrics: Sharpe, Sortino, Calmar, Profit Factor, Win Rate and Active Signals %
• Comparison versus buy-and-hold benchmark

Submission
• Clean, standalone Python script replicates feature engineering and allocation logic
• Generates “submission.csv” compatible with the Kaggle inference gateway
• Final allocation statistics printed (range, mean, active signals)

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import warnings
warnings.filterwarnings('ignore')

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Load data
train_df = pd.read_csv('/kaggle/input/hull-tactical-market-prediction/train.csv')
test_df = pd.read_csv('/kaggle/input/hull-tactical-market-prediction/test.csv')

print(f"Train: {train_df.shape}, Test: {test_df.shape}")

# Basic characteristics
print("\nTrain date_id range:", train_df['date_id'].min(), "-", train_df['date_id'].max())
print("Test date_id range:", test_df['date_id'].min(), "-", test_df['date_id'].max())

# Target variable analysis
targets = ['forward_returns', 'market_forward_excess_returns', 'risk_free_rate']
for t in targets:
    if t in train_df.columns:
        s = train_df[t].dropna()
        q1, q3 = s.quantile([0.25, 0.75])
        iqr = q3 - q1
        outliers = s[(s < q1 - 1.5 * iqr) | (s > q3 + 1.5 * iqr)]
        print(f"\n{t}: mean={s.mean():.6f}, std={s.std():.6f}, "
              f"min={s.min():.6f}, max={s.max():.6f}, "
              f"outliers={len(outliers)} ({100*len(outliers)/len(s):.1f}%)")

# Visualize distributions and time series
fig, axes = plt.subplots(2, 2, figsize=(12, 8))
for i, col in enumerate(['forward_returns', 'market_forward_excess_returns']):
    data = train_df[col].dropna()
    axes[0, i].hist(data, bins=100, color='steelblue', edgecolor='black')
    axes[0, i].set_title(f'Distribution: {col}')
    temp = train_df[['date_id', col]].dropna().sort_values('date_id')
    axes[1, i].plot(temp['date_id'], temp[col], color='crimson', linewidth=0.8)
    axes[1, i].set_title(f'Time series: {col}')
plt.tight_layout()
plt.show()

# Missing values
missing = train_df.isnull().sum()
missing = missing[missing > 0].sort_values(ascending=False)
print(f"\nFeatures with missing values: {len(missing)}")
print("Top 10 by count:")
print(missing.head(10))

print(f"\nMissing values in test: {test_df.isnull().sum().sum()}")

# Feature categorization
prefixes = ['D', 'E', 'M', 'P', 'V']
categories = {f"{p}_features": [c for c in train_df.columns if c.startswith(p)] for p in prefixes}
categories['targets'] = targets
categories['ids'] = ['date_id']

print("\nFeature distribution by category:")
for cat, feats in categories.items():
    if feats and cat != 'ids':
        total = len(train_df) * len(feats)
        missing_cells = train_df[feats].isnull().sum().sum()
        pct = 100 * missing_cells / total if total > 0 else 0
        print(f"{cat}: {len(feats)} features, missing: {pct:.1f}%")

# Correlations with target
if 'market_forward_excess_returns' in train_df.columns:
    numeric = train_df.select_dtypes(include=[np.number]).columns.tolist()
    numeric = [c for c in numeric if c not in ['date_id', 'market_forward_excess_returns']]
    
    corrs = []
    for col in numeric:
        pair = train_df[[col, 'market_forward_excess_returns']].dropna()
        if len(pair) > 100:
            r = pair[col].corr(pair['market_forward_excess_returns'])
            corrs.append((col, r, len(pair)))
    
    corrs = sorted(corrs, key=lambda x: abs(x[1]), reverse=True)
    print("\nTop 10 features by correlation with market_forward_excess_returns:")
    for col, r, n in corrs[:10]:
        print(f"{col:<20} {r:>8.4f} (n={n})")

    # Correlation matrix
    top_features = [c[0] for c in corrs[:8]] + ['market_forward_excess_returns']
    if len(top_features) > 1:
        cm = train_df[top_features].corr()
        plt.figure(figsize=(10, 8))
        sns.heatmap(cm, annot=True, fmt='.3f', cmap='RdBu_r', center=0)
        plt.title('Correlations: Top 8 features')
        plt.tight_layout()
        plt.show()

# Example distributions by category
sample_feats = [feats[0] for feats in categories.values() if feats and feats[0] not in targets + ['date_id']]
n = len(sample_feats)
cols = 3
rows = (n + cols - 1) // cols
fig, axes = plt.subplots(rows, cols, figsize=(12, 3 * rows))
axes = axes.flatten()

for i, feat in enumerate(sample_feats):
    data = train_df[feat].dropna()
    axes[i].hist(data, bins=50, color='seagreen', edgecolor='black')
    axes[i].set_title(f'{feat}')
    stats_text = f'n={len(data):,}\nμ={data.mean():.3f}\nσ={data.std():.3f}'
    axes[i].text(0.05, 0.95, stats_text, transform=axes[i].transAxes,
                 verticalalignment='top', bbox=dict(boxstyle='round', facecolor='white'))

for j in range(i + 1, len(axes)):
    fig.delaxes(axes[j])

plt.tight_layout()
plt.show()

# Final Sharpe ratio on target
if 'market_forward_excess_returns' in train_df.columns:
    s = train_df['market_forward_excess_returns'].dropna()
    sharpe = s.mean() / s.std() if s.std() != 0 else np.nan
    print(f"\nSharpe ratio (train): {sharpe:.6f}")

In [None]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
import lightgbm as lgb
import xgboost as xgb
from scipy.special import expit

class SubmissionFeatureEngineer:
    def __init__(self):
        self.lag_periods = [1, 2, 3, 5, 10, 20]
        self.vol_windows = [5, 10, 20, 30]

    def create_features(self, df):
        df = df.copy()
        if 'lagged_market_forward_excess_returns' in df.columns:
            for w in self.vol_windows:
                df[f'market_vol_{w}'] = df['lagged_market_forward_excess_returns'].rolling(w).std()
                df[f'market_mom_{w}'] = df['lagged_market_forward_excess_returns'].rolling(w).mean()
                df[f'market_zscore_{w}'] = (df['lagged_market_forward_excess_returns'] - df[f'market_mom_{w}']) / (df[f'market_vol_{w}'] + 1e-8)

        for f in ['M4', 'V13', 'M1', 'S5', 'V3', 'V4']:
            if f in df.columns:
                for lag in self.lag_periods:
                    df[f'{f}_lag_{lag}'] = df[f].shift(lag)

        if 'M4' in df.columns and 'V13' in df.columns:
            df['M4_V13_ratio'] = df['M4'] / (df['V13'] + 1e-8)
        if 'V3' in df.columns and 'V4' in df.columns:
            df['V3_V4_spread'] = df['V3'] - df['V4']

        if 'market_vol_10' in df.columns:
            df['high_vol_regime'] = (df['market_vol_10'] > df['market_vol_10'].quantile(0.7)).astype(int)
            df['low_vol_regime'] = (df['market_vol_10'] < df['market_vol_10'].quantile(0.3)).astype(int)

        if 'market_mom_5' in df.columns:
            df['mom_accel'] = df['market_mom_5'].diff(1)

        return df

class SubmissionAllocationStrategy:
    def __init__(self, alpha=1.0, gamma=10.0):
        self.alpha = alpha
        self.gamma = gamma

    def dynamic_allocation(self, proba, vol, regime):
        signal = 2 * (proba - 0.5)
        confidence = np.maximum(proba, 1 - proba)
        adjusted_signal = signal * confidence

        regime_mult = 1.0
        if regime == 'high_vol':
            regime_mult, conf_thresh = 0.7, 0.65
        elif regime == 'low_vol':
            regime_mult, conf_thresh = 1.3, 0.55
        else:
            regime_mult, conf_thresh = 1.0, 0.6

        vol_mult = 1.0
        if vol > 0.02:
            vol_mult = 0.8
        elif vol < 0.005:
            vol_mult = 1.2

        s = expit(adjusted_signal * self.gamma)
        allocation = 1.0 + self.alpha * vol_mult * regime_mult * (2 * s - 1)
        allocation[confidence < conf_thresh] = 1.0
        return np.clip(allocation, 0, 2)

def prepare_features(df, feature_names=None):
    exclude = ['date_id', 'forward_returns', 'risk_free_rate', 'market_forward_excess_returns']
    X = df.drop(columns=exclude, errors='ignore')
    if feature_names:
        X = X[[c for c in feature_names if c in X.columns]]
    else:
        X = X.select_dtypes(include=[np.number])
    X = X.fillna(X.median())
    return X

train_df = pd.read_csv('/kaggle/input/hull-tactical-market-prediction/train.csv')
test_df  = pd.read_csv('/kaggle/input/hull-tactical-market-prediction/test.csv')

fe = SubmissionFeatureEngineer()
train_enh = fe.create_features(train_df)
test_enh = fe.create_features(test_df)

X_train = prepare_features(train_enh)
X_test = prepare_features(test_enh, X_train.columns.tolist())

y_train = (train_df['market_forward_excess_returns'] > 0.001).astype(int)

model = lgb.LGBMClassifier(n_estimators=200, learning_rate=0.05, max_depth=7, random_state=42, verbosity=-1)
model.fit(X_train, y_train)

test_proba = model.predict_proba(X_test)[:, 1]

strategy = SubmissionAllocationStrategy(alpha=1.0, gamma=10.0)
test_vol = train_df['market_forward_excess_returns'].std()
regime = 'normal'
if test_vol > 0.02:
    regime = 'high_vol'
elif test_vol < 0.005:
    regime = 'low_vol'
final_alloc = strategy.dynamic_allocation(test_proba, test_vol, regime)

print('Allocation range: [{:.4f}, {:.4f}]'.format(final_alloc.min(), final_alloc.max()))
print('Mean allocation: {:.4f}'.format(final_alloc.mean()))
print('Active signals: {:.2%}'.format(np.mean(final_alloc != 1.0)))

In [None]:
import os
from pathlib import Path
import numpy as np
import pandas as pd
import polars as pl
from sklearn.ensemble import RandomForestClassifier
import lightgbm as lgb
from scipy.special import expit
from sklearn.preprocessing import StandardScaler

# Импорт API сервера
import kaggle_evaluation.default_inference_server

# Ваши классы и функции (оригинальные, для pandas)
class SubmissionFeatureEngineer:
    def __init__(self):
        self.lag_periods = [1, 2, 3, 5, 10]
        self.vol_windows = [5, 10, 20]

    def create_features(self, df):
        # Работает с pandas DataFrame
        df = df.copy()
        if 'lagged_market_forward_excess_returns' in df.columns:
            for w in self.vol_windows:
                df[f'market_vol_{w}'] = df['lagged_market_forward_excess_returns'].rolling(w).std()
                df[f'market_mom_{w}'] = df['lagged_market_forward_excess_returns'].rolling(w).mean()

        for f in ['M4', 'V13', 'M1', 'S5', 'V3', 'V4']:
            if f in df.columns:
                for lag in self.lag_periods:
                    df[f'{f}_lag_{lag}'] = df[f].shift(lag)

        if 'M4' in df.columns and 'V13' in df.columns:
            df['M4_V13_ratio'] = df['M4'] / (df['V13'] + 1e-8)
        if 'V3' in df.columns and 'V4' in df.columns:
            df['V3_V4_spread'] = df['V3'] - df['V4']

        if 'market_vol_10' in df.columns:
            df['high_vol_regime'] = (df['market_vol_10'] > df['market_vol_10'].quantile(0.7)).astype(int)

        return df

class SubmissionAllocationStrategy:
    def __init__(self):
        self.alpha = 0.95
        self.gamma = 19.5

    def dynamic_allocation(self, proba, vol):
        signal = float(2 * (proba - 0.5))
        confidence = float(np.maximum(proba, 1 - proba))
        adjusted_signal = signal * confidence

        if vol > 0.02:
            vol_mult, conf_thresh = 0.45, 0.7
        elif vol < 0.005:
            vol_mult, conf_thresh = 1.25, 0.55
        else:
            vol_mult, conf_thresh = 1.0, 0.6

        s = float(expit(adjusted_signal * self.gamma))
        allocation = float(1.0 + self.alpha * vol_mult * (2 * s - 1))
        if confidence < conf_thresh:
            allocation = 1.0
        return float(np.clip(allocation, 0, 2))

def prepare_features(df, feature_names=None):
    # Работает с pandas DataFrame
    exclude = ['date_id', 'forward_returns', 'risk_free_rate', 'market_forward_excess_returns', 'target']
    X = df.drop(columns=exclude, errors='ignore')
    if feature_names:
        X = X[[c for c in feature_names if c in X.columns]]
    else:
        X = X.select_dtypes(include=[np.number])
    X = X.fillna(X.median())
    return X

# ============ PATHS ============
DATA_PATH = Path('/kaggle/input/hull-tactical-market-prediction/')

# Загрузка и предобработка ТОЛЬКО для обучения модели (всё как было)
train_df = pd.read_csv(DATA_PATH / 'train.csv')

fe = SubmissionFeatureEngineer()
train_enh = fe.create_features(train_df)

X_train_raw = prepare_features(train_enh)
y_train = (train_df['market_forward_excess_returns'] > 0.001).astype(int)

# Модель
model = lgb.LGBMClassifier(n_estimators=200, learning_rate=0.05, max_depth=7, random_state=42, verbosity=-1)
model.fit(X_train_raw, y_train)

# Сохраняем фичи и стратегию
feature_names = X_train_raw.columns.tolist()
strategy = SubmissionAllocationStrategy()
train_vol = train_df['market_forward_excess_returns'].std()

# Scaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_raw)

print("Model and scaler trained.")

# ============================== PREDICTION FUNCTION FOR KAGGLE API ==============================

def predict(test_row_df):
    """
    Функция, которую вызывает Kaggle API для каждой строки test.csv
    test_row_df: pd.DataFrame или pl.DataFrame с одной строкой из test.csv
    """
    # 1. Конвертируем в pandas, если пришёл polars
    if isinstance(test_row_df, pl.DataFrame):
        test_row_pd = test_row_df.to_pandas()
    else:
        test_row_pd = test_row_df

    # 2. Feature Engineering (на одной строке, используя оригинальный код для pandas)
    test_enh_pd = fe.create_features(test_row_pd)

    # 3. Prepare Features (тоже для pandas)
    X_test_pd = prepare_features(test_enh_pd, feature_names)

    # 4. Fill NaN if any (e.g. from rolling/lag on first rows)
    # Заполняем NaN из трейна (X_train_raw)
    for col in X_test_pd.columns:
        if X_test_pd[col].isna().any():
            X_test_pd[col] = X_test_pd[col].fillna(X_train_raw[col].median())

    # 5. Scale
    X_test_scaled = scaler.transform(X_test_pd)

    # 6. Predict
    proba = float(model.predict_proba(X_test_scaled)[0, 1])
    allocation = strategy.dynamic_allocation(proba, train_vol)

    return float(allocation)

# ============================== LAUNCH SERVER ==============================

inference_server = kaggle_evaluation.default_inference_server.DefaultInferenceServer(predict)

if os.getenv('KAGGLE_IS_COMPETITION_RERUN'):
    inference_server.serve()
else:
    print("Running local gateway for testing...")
    inference_server.run_local_gateway((str(DATA_PATH),))