In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

The metric is a **Modified Sharpe Ratio**. Its goal is to find a strategy that is not just profitable on a risk-adjusted basis (the standard Sharpe Ratio), but one that also definitively beats the market without taking reckless risks.

---

## The Core Components (with Formulas)

Let's define our variables for a given day $t$:
* $p_t$ = Your position signal (from 0 to 2)
* $R_{m,t}$ = The market's forward return
* $R_{f,t}$ = The risk-free rate

#### 1. Strategy Daily Return ($R_{s,t}$)
This is the return your portfolio earns each day. It's a weighted average based on your signal: the portion in the market ($p_t$) earns the market return, and the portion in cash ($1-p_t$) earns the risk-free rate.

$$R_{s,t} = p_t \cdot R_{m,t} + (1 - p_t) \cdot R_{f,t}$$

---
#### 2. Annualized Sharpe Ratio ($S$)
This is the standard measure of risk-adjusted return. It uses the **geometric mean** of your strategy's *excess returns* ($R_{s,t} - R_{f,t}$) and divides it by the volatility (standard deviation) of your strategy's returns. It's then annualized by multiplying by $\sqrt{252}$ (the approximate number of trading days in a year).

* Strategy Excess Return: $R_{se,t} = R_{s,t} - R_{f,t}$
* Geometric Mean Excess Return: $\bar{R}_{se,g} = \left( \prod_{t=1}^{N} (1 + R_{se,t}) \right)^{1/N} - 1$
* Strategy Volatility: $\sigma_s = \text{StDev}(R_{s,1}, \dots, R_{s,N})$

$$S = \frac{\bar{R}_{se,g}}{\sigma_s} \cdot \sqrt{252}$$

---
#### 3. Volatility Penalty ($P_v$)
This penalizes you if your strategy's annualized volatility ($V_s$) is more than 20% higher than the market's annualized volatility ($V_m$).

* Strategy Annualized Volatility: $V_s = \sigma_s \cdot \sqrt{252}$
* Market Annualized Volatility: $V_m = \text{StDev}(R_{m,1}, \dots, R_{m,N}) \cdot \sqrt{252}$

$$P_v = 1 + \max\left(0, \frac{V_s}{V_m} - 1.2\right)$$

---
#### 4. Return Penalty ($P_r$)
This heavily penalizes you if your strategy's annualized geometric mean return is lower than the market's. The penalty is **quadratic**, meaning it grows very quickly as the performance gap widens.

* Annualized Return Gap ($G_r$): $G_r = \max(0, (\bar{R}_{me,g} - \bar{R}_{se,g}) \cdot 252)$

$$P_r = 1 + \frac{G_r^2}{100}$$

---
#### 5. Final Adjusted Sharpe Ratio ($S_{\text{adj}}$)
Your final score is the standard Sharpe Ratio, diminished by any penalties you incurred.

$$S_{\text{adj}} = \frac{S}{P_v \cdot P_r}$$

---
 ## A Step-by-Step Calculation Example

Let's use a simple 3-day period.

| Day (t) | Market Return ($R_{m,t}$) | Risk-Free Rate ($R_{f,t}$) | Your Signal ($p_t$) |
| :--- | :--- | :--- | :--- |
| 1 | +1.0% | 0.01% | 1.5 |
| 2 | -2.0% | 0.01% | 0.2 |
| 3 | +1.5% | 0.01% | 2.0 |

#### Step 1: Calculate Daily Returns
* **Strategy Return ($R_{s,t}$)**:
    * Day 1: $1.5 \cdot 0.01 + (1 - 1.5) \cdot 0.0001 = 0.01495 = +1.495\%$
    * Day 2: $0.2 \cdot -0.02 + (1 - 0.2) \cdot 0.0001 = -0.00392 = -0.392\%$
    * Day 3: $2.0 \cdot 0.015 + (1 - 2.0) \cdot 0.0001 = 0.0299 = +2.99\%$
* **Strategy Excess Return ($R_{se,t}$)**:
    * Day 1: $1.495\% - 0.01\% = +1.485\%$
    * Day 2: $-0.392\% - 0.01\% = -0.402\%$
    * Day 3: $2.99\% - 0.01\% = +2.98\%$
* **Market Excess Return ($R_{me,t}$)**:
    * Day 1: $1.0\% - 0.01\% = +0.99\%$
    * Day 2: $-2.0\% - 0.01\% = -2.01\%$
    * Day 3: $1.5\% - 0.01\% = +1.49\%$

#### Step 2: Calculate Key Statistics (Mean & Std Dev)
* **Strategy Geometric Mean**: $\bar{R}_{se,g} = ((1.01485) \cdot (0.99598) \cdot (1.0298))^{1/3} - 1 = 0.01345 = 1.345\%$
* **Market Geometric Mean**: $\bar{R}_{me,g} = ((1.0099) \cdot (0.9799) \cdot (1.0149))^{1/3} - 1 = 0.00143 = 0.143\%$
* **Strategy StDev**: $\sigma_s = \text{StDev}([0.01495, -0.00392, 0.0299]) = 0.0171$
* **Market StDev**: $\sigma_m = \text{StDev}([0.01, -0.02, 0.015]) = 0.0153$

#### Step 3: Calculate the Unadjusted Sharpe Ratio
$$S = \frac{0.01345}{0.0171} \cdot \sqrt{252} \approx 12.5$$

#### Step 4: Calculate the Penalties
* **Volatility Penalty ($P_v$)**:
    * $V_s = 0.0171 \cdot \sqrt{252} \approx 0.271$ (27.1%)
    * $V_m = 0.0153 \cdot \sqrt{252} \approx 0.243$ (24.3%)
    * Ratio: $27.1 / 24.3 \approx 1.115$. This is **less than 1.2**, so there is no penalty.
    * $P_v = 1 + \max(0, 1.115 - 1.2) = 1.0$
* **Return Penalty ($P_r$)**:
    * Your strategy's mean (1.345%) is **greater than** the market's mean (0.143%), so there is no penalty.
    * $P_r = 1.0$

#### Step 5: Calculate the Final Adjusted Score
$$S_{\text{adj}} = \frac{12.5}{1.0 \cdot 1.0} = 12.5$$
In this example, because the strategy comfortably beat the market's return and stayed within its volatility budget, the final score is the same as the unadjusted Sharpe Ratio.

# Feature Engineering

In [None]:
import polars as pl
import numpy as np
import xgboost as xgb
import itertools
import os


# --- 1. Metric Implementation (No changes here) ---
def calculate_competition_score(y_true_df: pl.DataFrame, y_pred_signals: np.ndarray) -> float:
    solution = y_true_df.to_pandas()
    solution['position'] = y_pred_signals
    solution['strategy_returns'] = (
        solution['risk_free_rate'] * (1 - solution['position']) +
        solution['position'] * solution['forward_returns']
    )
    strategy_excess_returns = solution['strategy_returns'] - solution['risk_free_rate']
    strategy_geo_mean = (1 + strategy_excess_returns).prod() ** (1 / len(solution)) - 1
    strategy_std = solution['strategy_returns'].std()
    if strategy_std == 0: return 0.0
    trading_days_per_yr = 252
    sharpe = strategy_geo_mean / strategy_std * np.sqrt(trading_days_per_yr)
    market_std = solution['forward_returns'].std()
    market_volatility = market_std * np.sqrt(trading_days_per_yr) * 100
    strategy_volatility = strategy_std * np.sqrt(trading_days_per_yr) * 100
    excess_vol = max(0, strategy_volatility / market_volatility - 1.2) if market_volatility > 0 else 0
    vol_penalty = 1 + excess_vol
    market_excess_returns = solution['forward_returns'] - solution['risk_free_rate']
    market_geo_mean = (1 + market_excess_returns).prod() ** (1 / len(solution)) - 1
    return_gap = max(0, (market_geo_mean - strategy_geo_mean) * 100 * trading_days_per_yr)
    return_penalty = 1 + (return_gap**2) / 100
    adjusted_sharpe = sharpe / (vol_penalty * return_penalty)
    return adjusted_sharpe


# --- 2. Feature Engineering (MODIFIED) ---
def create_and_save_interaction_features(df: pl.DataFrame, batch_size: int = 20, output_dir="features") -> list[str]:
    """
    Generates pairwise interaction features in batches to conserve memory and saves them to disk.
    """
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    base_features = [col for col in df.columns if col not in ["date_id", "forward_returns", "risk_free_rate", "target"]]
    print(f"Starting batched feature generation for {len(base_features)} base features...")
    
    file_paths = []
    
    # Iterate through the features in chunks
    for i in range(0, len(base_features), batch_size):
        batch_features = base_features[i:i + batch_size]
        
        # Create a temporary DataFrame for this batch's new features
        batch_interaction_df = pl.DataFrame()
        
        # --- Interactions WITHIN the current batch ---
        for f1, f2 in itertools.combinations(batch_features, 2):
            batch_interaction_df = batch_interaction_df.with_columns(
                (df[f1] + df[f2]).alias(f'{f1}_add_{f2}'),
                (df[f1] - df[f2]).alias(f'{f1}_sub_{f2}'),
                (df[f1] * df[f2]).alias(f'{f1}_mult_{f2}'),
            )
            
        # --- Interactions BETWEEN the current batch and ALL PREVIOUS features ---
        previous_features = base_features[:i]
        for f1 in batch_features:
            for f2 in previous_features:
                 batch_interaction_df = batch_interaction_df.with_columns(
                    (df[f1] + df[f2]).alias(f'{f1}_add_{f2}'),
                    (df[f1] - df[f2]).alias(f'{f1}_sub_{f2}'),
                    (df[f1] * df[f2]).alias(f'{f1}_mult_{f2}'),
                )
        
        if batch_interaction_df.width > 0:
            file_path = f"{output_dir}/interactions_batch_{i//batch_size}.parquet"
            batch_interaction_df.write_parquet(file_path)
            file_paths.append(file_path)
            print(f"  ... Saved batch {i//batch_size} with {batch_interaction_df.width} features to {file_path}")
            
    return file_paths

# --- Main Script ---
# 1. Load and do initial prep
full_train_df = pl.read_csv("/kaggle/input/hull-tactical-market-prediction/train.csv")
full_train_df = full_train_df.rename({'market_forward_excess_returns': 'target'})

# Explicitly cast all columns except date_id to Float64 to ensure they are all numeric
feature_cols = [col for col in full_train_df.columns if col != 'date_id']
full_train_df = full_train_df.with_columns(
    pl.col(feature_cols).cast(pl.Float64, strict=False)
)

# Handle nulls in the base data first
base_df = full_train_df.with_columns(pl.all().forward_fill()).drop_nulls()

# 2. Generate and save interaction features in batches
interaction_files = create_and_save_interaction_features(base_df, batch_size=20)

# 3. Load all features for the selection process
print("\nLoading all original and generated features for selection...")
interaction_dfs = [pl.read_parquet(f) for f in interaction_files]
# Combine original data with all generated feature batches horizontally
processed_df = pl.concat([base_df] + interaction_dfs, how="horizontal")

# 4. Chronological split (same as before)
VALIDATION_SIZE = 180
train_df = processed_df.head(-VALIDATION_SIZE)
# We don't need the validation set for feature selection, only training data
ALL_FEATURES = [col for col in train_df.columns if col not in ["date_id", "forward_returns", "risk_free_rate", "target"]]
TARGET_COL = "target"
X_train_all = train_df.select(ALL_FEATURES)
y_train = train_df.select(TARGET_COL)

print(f"\nGenerated a total of {len(ALL_FEATURES)} features for selection.")

# 5. Feature Selection using XGBoost Importance
print("\nStarting feature selection...")
N_FEATURES_TO_SELECT = 150
selector_model = xgb.XGBRegressor(objective='reg:squarederror', n_estimators=200, random_state=42, n_jobs=-1, tree_method='hist', device='cuda')
selector_model.fit(X_train_all, y_train, verbose=False)

importances = selector_model.feature_importances_
feature_importance_df = pl.DataFrame({'feature': ALL_FEATURES, 'importance': importances}).sort('importance', descending=True)
selected_features = feature_importance_df.head(N_FEATURES_TO_SELECT).get_column('feature').to_list()

print(f"Selected the top {len(selected_features)} most important features.")
final_training_data = processed_df.select(
    selected_features + ["target", "forward_returns", "risk_free_rate"]
)
# 6. Save the list of selected features for later use
output_filename = "final_training_data_150_features.parquet"
final_training_data.write_parquet(output_filename)

print(f"Successfully saved final training data with {final_training_data.width} columns to '{output_filename}'")

Cross validaiton

In [None]:
# === Run this in a SEPARATE "Experimentation" Notebook ===

import polars as pl
import numpy as np
import xgboost as xgb
from sklearn.model_selection import TimeSeriesSplit

# --- Copy the metric and signal functions from your other notebook ---

def calculate_competition_score(y_true_df: pl.DataFrame, y_pred_signals: np.ndarray) -> float:
    # (The full function code goes here)
    solution = y_true_df.to_pandas()
    solution['position'] = y_pred_signals
    solution['strategy_returns'] = (
        solution['risk_free_rate'] * (1 - solution['position']) +
        solution['position'] * solution['forward_returns']
    )
    strategy_excess_returns = solution['strategy_returns'] - solution['risk_free_rate']
    strategy_geo_mean = (1 + strategy_excess_returns).prod() ** (1 / len(solution)) - 1
    strategy_std = solution['strategy_returns'].std()
    if strategy_std == 0: return 0.0
    trading_days_per_yr = 252
    sharpe = strategy_geo_mean / strategy_std * np.sqrt(trading_days_per_yr)
    market_std = solution['forward_returns'].std()
    market_volatility = market_std * np.sqrt(trading_days_per_yr) * 100
    strategy_volatility = strategy_std * np.sqrt(trading_days_per_yr) * 100
    excess_vol = max(0, strategy_volatility / market_volatility - 1.2) if market_volatility > 0 else 0
    vol_penalty = 1 + excess_vol
    market_excess_returns = solution['forward_returns'] - solution['risk_free_rate']
    market_geo_mean = (1 + market_excess_returns).prod() ** (1 / len(solution)) - 1
    return_gap = max(0, (market_geo_mean - strategy_geo_mean) * 100 * trading_days_per_yr)
    return_penalty = 1 + (return_gap**2) / 100
    adjusted_sharpe = sharpe / (vol_penalty * return_penalty)
    return adjusted_sharpe

def convert_to_signal(predictions: np.ndarray, multiplier: float = 400.0) -> np.ndarray:
    # (The full function code goes here)
    signals = predictions * multiplier + 1
    return np.clip(signals, 0.0, 2.0)

# --- Main Cross-Validation Script ---

print("Loading pre-processed training data for cross-validation...")
training_df = pl.read_parquet("/kaggle/working/final_training_data_150_features.parquet")

FEATURES = [col for col in training_df.columns if col not in ["target", "forward_returns", "risk_free_rate"]]
TARGET_COL = "target"

X = training_df.select(FEATURES)
y = training_df.select(TARGET_COL)
scorer_info_df = training_df.select(["forward_returns", "risk_free_rate"])

print("\n" + "="*50)
print("Starting 5-Fold Time Series Cross-Validation...")
tscv = TimeSeriesSplit(n_splits=5)
cv_scores = []

for i, (train_index, test_index) in enumerate(tscv.split(X)):
    X_train, X_test = X[train_index], X[test_index]
    y_train = y[train_index]
    y_test_info = scorer_info_df[test_index]
    
    # Use your final model parameters for an accurate score estimate
    model = xgb.XGBRegressor(
        objective='reg:squarederror', n_estimators=500, device='cuda',
        learning_rate=0.05, max_depth=5, subsample=0.8, colsample_bytree=0.8,
        n_jobs=-1, random_state=42
    )
    model.fit(X_train, y_train, verbose=False)
    
    predictions = model.predict(X_test)
    signals = convert_to_signal(predictions)
    score = calculate_competition_score(y_test_info, signals)
    cv_scores.append(score)
    print(f"  Fold {i+1}/5 Score: {score:.4f}")

print(f"\nMean CV Score: {np.mean(cv_scores):.4f} (+/- {np.std(cv_scores):.4f})")
print("="*50)

In [None]:
import os
import polars as pl
import numpy as np
import xgboost as xgb
import kaggle_evaluation.default_inference_server

# --- Global Variables ---
FINAL_MODEL = None
MODEL_IS_TRAINED = False
FINAL_FEATURE_LIST = [] 
HISTORY_BUFFER = None

# --- Helper Functions ---
def convert_to_signal(predictions: np.ndarray, multiplier: float = 400.0) -> np.ndarray:
    signals = predictions * multiplier + 1
    return np.clip(signals, 0.0, 2.0)

# The fast, "production" function for creating features during the prediction loop
def generate_final_features_optimized(df: pl.DataFrame, feature_list: list[str]) -> pl.DataFrame:
    final_feature_exprs = []
    
    for feature_name in feature_list:
        if '_' not in feature_name:
            if feature_name in df.columns: final_feature_exprs.append(pl.col(feature_name))
        else:
            parts = feature_name.split('_')
            f1, op, f2 = parts[0], parts[1], "_".join(parts[2:])
            if f1 in df.columns and f2 in df.columns:
                if op == 'add': expr = (pl.col(f1) + pl.col(f2)).alias(feature_name)
                elif op == 'sub': expr = (pl.col(f1) - pl.col(f2)).alias(feature_name)
                elif op == 'mult': expr = (pl.col(f1) * pl.col(f2)).alias(feature_name)
                final_feature_exprs.append(expr)

    if not final_feature_exprs: return pl.DataFrame()
    return df.select(final_feature_exprs)

def train_model_if_needed():
    """Handles the one-time model training by loading the pre-processed Parquet file."""
    global FINAL_MODEL, MODEL_IS_TRAINED, FINAL_FEATURE_LIST, HISTORY_BUFFER

    if MODEL_IS_TRAINED:
        return

    print("First prediction call received. Starting one-time model training...")
    
    # Load the pre-processed training data
    training_df = pl.read_parquet("/kaggle/working/final_training_data_150_features.parquet")
    
    FINAL_FEATURE_LIST = [col for col in training_df.columns if col not in ["target", "forward_returns", "risk_free_rate"]]
    print(f"Training final model with {len(FINAL_FEATURE_LIST)} pre-processed features.")

    X_train_full = training_df.select(FINAL_FEATURE_LIST)
    y_train_full = training_df.select("target")

    # Train the final XGBoost model
    FINAL_MODEL = xgb.XGBRegressor(
        objective='reg:squarederror', n_estimators=500, device='cuda',
        learning_rate=0.05, max_depth=5, subsample=0.8, colsample_bytree=0.8,
        n_jobs=-1, random_state=42
    )
    FINAL_MODEL.fit(X_train_full, y_train_full, verbose=False)
    
    # --- FIX STARTS HERE ---
    # Initialize the history buffer with the raw data, ensuring the target column is renamed
    raw_train_df = pl.read_csv("/kaggle/input/hull-tactical-market-prediction/train.csv")
    raw_train_df = raw_train_df.rename({'market_forward_excess_returns': 'target'}) # Rename to 'target'
    
    feature_cols = [col for col in raw_train_df.columns if col != 'date_id']
    raw_train_df = raw_train_df.with_columns(pl.col(feature_cols).cast(pl.Float64, strict=False))
    base_df = raw_train_df.with_columns(pl.all().forward_fill()).drop_nulls()
    HISTORY_BUFFER = base_df.tail(35)
    # --- FIX ENDS HERE ---
    
    MODEL_IS_TRAINED = True
    print("Model training complete. Ready for predictions.")

# --- Main Prediction Function ---
def predict(test: pl.DataFrame) -> float:
    global HISTORY_BUFFER, FINAL_MODEL, FINAL_FEATURE_LIST
    
    train_model_if_needed()
    
    # Standardize incoming data
    feature_cols = [col for col in test.columns if col != 'date_id']
    test = test.with_columns(pl.col(feature_cols).cast(pl.Float64, strict=False))
    rename_mapping = {'lagged_forward_returns': 'forward_returns', 'lagged_risk_free_rate': 'risk_free_rate', 'lagged_market_forward_excess_returns': 'target'}
    test = test.rename(rename_mapping)
    if 'is_scored' in test.columns: test = test.drop('is_scored')
        
    HISTORY_BUFFER = pl.concat([HISTORY_BUFFER, test], how="vertical")
    
    # Use the OPTIMIZED function to generate features for the live data
    features_df = generate_final_features_optimized(HISTORY_BUFFER, FINAL_FEATURE_LIST)
    
    latest_features = features_df.tail(1)
    
    raw_prediction = FINAL_MODEL.predict(latest_features)[0]
    signal = convert_to_signal(np.array([raw_prediction]))[0]
    
    return float(signal)

# --- Launch Server ---
inference_server = kaggle_evaluation.default_inference_server.DefaultInferenceServer(predict)
if os.getenv('KAGGLE_IS_COMPETITION_RERUN'):
    inference_server.serve()
else:
    inference_server.run_local_gateway(('/kaggle/input/hull-tactical-market-prediction/',))