In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import polars as pl
import numpy as np
import xgboost as xgb
import optuna

# --- 1. Metric Implementation ---
def calculate_competition_score(y_true_df: pl.DataFrame, y_pred_signals: np.ndarray) -> float:
    solution = y_true_df.to_pandas()
    solution['position'] = y_pred_signals
    solution['strategy_returns'] = (
        solution['risk_free_rate'] * (1 - solution['position']) +
        solution['position'] * solution['forward_returns']
    )
    strategy_excess_returns = solution['strategy_returns'] - solution['risk_free_rate']
    strategy_geo_mean = (1 + strategy_excess_returns).prod() ** (1 / len(solution)) - 1
    strategy_std = solution['strategy_returns'].std()
    if strategy_std == 0: return 0.0
    trading_days_per_yr = 252
    sharpe = strategy_geo_mean / strategy_std * np.sqrt(trading_days_per_yr)
    market_std = solution['forward_returns'].std()
    market_volatility = market_std * np.sqrt(trading_days_per_yr) * 100
    strategy_volatility = strategy_std * np.sqrt(trading_days_per_yr) * 100
    excess_vol = max(0, strategy_volatility / market_volatility - 1.2) if market_volatility > 0 else 0
    vol_penalty = 1 + excess_vol
    market_excess_returns = solution['forward_returns'] - solution['risk_free_rate']
    market_geo_mean = (1 + market_excess_returns).prod() ** (1 / len(solution)) - 1
    return_gap = max(0, (market_geo_mean - strategy_geo_mean) * 100 * trading_days_per_yr)
    return_penalty = 1 + (return_gap**2) / 100
    adjusted_sharpe = sharpe / (vol_penalty * return_penalty)
    return adjusted_sharpe


# --- 2. Feature Engineering ---
def create_features(df: pl.DataFrame) -> pl.DataFrame:
    """Creates lags and rolling stats for ALL available feature columns."""
    
    # Dynamically identify all feature columns to engineer
    features_to_engineer = [col for col in df.columns if col not in ["date_id", "forward_returns", "risk_free_rate", "target"]]
    
    print(f"Starting feature engineering on {len(features_to_engineer)} columns...")
    
    df_eng = df.clone()

    # Create Lags and Rolling Stats for all identified features
    for feature in features_to_engineer:
        # Lags
        for lag in [1, 3, 5]:
            df_eng = df_eng.with_columns(
                pl.col(feature).shift(lag).alias(f'{feature}_lag_{lag}')
            )
            
        # Rolling Window Statistics
        for window in [10, 30]:
            df_eng = df_eng.with_columns(
                pl.col(feature).rolling_mean(window).alias(f'{feature}_roll_mean_{window}'),
                pl.col(feature).rolling_std(window).alias(f'{feature}_roll_std_{window}')
            )

    # Handle nulls created by lags and rolling windows
    return df_eng.with_columns(pl.all().forward_fill()).drop_nulls()

# --- 3. Signal Conversion ---
def convert_to_signal(predictions: np.ndarray, multiplier: float = 400.0) -> np.ndarray:
    signals = predictions * multiplier + 1
    return np.clip(signals, 0.0, 2.0)

# # --- 4. Main Script ---
# # Load and prepare data
# full_train_df = pl.read_csv("/kaggle/input/hull-tactical-market-prediction/train.csv")
# full_train_df = full_train_df.rename({'market_forward_excess_returns': 'target'})
# processed_df = create_features(full_train_df)

# # Chronological split
# VALIDATION_SIZE = 180
# train_df = processed_df.head(-VALIDATION_SIZE)
# validation_df = processed_df.tail(VALIDATION_SIZE)

# FEATURES = [col for col in train_df.columns if col not in ["date_id", "forward_returns", "risk_free_rate", "target"]]
# TARGET_COL = "target"

# X_train = train_df.select(FEATURES)
# y_train = train_df.select(TARGET_COL)
# X_val = validation_df.select(FEATURES)
# y_val_info = validation_df

# print(f"\nTraining with {len(FEATURES)} features.")

# # --- 5. Hyperparameter Tuning with Optuna ---
# def objective(trial):
#     params = {
#         'objective': 'reg:squarederror', 
#         'tree_method': 'hist',
#         'device' : 'cuda',
#         'n_estimators': 1000,
#         'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, log=True),
#         'max_depth': trial.suggest_int('max_depth', 3, 10),
#         'subsample': trial.suggest_float('subsample', 0.6, 1.0),
#         'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),
#         'random_state': 42, 'n_jobs': -1,
#     }
#     model = xgb.XGBRegressor(**params)
#     model.fit(X_train, y_train, eval_set=[(X_val, y_val_info.select(TARGET_COL))],
#               early_stopping_rounds=50, verbose=False)
#     predictions = model.predict(X_val)
#     signals = convert_to_signal(predictions)
#     score = calculate_competition_score(y_val_info, signals)
#     return score

# print("\nStarting hyperparameter tuning with Optuna...")
# # Suppress Optuna's logging to keep the output clean
# optuna.logging.set_verbosity(optuna.logging.WARNING)
# study = optuna.create_study(direction='maximize')
# study.optimize(objective, n_trials=50) # Use a reasonable number of trials

# print(f"Best trial score: {study.best_value}")
# print("Best parameters found: ", study.best_params)

# # --- 6. Train Final Model and Evaluate ---
# print("\nTraining final model with best parameters...")
# best_params = study.best_params
# final_model = xgb.XGBRegressor(objective='reg:squarederror', n_estimators=1000,
#                                n_jobs=-1, random_state=42, **best_params)
# final_model.fit(X_train, y_train, eval_set=[(X_val, y_val_info.select(TARGET_COL))], verbose=False)

# raw_predictions = final_model.predict(X_val)
# final_signals = convert_to_signal(raw_predictions)
# final_score = calculate_competition_score(y_val_info, final_signals)

# print("\n" + "="*50)
# print(f"Final Validation Score with Tuned XGBoost: {final_score:.4f}")
# print("="*50)

# For submission

In [None]:
# --- 1. Train the Final Model on All Available Data ---
import kaggle_evaluation.default_inference_server
print("Loading and preparing all training data...")
# Load the entire training dataset
full_train_df = pl.read_csv("/kaggle/input/hull-tactical-market-prediction/train.csv")
full_train_df = full_train_df.rename({'market_forward_excess_returns': 'target'})


# Explicitly cast all columns except date_id to Float64
feature_cols = [col for col in full_train_df.columns if col != 'date_id']
full_train_df = full_train_df.with_columns(
    pl.col(feature_cols).cast(pl.Float64, strict=False)
)
# We need the unprocessed data to build our historical buffer for inference
# We need at least 30 historical rows to calculate all rolling window features
HISTORY_BUFFER = full_train_df.tail(35) 

# Create features for the full training set
processed_df = create_features(full_train_df).drop_nulls()

FEATURES = [col for col in processed_df.columns if col not in ["date_id", "forward_returns", "risk_free_rate", "target"]]
TARGET_COL = "target"

X_train = processed_df.select(FEATURES)
y_train = processed_df.select(TARGET_COL)

# Hardcode the best parameters found by Optuna
best_params = {
    'learning_rate': 0.1637303464891451, 
    'max_depth': 10, 
    'subsample': 0.6007188302958633, 
    'colsample_bytree': 0.7562060623680535
}

print(f"Training final XGBoost model on {X_train.shape[0]} rows with {len(FEATURES)} features...")
final_model = xgb.XGBRegressor(
    objective='reg:squarederror',
    n_estimators=500, # Train for a fixed number of rounds
    n_jobs=-1,
    random_state=42,
    **best_params
)

final_model.fit(X_train, y_train, verbose=False)
print("Model training complete.")


# --- 2. Define the Prediction Function for the API ---

# --- 2. Define the Prediction Function for the API (CORRECTED) ---

def predict(test: pl.DataFrame) -> float:
    """
    This function is called by the Kaggle API for each day in the test set.
    """
    global HISTORY_BUFFER
    
    # Explicitly cast feature columns to Float64 to ensure type consistency
    feature_cols = [col for col in test.columns if col != 'date_id']
    test = test.with_columns(
        pl.col(feature_cols).cast(pl.Float64, strict=False)
    )

    # --- FIX STARTS HERE ---
    # Standardize column names: rename lagged columns from the test set to match the training set names
    rename_mapping = {
        'lagged_forward_returns': 'forward_returns',
        'lagged_risk_free_rate': 'risk_free_rate',
        'lagged_market_forward_excess_returns': 'target' # This will be our placeholder target
    }
    test = test.rename(rename_mapping)
    # --- FIX ENDS HERE ---

    # Drop the 'is_scored' column which exists in the test set but not our training history.
    if 'is_scored' in test.columns:
        test = test.drop('is_scored')
    
    # Append the new test row (now with matching columns) to our historical buffer
    HISTORY_BUFFER = pl.concat([HISTORY_BUFFER, test], how="vertical")
    
    # Create features on the combined history
    features_df = create_features(HISTORY_BUFFER)
    
    # We only want to predict on the very last row (the current day)
    latest_features = features_df.tail(1).select(FEATURES)
    
    # Predict, convert to signal, and return a single float value
    raw_prediction = final_model.predict(latest_features)[0]
    signal = convert_to_signal(np.array([raw_prediction]))[0]
    
    return float(signal)


# --- 3. Launch the Inference Server ---

inference_server = kaggle_evaluation.default_inference_server.DefaultInferenceServer(predict)

# This block is required by the Kaggle environment
if os.getenv('KAGGLE_IS_COMPETITION_RERUN'):
    inference_server.serve()
else:
    # This block is for local testing. It simulates the API.
    # We use the public test set as the data source.
    inference_server.run_local_gateway(('/kaggle/input/hull-tactical-market-prediction/',))
