In [1]:
import os
import pandas as pd
import polars as pl
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from collections import deque
import kaggle_evaluation.default_inference_server

# --- PART 1: INITIALIZATION AND TRAINING (Unchanged) ---
print("--- Initializing and training the final model... ---")

# 1.1: Load and Prepare Training Data
full_train_df = pd.read_csv('/kaggle/input/hull-tactical-market-prediction/train.csv')

# 1.2: Initial Feature Engineering
cutoff_date_id = 3283
df_subset = full_train_df[full_train_df['date_id'] >= cutoff_date_id].copy()
df_subset = df_subset.sort_values('date_id').ffill().bfill()

top_features_for_rolling = ['M4', 'V13', 'P8', 'S5']
for feature in top_features_for_rolling:
    if feature in df_subset.columns:
        df_subset[f'{feature}_roll_mean_20'] = df_subset[feature].rolling(window=20).mean()

if 'S5' in df_subset.columns and 'V13' in df_subset.columns:
    df_subset['S5_x_V13'] = df_subset['S5'] * df_subset['V13']

df_featured = df_subset.dropna()

# 1.3: Feature Selection
print("Performing feature selection...")
X_temp = df_featured.drop(columns=['date_id', 'market_forward_excess_returns', 'forward_returns', 'risk_free_rate'], errors='ignore')
y_temp = df_featured['market_forward_excess_returns']

feature_selector_model = RandomForestRegressor(n_estimators=100, max_depth=10, min_samples_leaf=10, random_state=42, n_jobs=-1)
feature_selector_model.fit(X_temp, y_temp)

importances = feature_selector_model.feature_importances_
feature_importance_df = pd.DataFrame({'feature': X_temp.columns, 'importance': importances})
top_30_features = feature_importance_df.sort_values('importance', ascending=False).head(30)['feature'].tolist()

df_lean = df_featured[['date_id', 'market_forward_excess_returns'] + top_30_features]

# 1.4: Train the Final Model
print("Training the final model on the lean feature set...")
X_final_train = df_lean[top_30_features]
y_final_train = df_lean['market_forward_excess_returns']

final_model = RandomForestRegressor(n_estimators=100, max_depth=10, min_samples_leaf=10, random_state=42, n_jobs=-1)
final_model.fit(X_final_train, y_final_train)
print("Model training complete.")

# 1.5: Initialize Global State for the Prediction Loop
history_for_features = df_featured.tail(50).copy()
raw_predictions_history = deque(maxlen=5)

print("--- Initialization complete. Ready for inference. ---")


### FIX: Add a function to transform the raw prediction to a valid [0, 2] allocation.
SCALE_FACTOR = 200 # This factor scales a 1% predicted return (0.01) to a max bet of 2.0 (0.01 * 200 = 2.0)

def prediction_to_allocation(prediction: float) -> float:
    """Converts a raw return prediction to a valid investment allocation between 0 and 2."""
    if prediction <= 0:
        return 0.0
    
    # Scale the positive prediction to an allocation size
    allocation = prediction * SCALE_FACTOR
    
    # Use np.clip to enforce the [0, 2] boundaries as a final safety measure.
    return np.clip(allocation, 0.0, 2.0)


# --- PART 2: THE PREDICTION FUNCTION (CORRECTED AND ROBUST) ---

def predict(test: pl.DataFrame) -> float:
    """
    This function is called by the inference server for each new day of test data.
    It uses the globally trained model and updates the global history to make a prediction.
    """
    global history_for_features, raw_predictions_history, final_model, top_30_features, top_features_for_rolling

    # 2.1: Convert input to Pandas and update history
    current_day_pd = test.to_pandas()
    history_for_features = pd.concat([history_for_features, current_day_pd], ignore_index=True)

    # 2.2: Impute missing values in the combined history BEFORE feature engineering.
    history_for_features.ffill(inplace=True)
    history_for_features.fillna(0, inplace=True)

    # 2.3: Prune the history to keep calculations fast and prevent memory leaks.
    history_for_features = history_for_features.tail(50).reset_index(drop=True)

    # 2.4: Engineer features for the current day using the now-clean history
    for feature in top_features_for_rolling:
        if feature in history_for_features.columns:
            history_for_features[f'{feature}_roll_mean_20'] = history_for_features[feature].rolling(window=20, min_periods=1).mean()
    
    if 'S5' in history_for_features.columns and 'V13' in history_for_features.columns:
        history_for_features['S5_x_V13'] = history_for_features['S5'] * history_for_features['V13']

    current_features = history_for_features.tail(1)

    # 2.5: Make a raw prediction
    X_current_test = current_features[top_30_features]
    raw_prediction = final_model.predict(X_current_test)[0]
    
    # 2.6: Apply smoothing
    raw_predictions_history.append(raw_prediction)
    smoothed_prediction = np.mean(raw_predictions_history)

    ### FIX: Transform the smoothed prediction into a valid allocation.
    final_allocation = prediction_to_allocation(smoothed_prediction)
    
    # 2.7: Return the final allocation
    return float(final_allocation)


# --- PART 3: SERVING THE MODEL (Unchanged) ---

inference_server = kaggle_evaluation.default_inference_server.DefaultInferenceServer(predict)

if os.getenv('KAGGLE_IS_COMPETITION_RERUN'):
    inference_server.serve()
else:
    inference_server.run_local_gateway(('/kaggle/input/hull-tactical-market-prediction/',))

--- Initializing and training the final model... ---
Performing feature selection...
Training the final model on the lean feature set...
Model training complete.
--- Initialization complete. Ready for inference. ---


In [2]:
read_submission = pd.read_parquet('/kaggle/working/submission.parquet')
read_submission.head()

Unnamed: 0,date_id,prediction
0,8980,0.068154
1,8981,0.037205
2,8982,0.029828
3,8983,0.049998
4,8984,0.04125
