In [None]:
# =========================
# Imports
# =========================
import os
import pandas as pd
import numpy as np
from pathlib import Path
from catboost import CatBoostRegressor
from lightgbm import LGBMRegressor
import polars as pl
import kaggle_evaluation.default_inference_server

# =========================
# Global Constants
# =========================
MIN_INVESTMENT = 0.0
MAX_INVESTMENT = 2.0
DATA_PATH = Path("/kaggle/input/hull-tactical-market-prediction/")

# =========================
# Load Data
# =========================
train_df = pd.read_csv(DATA_PATH / "train.csv").dropna()
test_df  = pd.read_csv(DATA_PATH / "test.csv").dropna()
true_targets = dict(zip(train_df["date_id"], train_df["forward_returns"]))

# =========================
# Feature List
# =========================
MAIN_FEATURES = [
    'E1','E2','E3','E4','E5','E6','E7','E8','E9','E10','E11','E12','E13','E14','E15','E16','E17','E18','E19','E20',
    "S1","S2","S5","I2","P8","P9","P10","P12","P13"
]

# =========================
# Preprocessing
# =========================
def preprocessing(df, typ="train"):
    df = df.copy()
    if typ == "train":
        df = df[MAIN_FEATURES + ["forward_returns"]]
    else:
        df = df[MAIN_FEATURES]
    df.fillna(0, inplace=True)
    return df

train_df_prep = preprocessing(train_df, "train")
X_train = train_df_prep.drop(columns=["forward_returns"])
y_train = train_df_prep["forward_returns"]

# =========================
# CatBoost Model
# =========================
cat_params = dict(
    iterations=3500,
    learning_rate=0.015,
    depth=8,
    l2_leaf_reg=4.0,
    min_child_samples=40,
    colsample_bylevel=0.8,
    od_wait=150,
    random_seed=42,
    od_type='Iter',
    loss_function='RMSE',
    bootstrap_type='Bayesian',
    grow_policy='SymmetricTree',
    logging_level='Silent'
)

cat_model = CatBoostRegressor(**cat_params)
cat_model.fit(X_train, y_train)

# =========================
# LightGBM Model
# =========================
lgb_model = LGBMRegressor(
    n_estimators=2500,
    learning_rate=0.02,
    max_depth=8,
    num_leaves=80,
    subsample=0.8,
    colsample_bytree=0.7,
    reg_alpha=1.0,
    reg_lambda=2.0,
    random_state=42,
    verbosity=-1
)

lgb_model.fit(X_train, y_train)

# =========================
# Predict Both Models
# =========================
X_test = preprocessing(test_df, "test")

pred_cat = cat_model.predict(X_test)
pred_lgb = lgb_model.predict(X_test)

pred_final = 0.72 * pred_cat + 0.28 * pred_lgb
pred_final = np.clip(pred_final, MIN_INVESTMENT, MAX_INVESTMENT)

# =========================
# Save Submission Parquet
# =========================
submission = pd.DataFrame({
    "date_id": test_df["date_id"],
    "prediction": pred_final
})

submission.to_parquet("submission.parquet", index=False)
print("Saved submission.parquet successfully!")

# =========================
# Prepare inference mapping
# =========================
pred_final_array = pred_final.copy()
test_index_map = {date_id: idx for idx, date_id in enumerate(test_df["date_id"])}

def predict(test_row: pl.DataFrame) -> float:
    date_id = int(test_row.select("date_id").to_series().item())
    idx = test_index_map.get(date_id)
    if idx is None:
        return 0.0
    return float(pred_final_array[idx])

inference_server = kaggle_evaluation.default_inference_server.DefaultInferenceServer(predict)

if os.getenv('KAGGLE_IS_COMPETITION_RERUN'):
    inference_server.serve()
else:
    inference_server.run_local_gateway((str(DATA_PATH),))