In [None]:
%%time
import os
import numpy as np
import pandas as pd
import polars as pl

from sklearn.linear_model import Ridge , LinearRegression
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler , FunctionTransformer
from sklearn.ensemble import RandomForestRegressor, StackingRegressor , BaggingRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor

import kaggle_evaluation.default_inference_server

In [None]:
train = pd.read_csv("/kaggle/input/hull-tactical-market-prediction/train.csv")
test = pd.read_csv("/kaggle/input/hull-tactical-market-prediction/test.csv")

# Pre processing

In [None]:
print("columns ONLY in train before adjusting lags risk free and exc frw ret\n")

for c in train.columns:
    if c not in test.columns:
        print(c)

print("\ncolumns ONLY in test before adjusting lags risk free and exc frw ret\n")

for c in test.columns:
    if c not in train.columns:
        print(c)

In [None]:
# Create lagged columns
train["lagged_forward_returns"] = train["forward_returns"].shift(1)
train["lagged_risk_free_rate"] = train["risk_free_rate"].shift(1)
train["lagged_market_forward_excess_returns"] = train["market_forward_excess_returns"].shift(1)

target = "market_forward_excess_returns"
exclude_cols = ["date_id", target]
features = [c for c in train.columns if c not in exclude_cols]

X = train[features]
y = train[target]

In [None]:
print("columns ONLY in train after adjusting lags risk free and exc frw ret\n")

for c in train.columns:
    if c not in test.columns:
        print(c)

print("\ncolumns ONLY in test after adjusting lags risk free and exc frw ret\n")

for c in test.columns:
    if c not in train.columns:
        print(c)

In [None]:
target = "forward_returns" 

x = train.drop(columns = [target])
y = train[target]

In [None]:
print("columns ONLY in X after adjusting lags risk free and exc frw ret\n")

for c in x.columns:
    if c not in test.columns:
        print(c)

print("\ncolumns ONLY in test after adjusting lags risk free and exc frw ret\n")

for c in test.columns:
    if c not in x.columns:
        print(c)

In [None]:
x = x.drop(columns = [ "risk_free_rate" , "market_forward_excess_returns" ])
test = test.drop(columns = ["is_scored"])

In [None]:
print("columns ONLY in X after adjusting lags risk free and exc frw ret\n")

for c in x.columns:
    if c not in test.columns:
        print(c)

print("\ncolumns ONLY in test after adjusting lags risk free and exc frw ret\n")

for c in test.columns:
    if c not in x.columns:
        print(c)

# feature engineering

In [None]:
def create_lags(data, lags):
    """
    Create lag features for a pandas Series or list.

    Parameters:
    -----------
    data : pd.Series or list-like
        The original time-series data.
    lags : int or list of ints
        Lag values (e.g., 1 or [1, 2, 7]).

    Returns:
    --------
    pd.DataFrame
        Columns named 'lag_{n}' with shifted values.
    """
    s = pd.Series(data).reset_index(drop=True)
    lags = [lags] if isinstance(lags, int) else lags
    lag_df = pd.DataFrame({f'lag_{n}': s.shift(n) for n in lags})
    return lag_df

In [None]:
def create_rolling_features(data, windows, functions=['mean']):
    """
    Create rolling window features for a pd.Series or list.

    Parameters:
    -----------
    data : pd.Series or list-like
    windows : int or list of ints
        Window sizes (e.g., 3 or [3, 7]).
    functions : str or list of str
        Aggregations to compute: 'mean', 'max', 'min', 'std', etc.

    Returns:
    --------
    pd.DataFrame
        Columns like 'roll_{func}_{w}'.
    """
    s = pd.Series(data).reset_index(drop=True)
    windows = [windows] if isinstance(windows, int) else windows
    functions = [functions] if isinstance(functions, str) else functions

    df = pd.DataFrame()
    for w in windows:
        rolled = s.rolling(window=w)
        for func in functions:
            if hasattr(rolled, func):
                df[f'roll_{func}_{w}'] = getattr(rolled, func)()
            else:
                raise ValueError(f"Unsupported function: {func}")
    return df

In [None]:
def create_diff_features(data, lags):
    """
    Create difference-from-past features.

    Parameters:
    - data: pd.Series, list, or DataFrame column
    - lags: int or list of ints

    Returns:
    - pd.DataFrame with difference features
    """
    if isinstance(data, list):
        data = pd.Series(data)
    if isinstance(lags, int):
        lags = [lags]

    diff_df = pd.DataFrame()
    for lag in lags:
        diff_df[f'diff_{lag}'] = data.diff(lag)

    return diff_df

In [None]:
def prepare_features_for_col(
    col, 
    col_name, 
    lag_values=None,
    win_values=None,
    win_methods=None,
    diff_values=None,
    is_a_target = False
):
    """
    Generate lag, rolling window, and difference features for a single column.
    
    Parameters:
    ----------
    col : list, pandas.Series, or numpy.ndarray
        The input column data.
    col_name : str
        Name of the column for naming generated features.
    lag_values : list[int]
        List of lag steps.
    win_values : list[int]
        List of window sizes for rolling features.
    win_methods : list[str]
        Methods for rolling aggregation: 'mean', 'max', 'min', 'sum', etc.
    diff_values : list[int]
        List of periods for calculating differences.
        
    Returns:
    -------
    pandas.DataFrame
        DataFrame with all generated features.
    """
    
    # Ensure input is a pandas Series
    if not isinstance(col, pd.Series):
        col = pd.Series(col)
    
    # Initialize result DataFrame
    features = pd.DataFrame(index=col.index)

    # add col as well
    if not is_a_target:
        features[f"{col_name}"] = col
    
    # --- Lag Features ---
    if lag_values:
        for lag in lag_values:
            features[f"lag_{lag}_{col_name}"] = col.shift(lag)
    
    # --- Rolling Window Features ---
    if win_values and win_methods:
        for win in win_values:
            for method in win_methods:
                if hasattr(pd.Series.rolling(col, win), method):
                    features[f"win_{method}_{win}_{col_name}"] = getattr(col.rolling(win), method)()
                else:
                    raise ValueError(f"Method '{method}' is not supported for rolling windows.")
     # --- Difference Features ---
    if diff_values:
        for diff in diff_values:
            features[f"diff_{diff}_{col_name}"] = col.diff(diff)
    
    return features

In [None]:
def prepare_features_for_df(
    df,
    lag_values=[1, 2,5,7,10,15,20,-1,-2,-5,-7,-10,-20 ],
    win_values=[2, 3 , 5 , 7 , 10 , 15 , 30 ],
    win_methods=["mean", "max"],
    diff_values=[1, 2, 5 , 7 , 10 , 15 , 20 ,-1,-2, -5,-7,-10,-15,-20 ]
):
    to_return = pd.DataFrame()
    for col in list(df.columns):
        featdf = prepare_features_for_col(
            col=df[col],
            col_name=col,
            lag_values=lag_values,
            win_values=win_values,
            win_methods=win_methods,
            diff_values=diff_values
        )
        to_return = pd.concat([to_return,featdf],axis = 1)
    to_return = to_return.fillna(method='ffill').fillna(method='bfill')
    return to_return.fillna(method='ffill').fillna(method='bfill')

In [None]:
smpdf= pd.DataFrame(
    {
        'A':[3,4,5],
        'B':[4,5,6]
    }
)

smpfeatdf = prepare_features_for_df(
    smpdf
).fillna(0)
smpfeatdf

# model engnieering

In [None]:
log_transformer = FunctionTransformer(lambda x: np.log1p(np.abs(x)) * np.sign(x))
feature_maker = FunctionTransformer(prepare_features_for_df, validate=False)

n_est_cmn = 50

base_models = [
    ("xgb", XGBRegressor(
        n_estimators=n_est_cmn, 
        learning_rate=0.05, 
        max_depth=5, 
        subsample=0.8, 
        colsample_bytree=0.8,
        random_state=42
    )),
    ("lgbm", LGBMRegressor(
        n_estimators=n_est_cmn, 
        learning_rate=0.05, 
        max_depth=-1, 
        subsample=0.8, 
        colsample_bytree=0.8,
        random_state=42,
        verbosity=-1
    )),
    ("rf", RandomForestRegressor(
        n_estimators=n_est_cmn, 
        max_depth=None, 
        n_jobs=-1, 
        random_state=42
    )),
    # # Bagging over Decision Trees
    # ("bagging_dt", BaggingRegressor(
    #     base_estimator=DecisionTreeRegressor(max_depth=5, random_state=42),
    #     n_estimators=50,
    #     max_samples=0.8,
    #     max_features=0.8,
    #     bootstrap=True,
    #     n_jobs=-1,
    #     random_state=42
    # )),

    # # Bagging over Linear Regression
    # ("bagging_lr", BaggingRegressor(
    #     base_estimator=LinearRegression(),
    #     n_estimators=50,
    #     max_samples=0.8,
    #     max_features=0.8,
    #     bootstrap=True,
    #     n_jobs=-1,
    #     random_state=42
    # )),

    # # Bagging over Ridge
    # ("bagging_ridge", BaggingRegressor(
    #     base_estimator=Ridge(alpha=1.0, random_state=42),
    #     n_estimators=50,
    #     max_samples=0.8,
    #     max_features=0.8,
    #     bootstrap=True,
    #     n_jobs=-1,
    #     random_state=42
    # )),

    # # Bagging over SVR (a bit slower, but adds diversity)
    # ("bagging_svr", BaggingRegressor(
    #     base_estimator=SVR(kernel="rbf", C=1.0, epsilon=0.1),
    #     n_estimators=20,   # lower because SVR is heavier
    #     max_samples=0.8,
    #     max_features=0.8,
    #     bootstrap=True,
    #     n_jobs=-1,
    #     random_state=42
    # )),

    # # Bagging over KNN
    # ("bagging_knn", BaggingRegressor(
    #     base_estimator=KNeighborsRegressor(n_neighbors=5),
    #     n_estimators=50,
    #     max_samples=0.8,
    #     max_features=0.8,
    #     bootstrap=True,
    #     n_jobs=-1,
    #     random_state=42
    # )),
]

# Stacked model with Ridge as meta-learner
stacked_model = StackingRegressor(
    estimators=base_models,
    final_estimator=Ridge(alpha=1.0),
    n_jobs=-1,
    passthrough=True  # if True, meta-model also sees raw features
)

# Full pipeline
model = Pipeline([
    ("Feature maker",feature_maker ),
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler()),
    ("log_norm", log_transformer),
    ("stacked", stacked_model)
])

# Fit
model.fit(x, y)

In [None]:
previous_allocation = 1.0  # start neutral

def predict(test: pl.DataFrame) -> float:
    global previous_allocation
    # Convert Polars -> Pandas
    row = test.to_pandas()
    # Make sure only training features are used
    row = row.reindex(columns=x.columns, fill_value=0)
    # Predict
    pred = model.predict(row)[0]
    # Convert prediction into allocation
    allocation = 1.0 + 50 * pred  # scaling factor
    # Clip to [0, 2]
    allocation = np.clip(allocation, 0.0, 2.0)
    # Smooth with previous allocation
    allocation = 0.8 * allocation + 0.2 * previous_allocation
    previous_allocation = allocation
    return float(allocation)

In [None]:
inference_server = kaggle_evaluation.default_inference_server.DefaultInferenceServer(predict)

if os.getenv("KAGGLE_IS_COMPETITION_RERUN"):
    inference_server.serve()
else:
    inference_server.run_local_gateway(("/kaggle/input/hull-tactical-market-prediction/",))