In [None]:
import numpy as np
import pandas as pd

In [None]:
df_train = pd.read_csv("/kaggle/input/hull-tactical-market-prediction/train.csv")
df_train.head()

In [None]:
print(df_train.columns)

In [None]:
cat_cols = [f"D{i}" for i in range(1, 10)]
df_train[cat_cols] = df_train[cat_cols].astype("category")

In [None]:
df_train.info()

In [None]:
df_test = pd.read_csv("/kaggle/input/hull-tactical-market-prediction/test.csv")
df_test.head()

In [None]:
df_test.shape

In [None]:
df_train.describe()

In [None]:
# ##Changing dtypes to getCategorical features ##
# categorical_x_vars = df_train[['D1', 'D2', 'D3', 'D4', 'D5', 'D6', 'D7', 'D8', 'D9']].columns
# cols_x  = df_train.columns.difference(['forward_returns' , 'market_forward_excess_returns' , 'risk_free_rate'])


In [None]:
def preprocess(df, drop_cols=None):
    df = df.copy()

    # Identify training mode (has forward_returns)
    is_train = "forward_returns" in df.columns

    # Drop high-null columns only during training
    if is_train:
        print("Training data Preprocess")
        high_null_cols = [c for c in df.columns if df[c].isnull().mean() > 0.5]
        drop_cols = high_null_cols  # Save for test data
    elif drop_cols is not None:
        # For test data, drop same columns as training
        df = df.drop(columns=drop_cols, errors='ignore')

    # Fill missing values
    for col in df.columns:
        if df[col].dtype in ['float64', 'int64']:
            df[col] = df[col].fillna(df[col].median())
        else:
            if len(df[col].mode()) > 0:
                df[col] = df[col].fillna(df[col].mode()[0])

    return df, drop_cols


# First, preprocess training data and capture dropped columns
df_train, drop_cols = preprocess(df_train)

# Then, apply same logic to test data
df_test, _ = preprocess(df_test, drop_cols=drop_cols)


In [None]:
print(df_train.shape)
print(df_test.shape)

In [None]:
##Creating lagged Features ##
def lagged_variables_creation(df: pd.DataFrame, mode: str = 'train') -> pd.DataFrame:
    df = df.copy()  
    
    if mode == "train":
        for i in [1, 2, 3, 5, 10]:
            df[f'lagged_forward_returns_{i}'] = df['forward_returns'].shift(i)
            df[f'lagged_risk_free_rate_{i}'] = df['risk_free_rate'].shift(i)
            df[f'lagged_market_forward_excess_returns_{i}'] = df['market_forward_excess_returns'].shift(i)
    
    else:  # mode == 'test'
        for i in [2, 3, 5, 10]:
            df[f'lagged_forward_returns_{i}'] = df['lagged_forward_returns'].shift(i)
            df[f'lagged_risk_free_rate_{i}'] = df['lagged_risk_free_rate'].shift(i)
            df[f'lagged_market_forward_excess_returns_{i}'] = df['lagged_market_forward_excess_returns'].shift(i)
    
    return df


In [None]:
df_train = lagged_variables_creation(df_train , mode = "train")
df_test = lagged_variables_creation(df_test , mode = 'test')

In [None]:
## Renaming columns as per training data ##
df_test.rename(columns = {"lagged_forward_returns" : "lagged_forward_returns_1" , "lagged_risk_free_rate" : 
                          "lagged_risk_free_rate_1" , 
                          "lagged_market_forward_excess_returns" : "lagged_market_forward_excess_returns_1"} , inplace = True)


In [None]:
## Removing useless cols ##
print(df_train.columns.difference(df_test.columns))
df_train.drop(columns = ['E7', 'M1', 'M13', 'M14', 'M6', 'S3', 'V10', 'V9' ,'market_forward_excess_returns', 'risk_free_rate'] , inplace = True)
print(df_test.columns.difference(df_train.columns))
df_test.drop(columns = ['is_scored'] , inplace = True)

In [None]:
## Backward filing values for lag variables ##
cols_lag = [c for c in df_train.columns if 'lagged_' in c]
df_train[cols_lag] = df_train[cols_lag].bfill()
df_test[cols_lag] = df_test[cols_lag].bfill()

In [None]:
print(f'Training data {df_train.shape}')
print(f'Testing data {df_test.shape}')

In [None]:
## Data Splitting for train test ##
from sklearn.model_selection import train_test_split 

cols_x =  df_train.columns.difference(['forward_returns'])
trainx , testx , trainy , testy  =  train_test_split(df_train[cols_x] , df_train['forward_returns'] , test_size = 0.2 , random_state = 0 )
print('Train size' , trainx.shape)
print('Val size' , testx.shape)

In [None]:
# ##Optuna for Hyper Parametre optimization ##
# import lightgbm as lgb
# import optuna
# from sklearn.metrics import mean_squared_error
# from sklearn.model_selection import TimeSeriesSplit
# import numpy as np




# X = trainx
# Y = trainy

# tscv = TimeSeriesSplit(n_splits=5)

# def objective(trial):
#     params = {
#         'n_estimators': trial.suggest_int('n_estimators', 1000, 2000),
#         'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.1),
#         'num_leaves': trial.suggest_int('num_leaves', 16, 256),
#         'feature_fraction': trial.suggest_float('feature_fraction', 0.6, 1.0),
#         'bagging_fraction': trial.suggest_float('bagging_fraction', 0.6, 1.0),
#         'bagging_freq': trial.suggest_int('bagging_freq', 1, 10),
#         'lambda_l1': trial.suggest_float('lambda_l1', 1e-3, 10.0, log=True),
#         'lambda_l2': trial.suggest_float('lambda_l2', 1e-3, 10.0, log=True),
#         'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', 10, 200),
#         'max_depth': trial.suggest_int('max_depth', 3, 25),
#         'objective': 'regression',
#         'metric': 'rmse',
#         'verbosity': -1,
#         'n_jobs': -1 , 
#         'device' : 'gpu'
#     }

#     rmse_scores = []

#     for train_idx, test_idx in tscv.split(X):
#         X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
#         y_train, y_test = Y.iloc[train_idx], Y.iloc[test_idx]

#         model = lgb.LGBMRegressor(**params)

    
#         model.fit(
#             X_train, y_train,
#             eval_set=[(X_test, y_test)],
#             eval_metric='rmse',
#         )

#         preds = model.predict(X_test)
#         rmse = mean_squared_error(y_test, preds, squared=False)
#         rmse_scores.append(rmse)

#     return np.mean(rmse_scores)

# study = optuna.create_study(direction='minimize')
# study.optimize(objective, n_trials=100)


In [None]:
# print("Best params:", study.best_params)
# print("Best score:", study.best_value)



In [None]:
## Model training ##
import lightgbm as lgb
from sklearn.metrics import mean_squared_error

##params got from optuna ##
params  = {'n_estimators': 1945, 'learning_rate': 0.04053314077367818, 'num_leaves': 173, 'feature_fraction': 0.8710874581442012, 'bagging_fraction': 0.6037115183441955, 'bagging_freq': 3, 
 'lambda_l1': 0.9517420004019254, 'lambda_l2': 0.7270178245553806, 'min_data_in_leaf': 108, 'max_depth': 25}


model = lgb.LGBMRegressor(**params  )
model.fit(trainx , trainy)

In [None]:
from sklearn.metrics import mean_squared_error

preds = model.predict(testx)

val_pred = pd.DataFrame({'actual' : testy , 'predicted' : preds})

val_pred.head()

In [None]:
print('Mean Squared Error',mean_squared_error(val_pred.actual , val_pred.predicted))
print('Root Mean Squared Error' , mean_squared_error(val_pred.actual , val_pred.predicted , squared = False))

In [None]:
import os
import numpy as np
import pandas as pd
import polars as pl
import lightgbm as lgb
import kaggle_evaluation.default_inference_server

from gc import collect
from warnings import filterwarnings
filterwarnings("ignore")

# ===============================
# Constants
# ===============================
MIN_SIGNAL = 0.0
MAX_SIGNAL = 2.0
SIGNAL_MULTIPLIER = 1.0


# ===============================
# Utility functions
# ===============================



# def predictions_to_signal(predictions):
#     """Convert raw model predictions to valid signals in [0, 2]."""
#     signals = np.clip(predictions * SIGNAL_MULTIPLIER, MIN_SIGNAL, MAX_SIGNAL)
#     return signals


ALPHA_FOR_SCORER = 0.600132
TAU_ABS_FOR_SCORER = 9.43717e-05
MIN_INVESTMENT, MAX_INVESTMENT = 0.0, 2.0
TRADING_DAYS = 252

def post_process_signal(y_pred,
                        *,
                        tau: float = TAU_ABS_FOR_SCORER,
                        alpha: float = ALPHA_FOR_SCORER,
                        min_investment: float = MIN_INVESTMENT,
                        max_investment: float = MAX_INVESTMENT):
    sig = np.asarray(y_pred, dtype=float).ravel()
    pos = np.where(sig > tau, alpha, 0.0)
    return np.clip(pos, min_investment, max_investment)




# Example feature columns (must match training)
cols_x = model.feature_name_
cat_cols = [c for c in cols_x if c.startswith("D")]

print(f"Model loaded with {len(cols_x)} features.")





def predict(test: pl.DataFrame) -> float:
    """
    Predict single-day market position for inference server.
    Must return a single float.
    """
    global model, cols_x, cat_cols

    # Convert Polars → Pandas
    test = test.to_pandas()

    # Convert categorical
    if cat_cols:
        for c in cat_cols:
            if c in test.columns:
                test[c] = test[c].astype("category")

    # Preprocess (handles NaN + alignment)
    test, _ = preprocess(test)   # ✅ FIX: unpack tuple safely

    # Lagged feature creation
    test = lagged_variables_creation(test, mode="test")

    # Rename lagged vars for consistency
    test.rename(columns={
        "lagged_forward_returns": "lagged_forward_returns_1",
        "lagged_risk_free_rate": "lagged_risk_free_rate_1",
        "lagged_market_forward_excess_returns": "lagged_market_forward_excess_returns_1"
    }, inplace=True)

    # Apply backward fill for lagged columns
    cols_lag = [c for c in test.columns if 'lagged_' in c]
    test[cols_lag] = test[cols_lag].bfill()

    # Align columns with model input
    test = test.reindex(columns=cols_x, fill_value=0)

    # Predict
    preds = model.predict(test)
    # preds = np.asarray(preds).ravel()
    pos = post_process_signal(preds)
    return float(np.asarray(pos).ravel()[0])

    # Convert to valid signal
    # signal = predictions_to_signal(preds[0])
   
    # return float(signal)



# Inference Server
# ===============================
print("Starting inference server...")

inference_server = kaggle_evaluation.default_inference_server.DefaultInferenceServer(predict)

if os.getenv("KAGGLE_IS_COMPETITION_RERUN"):
    inference_server.serve()
else:
    inference_server.run_local_gateway(("/kaggle/input/hull-tactical-market-prediction/",))