# Imports

In [None]:
import os
from pathlib import Path
import datetime

from tqdm import tqdm
from dataclasses import dataclass, asdict

import pandas as pd
import polars as pl 
import numpy as np
from sklearn.linear_model import ElasticNet, ElasticNetCV, LinearRegression
from sklearn.preprocessing import StandardScaler

import matplotlib.pyplot as plt
import seaborn as sns

import kaggle_evaluation.default_inference_server

import warnings
warnings.filterwarnings("ignore", message="use_inf_as_na option is deprecated")


In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Data Loading

In [None]:
# ============ PATHS ============
DATA_PATH: Path = Path('/kaggle/input/hull-tactical-market-prediction/')

# ============ RETURNS TO SIGNAL CONFIGS ============
MIN_SIGNAL: float = 0.0                         # Minimum value for the daily signal 
MAX_SIGNAL: float = 2.0                         # Maximum value for the daily signal 
SIGNAL_MULTIPLIER: float = 400.0                # Multiplier of the OLS market forward excess returns predictions to signal 

# ============ MODEL CONFIGS ============
CV: int = 10                                    # Number of cross validation folds in the model fitting
L1_RATIO: float = 0.5                           # ElasticNet mixing parameter
ALPHAS: np.ndarray = np.logspace(-4, 2, 100)    # Constant that multiplies the penalty terms
MAX_ITER: int = 1000000                         # The maximum number of iterations

In [None]:
MIN_INVESTMENT = 0.0
MAX_INVESTMENT = 2.0

In [None]:
@dataclass
class DatasetOutput:
    X_train : pl.DataFrame 
    X_test: pl.DataFrame
    y_train: pl.Series
    y_test: pl.Series
    scaler: StandardScaler

@dataclass 
class ElasticNetParameters:
    l1_ratio : float 
    cv: int
    alphas: np.ndarray 
    max_iter: int 
    
    def __post_init__(self): 
        if self.l1_ratio < 0 or self.l1_ratio > 1: 
            raise ValueError("Wrong initializing value for ElasticNet l1_ratio")
        
@dataclass(frozen=True)
class RetToSignalParameters:
    signal_multiplier: float 
    min_signal : float = MIN_SIGNAL
    max_signal : float = MAX_SIGNAL

ret_signal_params = RetToSignalParameters(
    signal_multiplier= SIGNAL_MULTIPLIER
)

enet_params = ElasticNetParameters(
    l1_ratio = L1_RATIO, 
    cv = CV, 
    alphas = ALPHAS, 
    max_iter = MAX_ITER
)

# Utility Functions

In [None]:
def load_trainset() -> pl.DataFrame:
    """
    Loads and preprocesses the training dataset.

    Returns:
        pl.DataFrame: The preprocessed training DataFrame.
    """
    return (
        pl.read_csv(DATA_PATH / "train.csv")
        .rename({'market_forward_excess_returns':'target'})
        .with_columns(
            pl.exclude('date_id').cast(pl.Float64, strict=False)
        )
        .head(-10)
    )

def load_testset() -> pl.DataFrame:
    """
    Loads and preprocesses the testing dataset.

    Returns:
        pl.DataFrame: The preprocessed testing DataFrame.
    """
    return (
        pl.read_csv(DATA_PATH / "test.csv")
        .rename({'lagged_forward_returns':'target'})
        .with_columns(
            pl.exclude('date_id').cast(pl.Float64, strict=False)
        )
    )

def create_example_dataset(df: pl.DataFrame) -> pl.DataFrame:
    """
    Creates new features and cleans a DataFrame.

    Args:
        df (pl.DataFrame): The input Polars DataFrame.

    Returns:
        pl.DataFrame: The DataFrame with new features, selected columns, and no null values.
    """
    vars_to_keep: List[str] = [
        "M1", "M2", "V1","V2", "S2", "E2", "E3", "P9", "S1", "S5", "I2", "P8",
        "P10", "P12", "P13", "U1", "U2"
    ]

    return (
        df.with_columns(
            (pl.col("I2") - pl.col("I1")).alias("U1"),
            (pl.col("M11") / ((pl.col("I2") + pl.col("I9") + pl.col("I7")) / 3)).alias("U2")
        )
        .select(["date_id", "target"] + vars_to_keep)
        .with_columns([
            pl.col(col).fill_null(pl.col(col).ewm_mean(com=0.5))
            for col in vars_to_keep
        ])
        .drop_nulls()
    )
    
def join_train_test_dataframes(train: pl.DataFrame, test: pl.DataFrame) -> pl.DataFrame:
    """
    Joins two dataframes by common columns and concatenates them vertically.

    Args:
        train (pl.DataFrame): The training DataFrame.
        test (pl.DataFrame): The testing DataFrame.

    Returns:
        pl.DataFrame: A single DataFrame with vertically stacked data from common columns.
    """
    common_columns: list[str] = [col for col in train.columns if col in test.columns]
    
    return pl.concat([train.select(common_columns), test.select(common_columns)], how="vertical")

def split_dataset(train: pl.DataFrame, test: pl.DataFrame, features: list[str]) -> DatasetOutput: 
    """
    Splits the data into features (X) and target (y), and scales the features.

    Args:
        train (pl.DataFrame): The processed training DataFrame.
        test (pl.DataFrame): The processed testing DataFrame.
        features (list[str]): List of features to used in model. 

    Returns:
        DatasetOutput: A dataclass containing the scaled feature sets, target series, and the fitted scaler.
    """
    X_train = train.drop(['date_id','target']) 
    y_train = train.get_column('target')
    X_test = test.drop(['date_id','target']) 
    y_test = test.get_column('target')
    
    scaler = StandardScaler() 
    
    X_train_scaled_np = scaler.fit_transform(X_train)
    X_train = pl.from_numpy(X_train_scaled_np, schema=features)
    
    X_test_scaled_np = scaler.transform(X_test)
    X_test = pl.from_numpy(X_test_scaled_np, schema=features)
    
    
    return DatasetOutput(
        X_train = X_train,
        y_train = y_train, 
        X_test = X_test, 
        y_test = y_test,
        scaler = scaler
    )
def convert_ret_to_signal(
    ret_arr: np.ndarray,
    params: RetToSignalParameters
) -> np.ndarray:
    """
    Converts raw model predictions (expected returns) into a trading signal.

    Args:
        ret_arr (np.ndarray): The array of predicted returns.
        params (RetToSignalParameters): Parameters for scaling and clipping the signal.

    Returns:
        np.ndarray: The resulting trading signal, clipped between min and max values.
    """
    return np.clip(
        ret_arr * params.signal_multiplier + 1, params.min_signal, params.max_signal
    )

def score(solution: pd.DataFrame, submission: pd.DataFrame, row_id_column_name: str) -> float:
    """
    Calculates a custom evaluation metric (volatility-adjusted Sharpe ratio).
    """

    if not pd.api.types.is_numeric_dtype(submission['prediction']):
        raise ParticipantVisibleError('Predictions must be numeric')

    # This function modifies the solution df, so pass a copy
    solution = solution.copy()
    solution['position'] = submission['prediction']

    if solution['position'].max() > MAX_INVESTMENT:
        raise ParticipantVisibleError(f'Position of {solution["position"].max()} exceeds maximum of {MAX_INVESTMENT}')
    if solution['position'].min() < MIN_INVESTMENT:
        raise ParticipantVisibleError(f'Position of {solution["position"].min()} below minimum of {MIN_INVESTMENT}')

    solution['strategy_returns'] = solution['risk_free_rate'] * (1 - solution['position']) + solution['position'] * solution['forward_returns']

    # Calculate strategy's Sharpe ratio
    strategy_excess_returns = solution['strategy_returns'] - solution['risk_free_rate']
    strategy_excess_cumulative = (1 + strategy_excess_returns).prod()
    strategy_mean_excess_return = (strategy_excess_cumulative) ** (1 / len(solution)) - 1
    strategy_std = solution['strategy_returns'].std()

    trading_days_per_yr = 252
    if strategy_std == 0:
        # Return 0 instead of crashing if std is zero
        return 0.0 
        
    sharpe = strategy_mean_excess_return / strategy_std * np.sqrt(trading_days_per_yr)
    strategy_volatility = float(strategy_std * np.sqrt(trading_days_per_yr) * 100)

    # Calculate market return and volatility
    market_excess_returns = solution['forward_returns'] - solution['risk_free_rate']
    market_excess_cumulative = (1 + market_excess_returns).prod()
    market_mean_excess_return = (market_excess_cumulative) ** (1 / len(solution)) - 1
    market_std = solution['forward_returns'].std()

    market_volatility = float(market_std * np.sqrt(trading_days_per_yr) * 100)

    if market_volatility == 0:
        raise ParticipantVisibleError('Division by zero, market std is zero')

    # Calculate the volatility penalty
    excess_vol = max(0, strategy_volatility / market_volatility - 1.2) if market_volatility > 0 else 0
    vol_penalty = 1 + excess_vol

    # Calculate the return penalty
    return_gap = max(
        0,
        (market_mean_excess_return - strategy_mean_excess_return) * 100 * trading_days_per_yr,
    )
    return_penalty = 1 + (return_gap**2) / 100

    # Adjust the Sharpe ratio by the volatility and return penalty
    adjusted_sharpe = sharpe / (vol_penalty * return_penalty)
    return min(float(adjusted_sharpe), 1_000_000)

In [None]:
train: pl.DataFrame = load_trainset()
test: pl.DataFrame = load_testset() 
print(train.tail(3)) 
print(test.head(3))

In [None]:
df: pl.DataFrame = join_train_test_dataframes(train, test)
df = create_example_dataset(df=df) 
train: pl.DataFrame = df.filter(pl.col('date_id').is_in(train.get_column('date_id')))
test: pl.DataFrame = df.filter(pl.col('date_id').is_in(test.get_column('date_id')))

FEATURES: list[str] = [col for col in test.columns if col not in ['date_id', 'target']]

dataset: DatasetOutput = split_dataset(train=train, test=test, features=FEATURES) 

X_train: pl.DataFrame = dataset.X_train
X_test: pl.DataFrame = dataset.X_test
y_train: pl.DataFrame = dataset.y_train
y_test: pl.DataFrame = dataset.y_test
scaler: StandardScaler = dataset.scaler 

# Preprocessing and EDA

In [None]:
train_pd = train.to_pandas()
test_pd = test.to_pandas()
train_pd = train_pd.replace([np.inf, -np.inf], np.nan)
test_pd = test_pd.replace([np.inf, -np.inf], np.nan)


print("✅ TRAIN SET OVERVIEW")
print(train_pd.info())
print(train_pd.describe().T)

print("\n✅ TEST SET OVERVIEW")
print(test_pd.info())
print(test_pd.describe().T)

In [None]:
plt.figure(figsize=(8,4))
sns.histplot(train_pd['target'], bins=50, kde=True)
plt.title("Distribution of Target (market_forward_excess_returns)")
plt.xlabel("Target Value")
plt.show()

# Basic stats
print(train_pd['target'].describe())

In [None]:
plt.figure(figsize=(12,4))
sns.lineplot(data=train_pd, x='date_id', y='target', linewidth=0.8)
plt.title("Target Over Time")
plt.xlabel("Date ID")
plt.ylabel("Target")
plt.show()

In [None]:
train_pd['target_rolling'] = train_pd['target'].rolling(30).mean()
plt.figure(figsize=(12,4))
sns.lineplot(data=train_pd, x='date_id', y='target_rolling', color='orange')
plt.title("30-Day Rolling Mean of Target")
plt.show()

In [None]:
FEATURES_TO_PLOT = ["M1", "M2", "V1", "V2", "S2", "E2", "E3", "P9", "S1", "S5", "I2", "P8"]

train_pd[FEATURES_TO_PLOT].hist(figsize=(14,10), bins=30)
plt.suptitle("Feature Distributions", fontsize=16)
plt.show()

In [None]:
corr = train_pd.corr(numeric_only=True)

# Correlation with target
target_corr = corr['target'].sort_values(ascending=False)
print("Top Positive Correlations:\n", target_corr.head(10))
print("\nTop Negative Correlations:\n", target_corr.tail(10))

# Heatmap
plt.figure(figsize=(10,8))
sns.heatmap(corr.loc[FEATURES_TO_PLOT + ['target'], FEATURES_TO_PLOT + ['target']], 
            cmap='coolwarm', center=0, annot=False)
plt.title("Feature Correlation Heatmap")
plt.show()

In [None]:
variance = train_pd.var().sort_values(ascending=True)
print("Low Variance Features:\n", variance.head(10))

# Boxplot of top features for outliers
plt.figure(figsize=(12,6))
sns.boxplot(data=train_pd[FEATURES_TO_PLOT])
plt.xticks(rotation=45)
plt.title("Feature Boxplots (Outlier Detection)")
plt.show()

# Top Performing Models

## Lightgbm

In [None]:
import lightgbm as lgb

model = lgb.LGBMRegressor(
    device='gpu',
    gpu_platform_id=0,
    gpu_device_id=0,
    n_estimators=100
)
model.fit(X_train, y_train)

In [None]:
def predict(test: pl.DataFrame) -> float:
    test = test.rename({'lagged_forward_returns':'target'})
    df: pl.DataFrame = create_example_dataset(test)
    X_test: pl.DataFrame = df.select(FEATURES)
    X_test_scaled_np: np.ndarray = scaler.transform(X_test)
    X_test: pl.DataFrame = pl.from_numpy(X_test_scaled_np, schema=FEATURES)
    raw_pred: float = model.predict(X_test)[0]
    return convert_ret_to_signal(raw_pred, ret_signal_params)

In [None]:
print("\n--- Calculating Local Score for LightGBM Model ---")

# 1. Load the ground truth solution (train data)
try:
    solution_df = pd.read_csv(DATA_PATH / "train.csv")
    solution_df = solution_df[['date_id', 'forward_returns', 'risk_free_rate']]
except Exception as e:
    print(f"Error loading solution data: {e}")
    raise

# 2. Generate predictions for the train set
X_train_np = X_train.to_numpy()
train_date_ids = train.get_column('date_id').to_numpy()

# LightGBM raw predictions (expected returns)
raw_preds_train = model.predict(X_train_np)

# Convert raw returns to final 0–2 signals
final_signals_train = convert_ret_to_signal(raw_preds_train, ret_signal_params)

# Create the submission DataFrame
submission_df = pd.DataFrame({
    'date_id': train_date_ids,
    'prediction': final_signals_train
})

# 3. Align both DataFrames on date_id
common_ids = set(solution_df['date_id']).intersection(set(submission_df['date_id']))

solution_df = (
    solution_df[solution_df['date_id'].isin(common_ids)]
    .sort_values('date_id')
    .reset_index(drop=True)
)

submission_df = (
    submission_df[submission_df['date_id'].isin(common_ids)]
    .sort_values('date_id')
    .reset_index(drop=True)
)

# 4. Evaluate with the provided score function
row_id_col = 'date_id'

if not solution_df.empty and not submission_df.empty:
    try:
        local_score = score(solution_df, submission_df, row_id_col)
        print(f"\nLightGBM Local Adjusted Sharpe Ratio: {local_score:.5f}")
    except ParticipantVisibleError as e:
        print(f"Metric Error: {e}")
    except Exception as e:
        print(f"Unexpected Error: {e}")
else:
    print("Could not calculate score: DataFrames are empty after alignment.")


In [None]:
#this change is done by shad jamil
print("Add additional models")
print("Light GBM")