# I've created a score comparison table for each model. 
## Let's try improving the models by adjusting the parameters.
### -----------------------------------------------------------
### ver2: Added Submit DEMO from [TMORODER](https://www.kaggle.com/code/morodertobias/hull-leak-safe-baseline
### ver4: Added Overfitting vs Performance 

In [None]:
import numpy as np
import pandas as pd
import pandas.api.types
import polars as pl
import pathlib
import os
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import cross_val_predict, KFold
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.pipeline import Pipeline
from sklearn.linear_model import (
    LinearRegression, Ridge, ElasticNet, Lars, Lasso,
    BayesianRidge, HuberRegressor, QuantileRegressor, RANSACRegressor,
    TheilSenRegressor, PoissonRegressor, TweedieRegressor, GammaRegressor,
    MultiTaskElasticNet
)
from sklearn.svm import SVR
from sklearn.neural_network import MLPRegressor
from sklearn.ensemble import GradientBoostingRegressor

from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from catboost import CatBoostRegressor

import warnings
warnings.filterwarnings("ignore")

In [None]:
MIN_INVESTMENT = 0
MAX_INVESTMENT = 2


class ParticipantVisibleError(Exception):
    pass


def hull_score(solution: pd.DataFrame, submission: pd.DataFrame, row_id_column_name: str) -> float:
    """
    Calculates a custom evaluation metric (volatility-adjusted Sharpe ratio).

    This metric penalizes strategies that take on significantly more volatility
    than the underlying market.

    Returns:
        float: The calculated adjusted Sharpe ratio.
    """

    if not pandas.api.types.is_numeric_dtype(submission['prediction']):
        raise ParticipantVisibleError('Predictions must be numeric')

    solution = solution
    solution['position'] = submission['prediction']

    if solution['position'].max() > MAX_INVESTMENT:
        raise ParticipantVisibleError(f'Position of {solution["position"].max()} exceeds maximum of {MAX_INVESTMENT}')
    if solution['position'].min() < MIN_INVESTMENT:
        raise ParticipantVisibleError(f'Position of {solution["position"].min()} below minimum of {MIN_INVESTMENT}')

    solution['strategy_returns'] = solution['risk_free_rate'] * (1 - solution['position']) + solution['position'] * solution['forward_returns']

    # Calculate strategy's Sharpe ratio
    strategy_excess_returns = solution['strategy_returns'] - solution['risk_free_rate']
    strategy_excess_cumulative = (1 + strategy_excess_returns).prod()
    strategy_mean_excess_return = (strategy_excess_cumulative) ** (1 / len(solution)) - 1
    strategy_std = solution['strategy_returns'].std()

    trading_days_per_yr = 252
    if strategy_std == 0:
        raise ParticipantVisibleError('Division by zero, strategy std is zero')
    sharpe = strategy_mean_excess_return / strategy_std * np.sqrt(trading_days_per_yr)
    strategy_volatility = float(strategy_std * np.sqrt(trading_days_per_yr) * 100)

    # Calculate market return and volatility
    market_excess_returns = solution['forward_returns'] - solution['risk_free_rate']
    market_excess_cumulative = (1 + market_excess_returns).prod()
    market_mean_excess_return = (market_excess_cumulative) ** (1 / len(solution)) - 1
    market_std = solution['forward_returns'].std()

    market_volatility = float(market_std * np.sqrt(trading_days_per_yr) * 100)

    if market_volatility == 0:
        raise ParticipantVisibleError('Division by zero, market std is zero')

    # Calculate the volatility penalty
    excess_vol = max(0, strategy_volatility / market_volatility - 1.2) if market_volatility > 0 else 0
    vol_penalty = 1 + excess_vol

    # Calculate the return penalty
    return_gap = max(
        0,
        (market_mean_excess_return - strategy_mean_excess_return) * 100 * trading_days_per_yr,
    )
    return_penalty = 1 + (return_gap**2) / 100

    # Adjust the Sharpe ratio by the volatility and return penalty
    adjusted_sharpe = sharpe / (vol_penalty * return_penalty)
    return min(float(adjusted_sharpe), 1_000_000)

In [None]:
BASE_DIR = pathlib.Path("/kaggle/input/hull-tactical-market-prediction")
SEED = 888
TEST_SKIP = 180
# same features as in hull starter nb
FEATURES = [
    "S2",
    "E2", "E3",
    "P8", "P9", "P10", "P12", "P13",
    "S1", "S5", 
    "I2",
    "U1",
    "U2",
]
INFO_COLS = ["date_id", "forward_returns", "risk_free_rate"]

CV = 5
L1_RATIO = 0.5
ALPHAS = np.logspace(-4, 2, 100)
MAX_ITER = 1000000

In [None]:
data = pd.read_csv(BASE_DIR / "train.csv")
data["U1"] = data["I2"] - data["I1"]
data["U2"] = data["M11"] / ((data["I2"] + data["I9"] + data["I7"]) / 3)
data = data[FEATURES + INFO_COLS].dropna()
max_train_date = data["date_id"].max() - TEST_SKIP
print("max train date_id:", max_train_date)

In [None]:
solution = data.copy()
market_excess_returns = solution['forward_returns'] - solution['risk_free_rate']
market_excess_cumulative = (1 + market_excess_returns).prod()
market_mean_excess_return = (market_excess_cumulative) ** (1 / len(solution)) - 1
c = (1 + market_mean_excess_return) ** (1 / (market_excess_returns > 0).mean()) - 1
submission = pd.DataFrame({'prediction': (c / market_excess_returns).clip(0, 2)})
print("best score train:", hull_score(solution, submission, ''))

In [None]:
train = data.loc[data["date_id"] <= max_train_date].copy()
test = data.loc[data["date_id"] > max_train_date].copy()
print("train shape:", train.shape)
print("test shape:", test.shape)
train["target"] = submission

# Definition of Model List

In [None]:
# --- Let's try optimize the parameters ---

lgb_params = {'learning_rate': 0.01,
               'n_estimators': 100,
               'verbosity': -1,
               'objective': 'regression',
               'metric': 'smape',
               'seed': SEED,
              }

xgb_params = {
    'learning_rate': 0.01,
    'n_estimators': 100,
    #'objective': 'reg:squarederror',
    #'eval_metric': hull_score,
    'random_state': SEED,
    'n_jobs': -1,
}

cat_params = {'learning_rate': 0.01,
              'n_estimators': 100,
              'custom_metric': 'SMAPE',
              'verbose': 0,
              'random_seed': SEED,
             }

models = {
    "LightGBM": LGBMRegressor(**lgb_params),
    "XGBoost": XGBRegressor(**xgb_params),
    "CatBoost": CatBoostRegressor(**cat_params),
    "LinearRegression": LinearRegression(),
    "Ridge": Ridge(),
    "ElasticNet": ElasticNet(),
    "Lars": Lars(),
    "Lasso": Lasso(),
    "BayesianRidge": BayesianRidge(),
    "Huber": HuberRegressor(),
    "QuantileRegressor": QuantileRegressor(quantile=0.5, solver='highs'),
    "RANSAC": RANSACRegressor(random_state=42),
    "TheilSen": TheilSenRegressor(random_state=42),
    "Poisson": PoissonRegressor(),
    "Tweedie": TweedieRegressor(),
    "PolynomialFeatures+LR": Pipeline([
        ('poly', PolynomialFeatures(degree=2, include_bias=False)),
        ('linear', LinearRegression())
    ]),
    "SVR": SVR(),
    "MLPRegressor": MLPRegressor(random_state=42, max_iter=500),
    "GradientBoosting": GradientBoostingRegressor(random_state=42),
}


# Evaluation

In [None]:
X_train = train[FEATURES].values
y_train = train["target"].values

sc = StandardScaler()
X_train_scaled = sc.fit_transform(X_train)

# --- Evaluation Loop for Each Model ---
results = []
cv = KFold(n_splits=3, shuffle=True, random_state=42)

for name, model in models.items():
    print(f"Running model: {name}")
    
    try:
        # Predict
        model.fit(X_train_scaled, y_train)
        X_test = test[FEATURES].values
        X_test_scaled = sc.transform(X_test)
        y_train_pred = model.predict(X_train_scaled)
        y_train_pred = np.clip(y_train_pred, 0.0, 2.0)
        y_test_pred = model.predict(X_test_scaled)
        y_test_pred = np.clip(y_test_pred, 0.0, 2.0)
        
        # Calculate the Sharpe ratio
        y_train_pred = pd.DataFrame({'prediction': y_train_pred}, index=train.index)
        y_test_pred = pd.DataFrame({'prediction': y_test_pred}, index=test.index)
        train_score = hull_score(train, y_train_pred, '')
        test_score = hull_score(test, y_test_pred, '')
        overfit_ratio = test_score / train_score
        
        #print("score public train:", train_score)
        print("score public test:", test_score)
        print("overfit ratio:", overfit_ratio)
        
        results.append({"model": name, "sharpe_ratio": test_score, "overfit_ratio": overfit_ratio})
        
    except Exception as e:
        print(f"Could not process model {name}: {e}")
        results.append({"model": name, "sharpe_ratio": np.nan})


# Visualization of Results

In [None]:
# --- Visualization of Results ---
results_df = pd.DataFrame(results).sort_values(by="sharpe_ratio", ascending=False).dropna()

plt.figure(figsize=(12, 10))
sns.barplot(x="sharpe_ratio", y="model", data=results_df, palette="viridis")
plt.title("Score by Model", fontsize=16)
plt.xlabel("Score", fontsize=12)
plt.ylabel("Model", fontsize=12)
plt.grid(axis='x', linestyle='--', alpha=0.7)
plt.show()

plt.figure(figsize=(8, 8))
plt.scatter(x=results_df.sharpe_ratio, y=results_df.overfit_ratio, color='tomato', s=100, alpha=0.8)

for _, row in results_df.iterrows():
    plt.text(row.sharpe_ratio-0.02, row.overfit_ratio+0.09, row.model,
             fontsize=10, va='center')

plt.xlabel("Score", fontsize=12)
plt.ylabel('Overfit Ratio', fontsize=12)
plt.title('Overfitting vs Performance', fontsize=16)
plt.grid(True, linestyle='--', alpha=0.6)
plt.show()

print(results_df)


# Submit

In [None]:
# https://www.kaggle.com/code/morodertobias/hull-leak-safe-baseline

import kaggle_evaluation.default_inference_server

def predict(test: pl.DataFrame) -> float:
    data = test.to_pandas()
    data["U1"] = data["I2"] - data["I1"]
    data["U2"] = data["M11"] / ((data["I2"] + data["I9"] + data["I7"]) / 3)    
    X = data[FEATURES].values
    X_scaled = sc.transform(X)
    y = model.predict(X_scaled)
    pred = np.clip(y, 0.0, 2.0)[0]
    print(f"date_id: {data['date_id'][0]} -> prediction: {pred:>.4f}")
    return pred

model = MLPRegressor(random_state=42, max_iter=500)
model.fit(X_train_scaled, y_train)

inference_server = kaggle_evaluation.default_inference_server.DefaultInferenceServer(predict)

if os.getenv('KAGGLE_IS_COMPETITION_RERUN'):
    inference_server.serve()
else:
    inference_server.run_local_gateway((BASE_DIR.as_posix(),))