In [None]:
import os
from pathlib import Path
import datetime

from tqdm import tqdm
from dataclasses import dataclass, asdict

import polars as pl 
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.linear_model import ElasticNet, ElasticNetCV, LinearRegression
from sklearn.ensemble import RandomForestRegressor 
from scipy.stats import randint, uniform
from sklearn.model_selection import RandomizedSearchCV, TimeSeriesSplit
from sklearn.metrics import mean_squared_error

from catboost import CatBoostRegressor

import kaggle_evaluation.default_inference_server

In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
import pandas.api.types

MIN_INVESTMENT = 0
MAX_INVESTMENT = 2


class ParticipantVisibleError(Exception):
    pass


def score(solution: pd.DataFrame, submission: pd.DataFrame, row_id_column_name: str) -> float:
    if not pandas.api.types.is_numeric_dtype(submission['prediction']):
        raise ParticipantVisibleError('Predictions must be numeric')

    solution = solution
    solution['position'] = submission['prediction']

    if solution['position'].max() > MAX_INVESTMENT:
        raise ParticipantVisibleError(f'Position of {solution["position"].max()} exceeds maximum of {MAX_INVESTMENT}')
    if solution['position'].min() < MIN_INVESTMENT:
        raise ParticipantVisibleError(f'Position of {solution["position"].min()} below minimum of {MIN_INVESTMENT}')

    solution['strategy_returns'] = solution['risk_free_rate'] * (1 - solution['position']) + solution['position'] * solution['forward_returns']

    strategy_excess_returns = solution['strategy_returns'] - solution['risk_free_rate']
    strategy_excess_cumulative = (1 + strategy_excess_returns).prod()
    strategy_mean_excess_return = (strategy_excess_cumulative) ** (1 / len(solution)) - 1
    strategy_std = solution['strategy_returns'].std()

    trading_days_per_yr = 252
    if strategy_std == 0:
        raise ParticipantVisibleError('Division by zero, strategy std is zero')
    sharpe = strategy_mean_excess_return / strategy_std * np.sqrt(trading_days_per_yr)
    strategy_volatility = float(strategy_std * np.sqrt(trading_days_per_yr) * 100)

    market_excess_returns = solution['forward_returns'] - solution['risk_free_rate']
    market_excess_cumulative = (1 + market_excess_returns).prod()
    market_mean_excess_return = (market_excess_cumulative) ** (1 / len(solution)) - 1
    market_std = solution['forward_returns'].std()

    market_volatility = float(market_std * np.sqrt(trading_days_per_yr) * 100)

    if market_volatility == 0:
        raise ParticipantVisibleError('Division by zero, market std is zero')

    excess_vol = max(0, strategy_volatility / market_volatility - 1.2) if market_volatility > 0 else 0
    vol_penalty = 1 + excess_vol

    return_gap = max(
        0,
        (market_mean_excess_return - strategy_mean_excess_return) * 100 * trading_days_per_yr,
    )
    return_penalty = 1 + (return_gap**2) / 100

    adjusted_sharpe = sharpe / (vol_penalty * return_penalty)
    return min(float(adjusted_sharpe), 1_000_000)

In [None]:
train_data = pd.read_csv("/kaggle/input/hull-tactical-market-prediction/train.csv")

features_delete = train_data.columns[train_data.isna().sum()>3000].tolist()
train_data = train_data.drop(features_delete, axis=1)

features_fill = train_data.columns[train_data.isna().any()].tolist()
for f in features_fill:
    train_data[f] = train_data[f].fillna(train_data[f].mean())

In [None]:
X = train_data.drop(["forward_returns", "date_id"], axis=1)
leak_cols = [c for c in X.columns if "forward" in c or "risk_free_rate" in c]
X = X.drop(columns=leak_cols)
y = train_data["forward_returns"]

scaler = RobustScaler()
X_scaled = scaler.fit_transform(X)

X_train_scaled, X_val_scaled, y_train, y_val = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

In [None]:
from sklearn.model_selection import KFold, cross_val_score
from sklearn.metrics import make_scorer
from sklearn.linear_model import BayesianRidge
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, StackingRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
import xgboost as xgb
from tqdm import tqdm

In [None]:
model = xgb.XGBRegressor(max_depth=2, random_state=42, n_estimators=358)
model.fit(X_train_scaled, y_train)

train_predict = model.predict(X_train_scaled)
val_predict = model.predict(X_val_scaled)

train_predict = model.predict(X_train_scaled)
val_predict = model.predict(X_val_scaled)

train_idx = np.arange(len(train_predict))
val_idx = np.arange(len(val_predict))

submission_train = pd.DataFrame({
    "row_id": train_idx,
    "prediction": np.clip(train_predict * 100 + 1, 0, 2)
})

solution_train = pd.DataFrame({
    "row_id": train_idx,
    "forward_returns": y_train.values,
    "risk_free_rate": np.zeros(len(y_train))
})

submission_val = pd.DataFrame({
    "row_id": val_idx,
    "prediction": np.clip(val_predict * 100 + 1, 0, 2)
})

solution_val = pd.DataFrame({
    "row_id": val_idx,
    "forward_returns": y_val.values,
    "risk_free_rate": np.zeros(len(y_val))
})

score_value_train = score(solution_train, submission_train, row_id_column_name="row_id")
score_value_val = score(solution_val, submission_val, row_id_column_name="row_id")

# print("Score:", np.round(score_value_train, 3))
# print("Score:", np.round(score_value_val, 3))

In [None]:
model = xgb.XGBRegressor(max_depth=2, random_state=42, n_estimators=358)
model = model.fit(X_scaled, y)

model.fit(X_train_scaled, y_train)

def predict(test_df: "pl.DataFrame") -> float:
    test_df = test_df.to_pandas()

    test_df = test_df.drop(
        columns=[c for c in ["date_id", "is_scored", "target", "forward_returns"] if c in test_df.columns],
        errors="ignore"
    )

    for col in X.columns:
        if col not in test_df.columns:
            test_df[col] = X[col].mean()
    test_df = test_df.reindex(columns=X.columns, fill_value=0)

    X_scaled = scaler.transform(test_df)
    pred = model.predict(X_scaled)[0]

    allocation = 1.0 + 100 * pred
    allocation = np.clip(allocation, 0.0, 2.0)

    return float(allocation)

In [None]:
inference_server = kaggle_evaluation.default_inference_server.DefaultInferenceServer(predict)

if __name__ == "__main__":
    if os.getenv("KAGGLE_IS_COMPETITION_RERUN"):
        inference_server.serve()
    else:
        inference_server.run_local_gateway(("/kaggle/input/hull-tactical-market-prediction/",))