In [None]:
import numpy as np
import pandas as pd

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
import polars as pl

train_df = pl.read_csv("/kaggle/input/hull-tactical-market-prediction/train.csv")
test_df = pl.read_csv("/kaggle/input/hull-tactical-market-prediction/test.csv")

### Features
* **date_id** - An identifier for a single trading day.
* **M*** - Market Dynamics/Technical features.
* **E*** - Macro Economic features.
* **I*** - Interest Rate features.
* **P*** - Price/Valuation features.
* **V*** - Volatility features.
* **S*** - Sentiment features.
* **MOM*** - Momentum features.
* **D*** - Dummy/Binary features.
* ***forward_returns*** - The returns from buying the S&P 500 and selling it a day later. Train set only.
* **risk_free_rate** - The federal funds rate. Train set only.
* **market_forward_excess_returns** - Forward returns relative to expectations. Computed by subtracting the rolling five-year mean forward returns and winsorizing the result using a median absolute deviation (MAD) with a criterion of 4. Train set only.

Target: **forward_returns**


In [None]:
print("Total Number of Features [Train Set]",len(train_df.columns))
print("Total Number of Features [Test Set]",len(test_df.columns)) # +1 for `is_scored`

In [None]:
train_df.describe()

## Preprocessing

### Converting `String` -> `Float`

In [None]:
train_df = train_df.with_columns([
    pl.col(c).cast(pl.Float64) for c, t in train_df.schema.items() if t == pl.Utf8
])

### Fill `null` -> 0

In [None]:
train_df = train_df.fill_null(0)

In [None]:
train_df.describe()

## EDA

### Mini Histograms of Numeric Features

In [None]:
import matplotlib.pyplot as plt
import math
numeric_cols = [c for c, t in train_df.schema.items() if t in (pl.Int64, pl.Float64)]

data = {}
for c in numeric_cols:
    arr = train_df[c].drop_nulls().to_numpy()
    if arr.size and not np.all(arr == arr[0]):
        data[c] = arr

n = len(data)
cols = 5
rows = math.ceil(n / cols)

plt.rcParams.update({
    "figure.dpi": 160,
    "axes.titlesize": 8,
    "xtick.labelsize": 7,
    "ytick.labelsize": 7
})

fig, axes = plt.subplots(rows, cols, figsize=(cols * 2.2, rows * 1.8), constrained_layout=True)
axes = axes.ravel()

for i, (col, arr) in enumerate(data.items()):
    ax = axes[i]
    bins = int(min(40, max(8, math.sqrt(arr.size))))
    ax.hist(arr, bins=bins, color="#4682B4", edgecolor="white", alpha=0.9)
    ax.set_title(col, fontsize=8, pad=2)
    ax.tick_params(axis="both", which="major", labelsize=6)
    ax.grid(alpha=0.2, linewidth=0.3)

for j in range(i + 1, len(axes)):
    axes[j].axis("off")

fig.suptitle("Mini Histograms — Numeric Columns", fontsize=10, weight="bold", y=1.02)
plt.show()


### Correlation Heatmap of Numeric Features

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib as mpl

num_cols = [c for c, t in train_df.schema.items() if t in (pl.Int64, pl.Float64) and c != "date_id"]
df = train_df.select([pl.col(c).cast(pl.Float64) for c in num_cols]).to_pandas()

df.replace([np.inf, -np.inf], np.nan, inplace=True)
df.dropna(axis=1, how="all", inplace=True)
df = df.loc[:, df.std(skipna=True) != 0]

if df.shape[1] == 0:
    raise SystemExit("no usable numeric columns for correlation after filtering")

corr = df.corr().fillna(0)
corr = corr.replace([np.inf, -np.inf], 0)

mask = np.triu(np.ones_like(corr, dtype=bool))
n = len(corr.columns)

plt.rcParams.update({"figure.dpi": 350})
figsize = (max(3, min(12, n * 0.18)), max(3, min(12, n * 0.18)))
cmap = "Spectral_r"
norm = mpl.colors.Normalize(vmin=-1, vmax=1)

plt.figure(figsize=figsize)
sns.heatmap(
    corr,
    mask=mask,
    cmap=cmap,
    norm=norm,
    vmin=-1,
    vmax=1,
    square=True,
    linewidths=0.3,
    cbar_kws={"shrink": 0.65, "aspect": 20},
    annot=(n <= 20),
    fmt=".2f",
    annot_kws={"size": 6}
)

plt.xticks(rotation=90, fontsize=max(5, 9 - n // 10))
plt.yticks(rotation=0, fontsize=max(5, 9 - n // 10))
plt.title("Correlation Heatmap", fontsize=10, weight="bold", pad=6)
plt.tight_layout()
plt.show()

Although we could select features based on their correlation with the `forward_return` to focus on the most predictive variables, in this experiment we will retain all features to allow the neural network to learn underlying relationships directly from the data.

In [None]:
import math
import warnings
import numpy as np
import polars as pl
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Suppress only the specific FutureWarning from seaborn/pandas
warnings.filterwarnings("ignore", category=FutureWarning, module="seaborn")

sns.set_theme(style="whitegrid", context="notebook")

plt.rcParams.update({
    "figure.dpi": 300,
    "axes.titlesize": 9,
    "xtick.labelsize": 7,
    "ytick.labelsize": 7
})

numeric_cols = [c for c, t in train_df.schema.items() if t in (pl.Int64, pl.Float64) and c != "date_id"]
df = train_df.select([pl.col(c).cast(pl.Float64) for c in numeric_cols]).to_pandas()
df.replace([np.inf, -np.inf], np.nan, inplace=True)

rows, cols = 4, 2
fig, axes = plt.subplots(rows, cols, figsize=(11, 11), constrained_layout=True)
axes = axes.ravel()

sns.histplot(df["forward_returns"].dropna(), bins=50, kde=True, color="#2E86AB", ax=axes[0])
axes[0].set_title("Forward Return Distribution")
axes[0].set_xlabel("")
axes[0].set_ylabel("")

sns.scatterplot(
    x=df["risk_free_rate"],
    y=df["market_forward_excess_returns"],
    alpha=0.45, s=12, color="#1f77b4", ax=axes[1]
)
axes[1].set_title("Risk-Free vs Market Excess")
axes[1].set_xlabel("")
axes[1].set_ylabel("")

if "date_id" in train_df.columns:
    time_x = train_df["date_id"].to_numpy()
    time_y = train_df["forward_returns"].fill_null(0).to_numpy()
    axes[2].plot(time_x, time_y, linewidth=0.7, color="#4C72B0")
else:
    axes[2].plot(df.index, df["forward_returns"].fillna(0), linewidth=0.7, color="#4C72B0")
axes[2].set_title("Forward Returns Over Time")
axes[2].set_xlabel("")
axes[2].set_ylabel("")

if "forward_returns" in df.columns:
    corrs = df.corr()["forward_returns"].drop("forward_returns") \
        .sort_values(key=lambda x: x.abs(), ascending=False).head(10)
else:
    corrs = pd.Series(dtype=float)

sns.barplot(x=corrs.values, y=corrs.index, palette="Spectral", ax=axes[3])
axes[3].set_title("Top 10 Features by Corr with Forward Return")
axes[3].set_xlabel("Correlation")
axes[3].set_ylabel("")

nulls = [
    train_df.select(pl.col(c).null_count()).item() / train_df.height * 100
    for c in train_df.columns
]
null_df = pl.DataFrame({"column": train_df.columns, "null_pct": nulls}) \
    .sort("null_pct", descending=True).to_pandas()
sns.barplot(data=null_df.head(10), x="null_pct", y="column", palette="rocket", ax=axes[4])
axes[4].set_title("Top 10 Features by Missing %")
axes[4].set_xlabel("Missing (%)")
axes[4].set_ylabel("")

top_features = corrs.index[:6].tolist()
if top_features:
    sns.boxplot(data=df[top_features], orient="h", fliersize=1, palette="vlag", ax=axes[5])
axes[5].set_title("Outlier Check — Top Features")
axes[5].set_xlabel("")
axes[5].set_ylabel("")

if top_features:
    sns.violinplot(data=df[top_features], orient="h", scale="width",
                   inner="quartile", palette="muted", ax=axes[6])
axes[6].set_title("Violin — Top Feature Distributions")
axes[6].set_xlabel("")
axes[6].set_ylabel("")

top_corr_features = corrs.index[:3].tolist()
if top_corr_features:
    df_time = train_df.select(["date_id"] + [pl.col(c).cast(pl.Float64) for c in top_corr_features]).to_pandas()
    df_time = df_time.groupby("date_id").mean().rolling(window=10).std()
    df_time.plot(ax=axes[7], linewidth=0.9)
    axes[7].legend(top_corr_features, fontsize=6, loc="upper right")
axes[7].set_title("Rolling Volatility (10-Day)")
axes[7].set_xlabel("")
axes[7].set_ylabel("")

fig.suptitle("Key Insights for Feature Engineering", fontsize=9, weight="bold", y=1.02)
plt.show()

## Modeling using Simple FF-NN Approach

In [None]:
import torch
import torch.nn.functional as F

class TorchNN(torch.nn.Module):
    def __init__(self, num_inputs, num_outputs):
        super().__init__()
        self.linear = torch.nn.Linear(num_inputs, num_outputs)

    def forward(self, x):
        logits = self.linear(x)
        return logits * 10


### Data Preparation

In [None]:
train_df = train_df.with_columns([
    pl.col("forward_returns").shift(1).alias("lagged_forward_returns")
])
train_df = train_df.with_columns([
    pl.col("risk_free_rate").shift(1).alias("lagged_risk_free_rate")
])
train_df = train_df.with_columns([
    pl.col("market_forward_excess_returns").shift(1).alias("lagged_market_forward_excess_returns")
])

In [None]:

train_df = train_df.drop("risk_free_rate")
train_df = train_df.drop("market_forward_excess_returns")
train_df = train_df.rename({"forward_returns": "label"})


In [None]:
train_df = train_df.slice(1)

In [None]:
non_feature_columns = ["date_id", "label"]
feature_columns = [c for c in train_df.columns if c not in set(non_feature_columns)]

In [None]:
validation_fraction = 0.2
train_size = train_df.height
split_index= int(train_size * (1 - validation_fraction))

train_slice = train_df.slice(0, split_index)
validation_slice = train_df.slice(split_index, train_size - split_index)

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"

X_train = train_slice.select(feature_columns).to_torch()
y_train = train_slice.select("label").to_torch()
X_val = validation_slice.select(feature_columns).to_torch()
y_val = validation_slice.select("label").to_torch()

X_train = X_train.to(torch.float32).to(device)
y_train = y_train.to(torch.float32).unsqueeze(1).to(device)
X_val = X_val.to(torch.float32).to(device)
y_val = y_val.to(torch.float32).unsqueeze(1).to(device)

In [None]:
from torch.utils.data import TensorDataset, DataLoader
batch_size=8
train_ds = TensorDataset(X_train, y_train)
val_ds   = TensorDataset(X_val, y_val)

train_loader = DataLoader(train_ds, batch_size=batch_size, shuffle=True)
validation_loader = DataLoader(val_ds, batch_size=batch_size, shuffle=False)

In [None]:
# import torch
# import torch.nn.functional as F

# device = "cuda" if torch.cuda.is_available() else "cpu"

# model = TorchNN(num_inputs=97, num_outputs=1).to(device)
# model = model.double()
# optimizer = torch.optim.Adam(model.parameters(), lr=1e-5)  # no weight decay for now

# n_epochs = 25
# best_val_mse = float("inf")

# for epoch in range(1, n_epochs + 1):
#     model.train()
#     train_losses = []
    
#     for xb, yb in train_loader:
#         xb = xb.to(device).double()
#         yb = yb.to(device).double().view(-1, 1)

#         optimizer.zero_grad()
#         preds = model(xb)
#         loss = F.mse_loss(preds, yb)
#         loss.backward()
#         optimizer.step()

#         train_losses.append(loss.item())

#     avg_train = sum(train_losses) / len(train_losses)

#     model.eval()
#     val_losses = []
#     with torch.no_grad():
#         for xb, yb in validation_loader:
#             xb = xb.to(device).double()
#             yb = yb.to(device).double().view(-1, 1)
#             preds = model(xb)
#             val_losses.append(F.mse_loss(preds, yb).item())

#     avg_val = sum(val_losses) / len(val_losses)

#     print(f"Epoch {epoch:02d} | train_mse: {avg_train:.6f} | val_mse: {avg_val:.10f}")

#     if avg_val < best_val_mse:
#         best_val_mse = avg_val
#         torch.save(model.state_dict(), "hull-tactical-market-prediction-model-v1.pth")
#         print(f"  -> saved best model (val_mse={best_val_mse:.10f})")

# print(f"\nTraining complete. Best val_mse: {best_val_mse:.10f}")

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"
model = TorchNN(num_inputs=97, num_outputs=1).to(device).double()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4, weight_decay=1e-4)

best_val_sharpe = -1e9
patience = 6
no_improve = 0
trans_cost_coeff = 0.02
n_epochs = 1

for epoch in range(1, n_epochs+1):
    model.train()
    train_losses = []
    # optionally set a larger batch_size in your DataLoader
    for xb, yb in train_loader:
        xb = xb.to(device).double()
        yb = yb.to(device).double().view(-1,1)
        preds = model(xb)
        # simpler, more stable base loss: maximize expected return
        base_loss = -torch.mean(preds * yb)
        # penalize large signals / turnover (approx)
        loss = base_loss + trans_cost_coeff * torch.mean(torch.abs(preds))
        optimizer.zero_grad()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 5.0)
        optimizer.step()
        train_losses.append(loss.item())

    # validation: compute val_sharpe across all batches (no grad)
    model.eval()
    all_val_rets = []
    with torch.no_grad():
        for xb_v, yb_v in validation_loader:
            xb_v = xb_v.to(device).double()
            yb_v = yb_v.to(device).double().view(-1,1)
            p = model(xb_v)
            all_val_rets.append((p * yb_v).cpu())
    all_val_rets = torch.cat(all_val_rets, dim=0)
    val_mean = float(torch.mean(all_val_rets))
    val_std = float(torch.std(all_val_rets) + 1e-8)
    val_sharpe = val_mean / val_std

    avg_train_loss = sum(train_losses) / len(train_losses)
    print(f"Epoch {epoch:02d} | train_loss: {avg_train_loss:.6f} | val_sharpe: {val_sharpe:.6f}")

    if val_sharpe > best_val_sharpe:
        best_val_sharpe = val_sharpe
        no_improve = 0
        torch.save(model.state_dict(), "hull-tactical-market-prediction-model-v1.pth")
        print(f" -> saved best model (val_sharpe={best_val_sharpe:.6f})")
    else:
        no_improve += 1
        if no_improve >= patience:
            print("Early stopping (no improvement).")
            break

print("Done. Best val_sharpe:", best_val_sharpe)

## Submission Inference

In [None]:
import os
import kaggle_evaluation.default_inference_server

import pandas as pd
import polars as pl

In [None]:
import torch
import polars as pl
import numpy as np

checkpoint = torch.load("hull-tactical-market-prediction-model-v1.pth", map_location=device)
model_local = TorchNN(num_inputs=97, num_outputs=1).to(device)
model_local = model_local.double()     
model_local.load_state_dict(checkpoint)
model_local.eval()

In [None]:
def predict(test: pl.DataFrame) -> float:
    """
    Inference function for Hull Tactical Market Prediction.
    - Safely drops 'date_id' and 'is_scored' columns if present.
    - Converts Polars DataFrame to torch tensor.
    - Returns a float signal in range [0, 2].
    """
    # Drop unwanted columns safely
    drop_cols = [c for c in ("date_id", "is_scored") if c in test.columns]
    if drop_cols:
        test = test.drop(drop_cols)

    # Convert features to numpy -> tensor
    X = test.to_numpy()
    X = torch.from_numpy(X).double().to(device)

    # Run model inference
    model_local.eval()
    with torch.no_grad():
        pred = model_local(X)

    # Convert tensor -> float
    signal = float(pred.cpu().numpy().squeeze())
    
    signal = np.clip(signal, 0.0, 2.0)

    # Prediction Debug
    print("DEBUG SIGNAL: ", signal)
    return float(signal)

In [None]:

inference_server = kaggle_evaluation.default_inference_server.DefaultInferenceServer(predict)

if os.getenv('KAGGLE_IS_COMPETITION_RERUN'):
    inference_server.serve()
else:
    inference_server.run_local_gateway(('/kaggle/input/hull-tactical-market-prediction/',))

In [None]:
read_submission = pd.read_parquet('/kaggle/working/submission.parquet')
read_submission

In [None]:
import numpy as np
import pandas as pd
import pandas.api.types

MIN_INVESTMENT = 0
MAX_INVESTMENT = 2


class ParticipantVisibleError(Exception):
    pass


def score(solution: pd.DataFrame, submission: pd.DataFrame, row_id_column_name: str) -> float:
    """
    Calculates a custom evaluation metric (volatility-adjusted Sharpe ratio).

    This metric penalizes strategies that take on significantly more volatility
    than the underlying market.

    Returns:
        float: The calculated adjusted Sharpe ratio.
    """

    if not pandas.api.types.is_numeric_dtype(submission['prediction']):
        raise ParticipantVisibleError('Predictions must be numeric')

    solution = solution
    solution['position'] = submission['prediction']

    if solution['position'].max() > MAX_INVESTMENT:
        raise ParticipantVisibleError(f'Position of {solution["position"].max()} exceeds maximum of {MAX_INVESTMENT}')
    if solution['position'].min() < MIN_INVESTMENT:
        raise ParticipantVisibleError(f'Position of {solution["position"].min()} below minimum of {MIN_INVESTMENT}')

    solution['strategy_returns'] = solution['risk_free_rate'] * (1 - solution['position']) + solution['position'] * solution['forward_returns']

    # Calculate strategy's Sharpe ratio
    strategy_excess_returns = solution['strategy_returns'] - solution['risk_free_rate']
    strategy_excess_cumulative = (1 + strategy_excess_returns).prod()
    strategy_mean_excess_return = (strategy_excess_cumulative) ** (1 / len(solution)) - 1
    strategy_std = solution['strategy_returns'].std()

    trading_days_per_yr = 252
    if strategy_std == 0:
        raise ParticipantVisibleError('Division by zero, strategy std is zero')
    sharpe = strategy_mean_excess_return / strategy_std * np.sqrt(trading_days_per_yr)
    strategy_volatility = float(strategy_std * np.sqrt(trading_days_per_yr) * 100)

    # Calculate market return and volatility
    market_excess_returns = solution['forward_returns'] - solution['risk_free_rate']
    market_excess_cumulative = (1 + market_excess_returns).prod()
    market_mean_excess_return = (market_excess_cumulative) ** (1 / len(solution)) - 1
    market_std = solution['forward_returns'].std()

    market_volatility = float(market_std * np.sqrt(trading_days_per_yr) * 100)

    if market_volatility == 0:
        raise ParticipantVisibleError('Division by zero, market std is zero')

    # Calculate the volatility penalty
    excess_vol = max(0, strategy_volatility / market_volatility - 1.2) if market_volatility > 0 else 0
    vol_penalty = 1 + excess_vol

    # Calculate the return penalty
    return_gap = max(
        0,
        (market_mean_excess_return - strategy_mean_excess_return) * 100 * trading_days_per_yr,
    )
    return_penalty = 1 + (return_gap**2) / 100

    # Adjust the Sharpe ratio by the volatility and return penalty
    adjusted_sharpe = sharpe / (vol_penalty * return_penalty)
    return min(float(adjusted_sharpe), 1_000_000)

In [None]:
import pandas as pd

# 1) Load your submission
submission = pd.read_parquet('/kaggle/working/submission.parquet')  # has: date_id, prediction

# 2) Build the "solution" for the same dates (from train.csv here)
#    If your ground-truth lives elsewhere, load it from there instead.
solution_raw = pd.read_csv('/kaggle/input/hull-tactical-market-prediction/train.csv',
                           usecols=['date_id', 'forward_returns', 'risk_free_rate'])

# 3) Keep only the rows that appear in the submission and align order
solution = (solution_raw
            .merge(submission[['date_id']], on='date_id', how='inner')
            .sort_values('date_id')
            .reset_index(drop=True))

submission = submission.sort_values('date_id').reset_index(drop=True)

# 4) (Optional) sanity checks
assert len(solution) == len(submission), "Solution and submission lengths differ."
assert solution['date_id'].equals(submission['date_id']), "date_id order mismatch."
assert pd.api.types.is_numeric_dtype(submission['prediction']), "prediction must be numeric."

# 5) Call the scorer. Note: row_id_column_name isn't used inside the function,
#    but pass 'date_id' to match the interface.
score_value = score(solution[['forward_returns', 'risk_free_rate']].copy(),
                    submission[['prediction']].copy(),
                    row_id_column_name='date_id')

print("Adjusted Sharpe (custom metric):", score_value)
