In [None]:
from dataclasses import dataclass

import os

import math

import polars as pl
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np
import matplotlib.pyplot as plt
import kaggle_evaluation.default_inference_server

import torch
import torch.nn as nn # defining our neural network
import torch.optim as optim # training our neural network
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset # loading data in batches

## Config

In [None]:
@dataclass
class Config:

    # FFN learning rate
    learning_rate = 5e-4
    momentum = 0.95
    epochs = 10

    train_path = "/kaggle/input/hull-tactical-market-prediction/train.csv"
    test_path = "/kaggle/input/hull-tactical-market-prediction/test.csv"

    target_column = "market_forward_excess_returns"
    # target_column = "forward_returns"

    base_signal = 1.0
    signal_multiplier: float = 8.0
    min_signal : float = 0.0 
    max_signal : float = 2.0
    

## Loading and Processing Data

In [None]:

# Load data
train = pd.read_csv("/kaggle/input/hull-tactical-market-prediction/train.csv")
test = pd.read_csv("/kaggle/input/hull-tactical-market-prediction/test.csv")

exclude = ["date_id", 'forward_returns', 'risk_free_rate', 'market_forward_excess_returns']
test_cols = []
for col in train.columns:
    nans = train[col].isna().sum()
    if nans <= 1006 and col not in exclude:
        test_cols.append(col)

# Columns to keep
print(f"test_cols: {test_cols}")
train_cols = ["forward_returns", "risk_free_rate", "market_forward_excess_returns"]
base_col = ["date_id"]

# Apply filtering
train_filtered = train[base_col + test_cols + train_cols]
test_filtered = test[base_col + test_cols]

window = 50 # number of days in each normalization window

# rolling_min = train_filtered["market_forward_excess_returns"].rolling(window).min()
# rolling_max = train_filtered["market_forward_excess_returns"].rolling(window).max()

# train_filtered["target_market_norm"] = (
#     (train_filtered["forward_returns"] - rolling_min) / (rolling_max - rolling_min)
# )
# train_filtered["target_market_norm"].bfill(inplace=True)

# remove duplicate colums
train_filtered = train_filtered.loc[:, ~train_filtered.columns.duplicated()]
test_filtered = test_filtered.loc[:, ~test_filtered.columns.duplicated()]

print(f"Number of rows in train: {len(train_filtered)}")
print(f"Number of rows in test: {len(test_filtered)}")


In [None]:
target = train_filtered[Config.target_column]
plt.hist(target, bins=50)
plt.show()

In [None]:
target.mean(), target.std()

In [None]:
n = len(target)
mu = target.mean()
sigma = target.std()
one_sigma_prop = ((target - mu >= -sigma) & (target - mu < sigma)).sum() / n
two_sigma_prop = ((target - mu >= -2 * sigma) & (target - mu < 2 * sigma)).sum() / n
three_sigma_prop = ((target - mu >= -3 * sigma) & (target - mu < 3 * sigma)).sum() / n
one_sigma_prop, two_sigma_prop, three_sigma_prop

The values suggest the target is not normally distributed, a potential distribution is the t-distribution, which has larger tails and can be denser within 1 sigma.

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import gennorm, norm

# Standardize data
target = train_filtered[Config.target_column].dropna()
z = (target - target.mean()) / target.std()

# Fit generalized normal
params = gennorm.fit(z)  # returns (beta, loc, scale)
beta, loc, scale = params
print(f"Fitted generalized normal: beta={beta:.3f}, loc={loc:.3f}, scale={scale:.3f}")

# x-range and PDFs
x = np.linspace(-6, 6, 1000)
pdf_fit = gennorm.pdf(x, *params)
pdf_norm = norm.pdf(x)

# Plot
plt.figure(figsize=(8,5))
plt.hist(z, bins=50, density=True, alpha=0.4, color='gray', label='Standardized empirical')
plt.plot(x, pdf_fit, label=f'GenNorm Î²={beta:.2f}', linewidth=2)
plt.plot(x, pdf_norm, 'k--', label='Normal (0,1)')
plt.xlabel("Standardized Value (z-score)")
plt.ylabel("Density")
plt.title("Empirical Data vs Fitted Generalized Normal Distribution")
plt.legend()
plt.grid(alpha=0.3)
plt.tight_layout()
plt.show()


In [None]:
from scipy.stats import laplace, norm

# Standardize your data
target = train_filtered[Config.target_column].dropna()
z = (target - target.mean()) / target.std()

# Fit Laplace parameters (loc and scale)
loc, scale = laplace.fit(z)
print(f"Laplace fit: loc={loc:.3f}, scale={scale:.3f}")

# Define x-range and PDFs
x = np.linspace(-6, 6, 1000)
pdf_laplace = laplace.pdf(x, loc=loc, scale=scale)
pdf_norm = norm.pdf(x)

# Plot
plt.figure(figsize=(8,5))
plt.hist(z, bins=50, density=True, alpha=0.4, color='gray', label='Standardized empirical data')
plt.plot(x, pdf_laplace, label=f'Laplace fit (loc={loc:.2f}, scale={scale:.2f})', linewidth=2)
plt.plot(x, pdf_norm, 'k--', label='Normal (0,1)', linewidth=1.5)

# Labels and legend
plt.title("Empirical Data vs Fitted Laplace and Normal Distributions", fontsize=14)
plt.xlabel("Standardized Value (z-score)", fontsize=12)
plt.ylabel("Density", fontsize=12)
plt.legend()
plt.grid(alpha=0.3)
plt.tight_layout()
plt.show()

In [None]:
# Count number of rows with at least one NaN
num_nan_train = train_filtered.isna().any(axis=1).sum()
num_nan_test = test_filtered.isna().any(axis=1).sum()

print(f"Number of rows with NaN in train: {num_nan_train}")
print(f"Number of rows with NaN in test: {num_nan_test}")

X = train_filtered[test_cols]  # features
y = train_filtered[Config.target_column]  # target

mask = X.isna().sum(axis=1) == 0
X = X[mask]
y = y[mask]

In [None]:
X.shape, y.shape

In [None]:
X.head()

In [None]:
y.head()

## FFN Model Definition

In [None]:

# our neural network class is a subclass of nn.Module
# this handles a lot of the boilerplate code for us
# including parameter initialization, etc.
class FFN(nn.Module):

    def __init__(self, input_size, hidden_size, output_size):

        '''
        Args:
            input_size: size of the input
            hidden_size: size of the hidden layer
            output_size: size of the output layer
        '''

        super().__init__() # call the parent class's constructor

        # define layers
        # nn.Linear takes in the size of the input and output
        self.fc1 = nn.Linear(input_size, hidden_size) # first fully connected layer
        self.fc2 = nn.Linear(hidden_size, hidden_size) # second fully connected layer
        self.fc3 = nn.Linear(hidden_size, output_size)

        self.relu = nn.ReLU() # activation function
    
    def forward(self, x):
        # define the forward pass
        out = self.fc1(x) # pass through first layer
        out = self.relu(out) # apply activation function
        out = self.fc2(out) # pass through second layer
        out = self.relu(out)
        out = self.fc3(out)
        return out



## Dataset Definition

In [None]:

class SP500Dataset(Dataset):
    def __init__(self, data, target_column):
        # data is a dataframe
        self.dataframe = data
        self.target = target_column

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        # return a tuple of (features, target)
        return torch.tensor(self.dataframe.iloc[idx].values, dtype=torch.float32), \
            torch.tensor(self.target.iloc[idx], dtype=torch.float32)



## Training Loop

In [None]:

x = np.linspace(-1, 1, 201)
print(x)
lpdf = laplace.pdf(x, loc=loc, scale=scale)
lcdf = laplace.cdf(x, loc=loc, scale=scale)
plt.plot(x, lcdf)
plt.plot(x, lpdf)
plt.show()

In [None]:

df, labels = X, y
dataset = SP500Dataset(X, y)

# This is an iterable that will yield batches of data
dataloader = DataLoader(dataset, batch_size=32, shuffle=True)
# Our model, optimizer, and loss function
model = FFN(input_size=len(X.columns), hidden_size=10, output_size=1)

# Variant of Stochastic Gradient Descent
optimizer = optim.SGD(
    model.parameters(),
    lr=Config.learning_rate,
    momentum=Config.momentum
)

# Mean Squared Error Loss for regression tasks
criterion = nn.L1Loss()
criterion2 = nn.BCELoss()

loss_graph = []
for epoch in range(Config.epochs): # loop over the dataset multiple times
    for inputs, targets in dataloader:
        optimizer.zero_grad() # zero the parameter gradients
        outputs = model(inputs)
        targets_transformed = torch.tensor(
            laplace.cdf(targets.numpy(), loc=loc, scale=scale),
            dtype=torch.float32
        )
        outputs_transformed = F.sigmoid(outputs) # forward pass
        
        loss = criterion(outputs.squeeze(), targets) # compute loss
        # loss += criterion2(outputs_transformed.squeeze(), targets_transformed) * .01
        loss.backward() # backward pass
        optimizer.step() # update parameters
        loss_graph.append(loss.item())

    print(f'Epoch {epoch+1}, Loss: {loss.item()}')



In [None]:
plt.plot(loss_graph)
plt.show()

In [None]:
train_data = torch.tensor(X.to_numpy(), dtype=torch.float32)

output = model.forward(train_data).view(-1).detach().numpy()

plt.hist((output - mu) / sigma, bins=50, density=True)
plt.hist((y - mu)/sigma, bins=50, density=True)
print(output.shape, y.shape)
plt.show()

## Prediction

In [None]:
def convert_ret_to_signal(
    ret_arr: np.ndarray,
) -> np.ndarray:
    """
    Converts raw model predictions (expected returns) into a trading signal.

    Args:
        ret_arr (np.ndarray): The array of predicted returns.
        params (RetToSignalParameters): Parameters for scaling and clipping the signal.

    Returns:
        np.ndarray: The resulting trading signal, clipped between min and max values.
    """
    return np.clip(
        ret_arr * Config.signal_multiplier + Config.base_signal, Config.min_signal, Config.max_signal
    )

In [None]:
def predict(test: pl.DataFrame) -> float:
    data = test.to_pandas()
    data = torch.tensor(data[test_cols].copy().to_numpy(),dtype=torch.float32)
    y_pred = model.forward(data)
    pred = float(y_pred.item())
    print(pred)
    signal = convert_ret_to_signal(pred)
    print(signal)
    return signal
    


In [None]:
# When your notebook is run on the hidden test set, inference_server.serve must be called within 15 minutes of the notebook starting
# or the gateway will throw an error. If you need more than 15 minutes to load your model you can do so during the very
# first `predict` call, which does not have the usual 1 minute response deadline.
inference_server = kaggle_evaluation.default_inference_server.DefaultInferenceServer(predict)

if os.getenv('KAGGLE_IS_COMPETITION_RERUN'):
    inference_server.serve()
else:
    inference_server.run_local_gateway(('/kaggle/input/hull-tactical-market-prediction/',))

In [None]:
thing = pd.read_parquet("/kaggle/working/submission.parquet")
thing