In [None]:
import pandas as pd
import os
import polars as pl
import matplotlib.pyplot as plt
import torch
from torch import nn
import torch.optim as optim
import kaggle_evaluation.jane_street_inference_server

Load in features and responders metadata

In [None]:
DIR = "./jane-street-real-time-market-data-forecasting"

features = pl.read_csv(DIR + "/features.csv")
responders = pl.read_csv(DIR + "/responders.csv")
test = pl.read_parquet(DIR + "/test.parquet/date_id=0/part-0.parquet")
lags = pl.read_parquet(DIR + "/lags.parquet/date_id=0/part-0.parquet")

In [None]:
class NeuralNetwork(nn.Module):
    def __init__(self, input_size, hidden_size_1, hidden_size_2):
        super().__init__()
        self.neural_net = nn.Sequential(
            nn.BatchNorm1d(input_size),
            nn.Linear(input_size, hidden_size_1),
            nn.ReLU(),
            nn.Linear(hidden_size_1, hidden_size_2),
            nn.ReLU(),
            nn.Linear(hidden_size_2, 1)
        )
    
    def forward(self, x):
        out = self.neural_net(x)
        return out

if torch.cuda.is_available():
  device = torch.device("cuda")
else:
  device = torch.device("cpu")

model = NeuralNetwork(82, 64, 16).to(device)
loss_fn = nn.MSELoss()
optimizer = optim.Adam(model.parameters())

In [None]:
base_str = "/train.parquet"
filename = "/part-0.parquet"
train = pl.DataFrame()
running_loss = 0
model.train()
total_num_data = 0

for i in range(10):
    partition = "/partition_id=" + str(i)
    train = pl.read_parquet(DIR + base_str + partition + filename)

    Y_train = train['responder_6']
    X_train = train.select(pl.all().exclude("^responder.*$"))
    X_train.drop_in_place('weight')
    
    if X_train.get_column("is_scored", default=None) != None:
        X_train.drop_in_place('is_scored')

    X_train = X_train.fill_null(0)
    Y_train = Y_train.fill_null(0)

    num_data = X_train.height
    total_num_data += num_data

    X_train = X_train.to_torch()
    Y_train = Y_train.to_torch()
    
    for i in range(num_data // 64):
        batch = X_train[i * 64: (i + 1) * 64].to(device)
        batch_outputs = Y_train[i * 64: (i + 1) * 64].to(device)
        outputs = model.forward(batch)
        outputs = torch.reshape(outputs, (outputs.shape[0],))

        loss = loss_fn(outputs.to(device), batch_outputs.to(device))
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

        running_loss += loss.item()

    i = num_data // 64
    batch = X_train[i * 64:]
    batch_outputs = Y_train[i * 64:]
    outputs = model.forward(batch.to(device))
    outputs = torch.reshape(outputs, (outputs.shape[0],))
    
    loss = loss_fn(outputs.to(device), batch_outputs.to(device))
    loss.backward()
    optimizer.step()
    optimizer.zero_grad()
    running_loss += loss.item()

avg_loss = running_loss / total_num_data
print(f"avg_loss: {avg_loss}")

In [None]:
def predict(test, lags):
    global lags_
    global model
    
    model.eval()

    if lags is not None:
        lags_ = lags
    
    test = pl.DataFrame(test)
    num_data = test.height
    unnecessary_features = ['weight', 'is_scored', 'row_id']
    row_ids = test['row_id']
    for feature in unnecessary_features:
        test.drop_in_place(feature)
    test = test.fill_null(0)
    predictions = pl.DataFrame()
    test = test.to_torch()
    test = torch.tensor(test, dtype=torch.float32)
    
    for i in range(num_data // 64):
        data = test[i * 64: (i + 1) * 64]
        rows = row_ids[i * 64: (i + 1) * 64]
        
        with torch.no_grad():
            outputs = model(data.to(device))
            outputs = torch.reshape(outputs, (outputs.shape[0],)).cpu()
            outputs = outputs.numpy()
            outputs = pl.DataFrame(outputs, schema={'responder_6': pl.Float32})
            df = pl.DataFrame(rows)
            df = df.with_columns(outputs)

            if predictions.is_empty():
                predictions = df
            else:
                predictions.extend(df)

    data = test[(num_data // 64):]
    rows = row_ids[(num_data // 64):]
        
    with torch.no_grad():
        outputs = model(data.to(device))
        outputs = torch.reshape(outputs, (outputs.shape[0],)).cpu()
        outputs = outputs.numpy()
        outputs = pl.DataFrame(outputs, schema={'responder_6': pl.Float32})
        df = pl.DataFrame(rows)
        df = df.with_columns(outputs)
        
        if predictions.is_empty():
            predictions = df
        else:
            predictions.extend(df)
    
    if isinstance(predictions, pl.DataFrame):
        assert predictions.columns == ['row_id', 'responder_6']
    elif isinstance(predictions, pd.DataFrame):
        assert (predictions.columns == ['row_id', 'responder_6']).all()
    else:
        raise TypeError('The predict function must return a DataFrame')

    assert len(predictions) == len(test)

    return predictions

In [None]:
inference_server = kaggle_evaluation.jane_street_inference_server.JSInferenceServer(predict)

if os.getenv('KAGGLE_IS_COMPETITION_RERUN'):
    inference_server.serve()
else:
    inference_server.run_local_gateway(
        (
            '/kaggle/input/jane-street-real-time-market-data-forecasting/test.parquet',
            '/kaggle/input/jane-street-real-time-market-data-forecasting/lags.parquet',
        )
    )