In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from data_processing import get_matrices, apply_kmeans_labels
import torch
import torch.nn as nn

In [2]:
from sklearn.model_selection import train_test_split

In [3]:
from data_processing import filter_columns

In [4]:
data = pd.read_csv('../data/training.csv')

In [5]:
X, y = get_matrices(data)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data.drop('row_id', axis=1, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data.drop('security_id', axis=1, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data.drop('initiator', axis=1, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data.drop(f'time{i}', axis=1, inpl

In [6]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [7]:
clusters = apply_kmeans_labels(X_train, y_train)

In [8]:
# naive model
from models import NaiveRepeaterModel

X_np = X_train.values
naive_model = NaiveRepeaterModel(output_length=20)
naive_model.fit(X_np)
preds_naive = naive_model.predict(X_np)

In [9]:
# Multi-output regression model 
# We need to train five separate linear regressions
from models import MultiOutputRegression
from torch import optim

X_tensor = torch.tensor(X_train.values, dtype=torch.float32)
y_tensor = torch.tensor(y_train.values, dtype=torch.float32)
cluster_tensor = torch.tensor(clusters, dtype=torch.int64)

cluster_models = {}
num_clusters = cluster_tensor.max().item() + 1

for cluster_id in range(num_clusters):
    idx = (cluster_tensor == cluster_id)
    X_sub = X_tensor[idx]
    y_sub = y_tensor[idx]

    reg_model = MultiOutputRegression(input_dim=104, output_dim=20)
    criterion = nn.MSELoss()
    optimizer = optim.Adam(reg_model.parameters(), lr=0.01)

    for epoch in range(100): 
        reg_model.train()
        preds = reg_model(X_sub)
        loss = criterion(preds, y_sub)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    
    cluster_models[cluster_id] = reg_model

In [10]:
# Train the Gaussian-Discriminant Analysis
from models import GaussianDiscriminantAnalysis

gda_model = GaussianDiscriminantAnalysis()
gda_model.fit(X_tensor, cluster_tensor)

In [11]:
# Train the SVM classifier model 
from models import SVMClassifier

svm_model = SVMClassifier()
svm_model.fit(X_np, clusters)

In [12]:
# train the transformer 
from models import TransformerRegressor

transformer_model = TransformerRegressor()
criterion = nn.MSELoss()
optimizer = optim.Adam(transformer_model.parameters(), lr=0.001)

epochs = 100

for epoch in range(epochs):
    transformer_model.train()
    
    preds = transformer_model(X_tensor)  # shape: (N, 20)
    loss = criterion(preds, y_tensor)

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    if (epoch + 1) % 10 == 0:
        print(f"Epoch {epoch+1}, Loss: {loss.item():.4f}")

Epoch 10, Loss: 0.4860
Epoch 20, Loss: 0.2174
Epoch 30, Loss: 0.0810
Epoch 40, Loss: 0.0210
Epoch 50, Loss: 0.0066
Epoch 60, Loss: 0.0036
Epoch 70, Loss: 0.0022
Epoch 80, Loss: 0.0019
Epoch 90, Loss: 0.0016
Epoch 100, Loss: 0.0016


In [22]:
from sklearn.metrics import mean_squared_error, r2_score
from models import KMEANS

In [17]:
#testing
X_test_np = X_test.values if hasattr(X_test, 'values') else X_test
y_test_np = y_test.values if hasattr(y_test, 'values') else y_test

In [20]:
naive_model.fit(X_test_np)  # this model just stores last 2 values per row
y_pred = naive_model.predict(X_test_np)
mse = mean_squared_error(y_test_np, y_pred)
r2 = r2_score(y_test_np, y_pred)
print(f"Naive Repeater Model MSE: {mse:.4f}")
print(f"Naive Repeater Model R²: {r2:.4f}")

Naive Repeater Model MSE: 0.0002
Naive Repeater Model R²: 0.9998


In [28]:
kmeans_model = KMEANS(n_clusters=5)
cluster_preds = kmeans_model.fit(y_test)
X_test_tensor = torch.tensor(X_test.values, dtype=torch.float32)

preds = []
for i, x_row in enumerate(X_test_tensor):
    cluster_id = cluster_preds[i]
    model = cluster_models[cluster_id]

    model.eval()
    with torch.no_grad():
        pred = model(x_row.unsqueeze(0))
        preds.append(pred.squeeze().numpy())

y_pred = np.stack(preds)
mse_lin_reg = mean_squared_error(y_test_np, y_pred)
r2_lin_reg = r2_score(y_test_np, y_pred)
print(mse_lin_reg)
print(r2_lin_reg)

0.00352776056807124
0.9963915007868895


In [32]:
cluster_tensor_test = torch.tensor(cluster_preds, dtype=torch.int64)

In [33]:
svm_model = SVMClassifier()
svm_model.fit(X_test, cluster_preds)
svm_preds = svm_model.predict_batch(X_test)

gda_model = GaussianDiscriminantAnalysis()
gda_model.fit(X_test_tensor, cluster_tensor_test)
gda_preds = [gda_model.predict(row) for row in X_test_tensor]

In [34]:
from sklearn.metrics import accuracy_score, classification_report

In [35]:
print("SVM Evaluation")
print("Accuracy:", accuracy_score(cluster_preds, svm_preds))
print(classification_report(cluster_preds, svm_preds))

print("\nGDA Evaluation")
print("Accuracy:", accuracy_score(cluster_preds, gda_preds))
print(classification_report(cluster_preds, gda_preds))

SVM Evaluation
Accuracy: 0.9787397540983607
              precision    recall  f1-score   support

           0       0.96      0.99      0.97       595
           1       0.97      0.98      0.98      1135
           2       1.00      0.97      0.99       292
           3       0.99      0.98      0.99      1343
           4       0.98      0.95      0.96       539

    accuracy                           0.98      3904
   macro avg       0.98      0.98      0.98      3904
weighted avg       0.98      0.98      0.98      3904


GDA Evaluation
Accuracy: 0.9444159836065574
              precision    recall  f1-score   support

           0       0.99      0.82      0.90       595
           1       0.96      0.95      0.95      1135
           2       1.00      0.99      1.00       292
           3       0.90      1.00      0.95      1343
           4       0.97      0.92      0.94       539

    accuracy                           0.94      3904
   macro avg       0.96      0.94      0.9

In [36]:
transformer_model.eval()

with torch.no_grad():
    y_pred_tensor = transformer_model(X_test_tensor)  # shape: (N, 20)
    y_pred = y_pred_tensor.numpy()

In [37]:
mse = mean_squared_error(y_test_np, y_pred)
r2 = r2_score(y_test_np, y_pred)
print("Transformer Evaluation")
print(f"MSE: {mse:.4f}")
print(f"R² Score: {r2:.4f}")

Transformer Evaluation
MSE: 0.0017
R² Score: 0.9983
