In [31]:
# Import packages

import random
import numpy as np
import pandas as pd
from typing import Tuple
from pandas.api.types import CategoricalDtype


import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader, random_split
from sklearn.preprocessing import StandardScaler


In [32]:
# Load data
X_trn = pd.read_csv("../../data/X_trn.csv")
y_trn = pd.read_csv("../../data/y_trn.csv")
X_test = pd.read_csv("../../data/X_test.csv")

In [33]:
# Set seed
SEED = 2025
torch.manual_seed(SEED)
np.random.seed(SEED)
random.seed(SEED)

In [34]:
# MDN hyperparameters
N_MIX = 6
HIDDEN = 128
N_HIDDEN_LAYERS = 2
ACTIVATION = nn.ReLU

In [35]:
# Training hyperparameters
BATCH_SIZE = 256
LR = 1e-3
WEIGHT_DECAY = 1e-5
EPOCHS = 60
VAL_FRAC = 0.15
CLIP_GRAD = None

In [None]:
# Preprocessing

def load_and_preprocess():

    # Combine to ensure consistent one-hot encoding
    all_X = pd.concat([X_trn, X_test], axis=0, ignore_index=True)

    cat_cols = [c for c in all_X.columns if all_X[c].dtype == "object" or isinstance(all_X[c].dtype, CategoricalDtype)]
    num_cols = [c for c in all_X.columns if c not in cat_cols]

    if len(cat_cols) > 0:
        all_X = pd.get_dummies(all_X, columns=cat_cols, drop_first=False)

    # Split data after one-hot encoding
    X_trn_enc = all_X.iloc[:len(X_trn)].reset_index(drop=True)
    X_test_enc = all_X.iloc[len(X_trn):].reset_index(drop=True)

    # Scale numerical data
    scaler = StandardScaler()
    X_trn_enc[num_cols] = scaler.fit_transform(X_trn_enc[num_cols])
    X_test_enc[num_cols] = scaler.transform(X_test_enc[num_cols])

    # Transform to numpy arrays
    X_trn_np = X_trn_enc.values.astype(np.float32)
    y_trn_np = y_trn.values.astype(np.float32).ravel()
    X_test_np = X_test_enc.values.astype(np.float32)

    return X_trn_np, y_trn_np, X_test_np, scaler

X_trn_np, y_trn_np, X_test_np, scaler = load_and_preprocess()
print("Train:", X_trn_np.shape, y_trn_np.shape, " Test:", X_test_np.shape)

Train: (31607, 35) (31607,)  Test: (5578, 35)


In [None]:
# Set the framework of the MDN model

class MDN(nn.Module):
    def __init__(self, input_dim, n_mixtures=N_MIX, hidden=HIDDEN, n_layers=N_HIDDEN_LAYERS):
        super().__init__()
        layers = []
        last = input_dim
        for _ in range(n_layers):
            layers.append(nn.Linear(last, hidden))
            layers.append(ACTIVATION())
            last = hidden
        self.net = nn.Sequential(*layers)
        self.logits = nn.Linear(last, n_mixtures)   # mixture weights
        self.means = nn.Linear(last, n_mixtures)    # means
        self.logvars = nn.Linear(last, n_mixtures)  # log variances

    def forward(self, x):
        h = self.net(x)
        return self.logits(h), self.means(h), self.logvars(h)

In [None]:
# Set the MDN log-likelihood (negative since pytorch minimalizes)
def mdn_nll(y, logits, means, logvars, eps=1e-8):
    
    # Set the right dimensions of the Tensor 
    if y.dim() == 1:
        y = y.unsqueeze(1)

    
    log_weights = torch.log_softmax(logits, dim=1)
    var = torch.exp(logvars) + eps
    y_exp = y.expand_as(means)
    sq = (y_exp - means) ** 2
    comp_logprob = -0.5 * (torch.log(2 * torch.pi * var) + sq / var)
    log_prob = torch.logsumexp(log_weights + comp_logprob, dim=1)
    return -log_prob.mean()