In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.metrics import accuracy_score

# Load dataset
data_bool = pd.read_parquet("data.parquet")

# Separate features and labels
y = data_bool.iloc[:, 0].values  # First column = labels
X = data_bool.iloc[:, 1:].values  # Remaining = features

# Normalize features to N(0, I)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split data: Train (70%) | Validation (10%) | Test (20%)
X_temp, X_test, y_temp, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=10)
X_train, X_val, y_train, y_val = train_test_split(X_temp, y_temp, test_size=0.125, random_state=10) 

print("Shapes:")
print("Train:", X_train.shape)
print("Validation:", X_val.shape)
print("Test:", X_test.shape)

# Define MLP model
class MLP(nn.Module):
    def __init__(self, input_size):
        super(MLP, self).__init__()
        self.fc1 = nn.Linear(input_size, 2000)
        self.bn1 = nn.BatchNorm1d(2000)
        self.dropout1 = nn.Dropout(0.02)
        self.fc2 = nn.Linear(2000, 200)
        self.bn2 = nn.BatchNorm1d(200)
        self.dropout2 = nn.Dropout(0.02)
        self.fc3 = nn.Linear(200, 20)
        self.bn3 = nn.BatchNorm1d(20)
        self.dropout3 = nn.Dropout(0.02)
        self.fc4 = nn.Linear(20, 1)
        self.sigmoid = nn.Sigmoid()
        self.leaky_relu = nn.LeakyReLU(negative_slope=0.01)

    def forward(self, x):
        x = self.leaky_relu(self.bn1(self.fc1(x)))
        x = self.dropout1(x)
        x = self.leaky_relu(self.bn2(self.fc2(x)))
        x = self.dropout2(x)
        x = self.leaky_relu(self.bn3(self.fc3(x)))
        x = self.dropout3(x)
        x = self.sigmoid(self.fc4(x)).squeeze()
        return x

# Train model with validation
def train_mlp(X_train, y_train, X_val, y_val, epochs=300, batch_size=64, lr=0.0003, l1_lambda=1e-5):
    model = MLP(X_train.shape[1])
    model.train()
    criterion = nn.BCELoss()
    optimizer = optim.SGD(model.parameters(), lr=lr, momentum=0.5)

    # DataLoaders
    train_loader = DataLoader(TensorDataset(torch.tensor(X_train, dtype=torch.float32),
                                            torch.tensor(y_train, dtype=torch.float32)),
                              batch_size=batch_size, shuffle=True)
    
    val_tensor = torch.tensor(X_val, dtype=torch.float32)
    val_labels = torch.tensor(y_val, dtype=torch.float32)

    for epoch in range(epochs):
        model.train()
        all_preds, all_labels = [], []

        for batch_X, batch_y in train_loader:
            optimizer.zero_grad()
            outputs = model(batch_X)

            l1_reg = sum(torch.norm(param, 1) for name, param in model.named_parameters() if "weight" in name)
            loss = criterion(outputs, batch_y) + l1_lambda * l1_reg

            loss.backward()
            optimizer.step()

            preds = (outputs.detach().cpu().numpy() > 0.5).astype(int)
            all_preds.extend(preds)
            all_labels.extend(batch_y.cpu().numpy().astype(int))

        train_acc = accuracy_score(all_labels, all_preds)

        # Validation accuracy
        model.eval()
        with torch.no_grad():
            val_preds = (model(val_tensor).cpu().numpy() > 0.5).astype(int)
            val_acc = accuracy_score(y_val, val_preds)

        if (epoch + 1) % 10 == 0:
            print(f"Epoch {epoch+1}/{epochs} - Train Acc: {train_acc:.4f} | Val Acc: {val_acc:.4f} | Loss: {loss.item():.4f}")

    return model

# Train the model
model = train_mlp(X_train, y_train, X_val, y_val)
torch.save(model.state_dict(), "mlp_model.pth")
print("Model saved as mlp_model.pth")

# Test evaluation
model.eval()
with torch.no_grad():
    test_preds = (model(torch.tensor(X_test, dtype=torch.float32)).numpy() > 0.5).astype(int)
    test_acc = accuracy_score(y_test, test_preds)
    print(f"Final Test Accuracy: {test_acc:.4f}")



Shapes:
Train: (6544, 19887)
Validation: (935, 19887)
Test: (1870, 19887)
Epoch 10/300 - Train Acc: 0.9783 | Val Acc: 0.9754 | Loss: 2.0318
Epoch 20/300 - Train Acc: 0.9921 | Val Acc: 0.9882 | Loss: 1.8075
Epoch 30/300 - Train Acc: 0.9940 | Val Acc: 0.9925 | Loss: 1.6284
Epoch 40/300 - Train Acc: 0.9951 | Val Acc: 0.9925 | Loss: 1.5886
Epoch 50/300 - Train Acc: 0.9960 | Val Acc: 0.9925 | Loss: 1.6386
Epoch 60/300 - Train Acc: 0.9972 | Val Acc: 0.9925 | Loss: 1.6093
Epoch 70/300 - Train Acc: 0.9977 | Val Acc: 0.9936 | Loss: 1.5367
Epoch 80/300 - Train Acc: 0.9976 | Val Acc: 0.9925 | Loss: 1.5115
Epoch 90/300 - Train Acc: 0.9988 | Val Acc: 0.9936 | Loss: 1.5094
Epoch 100/300 - Train Acc: 0.9991 | Val Acc: 0.9904 | Loss: 1.4964
Epoch 110/300 - Train Acc: 0.9992 | Val Acc: 0.9936 | Loss: 1.4982
Epoch 120/300 - Train Acc: 0.9997 | Val Acc: 0.9914 | Loss: 1.4810
Epoch 130/300 - Train Acc: 0.9997 | Val Acc: 0.9925 | Loss: 1.4766
Epoch 140/300 - Train Acc: 0.9995 | Val Acc: 0.9936 | Loss: 1.47