# Neural Network Classification

#### Tabular Data Classification with PyTorch

***

Table of contents:

0. Imports
1. Setup Device
2. Load and Prepare Data
3. Create Dataset Class
4. Build the Neural Network Model
5. Create Visualization Functions
6. Training and Evaluation Function
7. Run the Model and show results

### 0. Imports

In [16]:
import pandas as pd
import torch
import torch.nn as nn
from matplotlib import pyplot as plt
from sklearn.metrics import classification_report, roc_auc_score
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset

from helpMethods.data import to_dataloader, train_val_split
from helpMethods.training import fit
from helpMethods.utils import get_device

### 1. Setup Device

If available, GPU will be used to speed up training.


In [17]:
DEVICE = get_device()


Running pytorch version (2.5.1) with backend = mps


### 2. Load and Prepare Data

We load the dataset using the shared `data_preparation` function. The function encodes target column, one-hot encodes categorical features, and checks for class imbalance. It also drops irrelevant columns and converts boolean columns to integers. The dataset is then split into features and target variable.


In [18]:
# Load the dataset

# if full dataset, uncomment the line below
# df = pd.read_csv('Data/cleaned_data.csv')

# if small sample dataset, uncomment the line below
df = pd.read_csv('../Data/top_1000_cleaned.csv')

def data_preparation(dataframe):
    dataframe = pd.get_dummies(dataframe, columns=['Medlemstype'], drop_first=True)
    dataframe = pd.get_dummies(dataframe, columns=['Region'], drop_first=True)
    dataframe['Aktiv_Deltager'] = dataframe['Aktiv_Deltager'].map({'Ja': 1, 'Nej': 0})

    # Balancing the dataset
    df_majority = dataframe[dataframe['Aktiv_Deltager'] == 0]
    df_minority = dataframe[dataframe['Aktiv_Deltager'] == 1]
    df_minority_upsampled = df_minority.sample(len(df_majority), replace=True, random_state=42)
    dataframe = pd.concat([df_majority, df_minority_upsampled])
    dataframe = dataframe.sample(frac=1, random_state=42)  # Shuffle the dataset

    # Splitting the data into features and target variable
    features = dataframe.drop(columns=[
        'Aktiv_Deltager',
        'Kontakt_ID',
        'Kontakt_OK',
        'Antal_Aktiv',
        'Status_Aarsag',
        'Status',
        'Startdato',

        # remove below comment, optional case
        # 'Antal'
    ])
    features = features.astype({col: int for col in features.select_dtypes(bool).columns})
    target = dataframe['Aktiv_Deltager']

    return features, target

### 3. Create Dataset Class

We define a custom PyTorch Dataset class for our tabular data. The reason for this is to allow PyTorch to handle the data in a way that is compatible with its DataLoader, which is used for batching and shuffling the data during training.


In [19]:
class TabularDataset(Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = y

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

### 4. Build the Neural Network Model

We define a neural network with multiple fully connected layers for binary classification. The model consists of three fully connected layers with ReLU activations and dropout for regularization. The final layer outputs two classes, for whether the user is likely to sell lottery tickets or not.


In [20]:
class NeuralNet(nn.Module):
    def __init__(self, input_dim):
        super().__init__()
        self.fc1 = nn.Linear(input_dim, 64)
        self.relu1 = nn.ReLU()
        self.dropout1 = nn.Dropout(0.2)
        self.fc2 = nn.Linear(64, 32)
        self.relu2 = nn.ReLU()
        self.dropout2 = nn.Dropout(0.2)
        self.fc3 = nn.Linear(32, 2)

    def forward(self, x):
        x = self.fc1(x)
        x = self.relu1(x)
        x = self.dropout1(x)
        x = self.fc2(x)
        x = self.relu2(x)
        x = self.dropout2(x)
        x = self.fc3(x)
        return x

### 5. Create visualization functions

We define functions to plot the training and validation loss and accuracy curves. These plots help us visualize the model's performance over epochs, allowing us to identify potential overfitting or underfitting.

In [None]:
def plot_loss(history: dict):

    plt.figure(figsize=(8, 5))
    plt.plot(history["train_loss"], label="Train Loss")
    plt.plot(history["val_loss"], label="Validation Loss")
    plt.title("Loss Curve")
    plt.xlabel("Epoch")
    plt.ylabel("Loss")
    plt.legend()
    plt.grid(True)

    plt.savefig("Visualisations/Loss_curve.png", dpi=300, bbox_inches='tight')

def plot_accuracy(history: dict):

    plt.figure(figsize=(8, 5))
    plt.plot(history["train_acc"], label="Train Accuracy")
    plt.plot(history["val_acc"], label="Validation Accuracy")
    plt.title("Accuracy Curve")
    plt.xlabel("Epoch")
    plt.ylabel("Accuracy (%)")
    plt.legend()
    plt.grid(True)

    plt.savefig("Visualisations/Accuracy_curve.png", dpi=300, bbox_inches='tight')

### 6. Training and Evaluation Function

This function handles the entire process of preparing data, training the model, evaluating and visualizing it.


In [21]:
def train_nn_model(data):
    # Data preparation
    features, target = data_preparation(data)

    # Splitting the data into training, validation, and test sets
    X_tensor = torch.FloatTensor(features.values)
    y_tensor = torch.LongTensor(target.values)

    X_train_val, X_test, y_train_val, y_test = train_test_split(
        X_tensor, y_tensor, test_size=0.2, stratify=y_tensor, random_state=42
    )

    train_set = TabularDataset(X_train_val, y_train_val)
    test_set = TabularDataset(X_test, y_test)
    train_set, val_set = train_val_split(train_set, val_ratio=0.2, seed=42)

    train_loader = to_dataloader(train_set, batch_size=64, shuffle=True)
    val_loader = to_dataloader(val_set, batch_size=64, shuffle=False)
    test_loader = to_dataloader(test_set, batch_size=64, shuffle=False)

    model = NeuralNet(X_tensor.shape[1]).to(DEVICE)
    optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
    loss_fn = nn.CrossEntropyLoss()

    model, history = fit(model, train_loader, val_loader, DEVICE, optimizer, loss_fn, 10)

    y_true, y_pred, y_prob = [], [], []
    model.eval()
    with torch.no_grad():
        for features_batch, labels_batch in test_loader:
            features_batch, labels_batch = features_batch.to(DEVICE), labels_batch.to(DEVICE)
            out = model(features_batch)
            _, predictions = torch.max(out, 1)
            probabilities = torch.softmax(out, dim=1)
            y_true.extend(labels_batch.cpu().numpy())
            y_pred.extend(predictions.cpu().numpy())
            y_prob.extend(probabilities[:, 1].cpu().numpy())

    plot_loss(history)
    plot_accuracy(history)
    report = classification_report(y_true, y_pred, output_dict=True)
    auc = roc_auc_score(y_true, y_prob) if len(set(y_true)) == 2 else None

    return {
        'accuracy': report['accuracy'],
        'precision': report['1']['precision'],
        'recall': report['1']['recall'],
        'f1': report['1']['f1-score'],
        'auc': auc
    }

### 7. Run the Model

At last, we'll run the model and display the results.


In [22]:
# Run the model and print the results
print(train_nn_model(df))

ValueError: The least populated class in y has only 1 member, which is too few. The minimum number of groups for any class cannot be less than 2.