In [1]:
from tqdm import tqdm
import os
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, f1_score, precision_score, recall_score
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

We will be looking at a two layer NN (3 coloumms). Input size is number of layer in first of 2 layers. (so if you have like a 6 layer NN there are many inputs depending on what layer you are looking at)

In [2]:
class BasicMLP(nn.Module):
    """
    A simple feedforward neural network with one hidden layer.
    """
    def __init__(self, input_size, hidden_size, output_size):
        super(BasicMLP, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.fc2 = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        x = F.relu(self.fc1(x)) #nonlinear activiation function, relu is a standard, more complicated ones like selu and gelu 
        x = self.fc2(x)
        return x

#above is one way, below is another way, below is useful for multiple layers 

class BasicMLP2(nn.Module):
    """
    A simple feedforward neural network with one hidden layer usibng nn.Sequential.
    """
    def __init__(self, input_size, hidden_size, output_size):
        super(BasicMLP2, self).__init__()
        self.fc = nn.Sequential(
            nn.Linear(input_size, hidden_size),
            nn.ReLU(inplace=True),
            nn.Linear(hidden_size, output_size),
        )

    def forward(self, x):
        return self.fc(x)   
    

class BasicMLP3(nn.Module):
    """
     A simple feedforward neural network with multiple hidden layers.
    """
    def __init__(self, input_size, hidden_size, output_size, n_hidden_layers):
        super(BasicMLP3, self).__init__()
        self.input_layer = nn.Sequential(nn.Linear(input_size, hidden_size), nn.ReLU(inplace=True))
        self.hidden_layers = self._make_hidden_layers(n_hidden_layers, hidden_size)
        self.output_layer = nn.Linear(hidden_size, output_size)

#creates a list of linear layers within NN, 
    def _make_hidden_layers(self, n_hidden_layers, hidden_size):
        layers = []
        for _ in range(n_hidden_layers):
            layers += [nn.Linear(hidden_size, hidden_size), nn.ReLU(inplace=True)]
        return nn.Sequential(*layers) #* is a python thinng, it unpacks the list into arguments 

    def forward(self, x):
        x = self.input_layer(x)
        x = self.hidden_layers(x)
        x = self.output_layer(x)
        return x


In [13]:
df = pd.read_csv("https://web.stanford.edu/class/archive/cs/cs109/cs109.1166/stuff/titanic.csv")
X, y = df.drop(columns=["Survived", "Name"]), df["Survived"]

numerical_cols = ['Age', 'Siblings/Spouses Aboard', 'Parents/Children Aboard', 'Fare']
categorical_cols = ['Sex', 'Pclass']
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

# Split the data into training, validation, and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y.values, test_size=0.1, random_state=42)
X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=1/9, random_state=42)

X_train = preprocessor.fit_transform(X_train)
X_valid = preprocessor.transform(X_valid)
X_test = preprocessor.transform(X_test)

In PyTorch, we can define custom datasets by subclassing `torch.utils.data.Dataset` and implementing the `__len__` and `__getitem__` methods. In our case this might look like unnecessarily complicated, but it will be useful when we want to use more complex datasets such as image data. Moreover, we can use the `torch.utils.data.DataLoader` class to load the data in batches and shuffle it.

In [14]:
class TitanicDataset(Dataset):
  '''
  Prepare the dTitanic dataset for classification.
  '''

  def __init__(self, X, y): #numpy array 
    if not torch.is_tensor(X) or not torch.is_tensor(y):
      self.X = torch.from_numpy(X).float()
      self.y = torch.from_numpy(y) #makes then torch arrays 

  def __len__(self):
      return len(self.X)

  def __getitem__(self, i):
      return {"X": self.X[i], "label": self.y[i]} #using dictonary because he like dictonary 
  #could also do return self.X[i], self.y[i]
  #can make a funstion so that no memoery is ussed, like when you are going through an image dataset 

In [15]:
def evaluation_metrics(y_true: np.ndarray, y_preds: np.ndarray):
    conf_matrix = confusion_matrix(
        y_true, y_preds, labels=[0, 1]
    )  # TODO: Add generalised labels
    accuracy, f1, precision, recall = (
        accuracy_score(y_true, y_preds),
        f1_score(y_true, y_preds, zero_division=0.0),
        precision_score(y_true, y_preds, zero_division=0.0),
        recall_score(y_true, y_preds, zero_division=0.0),
    )
    return {
        "accuracy": accuracy,
        "f1_score": f1,
        "precision": precision,
        "recall": recall,
    }, conf_matrix


@torch.inference_mode()
def perform_inference(model, dataloader, device: str, loss_fn=None):
    """
    Perform inference on given dataset using given model on the specified device. If loss_fn is provided, it also
    computes the loss and returns [y_preds, y_true, losses].
    """
    model.eval()  # Set the model to evaluation mode, this disables training specific operations such as dropout and batch normalization
    y_preds = []
    y_true = []
    losses = []

    print("[inference.py]: Running inference...")
    for i, batch in tqdm(enumerate(dataloader)):
        inputs = batch["X"].to(device)
        outputs = model(inputs)
        if loss_fn is not None:
            labels = batch["label"].to(device)
            loss = loss_fn(outputs, labels)
            losses.append(loss.item())
            y_true.append(labels.cpu().numpy())

        preds = F.softmax(outputs.detach().cpu(), dim=1).argmax(dim=1)
        y_preds.append(preds.numpy())

    model.train()  # Set the model back to training mode
    y_true, y_preds = np.concatenate(y_true), np.concatenate(y_preds)
    return y_true, y_preds, np.mean(losses) if losses else None

We will see more uses of Dataset and DataLoader in the future.

In [None]:
# Create datasets and dataloaders
train_dataset = TitanicDataset(X_train, y_train)
valid_dataset = TitanicDataset(X_valid, y_valid)
test_dataset = TitanicDataset(X_test, y_test)

train_dataloader = DataLoader(train_dataset, batch_size=32, num_workers=4, shuffle=True) # Shuffle the training data on each epoch
valid_dataloader = DataLoader(valid_dataset, batch_size=128, num_workers=4, shuffle=False)
test_dataloader = DataLoader(test_dataset, batch_size=128, num_workers=4, shuffle=False)

output_dir = "./models/"
os.makedirs(os.path.join(output_dir, "saved_models"), exist_ok=True)
n_epochs = 50
batches_done = 0
sbest_loss_val = float("inf")
epoch_metrics = {}
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
num_classes = len(y.unique())
model = BasicMLP(X_train.shape[1], 128, num_classes).to(device)
optimiser = optim.Adam(model.parameters(), lr=0.01)
loss_fn = nn.CrossEntropyLoss().to(device)


print("[train.py]: Starting Training...")
for epoch in tqdm(range(n_epochs)):
    y_preds = []
    y_train = []
    losses = []
    for i, data in tqdm(enumerate(train_dataloader)):
        inputs, labels = data["X"].to(device), data["label"].to(device)
        outputs = model(inputs)
        loss = loss_fn(outputs, labels)

        loss.backward()
        optimiser.step()
        optimiser.zero_grad(set_to_none=True)

        preds = F.softmax(outputs, dim=1).argmax(dim=1)
        y_preds.append(preds.cpu().numpy())
        y_train.append(labels.cpu().numpy())
        losses.append(loss.item())

        batches_done += 1

    loss_train = torch.tensor(losses).mean().item()
    y_train, y_preds = np.concatenate(y_train), np.concatenate(y_preds)
    train_metrics, train_conf_matrix = evaluation_metrics(y_train, y_preds)

    y_val, y_preds_val, loss_val = perform_inference(model, valid_dataloader, device, loss_fn)
    val_metrics, val_conf_matrix = evaluation_metrics(y_val, y_preds_val)

    print(f"EPOCH: {epoch}")
    print(
        f'[TRAINING METRICS] Loss: {loss_train} | Accuracy: {train_metrics["accuracy"]} | '
        f'F1: {train_metrics["f1_score"]} | Precision: {train_metrics[f"precision"]} | Recall: {train_metrics["recall"]}'
    )
    print(
        f'[VALIDATION METRICS] Loss: {loss_val} | Accuracy: {val_metrics["accuracy"]} | '
        f'F1: {val_metrics["f1_score"]} | Precision: {val_metrics[f"precision"]} | Recall: {val_metrics["recall"]}'
    )

    checkpoint = {
        "epoch": epoch,
        "model_state_dict": model.state_dict(),
        "optimiser_state_dict": optimiser.state_dict(),
        "best_loss_val": best_loss_val,
    }
    print(f"[train.py]: Saving model at epoch {epoch}...")
    
    # Save the model checkpoint if the validation loss is the best we've seen so far
    if loss_val < best_loss_val:
        best_loss_val = loss_val
        print(f"[train.ry]: Found new best model at epoch {epoch}. Saving model...")
        torch.save(
            checkpoint,
            os.path.join(output_dir, "saved_models", "best_model.pt"),
        )

    epoch_metrics[epoch] = {
        "train_loss": loss_train,
        "train_metrics": train_metrics,
        "valid_loss": loss_val,
        "valid_metrics": val_metrics,
    }

## Plot metrics

In [None]:
epoch_metrics_df = pd.DataFrame.from_dict(epoch_metrics, orient="index")

sns.set_style("whitegrid")
sns.lineplot(data=epoch_metrics_df, x=epoch_metrics_df.index, y="train_loss", label="Training Loss")
sns.lineplot(data=epoch_metrics_df, x=epoch_metrics_df.index, y="valid_loss", label="Validation Loss")

Try training a bigger model (`BasicMLP3` with more hidden layers) and see if you can get better results. 