### Problem to Solve:

This is a comparative study of different Machine Learning approaches.
We have investigated various neural network design choices and compared them on
the Kaggle Titanic Dataset: https://www.kaggle.com/datasets/yasserh/titanic-dataset.
We have compared linear classification, logistic regression, MLP (1 and 2D),
4D MLP with a residual network, and have researched and implemented the tabular autoencoder.
We have concluded that some of these designs were great at classifying our data
(Linear, Logistic, MLP),
and some are not meant to be used for this type of data,
(Tabular autoencoder).
After a thourough investigation, we have determined that MLP is the best
at classifying the Titanic data, and have deployed it into a simple geussing game.
In the game a users geuss is compared with the ML model to see if the user made the correct choice.

In [3]:
import numpy as np
import pandas as pd
import torch
from torch.utils.data import (
    DataLoader,
    TensorDataset,
    Dataset
)
from torch.optim import (Optimizer, Adam)
from torch.optim import SGD
from torch import nn

import warnings
warnings.filterwarnings('ignore')

In [4]:
filepath = "Titanic-Dataset.csv"

# Get data in pandas dataframe
df = pd.read_csv(filepath)

### Step One:

We start by cleaning up the data.

In [4]:
def clean_df(df: pd.DataFrame) -> pd.DataFrame:
    df = df[['Survived','Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Cabin', 'Embarked']]
    
    df = df.replace("male", 0)
    df = df.replace("female", 1)
    # Replace NaN
    df['Age'] = df['Age'].fillna(df['Age'].mean())
    
    # Get rid of blanks, get only cabin class - shows upper to lower decks, 
    # then 1-hot encode (seperate each class to binary columns)
    # Will end up with around 8 binary columns
    # Cabin_U = unknown
    df['Cabin'] = df['Cabin'].fillna('Unknown')
    df['Cabin'] = df['Cabin'].str[0]
    df = pd.get_dummies(df, columns=['Cabin'])

    # 1-hot encoding for 3 binary columns repping port of embarkment
    df = pd.get_dummies(df, columns=['Embarked'])

    bool_cols = df.select_dtypes(include='bool').columns
    df[bool_cols] = df[bool_cols].astype(int)

    return df

In [5]:
cleaned_df = clean_df(df)
print(cleaned_df)

     Survived  Pclass  Sex        Age  SibSp  Parch     Fare  Cabin_A  \
0           0       3    0  22.000000      1      0   7.2500        0   
1           1       1    1  38.000000      1      0  71.2833        0   
2           1       3    1  26.000000      0      0   7.9250        0   
3           1       1    1  35.000000      1      0  53.1000        0   
4           0       3    0  35.000000      0      0   8.0500        0   
..        ...     ...  ...        ...    ...    ...      ...      ...   
886         0       2    0  27.000000      0      0  13.0000        0   
887         1       1    1  19.000000      0      0  30.0000        0   
888         0       3    1  29.699118      1      2  23.4500        0   
889         1       1    0  26.000000      0      0  30.0000        0   
890         0       3    0  32.000000      0      0   7.7500        0   

     Cabin_B  Cabin_C  Cabin_D  Cabin_E  Cabin_F  Cabin_G  Cabin_T  Cabin_U  \
0          0        0        0        0     

In [6]:
# Convert to Torch dataset with tensors
# Need to update this to split test data, ie make 10% of dataset test data, 90% is training
def make_dataset(df: pd.DataFrame) -> Dataset:
    features = df.drop(columns=['Survived'])
    
    # Ytrue
    target = df['Survived']

    x_values = torch.tensor(features.values, dtype=torch.float32)
    y_values = torch.tensor(target.values, dtype=torch.int64)
    return TensorDataset(x_values,y_values)

In [7]:
train_dataset = make_dataset(cleaned_df)

In [8]:
def make_dataloader(dataset: Dataset, batch_size:int, shuffle:bool) -> DataLoader:
    return DataLoader(dataset, batch_size, shuffle)

In [9]:
train_dataloader = make_dataloader(train_dataset, shuffle=False, batch_size=5)

### Step Two:

We start to design our various ML models to test out and explore the performance on this dataset.

In [10]:
"""

# Linear Classifier Model

"""
class LinearClassifier(nn.Module):
    def __init__(self, num_features):
        super().__init__()
        self.num_features = num_features
        self.linear = nn.Linear(num_features, 2)
    
    def forward(self, x):
        return self.linear(x)

In [11]:
"""

# Logistic Regression Model

"""
class LogisticRegression(nn.Module):
    def __init__(self, num_features):
        super().__init__()
        self.num_features = num_features
        # Makes logit
        self.linear = nn.Linear(num_features, 1)
    
    def forward(self, x):
        return torch.sigmoid(self.linear(x))

In [12]:
"""

1 Layer MLP Classifier Model

"""
class MLPClassifier(nn.Module):
    def __init__(self, num_features):
        super().__init__()
        self.num_features = num_features
        self.linear1 = nn.Linear(num_features, 100)
        self.act1 = nn.ReLU()
        self.output = nn.Linear(100, 2)
    
    def forward(self, x):
        x = self.linear1(x)
        x = self.act1(x)
        x = self.output(x)
        return x

In [13]:
"""

2 Layer MLP Classifier Model

"""
class MLP2DClassifier(nn.Module):
    def __init__(self, num_features):
        super().__init__()
        self.num_features = num_features
        self.linear1 = nn.Linear(num_features, 100)
        self.act1 = nn.ReLU()
        self.linear2 = nn.Linear(100,50)
        self.output = nn.Linear(50,2)
    def forward(self, x):
        x = self.linear1(x)
        x = self.act1(x)
        x = self.linear2(x)
        x = self.output(x)
        return x

In [14]:
"""

4 Layer Residual MLP Classifier Model

"""
class MLP4DResidualClassifier(nn.Module):
    def __init__(self, num_features):
        super().__init__()

        self.act = nn.ReLU()
        self.num_features = num_features

        # Main layers
        self.linear1 = nn.Linear(num_features, 128)
        self.linear2 = nn.Linear(128, 64)
        self.linear3 = nn.Linear(64, 32)
        self.linear4 = nn.Linear(32, 16)

        # Skip connections
        self.skip1 = nn.Linear(num_features, 128)
        self.skip2 = nn.Linear(128, 64)
        self.skip3 = nn.Linear(64, 32)
        self.skip4 = nn.Linear(32, 16)
        
        self.output = nn.Linear(16,2)
    def forward(self, x):
        s1 = self.skip1(x)
        x = self.act(self.linear1(x) + s1)

        s2 = self.skip2(x)
        x = self.act(self.linear2(x) + s2)

        s3 = self.skip3(x)
        x = self.act(self.linear3(x) + s3)

        s4 = self.skip4(x)
        x = self.act(self.linear4(x) + s4)

        return self.output(x)

In [15]:
"""

Tabular Autoencoder Model

"""
class TabularAutoencoder(nn.Module):
    def __init__(self, input_dim, latent_dim=8):
        super().__init__()

        self.encoder = nn.Sequential(
            nn.Linear(input_dim, 16),
            nn.ReLU(),
            nn.Linear(16, latent_dim),
        )

        self.decoder = nn.Sequential(
            nn.Linear(latent_dim, 16),
            nn.ReLU(),
            nn.Linear(16, input_dim)
        )

    def forward(self, x):
        z = self.encoder(x)
        x_hat = self.decoder(z)
        return x_hat

### Step Three:

We now begin to train and save the models

In [16]:
n_samples, n_features = train_dataset.tensors[0].shape
n_epochs = 100

In [17]:
def lin_train(model: nn.Module, loss: nn.Module, optimizer: Optimizer, dataloader: DataLoader, epochs: int):
    for epoch in range(epochs):
        for (x,target) in dataloader:
            pred = model(x)
            l = loss(pred, target)

            optimizer.zero_grad()
            l.backward()
            optimizer.step()

        if epoch % 10 == 0:
            [w,b] = model.parameters()
            print(f'epoch {epoch + 1}: w = {w[0][0].item():.3f}, loss = {l:.8f}')

In [18]:
# Training linear
lin = LinearClassifier(n_features)
loss = nn.CrossEntropyLoss()

# May want to adjust learning rate
optimizer = SGD(lin.parameters(), lr=0.001)
# May want less epochs
lin_train(lin, loss, optimizer, train_dataloader, n_epochs)
torch.save(lin, "lin_model.mdl")

epoch 1: w = 0.085, loss = 0.23908569
epoch 11: w = 0.130, loss = 0.20649195
epoch 21: w = 0.141, loss = 0.17081665
epoch 31: w = 0.152, loss = 0.14239536
epoch 41: w = 0.164, loss = 0.12135758
epoch 51: w = 0.176, loss = 0.10577343
epoch 61: w = 0.189, loss = 0.09395583
epoch 71: w = 0.201, loss = 0.08475528
epoch 81: w = 0.212, loss = 0.07742137
epoch 91: w = 0.223, loss = 0.07145692


In [19]:
def log_train(model: nn.Module, loss: nn.Module, optimizer: Optimizer, dataloader: DataLoader, epochs: int):
    for epoch in range(epochs):
        for x, target in dataloader:
            target = target.float().unsqueeze(1)
            pred = model(x)
            l = loss(pred, target)
            
            optimizer.zero_grad()
            l.backward()
            optimizer.step()
            

        if epoch % 10 == 0:
            [w,b] = model.parameters()
            print(f'epoch {epoch + 1}: w = {w[0][0].item():.3f}, loss = {l:.8f}')

In [23]:
# Training Logistic
log = LogisticRegression(n_features)
loss = nn.BCELoss()

# May want to adjust learning rate
optimizer = SGD(log.parameters(), lr=0.001)
log_train(log,loss,optimizer,train_dataloader,n_epochs)
torch.save(log, "log_model.mdl")

epoch 1: w = -0.073, loss = 0.34014407
epoch 11: w = -0.191, loss = 0.29816529
epoch 21: w = -0.245, loss = 0.26943448
epoch 31: w = -0.276, loss = 0.24700762
epoch 41: w = -0.298, loss = 0.22827701
epoch 51: w = -0.315, loss = 0.21221903
epoch 61: w = -0.331, loss = 0.19829655
epoch 71: w = -0.346, loss = 0.18614033
epoch 81: w = -0.360, loss = 0.17546216
epoch 91: w = -0.372, loss = 0.16602771


In [25]:
def mlp_train(model: nn.Module, loss: nn.Module, optimizer: Optimizer, dataloader: DataLoader, epochs: int):
    for epoch in range(epochs):
        for x, target in dataloader:
            pred = model(x)
            l = loss(pred, target)
            
            optimizer.zero_grad()
            l.backward()
            optimizer.step()
                
        if epoch % 10 == 0:
            w = model.linear1.weight
            print(f'epoch {epoch + 1}: w = {w[0][0].item():.3f}, loss = {l:.8f}')

In [21]:
# Training MLP

loss = nn.CrossEntropyLoss()

# 1 layer
mlp = MLPClassifier(n_features)
optimizer = Adam(mlp.parameters(), lr=0.001)
mlp_train(mlp,loss,optimizer,train_dataloader,n_epochs)

torch.save(mlp, "mlp_onelayer_model.mdl")

epoch 1: w = 0.014, loss = 0.31397393
epoch 11: w = -0.086, loss = 0.12209155
epoch 21: w = -0.146, loss = 0.09382787
epoch 31: w = -0.183, loss = 0.08696169
epoch 41: w = -0.212, loss = 0.07449991
epoch 51: w = -0.261, loss = 0.05678044
epoch 61: w = -0.343, loss = 0.04736802
epoch 71: w = -0.407, loss = 0.03767274
epoch 81: w = -0.439, loss = 0.03173331
epoch 91: w = -0.495, loss = 0.03259815


In [26]:
# 2 layers
mlp2 = MLP2DClassifier(n_features)
optimizer = Adam(mlp2.parameters(), lr=0.001)
mlp_train(mlp2,loss,optimizer,train_dataloader,n_epochs)

torch.save(mlp2, "mlp_twolayer_model.mdl")

ValueError: Using a target size (torch.Size([5])) that is different to the input size (torch.Size([5, 2])) is deprecated. Please ensure they have the same size.

In [27]:
# 4 layers residual
mlp4 = MLP4DResidualClassifier(n_features)
optimizer = Adam(mlp4.parameters(), lr=0.001)
mlp_train(mlp4,loss,optimizer,train_dataloader,n_epochs)

torch.save(mlp4, "mlp_fourlayer_model.mdl")

ValueError: Using a target size (torch.Size([5])) that is different to the input size (torch.Size([5, 2])) is deprecated. Please ensure they have the same size.

In [28]:
def ta_train(model: nn.Module, loss: nn.Module, optimizer: Optimizer, dataloader: DataLoader, epochs: int):
    for epoch in range(epochs):
        for x, _ in dataloader:
            x_prime = x[0]
            
            pred = model(x)
            l = loss(pred, x_prime)
            
            optimizer.zero_grad()
            l.backward()
            optimizer.step()
                
        if epoch % 10 == 0:
            print(f'epoch {epoch + 1}: loss = {l:.8f}')

In [29]:
ta = TabularAutoencoder(input_dim = n_features, latent_dim = 8)
optimizer = Adam(ta.parameters(), lr=1e-3)
loss = nn.MSELoss()
ta_train(ta, loss, optimizer, train_dataloader, n_epochs)


epoch 1: loss = 9.08781910
epoch 11: loss = 30.30890656
epoch 21: loss = 30.91572762
epoch 31: loss = 27.79380608
epoch 41: loss = 26.39601898
epoch 51: loss = 25.52437973
epoch 61: loss = 24.55498505
epoch 71: loss = 23.73088646
epoch 81: loss = 22.80644989
epoch 91: loss = 21.95965004
