### Problem to Solve:

This is a comparative study of different Machine Learning approaches.
We have investigated various neural network design choices and compared them on
the Kaggle Titanic Dataset: https://www.kaggle.com/datasets/yasserh/titanic-dataset.
We have compared linear classification, logistic regression, MLP (1 and 2D),
4D MLP with a residual network, and have researched and implemented the tabular autoencoder.
We have concluded that some of these designs were great at classifying our data
(Linear, Logistic, MLP),
and some are not meant to be used for this type of data,
(Tabular autoencoder).
After a thourough investigation, we have determined that MLP is the best
at classifying the Titanic data, and have deployed it into a simple geussing game.
In the game a users geuss is compared with the ML model to see if the user made the correct choice.

In [1]:
import numpy as np
import pandas as pd
import torch
from torch.utils.data import (
    DataLoader,
    TensorDataset,
    Dataset
)
from torch.optim import (Optimizer, Adam)
from torch.optim import SGD
from torch import nn

import warnings
warnings.filterwarnings('ignore')

In [2]:
filepath = "Titanic-Dataset.csv"

# Get data in pandas dataframe
df = pd.read_csv(filepath)

### Step One:

We start by cleaning up the data.

In [3]:
def clean_df(df: pd.DataFrame) -> pd.DataFrame:
    df = df[['Survived','Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Cabin', 'Embarked']]
    
    df = df.replace("male", 0)
    df = df.replace("female", 1)
    # Replace NaN
    df['Age'] = df['Age'].fillna(df['Age'].mean())
    
    # Get rid of blanks, get only cabin class - shows upper to lower decks, 
    # then 1-hot encode (seperate each class to binary columns)
    # Will end up with around 8 binary columns
    # Cabin_U = unknown
    df['Cabin'] = df['Cabin'].fillna('Unknown')
    df['Cabin'] = df['Cabin'].str[0]
    df = pd.get_dummies(df, columns=['Cabin'])

    # 1-hot encoding for 3 binary columns repping port of embarkment
    df = pd.get_dummies(df, columns=['Embarked'])

    bool_cols = df.select_dtypes(include='bool').columns
    df[bool_cols] = df[bool_cols].astype(int)

    return df

In [4]:
cleaned_df = clean_df(df)
print(cleaned_df)

     Survived  Pclass  Sex        Age  SibSp  Parch     Fare  Cabin_A  \
0           0       3    0  22.000000      1      0   7.2500        0   
1           1       1    1  38.000000      1      0  71.2833        0   
2           1       3    1  26.000000      0      0   7.9250        0   
3           1       1    1  35.000000      1      0  53.1000        0   
4           0       3    0  35.000000      0      0   8.0500        0   
..        ...     ...  ...        ...    ...    ...      ...      ...   
886         0       2    0  27.000000      0      0  13.0000        0   
887         1       1    1  19.000000      0      0  30.0000        0   
888         0       3    1  29.699118      1      2  23.4500        0   
889         1       1    0  26.000000      0      0  30.0000        0   
890         0       3    0  32.000000      0      0   7.7500        0   

     Cabin_B  Cabin_C  Cabin_D  Cabin_E  Cabin_F  Cabin_G  Cabin_T  Cabin_U  \
0          0        0        0        0     

In [5]:
# Convert to Torch dataset with tensors
# Need to update this to split test data, ie make 10% of dataset test data, 90% is training
def make_dataset(df: pd.DataFrame) -> Dataset:
    features = df.drop(columns=['Survived'])
    
    # Ytrue
    target = df['Survived']

    x_values = torch.tensor(features.values, dtype=torch.float32)
    y_values = torch.tensor(target.values, dtype=torch.int64)
    return TensorDataset(x_values,y_values)

In [6]:
train_dataset = make_dataset(cleaned_df)

In [7]:
def make_dataloader(dataset: Dataset, batch_size:int, shuffle:bool) -> DataLoader:
    return DataLoader(dataset, batch_size, shuffle)

In [8]:
train_dataloader = make_dataloader(train_dataset, shuffle=False, batch_size=5)

### Step Two:

We start to design our various ML models to test out and explore the performance on this dataset.

In [9]:
"""

# Linear Classifier Model

"""
class LinearClassifier(nn.Module):
    def __init__(self, num_features):
        super().__init__()
        self.num_features = num_features
        self.linear = nn.Linear(num_features, 2)
    
    def forward(self, x):
        return self.linear(x)

In [10]:
"""

# Logistic Regression Model

"""
class LogisticRegression(nn.Module):
    def __init__(self, num_features):
        super().__init__()
        self.num_features = num_features
        # Makes logit
        self.linear = nn.Linear(num_features, 1)
    
    def forward(self, x):
        return torch.sigmoid(self.linear(x))

In [11]:
"""

1 Layer MLP Classifier Model

"""
class MLPClassifier(nn.Module):
    def __init__(self, num_features):
        super().__init__()
        self.num_features = num_features
        self.linear1 = nn.Linear(num_features, 100)
        self.act1 = nn.ReLU()
        self.output = nn.Linear(100, 2)
    
    def forward(self, x):
        x = self.linear1(x)
        x = self.act1(x)
        x = self.output(x)
        return x

In [23]:
"""

2 Layer MLP Classifier Model

"""
class MLP2DClassifier(nn.Module):
    def __init__(self, num_features):
        super().__init__()
        self.num_features = num_features
        self.linear1 = nn.Linear(num_features, 100)
        self.act1 = nn.ReLU()
        self.linear2 = nn.Linear(100, 50)
        self.output = nn.Linear(50, 2)
        
    def forward(self, x):
        x = self.linear1(x)
        x = self.act1(x)
        x = self.linear2(x)
        x = self.output(x)
        return x

In [24]:
"""

4 Layer Residual MLP Classifier Model

"""
class MLP4DResidualClassifier(nn.Module):
    def __init__(self, num_features):
        super().__init__()

        self.act = nn.ReLU()
        self.num_features = num_features

        # Main layers
        self.linear1 = nn.Linear(num_features, 128)
        self.linear2 = nn.Linear(128, 64)
        self.linear3 = nn.Linear(64, 32)
        self.linear4 = nn.Linear(32, 16)

        # Skip connections
        self.skip1 = nn.Linear(num_features, 128)
        self.skip2 = nn.Linear(128, 64)
        self.skip3 = nn.Linear(64, 32)
        self.skip4 = nn.Linear(32, 16)
        
        self.output = nn.Linear(16, 2)
        
    def forward(self, x):
        s1 = self.skip1(x)
        x = self.act(self.linear1(x) + s1)

        s2 = self.skip2(x)
        x = self.act(self.linear2(x) + s2)

        s3 = self.skip3(x)
        x = self.act(self.linear3(x) + s3)

        s4 = self.skip4(x)
        x = self.act(self.linear4(x) + s4)

        return self.output(x)

In [25]:
"""

Tabular Autoencoder Model

"""
class TabularAutoencoder(nn.Module):
    def __init__(self, input_dim, latent_dim=8):
        super().__init__()

        self.encoder = nn.Sequential(
            nn.Linear(input_dim, 16),
            nn.ReLU(),
            nn.Linear(16, latent_dim),
        )

        self.decoder = nn.Sequential(
            nn.Linear(latent_dim, 16),
            nn.ReLU(),
            nn.Linear(16, input_dim)
        )

    def forward(self, x):
        z = self.encoder(x)
        x_hat = self.decoder(z)
        return x_hat

### Step Three:

We now begin to train and save the models

In [26]:
n_samples, n_features = train_dataset.tensors[0].shape
n_epochs = 100

In [27]:
def lin_train(model: nn.Module, loss: nn.Module, optimizer: Optimizer, dataloader: DataLoader, epochs: int):
    for epoch in range(epochs):
        for (x,target) in dataloader:
            pred = model(x)
            l = loss(pred, target)

            optimizer.zero_grad()
            l.backward()
            optimizer.step()

        if epoch % 10 == 0:
            [w,b] = model.parameters()
            print(f'epoch {epoch + 1}: w = {w[0][0].item():.3f}, loss = {l:.8f}')

In [28]:
# Training linear
lin = LinearClassifier(n_features)
loss = nn.CrossEntropyLoss()

# May want to adjust learning rate
optimizer = SGD(lin.parameters(), lr=0.001)
# May want less epochs
lin_train(lin, loss, optimizer, train_dataloader, n_epochs)
torch.save(lin, "models/lin_model.mdl")

epoch 1: w = 0.108, loss = 0.11851604
epoch 11: w = 0.173, loss = 0.11577205
epoch 21: w = 0.180, loss = 0.10613435
epoch 31: w = 0.181, loss = 0.09775220
epoch 41: w = 0.184, loss = 0.09046432
epoch 51: w = 0.189, loss = 0.08408478
epoch 61: w = 0.195, loss = 0.07850707
epoch 71: w = 0.202, loss = 0.07361709
epoch 81: w = 0.209, loss = 0.06931734
epoch 91: w = 0.216, loss = 0.06552503


In [29]:
def log_train(model: nn.Module, loss: nn.Module, optimizer: Optimizer, dataloader: DataLoader, epochs: int):
    for epoch in range(epochs):
        for x, target in dataloader:
            target = target.float().unsqueeze(1)
            pred = model(x)
            l = loss(pred, target)
            
            optimizer.zero_grad()
            l.backward()
            optimizer.step()
            

        if epoch % 10 == 0:
            [w,b] = model.parameters()
            print(f'epoch {epoch + 1}: w = {w[0][0].item():.3f}, loss = {l:.8f}')

In [30]:
# Training Logistic
log = LogisticRegression(n_features)
loss = nn.BCELoss()

# May want to adjust learning rate
optimizer = SGD(log.parameters(), lr=0.001)
log_train(log,loss,optimizer,train_dataloader,n_epochs)
torch.save(log, "models/log_model.mdl")

epoch 1: w = -0.158, loss = 0.25580165
epoch 11: w = -0.229, loss = 0.23033464
epoch 21: w = -0.261, loss = 0.21212526
epoch 31: w = -0.280, loss = 0.19752729
epoch 41: w = -0.294, loss = 0.18513270
epoch 51: w = -0.307, loss = 0.17433584
epoch 61: w = -0.319, loss = 0.16481116
epoch 71: w = -0.330, loss = 0.15634452
epoch 81: w = -0.341, loss = 0.14877635
epoch 91: w = -0.352, loss = 0.14198035


In [31]:
def mlp_train(model: nn.Module, loss: nn.Module, optimizer: Optimizer, dataloader: DataLoader, epochs: int):
    for epoch in range(epochs):
        for x, target in dataloader:
            pred = model(x)
            l = loss(pred, target)
            
            optimizer.zero_grad()
            l.backward()
            optimizer.step()
                
        if epoch % 10 == 0:
            w = model.linear1.weight
            print(f'epoch {epoch + 1}: w = {w[0][0].item():.3f}, loss = {l:.8f}')

In [32]:
# Training MLP

loss = nn.CrossEntropyLoss()

# 1 layer
mlp = MLPClassifier(n_features)
optimizer = Adam(mlp.parameters(), lr=0.001)
mlp_train(mlp,loss,optimizer,train_dataloader,n_epochs)

torch.save(mlp, "models/mlp_onelayer_model.mdl")

epoch 1: w = -0.232, loss = 0.28272519
epoch 11: w = -0.307, loss = 0.10031798
epoch 21: w = -0.357, loss = 0.08048259
epoch 31: w = -0.397, loss = 0.07825620
epoch 41: w = -0.436, loss = 0.07690136
epoch 51: w = -0.473, loss = 0.06662951
epoch 61: w = -0.506, loss = 0.05416132
epoch 71: w = -0.530, loss = 0.05258003
epoch 81: w = -0.544, loss = 0.05037854
epoch 91: w = -0.554, loss = 0.04954880


In [33]:
# 2 layers
mlp2 = MLP2DClassifier(n_features)
optimizer = Adam(mlp2.parameters(), lr=0.001)
mlp_train(mlp2,loss,optimizer,train_dataloader,n_epochs)

torch.save(mlp2, "models/mlp_twolayer_model.mdl")

epoch 1: w = 0.055, loss = 0.36326748
epoch 11: w = -0.030, loss = 0.10424865
epoch 21: w = -0.043, loss = 0.08040251
epoch 31: w = -0.086, loss = 0.06774805
epoch 41: w = -0.180, loss = 0.06460786
epoch 51: w = -0.268, loss = 0.05663018
epoch 61: w = -0.355, loss = 0.05575206
epoch 71: w = -0.432, loss = 0.04975513
epoch 81: w = -0.505, loss = 0.04463483
epoch 91: w = -0.577, loss = 0.03764702


In [34]:
# 4 layers residual
mlp4 = MLP4DResidualClassifier(n_features)
optimizer = Adam(mlp4.parameters(), lr=0.001)
mlp_train(mlp4,loss,optimizer,train_dataloader,n_epochs)

torch.save(mlp4, "models/mlp_fourlayer_model.mdl")

epoch 1: w = 0.195, loss = 0.27086285
epoch 11: w = 0.173, loss = 0.09795909
epoch 21: w = 0.199, loss = 0.07923308
epoch 31: w = 0.186, loss = 0.06166134
epoch 41: w = 0.172, loss = 0.09448048
epoch 51: w = 0.211, loss = 0.06841534
epoch 61: w = 0.208, loss = 0.04775996
epoch 71: w = 0.196, loss = 0.04419958
epoch 81: w = 0.207, loss = 0.03953584
epoch 91: w = 0.213, loss = 0.03753933


In [35]:
def ta_train(model: nn.Module, loss: nn.Module, optimizer: Optimizer, dataloader: DataLoader, epochs: int):
    for epoch in range(epochs):
        for x, _ in dataloader:
            x_prime = x[0]
            
            pred = model(x)
            l = loss(pred, x_prime)
            
            optimizer.zero_grad()
            l.backward()
            optimizer.step()
                
        if epoch % 10 == 0:
            print(f'epoch {epoch + 1}: loss = {l:.8f}')

In [37]:
ta = TabularAutoencoder(input_dim = n_features, latent_dim = 8)
optimizer = Adam(ta.parameters(), lr=1e-3)
loss = nn.MSELoss()
ta_train(ta, loss, optimizer, train_dataloader, n_epochs)

torch.save(mlp2, "models/tae_model.mdl")

epoch 1: loss = 14.24577332
epoch 11: loss = 25.25287819
epoch 21: loss = 25.09706879
epoch 31: loss = 24.72102547
epoch 41: loss = 24.02478790
epoch 51: loss = 23.58013153
epoch 61: loss = 23.23166275
epoch 71: loss = 22.80758286
epoch 81: loss = 22.57798386
epoch 91: loss = 22.35825348
