### Problem to Solve:

This is a comparative study of different Machine Learning approaches.
We have investigated various neural network design choices and compared them on
the Kaggle Titanic Dataset: https://www.kaggle.com/datasets/yasserh/titanic-dataset.
We have compared linear classification, logistic regression, MLP (1 and 2D),
4D MLP with a residual network, and have researched and implemented the tabular autoencoder.
We have concluded that some of these designs were great at classifying our data
(Linear, Logistic, MLP),
and some are not meant to be used for this type of data,
(Tabular autoencoder).
After a thourough investigation, we have determined that MLP is the best
at classifying the Titanic data, and have deployed it into a simple geussing game.
In the game a users geuss is compared with the ML model to see if the user made the correct choice.

In [1]:
import numpy as np
import pandas as pd
import torch
from torch.utils.data import (
    DataLoader,
    TensorDataset,
    Dataset
)
from torch.optim import (Optimizer, Adam)
from torch.optim import SGD
from torch import nn

import warnings
warnings.filterwarnings('ignore')

In [2]:
filepath = "Titanic-Dataset.csv"

# Get data in pandas dataframe
df = pd.read_csv(filepath)

### Step One:

We start by cleaning up the data.

In [7]:
def clean_df(df: pd.DataFrame) -> pd.DataFrame:
    df = df[['Survived','Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Cabin', 'Embarked']]
    
    df = df.replace("male", 0)
    df = df.replace("female", 1)
    # Replace NaN
    df['Age'] = df['Age'].fillna(df['Age'].mean())
    
    # Get rid of blanks, get only cabin class - shows upper to lower decks, 
    # then 1-hot encode (seperate each class to binary columns)
    # Will end up with around 8 binary columns
    # Cabin_U = unknown
    df['Cabin'] = df['Cabin'].fillna('Unknown')
    df['Cabin'] = df['Cabin'].str[0]
    df = pd.get_dummies(df, columns=['Cabin'])

    # 1-hot encoding for 3 binary columns repping port of embarkment
    df = pd.get_dummies(df, columns=['Embarked'])

    bool_cols = df.select_dtypes(include='bool').columns
    df[bool_cols] = df[bool_cols].astype(int)

    return df

In [8]:
cleaned_df = clean_df(df)
print(cleaned_df)

     Survived  Pclass  Sex        Age  SibSp  Parch     Fare  Cabin_A  \
0           0       3    0  22.000000      1      0   7.2500        0   
1           1       1    1  38.000000      1      0  71.2833        0   
2           1       3    1  26.000000      0      0   7.9250        0   
3           1       1    1  35.000000      1      0  53.1000        0   
4           0       3    0  35.000000      0      0   8.0500        0   
..        ...     ...  ...        ...    ...    ...      ...      ...   
886         0       2    0  27.000000      0      0  13.0000        0   
887         1       1    1  19.000000      0      0  30.0000        0   
888         0       3    1  29.699118      1      2  23.4500        0   
889         1       1    0  26.000000      0      0  30.0000        0   
890         0       3    0  32.000000      0      0   7.7500        0   

     Cabin_B  Cabin_C  Cabin_D  Cabin_E  Cabin_F  Cabin_G  Cabin_T  Cabin_U  \
0          0        0        0        0     

In [9]:
# Convert to Torch dataset with tensors
# Need to update this to split test data, ie make 10% of dataset test data, 90% is training
def make_dataset(df: pd.DataFrame) -> Dataset:
    features = df.drop(columns=['Survived'])
    
    # Ytrue
    target = df['Survived']

    x_values = torch.tensor(features.values, dtype=torch.float32)
    y_values = torch.tensor(target.values, dtype=torch.int64)
    return TensorDataset(x_values,y_values)

In [10]:
train_dataset = make_dataset(cleaned_df)

In [11]:
def make_dataloader(dataset: Dataset, batch_size:int, shuffle:bool) -> DataLoader:
    return DataLoader(dataset, batch_size, shuffle)

In [12]:
train_dataloader = make_dataloader(train_dataset, shuffle=False, batch_size=5)

### Step Two:

We start to design our various ML models to test out and explore the performance on this dataset.

In [13]:
"""

# Linear Classifier Model

"""
class LinearClassifier(nn.Module):
    def __init__(self, num_features):
        super().__init__()
        self.num_features = num_features
        self.linear = nn.Linear(num_features, 2)
    
    def forward(self, x):
        return self.linear(x)

In [14]:
"""

# Logistic Regression Model

"""
class LogisticRegression(nn.Module):
    def __init__(self, num_features):
        super().__init__()
        self.num_features = num_features
        # Makes logit
        self.linear = nn.Linear(num_features, 1)
    
    def forward(self, x):
        return torch.sigmoid(self.linear(x))

In [15]:
"""

1 Layer MLP Classifier Model

"""
class MLPClassifier(nn.Module):
    def __init__(self, num_features):
        super().__init__()
        self.num_features = num_features
        self.linear1 = nn.Linear(num_features, 100)
        self.act1 = nn.ReLU()
        self.output = nn.Linear(100, 2)
    
    def forward(self, x):
        x = self.linear1(x)
        x = self.act1(x)
        x = self.output(x)
        return x

In [16]:
"""

2 Layer MLP Classifier Model

"""
class MLP2DClassifier(nn.Module):
    def __init__(self, num_features):
        super().__init__()
        self.num_features = num_features
        self.linear1 = nn.Linear(num_features, 100)
        self.act1 = nn.ReLU()
        self.linear2 = nn.Linear(100, 50)
        self.output = nn.Linear(50, 2)
        
    def forward(self, x):
        x = self.linear1(x)
        x = self.act1(x)
        x = self.linear2(x)
        x = self.output(x)
        return x

In [17]:
"""or any

4 Layer Residual MLP Classifier Model

"""
class MLP4DResidualClassifier(nn.Module):
    def __init__(self, num_features):
        super().__init__()

        self.act = nn.ReLU()
        self.num_features = num_features

        # Main layers
        self.linear1 = nn.Linear(num_features, 128)
        self.linear2 = nn.Linear(128, 64)
        self.linear3 = nn.Linear(64, 32)
        self.linear4 = nn.Linear(32, 16)

        # Skip connections
        self.skip1 = nn.Linear(num_features, 128)
        self.skip2 = nn.Linear(128, 64)
        self.skip3 = nn.Linear(64, 32)
        self.skip4 = nn.Linear(32, 16)
        
        self.output = nn.Linear(16, 2)
        
    def forward(self, x):
        s1 = self.skip1(x)
        x = self.act(self.linear1(x) + s1)

        s2 = self.skip2(x)
        x = self.act(self.linear2(x) + s2)

        s3 = self.skip3(x)
        x = self.act(self.linear3(x) + s3)

        s4 = self.skip4(x)
        x = self.act(self.linear4(x) + s4)

        return self.output(x)

In [28]:
"""

Tabular Autoencoder Model

"""
class TabularAutoencoder(nn.Module):
    def __init__(self, input_dim, latent_dim=8):
        super().__init__()

        self.encoder = nn.Sequential(
            nn.Linear(input_dim, 16),
            nn.ReLU(),
            nn.Linear(16, latent_dim),
        )

        self.decoder = nn.Sequential(
            nn.Linear(latent_dim, 16),
            nn.ReLU(),
            nn.Linear(16, input_dim)
        )

    def forward(self, x):
        z = self.encoder(x)
        x_hat = self.decoder(z)
        return x_hat

### Step Three:

We now begin to train and save the models

In [29]:
n_samples, n_features = train_dataset.tensors[0].shape
n_epochs = 100

In [30]:
def lin_train(model: nn.Module, loss: nn.Module, optimizer: Optimizer, dataloader: DataLoader, epochs: int):
    for epoch in range(1, epochs + 1):
        total_loss = 0
        for (x,target) in dataloader:
            pred = model(x)
            l = loss(pred, target)

            optimizer.zero_grad()
            l.backward()
            optimizer.step()

            total_loss += l.item()

        if epoch % 10 == 0:
            [w,b] = model.parameters()
            print(f'epoch {epoch}: w = {w[0][0].item():.3f}, loss = {l:.8f}, total_loss = {total_loss:.4f}')

In [31]:
# Training linear
lin = LinearClassifier(n_features)
loss = nn.CrossEntropyLoss()

# May want to adjust learning rate
optimizer = SGD(lin.parameters(), lr=0.001)
# May want less epochs
lin_train(lin, loss, optimizer, train_dataloader, n_epochs)
torch.save(lin, "models/lin_model.mdl")

epoch 10: w = 0.157, loss = 0.30379084, total_loss = 138.9426
epoch 20: w = 0.191, loss = 0.24724936, total_loss = 128.0752
epoch 30: w = 0.206, loss = 0.20415820, total_loss = 121.1879
epoch 40: w = 0.220, loss = 0.17249052, total_loss = 116.5114
epoch 50: w = 0.233, loss = 0.14862877, total_loss = 112.9642
epoch 60: w = 0.245, loss = 0.13021740, total_loss = 110.2590
epoch 70: w = 0.257, loss = 0.11571588, total_loss = 108.1717
epoch 80: w = 0.268, loss = 0.10408817, total_loss = 106.5378
epoch 90: w = 0.278, loss = 0.09461974, total_loss = 105.2396
epoch 100: w = 0.287, loss = 0.08680409, total_loss = 104.1929


In [32]:
def log_train(model: nn.Module, loss: nn.Module, optimizer: Optimizer, dataloader: DataLoader, epochs: int):
    for epoch in range(1, epochs + 1):
        total_loss = 0
        for x, target in dataloader:
            target = target.float().unsqueeze(1)
            pred = model(x)
            l = loss(pred, target)
            
            optimizer.zero_grad()
            l.backward()
            optimizer.step()

            total_loss += l.item()
            

        if epoch % 10 == 0:
            [w,b] = model.parameters()
            print(f'epoch {epoch}: w = {w[0][0].item():.3f}, loss = {l:.8f}, total_loss = {total_loss:.4f}')

In [33]:
# Training Logistic
log = LogisticRegression(n_features)
loss = nn.BCELoss()

# May want to adjust learning rate
optimizer = SGD(log.parameters(), lr=0.001)
log_train(log,loss,optimizer,train_dataloader,n_epochs)
torch.save(log, "models/log_model.mdl")

epoch 10: w = -0.090, loss = 0.21280563, total_loss = 116.4880
epoch 20: w = -0.181, loss = 0.19022161, total_loss = 110.5007
epoch 30: w = -0.220, loss = 0.17497328, total_loss = 106.8198
epoch 40: w = -0.242, loss = 0.16306059, total_loss = 103.9601
epoch 50: w = -0.258, loss = 0.15303203, total_loss = 101.6191
epoch 60: w = -0.271, loss = 0.14434722, total_loss = 99.6729
epoch 70: w = -0.283, loss = 0.13674350, total_loss = 98.0404
epoch 80: w = -0.295, loss = 0.13004620, total_loss = 96.6603
epoch 90: w = -0.306, loss = 0.12411778, total_loss = 95.4854
epoch 100: w = -0.317, loss = 0.11884392, total_loss = 94.4788


In [34]:
def mlp_train(model: nn.Module, loss: nn.Module, optimizer: Optimizer, dataloader: DataLoader, epochs: int):
    for epoch in range(1, epochs + 1):
        total_loss = 0
        for x, target in dataloader:
            pred = model(x)
            l = loss(pred, target)
            
            optimizer.zero_grad()
            l.backward()
            optimizer.step()

            total_loss += l.item()
                
        if epoch % 10 == 0:
            w = model.linear1.weight
            print(f'epoch {epoch}: w = {w[0][0].item():.3f}, loss = {l:.8f}, total_loss = {total_loss:.4f}')

In [35]:
# Training MLP

loss = nn.CrossEntropyLoss()

# 1 layer
mlp = MLPClassifier(n_features)
optimizer = Adam(mlp.parameters(), lr=0.001)
mlp_train(mlp,loss,optimizer,train_dataloader,n_epochs)

torch.save(mlp, "models/mlp_onelayer_model.mdl")

epoch 10: w = -0.127, loss = 0.11859322, total_loss = 82.2418
epoch 20: w = -0.147, loss = 0.08677053, total_loss = 78.1386
epoch 30: w = -0.158, loss = 0.08098578, total_loss = 75.4810
epoch 40: w = -0.168, loss = 0.07231505, total_loss = 77.6081
epoch 50: w = -0.178, loss = 0.07026675, total_loss = 74.0961
epoch 60: w = -0.191, loss = 0.06030358, total_loss = 71.9251
epoch 70: w = -0.199, loss = 0.05834802, total_loss = 68.8279
epoch 80: w = -0.201, loss = 0.05212197, total_loss = 68.7105
epoch 90: w = -0.200, loss = 0.04799564, total_loss = 65.6423
epoch 100: w = -0.202, loss = 0.04922508, total_loss = 64.8338


In [36]:
# 2 layers
mlp2 = MLP2DClassifier(n_features)
optimizer = Adam(mlp2.parameters(), lr=0.001)
mlp_train(mlp2,loss,optimizer,train_dataloader,n_epochs)

torch.save(mlp2, "models/mlp_twolayer_model.mdl")

epoch 10: w = 0.047, loss = 0.09897653, total_loss = 80.8777
epoch 20: w = 0.058, loss = 0.07110681, total_loss = 77.1249
epoch 30: w = 0.077, loss = 0.05992853, total_loss = 75.2195
epoch 40: w = 0.101, loss = 0.05771663, total_loss = 73.3120
epoch 50: w = 0.111, loss = 0.05571553, total_loss = 71.0313
epoch 60: w = 0.125, loss = 0.05121632, total_loss = 69.1391
epoch 70: w = 0.122, loss = 0.04961312, total_loss = 68.1810
epoch 80: w = 0.131, loss = 0.04892419, total_loss = 65.8632
epoch 90: w = 0.138, loss = 0.04627061, total_loss = 64.6637
epoch 100: w = 0.145, loss = 0.04573051, total_loss = 63.1262


In [37]:
# 4 layers residual
mlp4 = MLP4DResidualClassifier(n_features)
optimizer = Adam(mlp4.parameters(), lr=0.001)
mlp_train(mlp4,loss,optimizer,train_dataloader,n_epochs)

torch.save(mlp4, "models/mlp_fourlayer_model.mdl")

epoch 10: w = 0.181, loss = 0.09603711, total_loss = 81.6603
epoch 20: w = 0.181, loss = 0.08851510, total_loss = 75.3010
epoch 30: w = 0.181, loss = 0.08842213, total_loss = 71.9302
epoch 40: w = 0.181, loss = 0.10740285, total_loss = 68.4317
epoch 50: w = 0.181, loss = 0.07169863, total_loss = 67.1694
epoch 60: w = 0.181, loss = 0.04554076, total_loss = 64.4520
epoch 70: w = 0.181, loss = 0.03830257, total_loss = 63.8072
epoch 80: w = 0.181, loss = 0.04722589, total_loss = 62.9862
epoch 90: w = 0.181, loss = 0.03841064, total_loss = 67.9770
epoch 100: w = 0.181, loss = 0.03018275, total_loss = 60.5111


In [40]:
def ta_train(model: nn.Module, loss: nn.Module, optimizer: Optimizer, dataloader: DataLoader, epochs: int):
    for epoch in range(1, epochs + 1):
        total_loss = 0
        for x, _ in dataloader:
            x_prime = x[0]
            
            pred = model(x)
            l = loss(pred, x_prime)
            
            optimizer.zero_grad()
            l.backward()
            optimizer.step()

            total_loss += l.item()
                
        if epoch % 10 == 0:
            print(f'epoch {epoch}: loss = {l:.8f}, total_loss = {total_loss:.4f}')

In [41]:
ta = TabularAutoencoder(input_dim = n_features, latent_dim = 8)
optimizer = Adam(ta.parameters(), lr=1e-3)
loss = nn.MSELoss()
ta_train(ta, loss, optimizer, train_dataloader, n_epochs)

torch.save(mlp2, "models/tae_model.mdl")

epoch 10: loss = 30.11790276, total_loss = 19016.2615
epoch 20: loss = 29.80267715, total_loss = 18565.0683
epoch 30: loss = 28.63719940, total_loss = 18291.9598
epoch 40: loss = 25.97786140, total_loss = 18054.6605
epoch 50: loss = 24.59546280, total_loss = 17911.9967
epoch 60: loss = 23.84953690, total_loss = 17823.4341
epoch 70: loss = 23.19057846, total_loss = 17732.9634
epoch 80: loss = 22.73792458, total_loss = 17660.4718
epoch 90: loss = 22.81102562, total_loss = 17601.6074
epoch 100: loss = 22.60476494, total_loss = 17579.1379


|Final Total Loss|Linear Classifier|Logistic Regression|MLP-1D|MLP-2D|MLP-4D|
|-|-|-|-|-|-|
|100 Epochs|104.1929|94.4788|64.8338|63.1262|60.5111|
|1000 Epochs|
