In [68]:
import numpy as np
import pandas as pd
import torch
from torch.utils.data import (
    DataLoader,
    TensorDataset,
    Dataset
)
from torch.optim import (Optimizer, Adam)
from torch.optim import SGD
from torch import nn

import warnings
warnings.filterwarnings('ignore')

In [69]:
filepath = "Titanic-Dataset.csv"

# Get data in pandas dataframe
df = pd.read_csv(filepath)

### Step One:

We start by cleaning up the data.

In [70]:
def clean_df(df: pd.DataFrame) -> pd.DataFrame:
    df = df[['Survived','Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Cabin', 'Embarked']]
    
    df = df.replace("male", 0)
    df = df.replace("female", 1)
    # Replace NaN
    df['Age'] = df['Age'].fillna(df['Age'].mean())
    
    # Get rid of blanks, get only cabin class - shows upper to lower decks, 
    # then 1-hot encode (seperate each class to binary columns)
    # Will end up with around 8 binary columns
    # Cabin_U = unknown
    df['Cabin'] = df['Cabin'].fillna('Unknown')
    df['Cabin'] = df['Cabin'].str[0]
    df = pd.get_dummies(df, columns=['Cabin'])

    # 1-hot encoding for 3 binary columns repping port of embarkment
    df = pd.get_dummies(df, columns=['Embarked'])

    bool_cols = df.select_dtypes(include='bool').columns
    df[bool_cols] = df[bool_cols].astype(int)

    return df

In [71]:
cleaned_df = clean_df(df)
print(cleaned_df)

     Survived  Pclass  Sex        Age  SibSp  Parch     Fare  Cabin_A  \
0           0       3    0  22.000000      1      0   7.2500        0   
1           1       1    1  38.000000      1      0  71.2833        0   
2           1       3    1  26.000000      0      0   7.9250        0   
3           1       1    1  35.000000      1      0  53.1000        0   
4           0       3    0  35.000000      0      0   8.0500        0   
..        ...     ...  ...        ...    ...    ...      ...      ...   
886         0       2    0  27.000000      0      0  13.0000        0   
887         1       1    1  19.000000      0      0  30.0000        0   
888         0       3    1  29.699118      1      2  23.4500        0   
889         1       1    0  26.000000      0      0  30.0000        0   
890         0       3    0  32.000000      0      0   7.7500        0   

     Cabin_B  Cabin_C  Cabin_D  Cabin_E  Cabin_F  Cabin_G  Cabin_T  Cabin_U  \
0          0        0        0        0     

In [77]:
# Convert to Torch dataset with tensors
# Need to update this to split test data, ie make 10% of dataset test data, 90% is training
def make_dataset(df: pd.DataFrame) -> Dataset:
    features = df.drop(columns=['Survived'])
    
    # Ytrue
    target = df['Survived']

    x_values = torch.tensor(features.values, dtype=torch.float32)
    y_values = torch.tensor(target.values, dtype=torch.int64)
    return TensorDataset(x_values,y_values)

In [78]:
train_dataset = make_dataset(cleaned_df)

In [74]:
def make_dataloader(dataset: Dataset, batch_size:int, shuffle:bool) -> DataLoader:
    return DataLoader(dataset, batch_size, shuffle)

In [75]:
train_dataloader = make_dataloader(train_dataset, shuffle=False, batch_size=5)

### Step Two:

We start to design our various ML models to test out and explore the performance on this dataset.

In [36]:
"""

# Linear Classifier Model

"""
class LinearClassifier(nn.Module):
    def __init__(self, num_features):
        super().__init__()
        self.num_features = num_features
        self.linear = nn.Linear(num_features, 2)
    
    def forward(self, x):
        return self.linear(x)

In [37]:
"""

# Logistic Regression Model

"""
class LogisticRegression(nn.Module):
    def __init__(self, num_features):
        super().__init__()
        self.num_features = num_features
        # Makes logit
        self.linear = nn.Linear(num_features, 1)
    
    def forward(self, x):
        return torch.sigmoid(self.linear(x))

In [38]:
"""

1 Layer MLP Classifier Model

"""
class MLPClassifier(nn.Module):
    def __init__(self, num_features):
        super().__init__()
        self.num_features = num_features
        self.linear1 = nn.Linear(num_features, 100)
        self.act1 = nn.ReLU()
        self.output = nn.Linear(100, 2)
    
    def forward(self, x):
        x = self.linear1(x)
        x = self.act1(x)
        x = self.output(x)
        return x

In [39]:
"""

2 Layer MLP Classifier Model

"""
class MLP2DClassifier(nn.Module):
    def __init__(self, num_features):
        super().__init__()
        self.num_features = num_features
        self.linear1 = nn.Linear(num_features, 100)
        self.act1 = nn.ReLU()
        self.linear2 = nn.Linear(100,50)
        self.output = nn.Linear(50,2)
    def forward(self, x):
        x = self.linear1(x)
        x = self.act1(x)
        x = self.linear2(x)
        x = self.output(x)
        return x

In [40]:
"""

4 Layer Residual MLP Classifier Model

"""
class MLP4DResidualClassifier(nn.Module):
    def __init__(self, num_features):
        super().__init__()

        self.act = nn.ReLU()
        self.num_features = num_features

        # Main layers
        self.linear1 = nn.Linear(num_features, 128)
        self.linear2 = nn.Linear(128, 64)
        self.linear3 = nn.Linear(64, 32)
        self.linear4 = nn.Linear(32, 16)

        # Skip connections
        self.skip1 = nn.Linear(num_features, 128)
        self.skip2 = nn.Linear(128, 64)
        self.skip3 = nn.Linear(64, 32)
        self.skip4 = nn.Linear(32, 16)
        
        self.output = nn.Linear(16,2)
    def forward(self, x):
        s1 = self.skip1(x)
        x = self.act(self.linear1(x) + s1)

        s2 = self.skip2(x)
        x = self.act(self.linear2(x) + s2)

        s3 = self.skip3(x)
        x = self.act(self.linear3(x) + s3)

        s4 = self.skip4(x)
        x = self.act(self.linear4(x) + s4)

        return self.output(x)

### Step Three:

We now begin to train and save the models

In [80]:
n_samples, n_features = train_dataset.tensors[0].shape
n_epochs = 100

In [53]:
def lin_train(model: nn.Module, loss: nn.Module, optimizer: Optimizer, dataloader: DataLoader, epochs: int):
    for epoch in range(epochs):
        for (x,target) in dataloader:
            pred = model(x)
            l = loss(pred, target)

            optimizer.zero_grad()
            l.backward()
            optimizer.step()

        if epoch % 10 == 0:
            [w,b] = model.parameters()
            print(f'epoch {epoch + 1}: w = {w[0][0].item():.3f}, loss = {l:.8f}')

In [54]:
# Training linear
lin = LinearClassifier(n_features)
loss = nn.CrossEntropyLoss()

# May want to adjust learning rate
optimizer = SGD(lin.parameters(), lr=0.001)
# May want less epochs
lin_train(lin, loss, optimizer, train_dataloader, n_epochs)
torch.save(lin, "lin_model.mdl")

epoch 1: w = 0.142, loss = 0.59377247
epoch 11: w = 0.146, loss = 0.57468140
epoch 21: w = 0.146, loss = 0.56229663
epoch 31: w = 0.145, loss = 0.55359000
epoch 41: w = 0.144, loss = 0.54691315
epoch 51: w = 0.143, loss = 0.54134536
epoch 61: w = 0.143, loss = 0.53638387
epoch 71: w = 0.143, loss = 0.53176123
epoch 81: w = 0.143, loss = 0.52734107
epoch 91: w = 0.144, loss = 0.52305800


In [56]:
def log_train(model: nn.Module, loss: nn.Module, optimizer: Optimizer, dataloader: DataLoader, epochs: int):
    for epoch in range(epochs):
        for x, target in dataloader:
            target = target.float().unsqueeze(1)
            pred = model(x)
            l = loss(pred, target)
            
            optimizer.zero_grad()
            l.backward()
            optimizer.step()
            

        if epoch % 10 == 0:
            [w,b] = model.parameters()
            print(f'epoch {epoch + 1}: w = {w[0][0].item():.3f}, loss = {l:.8f}')

In [57]:
# Training Logistic
log = LogisticRegression(n_features)
loss = nn.BCELoss()

# May want to adjust learning rate
optimizer = SGD(log.parameters(), lr=0.001)
log_train(log,loss,optimizer,train_dataloader,n_epochs)
torch.save(log, "log_model.mdl")

epoch 1: w = -0.074, loss = 0.20454602
epoch 11: w = -0.100, loss = 0.40410495
epoch 21: w = -0.122, loss = 0.39756119
epoch 31: w = -0.139, loss = 0.39344776
epoch 41: w = -0.152, loss = 0.39111623
epoch 51: w = -0.162, loss = 0.39008528
epoch 61: w = -0.169, loss = 0.38999808
epoch 71: w = -0.175, loss = 0.39059016
epoch 81: w = -0.180, loss = 0.39166415
epoch 91: w = -0.184, loss = 0.39307275


In [63]:
def mlp_train(model: nn.Module, loss: nn.Module, optimizer: Optimizer, dataloader: DataLoader, epochs: int):
    for epoch in range(epochs):
        for x, target in dataloader:
            pred = model(x)
            l = loss(pred, target)
            
            optimizer.zero_grad()
            l.backward()
            optimizer.step()
                
        if epoch % 10 == 0:
            w = model.linear1.weight
            print(f'epoch {epoch + 1}: w = {w[0][0].item():.3f}, loss = {l:.8f}')

In [65]:
# Training MLP

loss = nn.CrossEntropyLoss()

# 1 layer
mlp = MLPClassifier(n_features)
optimizer = Adam(mlp.parameters(), lr=0.001)
mlp_train(mlp,loss,optimizer,train_dataloader,n_epochs)

torch.save(mlp, "mlp_onelayer_model.mdl")

epoch 1: w = -0.095, loss = 0.43417811
epoch 11: w = -0.148, loss = 0.39460459
epoch 21: w = -0.177, loss = 0.38648927
epoch 31: w = -0.197, loss = 0.37094721
epoch 41: w = -0.204, loss = 0.35725096
epoch 51: w = -0.208, loss = 0.35009229
epoch 61: w = -0.211, loss = 0.34017166
epoch 71: w = -0.213, loss = 0.32246652
epoch 81: w = -0.216, loss = 0.30527335
epoch 91: w = -0.217, loss = 0.28432623


In [66]:
# 2 layers
mlp2 = MLP2DClassifier(n_features)
optimizer = Adam(mlp2.parameters(), lr=0.001)
mlp_train(mlp2,loss,optimizer,train_dataloader,n_epochs)

torch.save(mlp2, "mlp_twolayer_model.mdl")

epoch 1: w = 0.121, loss = 0.47413000
epoch 11: w = 0.119, loss = 0.38618481
epoch 21: w = 0.120, loss = 0.36881888
epoch 31: w = 0.108, loss = 0.34620664
epoch 41: w = 0.078, loss = 0.33320415
epoch 51: w = 0.042, loss = 0.31737164
epoch 61: w = 0.012, loss = 0.30967286
epoch 71: w = -0.006, loss = 0.29498753
epoch 81: w = -0.016, loss = 0.27107331
epoch 91: w = -0.022, loss = 0.26264268


In [67]:
# 4 layers residual
mlp4 = MLP4DResidualClassifier(n_features)
optimizer = Adam(mlp4.parameters(), lr=0.001)
mlp_train(mlp4,loss,optimizer,train_dataloader,n_epochs)

torch.save(mlp4, "mlp_fourlayer_model.mdl")

epoch 1: w = 0.153, loss = 0.39594677
epoch 11: w = 0.130, loss = 0.32388675
epoch 21: w = 0.071, loss = 0.30200589
epoch 31: w = 0.041, loss = 0.33498892
epoch 41: w = 0.026, loss = 0.28398848
epoch 51: w = 0.052, loss = 0.31911257
epoch 61: w = 0.060, loss = 0.30112976
epoch 71: w = 0.082, loss = 0.28604266
epoch 81: w = 0.098, loss = 0.27376878
epoch 91: w = 0.108, loss = 0.27897996


In [None]:
# We train all, whichever performs best we integrate into final game (system integration part 3)