In [None]:
import numpy as np
import pandas as pd
import torch
from torch.utils.data import (
    DataLoader,
    TensorDataset,
    Dataset
)
from torch.optim import (Optimizer, Adam)
from torch.optim import SGD
from torch import nn

In [3]:
filepath = "Titanic-Dataset.csv"

# Get data in pandas dataframe
df = pd.read_csv(filepath)

In [None]:
def clean_df(df: pd.DataFrame) -> pd.DataFrame:
    df = df[['Survived','Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Cabin', 'Embarked']]
    
    df = df.replace("male", 0)
    df = df.replace("female", 1)
    # Replace NaN
    df['Age'] = df['Age'].fillna(df['Age'].mean())
    
    # Get rid of blanks, get only cabin class - shows upper to lower decks, 
    # then 1-hot encode (seperate each class to binary columns)
    # Will end up with around 8 binary columns
    # Cabin_U = unknown
    df['Cabin'] = df['Cabin'].fillna('Unknown')
    df['Cabin'] = df['Cabin'].str[0]
    df = pd.get_dummies(df, columns=['Cabin'])

    # 1-hot encoding for 3 binary columns repping port of embarkment
    df = pd.get_dummies(df, columns=['Embarked'])

    bool_cols = df.select_dtypes(include='bool').columns
    df[bool_cols] = df[bool_cols].astype(int)

    return df

In [5]:
cleaned_df = clean_df(df)
print(cleaned_df)

     Survived  Pclass  Sex        Age  SibSp  Parch     Fare  Cabin_A  \
0           0       3    0  22.000000      1      0   7.2500        0   
1           1       1    1  38.000000      1      0  71.2833        0   
2           1       3    1  26.000000      0      0   7.9250        0   
3           1       1    1  35.000000      1      0  53.1000        0   
4           0       3    0  35.000000      0      0   8.0500        0   
..        ...     ...  ...        ...    ...    ...      ...      ...   
886         0       2    0  27.000000      0      0  13.0000        0   
887         1       1    1  19.000000      0      0  30.0000        0   
888         0       3    1  29.699118      1      2  23.4500        0   
889         1       1    0  26.000000      0      0  30.0000        0   
890         0       3    0  32.000000      0      0   7.7500        0   

     Cabin_B  Cabin_C  Cabin_D  Cabin_E  Cabin_F  Cabin_G  Cabin_T  Cabin_U  \
0          0        0        0        0     

  df = df.replace("female", 1)


In [None]:
# Convert to Torch dataset with tensors
# Need to update this to split test data, ie make 10% of dataset test data, 90% is training
def make_datset(df: pd.DataFrame) -> Dataset:
    features = df.drop(columns=['Survived'])
    
    # Ytrue
    target = df['Survived']

    x_values = torch.tensor(features.values, dtype=torch.float32)
    y_values = torch.tensor(target.values, dtype=torch.int64)
    return TensorDataset(x_values,y_values)

In [13]:
train_dataset = make_datset(cleaned_df)

In [14]:
def make_dataloader(dataset: Dataset, batch_size:int, shuffle:bool) -> DataLoader:
    return DataLoader(dataset, batch_size, shuffle)

In [15]:
train_dataloader = make_dataloader(train_dataset, shuffle=False, batch_size=5)

In [None]:
# Want to do linear, logistic, 2 types of MLP, maybe also compare custom vs built in
# like custom gradient descent vs built in optimizer.step()

In [None]:
# Linear classifier model
class LinearClassifier(nn.Module):
    def __init__(self, num_features):
        super().__init__()
        self.num_features = num_features
        self.linear = nn.Linear(num_features, 2)
    
    def forward(self, x):
        return self.linear(x)

In [None]:
# Logistic Regression Model
class LogisticRegression(nn.Module):
    def __init__(self, num_features):
        super().__init__()
        self.num_features = num_features
        # Makes logit
        self.linear = nn.Linear(num_features, 1)
    
    def forward(self, x):
        return torch.sigmoid(self.linear(x))

In [None]:
# 1-layer MLP and 2-layer MLP
# Build 1 using custom 
class MLPClassifier(nn.Module):
    def __init__(self, num_features):
        super().__init__()
        self.num_features = num_features
        # Hidden layer: 4 features applied to 100 neurons do we
        self.linear1 = nn.Linear(num_features, 100)
        # act 1
        self.act1 = nn.ReLU()
        # Output layer - binary classification for titanic survival
        self.output = nn.Linear(100, 2)
    
    def forward(self, x):
        x = self.linear1(x)
        x = self.act1(x)
        x = self.output(x)
        return x

In [None]:
# 2-layer MLP
class MLP2DClassifier(nn.Module):
    def __init__(self, num_features):
        super().__init__()
        self.num_features = num_features
        self.linear1 = nn.Linear(num_features, 100)
        self.act1 = nn.ReLU()
        self.linear2 = nn.Linear(100,50)
        self.output = nn.Linear(50,2)
    def forward(self, x):
        x = self.linear1(x)
        x = self.act1(x)
        x = self.linear2(x)
        x = self.output(x)
        return x

In [None]:
# Lets also do 4 layers with a residual network?
# This will probably overfit, good for investigation for assignment
# Also encoder counts as investigation even though it doesn't go with this data right
# SOMEONE WE NEED TO FIGURE OUT SKIP CONNECTION HERE FOR RESIDUAL NETWORK
class MLP4DResidualClassifier(nn.Module):
    def __init__(self, num_features):
        super().__init__()
        self.num_features = num_features
        self.linear1 = nn.Linear(num_features, 128)
        self.act1 = nn.ReLU()
        self.linear2 = nn.Linear(128,64)
        self.act2 = nn.ReLU()
        self.linear3 = nn.Linear(64,32)
        self.act3 = nn.ReLU()
        self.linear4 = nn.Linear(32,16)
        self.output = nn.Linear(16,2)
    def forward(self, x):
        x = self.linear1(x)
        x = self.act1(x)
        x = self.linear2(x)
        x = self.act2(x)
        x = self.linear3(x)
        x = self.act3(x)
        x = self.linear4(x)
        x = self.output(x)
        return x

In [None]:
# Do we want custom vs built in?
# Can create cross entropy function and gradient descent and compare to built in mlp
# We do this for whichever mlp is best

In [None]:
# Train
def train(model: nn.Module, loss: nn.Module, optimizer: Optimizer, dataloader: DataLoader, epochs: int):
    for epoch in range(epochs):
        for (x,target) in dataloader:
            pred = model(x)
            l = loss(pred, target)
            l.backward()
            optimizer.step()
            optimizer.zero_grad()

        # Print every accuracy every 10 epochs, do we need to change how we do this?
        if epoch % 10 == 0:
            [w,b] = model.parameters()
            print(f'epoch {epoch + 1}: w = {w[0][0].item():.3f}, loss = {l:.8f}')

In [None]:
# For training
n_samples, n_features = train_dataset.tensors[0].shape
n_epochs = 100

In [None]:
# Training linear
lin = LinearClassifier(n_features)
loss = nn.MSELoss()
# May want to adjust learning rate
optimizer = SGD(lin.parameters(), lr=0.001)
# May want less epochs
train(lin, loss, optimizer, train_dataloader, n_epochs)

In [None]:
# Training Logistic
log = LogisticRegression(n_features)
loss = nn.BCELoss()
# May want to adjust learning rate
optimizer = SGD(log.parameters(), lr=0.001)
train(log,loss,optimizer,train_dataloader,n_epochs)

In [None]:
# Training MLP

loss = nn.CrossEntropyLoss()

# 1 layer
mlp = MLPClassifier(n_features)
optimizer = Adam(mlp.parameters(), lr=0.001)
train(mlp,loss,optimizer,train_dataloader,n_epochs)

# 2 layers
mlp2 = MLP2DClassifier(n_features)
optimizer = Adam(mlp2.parameters(), lr=0.001)
train(mlp2,loss,optimizer,train_dataloader,n_epochs)

# 4 layers residual - note residual not added yet
mlp4 = MLP4DResidualClassifier(n_features)
optimizer = Adam(mlp4.parameters(), lr=0.001)
train(mlp4,loss,optimizer,train_dataloader,n_epochs)

In [None]:
# We train all, whichever performs best we integrate into final game (system integration part 3)