# Feed Forward Neural Network

In [3]:
import torch
from torch import nn
from torch.utils.data import DataLoader

from utils import DataHandlerTitantic

Load the data and split into train and validation sets.

In [4]:
dh = DataHandlerTitantic(34545234)
dh.load_data("data/train.csv", "data/test.csv")
dh.shuffle_split(0.9)
dh.full_train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


Get the device we'll be using.

In [5]:
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using {device} device")

Using cpu device


## First model

We'll use socio-economic class, gender, age, fare, port of embark, number of siblings/spouses aboard and number of parents/childern aboard as features. Let's impute the missing values, and do a little preprocessing.

In [6]:
impute_columns = [
    "Pclass_1",
    "Pclass_2",
    "Pclass_3",
    "IsFemale",
    "Age",
    "SibSp",
    "Parch",
    "Fare",
    "Embarked_C",
    "Embarked_Q",
    "Embarked_S"
]

dh1 = dh.to_is_female()\
        .make_dummies(["Pclass", "Embarked"])\
        .impute_values(impute_columns, strategy="knn")
dh1.train.head()

Unnamed: 0,PassengerId,Survived,Name,IsFemale,Age,SibSp,Parch,Ticket,Fare,Cabin,Pclass_1,Pclass_2,Pclass_3,Embarked_C,Embarked_Q,Embarked_S
763,764,1,"Carter, Mrs. William Ernest (Lucile Polk)",1.0,36.0,1.0,2.0,113760,120.0,B96 B98,1.0,0.0,0.0,0.0,0.0,1.0
632,633,1,"Stahelin-Maeglin, Dr. Max",0.0,32.0,0.0,0.0,13214,30.5,B50,1.0,0.0,0.0,1.0,0.0,0.0
237,238,1,"Collyer, Miss. Marjorie ""Lottie""",1.0,8.0,0.0,2.0,C.A. 31921,26.25,,0.0,1.0,0.0,0.0,0.0,1.0
491,492,0,"Windelov, Mr. Einar",0.0,21.0,0.0,0.0,SOTON/OQ 3101317,7.25,,0.0,0.0,1.0,0.0,0.0,1.0
590,591,0,"Rintamaki, Mr. Matti",0.0,35.0,0.0,0.0,STON/O 2. 3101273,7.125,,0.0,0.0,1.0,0.0,0.0,1.0


Now set up the data sets and data loaders.

In [7]:
batch_size = 64

feature_columns1 = [
    "Pclass_1",
    "Pclass_2",
    "Pclass_3",
    "IsFemale",
    "Age",
    "SibSp",
    "Parch",
    "Fare",
    "Embarked_C",
    "Embarked_Q",
    "Embarked_S"
]

train_dataset1 = dh1.get_train_pytorch_dataset(feature_columns1)
eval_dataset1 = dh1.get_eval_pytorch_dataset(feature_columns1)

train_dataloader1 = DataLoader(train_dataset1, batch_size=batch_size)
eval_dataloader1 = DataLoader(eval_dataset1, batch_size=batch_size)

Define the model. We'll use a hidden layer with 6 units.

In [8]:
class Model1(nn.Module):

    def __init__(self):
        super().__init__()
        self.stack = nn.Sequential(
            nn.Linear(11, 6),
            nn.ReLU(),
            nn.Linear(6, 2)
        )

    def forward(self, x):
        return self.stack(x)

model1 = Model1().to(device)
print(model1)

Model1(
  (stack): Sequential(
    (0): Linear(in_features=11, out_features=6, bias=True)
    (1): ReLU()
    (2): Linear(in_features=6, out_features=2, bias=True)
  )
)


We'll use cross entropy loss and stochastic gradient descent.

In [9]:
loss_fn1 = nn.CrossEntropyLoss()
optimiser1 = torch.optim.SGD(model1.parameters(), lr=1e-3)

This is the generic training loop, which we'll use for all the models.

In [10]:
def train(dataloader, model, loss_fn, optimiser):
    
    # Put the model in training mode
    model.train()
    
    # The number of datapoints
    size = len(dataloader.dataset)
    
    # Loop over each batch of datapoints
    for batch, (X, y) in enumerate(dataloader):
        
        # Make sure the data is on the device
        X, y = X.to(device), y.to(device)
        
        # Make a prediction using the current parameters
        pred = model(X)
        
        # Compute the loss for this prediction
        loss = loss_fn(pred, y)
        
        # Set all the gradients to zero
        optimiser.zero_grad()
        
        # Propagate the loss backwards to compute the gradients
        loss.backward()
        
        # Do one step of optimisation
        optimiser.step()
        
        # Print the loss function and train error every so often
        # if batch % 100 == 0:
        #     loss_val = loss.item()
        #     train_err = 1 - (pred.argmax(1) == y).count_nonzero() / len(y)
        #     pos = batch * len(X)
        #     print(f"loss: {loss_val:>7f}, train err: {train_err:09.5%} "
        #          f"[{pos:>5d}/{size:>5d}]")

This function outputs the error for either the train set or the evaluation set.

In [11]:
def print_error(dataloader, dataloader_name, model):
    
    # Put the model in evaluation mode
    model.eval()
    
    # Get the number of datapoints
    size = len(dataloader.dataset)

    # The number of correct predictions
    correct = 0

    # We don't want to be computing the gradients
    with torch.no_grad():

        # Loop through the minibatches
        for X, y in dataloader:

            # Make sure the data is on the device
            X, y = X.to(device), y.to(device)

            # Compute the model predictions
            pred = model(X)

            # Update with the number of correct predictions
            correct += (pred.argmax(1) == y).count_nonzero()

    # Compute the error for the whole dataset
    error = 1 - correct / len(dataloader.dataset)

    # Output it
    print(f"{dataloader_name.capitalize()} error: {error:09.5%}")

Let's train the model.

In [12]:
epochs = 5000
for t in range(1, epochs+1):
    train(train_dataloader1, model1, loss_fn1, optimiser1)
    if t % 1000 == 0:
        print(f"Epoch {t}")
        print("----------------------------------")
        print_error(train_dataloader1, "train", model1)
        print_error(eval_dataloader1, "validation", model1)
        print("")

Epoch 1000
----------------------------------
Train error: 22.09738%
Validation error: 20.00000%

Epoch 2000
----------------------------------
Train error: 19.85019%
Validation error: 17.77778%

Epoch 3000
----------------------------------
Train error: 19.97503%
Validation error: 17.77778%

Epoch 4000
----------------------------------
Train error: 19.47566%
Validation error: 17.77778%

Epoch 5000
----------------------------------
Train error: 18.72659%
Validation error: 16.66667%



## Slower learning rate

Let's try with a slower learning rate.

In [13]:
# Define the model
model2 = Model1().to(device)

# Use the same data handler
dh2 = dh1

# Use the same data sets
train_dataset2 = train_dataset1
eval_dataset2 = eval_dataset1

# Define new data loaders
train_dataloader2 = DataLoader(train_dataset2, batch_size=batch_size)
eval_dataloader2 = DataLoader(eval_dataset2, batch_size=batch_size)

# Use the same loss function, and SGD with a lower learning rate
loss_fn2 = nn.CrossEntropyLoss()
optimiser2 = torch.optim.SGD(model2.parameters(), lr=1e-2)

# Train the model
epochs = 5000
for t in range(1, epochs+1):
    train(train_dataloader2, model2, loss_fn2, optimiser2)
    if t % 1000 == 0:
        print(f"Epoch {t}")
        print("----------------------------------")
        print_error(train_dataloader2, "train", model2)
        print_error(eval_dataloader2, "validation", model2)
        print("")

Epoch 1000
----------------------------------
Train error: 20.84894%
Validation error: 21.11111%

Epoch 2000
----------------------------------
Train error: 20.09987%
Validation error: 16.66667%

Epoch 3000
----------------------------------
Train error: 19.47566%
Validation error: 15.55555%

Epoch 4000
----------------------------------
Train error: 19.47566%
Validation error: 15.55555%

Epoch 5000
----------------------------------
Train error: 19.10113%
Validation error: 15.55555%



## Using Adam optimiser

This time, let's try using the Adam optimiser instead of SGD.

In [14]:
# Define the model
model3 = Model1().to(device)

# Use the same data handler
dh3 = dh1

# Use the same data sets
train_dataset3 = train_dataset1
eval_dataset3 = eval_dataset1

# Define new data loaders
train_dataloader3 = DataLoader(train_dataset3, batch_size=batch_size)
eval_dataloader3 = DataLoader(eval_dataset3, batch_size=batch_size)

# Use the same loss function, and the Adam optimiser
loss_fn3 = nn.CrossEntropyLoss()
optimiser3 = torch.optim.Adam(model3.parameters())

# Train the model
epochs = 5000
for t in range(1, epochs+1):
    train(train_dataloader3, model3, loss_fn3, optimiser3)
    if t % 1000 == 0:
        print(f"Epoch {t}")
        print("----------------------------------")
        print_error(train_dataloader3, "train", model3)
        print_error(eval_dataloader3, "validation", model3)
        print("")

Epoch 1000
----------------------------------
Train error: 17.72784%
Validation error: 16.66667%

Epoch 2000
----------------------------------
Train error: 17.10362%
Validation error: 16.66667%

Epoch 3000
----------------------------------
Train error: 17.22847%
Validation error: 16.66667%

Epoch 4000
----------------------------------
Train error: 16.97878%
Validation error: 16.66667%

Epoch 5000
----------------------------------
Train error: 16.85393%
Validation error: 16.66667%



## Using two hidden layers

Let's try using a network with two hidden layers, first increasing dimension, then decreasing.

In [17]:
class Model4(nn.Module):

    def __init__(self):
        super().__init__()
        self.stack = nn.Sequential(
            nn.Linear(11, 16),
            nn.ReLU(),
            nn.Linear(16, 6),
            nn.ReLU(),
            nn.Linear(6, 2)
        )

    def forward(self, x):
        return self.stack(x)

model4 = Model4().to(device)
print(model4)

Model4(
  (stack): Sequential(
    (0): Linear(in_features=11, out_features=16, bias=True)
    (1): ReLU()
    (2): Linear(in_features=16, out_features=6, bias=True)
    (3): ReLU()
    (4): Linear(in_features=6, out_features=2, bias=True)
  )
)


Train the model.

In [19]:
# Use the same data handler
dh4 = dh1

# Use the same data sets
train_dataset4 = train_dataset1
eval_dataset4 = eval_dataset1

# Define new data loaders
train_dataloader4 = DataLoader(train_dataset4, batch_size=batch_size)
eval_dataloader4 = DataLoader(eval_dataset4, batch_size=batch_size)

# Use the cross entropy loss function, and SGD
loss_fn4 = nn.CrossEntropyLoss()
optimiser4 = torch.optim.SGD(model4.parameters(), lr=1e-3)

# Train the model
epochs = 5000
for t in range(1, epochs+1):
    train(train_dataloader4, model4, loss_fn4, optimiser4)
    if t % 1000 == 0:
        print(f"Epoch {t}")
        print("----------------------------------")
        print_error(train_dataloader4, "train", model4)
        print_error(eval_dataloader4, "validation", model4)
        print("")

Epoch 1000
----------------------------------
Train error: 30.21224%
Validation error: 32.22222%

Epoch 2000
----------------------------------
Train error: 25.34332%
Validation error: 24.44444%

Epoch 3000
----------------------------------
Train error: 18.72659%
Validation error: 13.33333%

Epoch 4000
----------------------------------
Train error: 17.97753%
Validation error: 15.55555%

Epoch 5000
----------------------------------
Train error: 18.35206%
Validation error: 16.66667%

