In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import re
import torch 
import torch.nn as nn

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

Some Helper Functions

In [None]:
def printUniqueCounts(df, columnName: str = None):
    column = None
    if columnName is None:
        column = df
    else:
        column = df[columnName]
    titles = column.unique()
    counts = []
    for title in titles:
        if pd.isna(title):
            count = sum(column.isnull())
        else:
            count = sum(column==title)
        counts.append([title,count])
    
    counts.sort(key=lambda x: -x[1])
    
    for item in counts:
        print(str(item[0])+": "+str(item[1]))

In [None]:
trainData = pd.read_csv('/kaggle/input/titanic/train.csv')
print(trainData.shape)
trainData.head()


Now it's time to clean up the data for use with our ML algorithm and do some feature engineering.

First lets see which columns have NaN / null values:

In [None]:
for column in trainData.columns:
    empty = len(trainData[trainData[column].isnull()])
    if empty>0: print(column + " has " + str(empty) + " missing data points")

We will fill in missing age values with average age for now. In the future we may want to infer the age from the other columns, but for now this will have to suffice.

In [None]:
#fill in missing age values with average age
avgAge = trainData["Age"].mean()
print("Average age: ")
trainData["Age"].fillna(avgAge)

To gain a better idea of how to handle the missing values in embarked, lets first see how many people embarked from each port:

In [None]:
printUniqueCounts(trainData,"Embarked")

There are only 3 ports of embarcation, which makes this feature a good candidate for one-hot encoding. Missing values will simply be recorded as 'False' in all embarcation ports

In [None]:
trainData = pd.get_dummies(trainData,columns=["Embarked"])
trainData.head()

The remaining column with missing data is Cabin. This is really two features: cabin area (the letter) and cabin number (the integer). For the sake of simplicity, lets assume the cabin number is not useful, but that the cabin area is. This is a reasonable assumption because the cabin area is likely to have a large effect on the location of the passenger, whereas the cabin number is less likely to have a large effect.

In [None]:

def getCabinLetter(cabin: str):
    if cabin is None or not isinstance(cabin,str): return None
    result =  re.search("^[a-zA-Z]*",cabin)
    if not result: return None
    return result[0]
    
cabinLetter = trainData["Cabin"].apply(getCabinLetter)

print("Cabin Letters")
printUniqueCounts(cabinLetter)


We can see there are only 8 valid options for cabin letter. Being a small number this is a good candidate for one-hot encoding, again solving the missing data issue.

In [None]:
trainData["Cabin"] = cabinLetter
trainData = pd.get_dummies(trainData,columns=["Cabin"])
trainData.head()

That covers all the missing data, so now we just need to deal with non-numeric data. Lets start by converting Male/Female and True/False to 0/1

In [None]:
trainData["Sex"] = trainData["Sex"]=="male"
trainData = trainData*1
trainData.head()
printUniqueCounts(cabinLetter)

Name and Ticket remain as non-numeric data. We probably could do some NLP to make useful inferences out of the names, but that would be a lot of work and likely not yield a big difference in results. Ticket is also likely not to be very useful info. So lets just delete both columns.

In [None]:
trainData = trainData.drop(columns=["Name","Ticket","PassengerId"])
trainData.head()

Finally lets do a little bit of feature engineering. Most of these features look pretty useful as is, but Sibsp and Parch stand out as being a potentially useful target for a little bit of engineering. 
SibSp: siblings and spouses
Parch: parents and children

We can combine these two to get family size. Total family size might be an important metric so lets make an additional column for family size 

In [None]:

trainData.insert(len(trainData.columns),"FamSize",trainData["SibSp"]+trainData["Parch"])
trainData.head()

Finally we have to normalize the data. Lets use zscore normalization on Age and Fare:
**temporarily dropping Age to fix bugs**

In [None]:
from scipy.stats import zscore
colsToNormalize = ["Age","Fare"]
trainData[colsToNormalize] = trainData[colsToNormalize].apply(zscore)

#drop age for now

trainDataY = trainData["Survived"]
trainDataX = trainData.drop(columns=["Survived","Age"])
trainDataX.head()




Now we have prepared the data for our ML algorithm.

Lets create a model with 3 hidden layers, of 20, 8 and 5 neurons. Each hidden layer will have a ReLU activation function. The final output neuron will have a sigmoid activation function.
For the loss function we will use binary cross entropy.

In [None]:
inCount = trainDataX.shape[1]#number of inputs
layerWeights = []
layerBiases = []

relu = nn.ReLU()
sigmoid = nn.Sigmoid()
layers = [[20,relu],[8,relu],[5,relu],[1,sigmoid]]#[node count, activation function]
loss = nn.BCELoss()
learningRate = 0.01

lastNodeCount = inCount
for layer in layers:
    nodeCount = layer[0]
    layerWeights.append(torch.rand((lastNodeCount,nodeCount),requires_grad=True))
    layerBiases.append(torch.rand(nodeCount,requires_grad=True))
    lastNodeCount = nodeCount


def forward(X):
    A = X
    for i in range(len(layers)):
        layer = layers[i]
        function = layer[1]
        prod = torch.matmul(A,layerWeights[i])
        z = prod + layerBiases[i]
        A = function(z)
        
        
X = torch.tensor(trainDataX.values,dtype=torch.float32)
Y = torch.tensor(trainDataY.values,dtype=torch.float32)

losses = []
iterations = 100
for epoch in range(iterations):
    print(epoch)
    yHat = forward(X)
    print(yHat)
    print(yHat)
    l = loss(Y,yHat)
    l.backward()
    
    with torch.no_grad():
        layerWeights -= learningRate*layerWeights.grad
        layerBiases -= learningRate*layerBiases.grad
    
    layerWeights.grad.zero_()
    layerBiaseds.grad.zero_()
    losses.append(l)


print(losses)