In [60]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/titanic/train.csv
/kaggle/input/titanic/test.csv
/kaggle/input/titanic/gender_submission.csv


# Loading Data

In [61]:
import pandas as pd

df = pd.read_csv('/kaggle/input/titanic/train.csv')

## Display data

In [62]:
df

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Thayer)",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


## Data Cleaning

There are some missing values in the csv file. Pandas will put a NaN in the missing locations.

In [63]:
df.isna().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

Next, we need to replace the NaN with something. A common used way is to replace them with mode values.

In [64]:
modes = df.mode().iloc[0]
modes

PassengerId                      1
Survived                       0.0
Pclass                         3.0
Name           Abbing, Mr. Anthony
Sex                           male
Age                           24.0
SibSp                          0.0
Parch                          0.0
Ticket                        1601
Fare                          8.05
Cabin                      B96 B98
Embarked                         S
Name: 0, dtype: object

In [65]:
df.fillna(modes, inplace=True)

We can now check there's no missing values left:

In [66]:
df.isna().sum()

PassengerId    0
Survived       0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Cabin          0
Embarked       0
dtype: int64

The summary of the dataset:

In [67]:
import numpy as np

df.describe(include=(np.number))

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,891.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,28.56697,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,13.199572,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,22.0,0.0,0.0,7.9104
50%,446.0,0.0,3.0,24.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,35.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


Print the summary of non-numeric columns in the dataset.

In [68]:
df.describe(include=[object])

Unnamed: 0,Name,Sex,Ticket,Cabin,Embarked
count,891,891,891,891,891
unique,891,2,681,147,3
top,"Braund, Mr. Owen Harris",male,347082,B96 B98,S
freq,1,577,7,691,646


## Data Preprocessing

Obviously, we can not multiply non-numeric values by coefficients, so we need to replace those with numbers.

We do that by creating new columns containing dummy variables. A dummy variable is a column that contains a 1 where a particular column contains a particular value, or a 0 otherwise.

For instance, we could create a dummy variable for `Sex='male'`, which would be a new column containing 1 for rows where Sex is 'male', and 0 for rows where it isn't.

Pandas can create these automatically using `get_dummies`, which also remove the original columns. We'll create dummy variables for `Pclass`, even although it's numeric, since the numbers 1, 2, and 3 correspond to first, second, and third class cabins - not to counts or measures that make sense to multiply by. We'll also create dummies for `Sex` and `Embarked` since we'll want to use those as predictors in our model. On the other hand, `Cabin`, `Name`, and `Ticket` have too many unique values for it to make sense creating dummy variables for them.

In [69]:
df = pd.get_dummies(df, columns=["Sex","Pclass","Embarked"])
df.columns

Index(['PassengerId', 'Survived', 'Name', 'Age', 'SibSp', 'Parch', 'Ticket',
       'Fare', 'Cabin', 'Sex_female', 'Sex_male', 'Pclass_1', 'Pclass_2',
       'Pclass_3', 'Embarked_C', 'Embarked_Q', 'Embarked_S'],
      dtype='object')

Let's look at some of the added columns and values.

In [70]:
added_cols = ['Sex_male', 'Sex_female', 'Pclass_1', 'Pclass_2', 'Pclass_3', 'Embarked_C', 'Embarked_Q', 'Embarked_S']
df[added_cols] = df[added_cols].astype(int)
df[added_cols].head()

Unnamed: 0,Sex_male,Sex_female,Pclass_1,Pclass_2,Pclass_3,Embarked_C,Embarked_Q,Embarked_S
0,1,0,0,0,1,0,0,1
1,0,1,1,0,0,1,0,0
2,0,1,0,0,1,0,0,1
3,0,1,1,0,0,0,0,1
4,1,0,0,0,1,0,0,1


The target variable (label) is `Survived` and the others are as model input.

In [71]:
import torch

Y = torch.tensor(df.Survived)

indep_cols = ['Age', 'SibSp', 'Parch', 'Fare'] + added_cols
X = torch.tensor(df[indep_cols].values, dtype=torch.float)

print('Input shape: ', X.shape)
print('Target shape', Y.shape)

Input shape:  torch.Size([891, 12])
Target shape torch.Size([891])


# Building a linear model

$$
Y = WX
$$

In [72]:
torch.manual_seed(0)

num_features = X.shape[1]
W = torch.rand(num_features)-0.5

print('W shape: ', W.shape)
print(W)

W shape:  torch.Size([12])
tensor([-0.0037,  0.2682, -0.4115, -0.3680, -0.1926,  0.1341, -0.0099,  0.3964,
        -0.0444,  0.1323, -0.1511, -0.0983])


In [73]:
pred = torch.matmul(W, X.T)

print('pred shape: ', pred.shape)
print(pred[:10])

pred shape:  torch.Size([891])
tensor([ -2.8171, -25.8476,  -3.0221, -19.3761,  -3.4284,  -3.5903, -19.5867,
         -7.7045,  -5.0294, -10.1865])


The model weights are generated randomly. So the current predictions are not going to be any use. To train the linear model, we need to define the loss firstly. Here, we simply use the absolute error as the loss.

In [74]:
loss = torch.abs(pred - Y).mean()

print('loss: ', loss)

loss:  tensor(12.4397)


Leverage PyTorch to calculate the gradients. We need to call the function `requires_grad_` before calculating loss.

In [75]:
W.requires_grad_()

tensor([-0.0037,  0.2682, -0.4115, -0.3680, -0.1926,  0.1341, -0.0099,  0.3964,
        -0.0444,  0.1323, -0.1511, -0.0983], requires_grad=True)

In [76]:
pred = torch.matmul(W, X.T)
loss = torch.abs(pred - Y).mean()

loss.backward()

print(W.grad)

tensor([-28.2437,  -0.5230,  -0.3816, -32.2042,  -0.6341,  -0.3524,  -0.2424,
         -0.1930,  -0.5511,  -0.1886,  -0.0864,  -0.7116])


Notice that the gradients will be accumulated by default. Therefore, if we don't want the gradients to be accumulated, we should clear the gradients by call the `zero_` function of the gradients object after updating the model parameters.

In [77]:
learning_rate = 0.1
with torch.no_grad():
    W.sub_(W.grad * learning_rate)
    W.grad.zero_()

# Model Training

Before model training, we should split the dataset into two parts: one for training (training set) and one for validation (validation set). Training set is used to train the model, and the validation set is used to evaluate the performance of the current model.

It is not resonable to evaluate the model according to the performance of the model on the training set since the model has seen the training set. It is better to use another dataset that the model has not seen. That's why we split the dataset into training set and validation set.

In [78]:
from fastai.data.transforms import RandomSplitter

trn_split, val_split = RandomSplitter(seed=0, valid_pct=0.2)(df)
trn_X,val_X = X[trn_split],X[val_split]
trn_Y,val_Y = Y[trn_split],Y[val_split]

vals, indices = trn_X.max(dim=0)
trn_X = trn_X / vals
vals, indices = val_X.max(dim=0)
val_X = val_X / vals

print('#Training set: ', len(trn_X))
print('#Validation set: ', len(val_X))

#Training set:  713
#Validation set:  178


Start training now! Let's define some functions:

In [79]:
def predict(W, x):
    return torch.matmul(W, x.T)

def loss_fn(pred, target):
    return torch.abs(pred - target).mean()

def optimize(W, learning_rate):
    W.sub_(W.grad * learning_rate)
    W.grad.zero_()

def one_epoch(W, x, y, learning_rate):
    pred = predict(W, x)
    loss = loss_fn(pred, y)
    loss.backward()
    with torch.no_grad():
        optimize(W, learning_rate)
    return loss.item()

def init_w(num_features):
    W = torch.rand(num_features)-0.5
    W.requires_grad_()
    return W

def train_model(X, Y, epochs=50, lr=0.0005):
    torch.manual_seed(0)
    W = init_w(X.shape[1])
    for i in range(epochs):
        loss = one_epoch(W, X, Y, learning_rate=lr)
    print('Final loss: ', loss)
    return W

Then we can train a model by calling the `train_model` function.

In [97]:
W = train_model(trn_X, trn_Y, 1000, 0.005)

Final loss:  0.2253539115190506


# Evaluation

Use the model to predict the results on the validation set.

In [81]:
pred = predict(W, val_X)
loss = loss_fn(pred, val_Y)

results = val_Y.bool() == (pred > 0.5)

print('Loss: ', loss)
print('Pred: ', pred[:10])
print('Target: ', val_Y[:10])
print('Accuracy: ', results.float().mean().item())

Loss:  tensor(0.2580, grad_fn=<MeanBackward0>)
Pred:  tensor([-0.0070, -0.0012, -0.0353, -0.0821,  0.9842, -0.1535, -0.1110,  1.0112,
         0.9232,  0.0073], grad_fn=<SliceBackward0>)
Target:  tensor([0, 0, 0, 0, 1, 1, 0, 0, 1, 1])
Accuracy:  0.7696629166603088


# Submitting to Kaggle

In [89]:
tst_df = pd.read_csv('/kaggle/input/titanic/test.csv')

In this case, it turns out that the test set is missing Fare for one passenger. We'll just fill it with 0 to avoid problems:

In [90]:
tst_df['Fare'] = tst_df.Fare.fillna(0)

Now we can just copy the same steps we did to our training set and do the same exact things on our test set to preprocess the data:

In [91]:
tst_df.fillna(modes, inplace=True)
tst_df = pd.get_dummies(tst_df, columns=["Sex","Pclass","Embarked"])
tst_df[added_cols] = tst_df[added_cols].astype(int)
tst_X = torch.tensor(tst_df[indep_cols].values, dtype=torch.float)
vals, indices = tst_X.max(dim=0)
tst_X = tst_X / vals

Let's calculate our predictions of which passengers survived in the test set:

In [92]:
tst_df['Survived'] = (predict(tst_X, W) > 0.5).int()

The sample submission on the Kaggle competition site shows that we're expected to upload a CSV with just `PassengerId` and `Survived`, so let's create that and save it:

In [93]:
sub_df = tst_df[['PassengerId','Survived']]
sub_df.to_csv('submission.csv', index=False)

We can check the first few rows of the file to make sure it looks reasonable:

In [94]:
!head submission.csv

PassengerId,Survived
892,0
893,1
894,0
895,0
896,1
897,0
898,1
899,0
900,1


When you click "Submit" in Kaggle, and wait for the notebook to run, you'll see that your results have been submitted to the competition.