# Preliminaries

We download the data:

In [21]:
! wget https://gitlab.com/andras.simonyi/10_days_AI_training_data/raw/65de5908dccf120762b305238e02610a8c18a3f9/titanic_train.csv

--2020-10-12 09:52:21--  https://gitlab.com/andras.simonyi/10_days_AI_training_data/raw/65de5908dccf120762b305238e02610a8c18a3f9/titanic_train.csv
Resolving gitlab.com (gitlab.com)... 172.65.251.78, 2606:4700:90:0:f22e:fbec:5bed:a9b9
Connecting to gitlab.com (gitlab.com)|172.65.251.78|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: unspecified [text/plain]
Saving to: ‘titanic_train.csv.2’

titanic_train.csv.2     [ <=>                ]  59.76K  --.-KB/s    in 0.005s  

2020-10-12 09:52:21 (12.8 MB/s) - ‘titanic_train.csv.2’ saved [61194]



# A perceptron implementation in NumPy

We create a Perceptron class, which mimics scikit-learn estimators by providing a `fit` and a `predict` method:
+ The `predict` method returns a vector of predictions for an array of samples, while 
+ the `fit` method initializes the model parameters and trains the model using the perceptron learning rule.

In [22]:
import numpy as np
from numpy.random import permutation, seed
seed(12) # Fix the random seed

class Perceptron:
    """A simple implementation of the classic single-neuron perceptron model.

    Attrs:
        fitted (bool): Whether the model has been fitted.
        n_features (int): Number of input features.
        weights (numpy array): The model's weights.
        bias (float): The model's bias.
    """

    def __init__(self):
        """Create a perceptron model.

        Returns:
            A new perceptron instance.
        """
        self.fitted = False

    def fit(self, X, y, n_epochs=10, lr=0.01):
        """Fit the model to a data set.

        Args:
            X (numpy array of shape (n_samples, n_features)): Training data.
            y (numpy array of shape (n_samples,)): Target binary labels.
            n_epochs (int): Number of training epochs.
            lr (float): Learning rate.
        """
        n_samples, self.n_features = X.shape
        
        # Initialization
        # As we know, tha ONLY neural model that can be initialized to zeroes
        # is the Perceptron, if we use the perceptron learning rule.
        # Please initialize the weights and biases to zero!
        # Use Numpy!
        # Bear in mind, that the weights are a VECTOR, not a one dimensional matrix!
        # Bias is a scalar.
        self.weights = np.zeros(self.n_features)
        self.bias    = np.zeros(1)

        # Training
        # Implement the main loop, use the epoch parameter!
        for e in range(n_epochs):
            print("Starting epoch", e)

            # Random shuffle - in  a tricky way.
            # Generate a permutation mask with numpy 
            # which we will use to index into the data, thus realizing "shuffling".
            # Numpy has a permutation function. 
            # Please bear in mind, how many datapoints do we have. 
            # We have a variable for it above...
            perm = np.arange(n_samples)
            perm = np.random.permutation(perm)

            # Do the epoch!
            # Observe the trick, please!
            # We use the permutation mask to index into X and y,
            # and we generate and iterator of tuples.
            # Then we parse the intividual tuples (x,y pairs), and loot through them.
            for x, label in zip(X[perm], y[perm]):
                    #: you can uncomment this, if you have understood.
                    # Please convert the 1/0 label to 1/-1 label
                    # 1 should remain 1, 0 should become -1
                    # Use some easy mathematics, or whatever you feel like.
                    y_ = [-1 if label==0 else 1][0]
                    
                    # Implement the update rule!
                    val = np.dot(self.weights,x) + self.bias
                    # If the simple product of the output of our neuron and the true label is
                    # less than or equal to zero (negative), we will update.
                    # Remember, the activation is W times x plus the bias.
                    # Please bear in mind, that the "times" here is denoting the dot product!
                    if val * y_ <= 0 :
                        # So, we have an error. We should "update" the weights and the bias!
                        # We just add to the weight the multiplication of the input vector 
                        # and the true label, mutiplied by the learning rate 
                        # (we have a parameter for that).
                        self.weights += x * y_ * lr
                        # For the bias we have the "virtual input" of 1, so we only add true label 
                        # multiplied by the learning rate.
                        self.bias += y_* lr
                    
                    # Lo and behold, THAT'S IT!
                    # We have implemented a perceptron!
            
            self.fitted = True
            print("Finished training.")
        
    def predict(self, X):
        """Predict labels for samples.
        
        Args:
            X (numpy array of shape (n_samples, n_features)):  Samples.

        Returns:
            A numpy array of shape (n_samples,) containing the predicted labels.
        """
        if not self.fitted:
            raise ValueError("Perceptron model is not fitted")
        elif X.shape[1] != self.n_features:
            raise ValueError(f"Incorrect number of input features (expected {self.n_features})")
        else:
            # Please think through and explain the trainer, WHY this is a matmul!?

            # It's a MATMUL because we are using N-D array
            # We know, y = w * x so w(N X 1) array which is multiplied by x(N X d) where d is number of samples
            # We get 1 * d shaped matrix in result which is ony possible while using matmul
            #print(X.shape,self.weights.shape)
            activations = np.matmul(X, self.weights) + self.bias
            #print(activations.shape)
            signs = np.sign(activations).astype(int) 
            # Tricky conversion of signs to 0/1 labels
            return (signs + signs * signs) // 2

# Trying it out -- on the Titanic data set

Short description of the data set:

> The titanic [...] data frames describe the survival status of individual passengers on the Titanic.

> Non-obvious variables:

>- Pclass -- Passenger Class  (1 = 1st; 2 = 2nd; 3 = 3rd)
>- Survived -- Survival  (0 = No; 1 = Yes)
>- SibSp -- Number of Siblings/Spouses Aboard
>- Parch -- Number of Parents/Children Aboard
>- Fare (ticket price in British pound)
>- Embarked -- Port of Embarkation (C = Cherbourg; Q = Queenstown; S = Southampton)


In [23]:
import pandas as pd
from sklearn.model_selection import train_test_split
df = pd.read_csv("titanic_train.csv")

For the sake of simplicity, we divide our data only into a training and a validation part:

In [24]:
df_train, df_valid = train_test_split(df, test_size=0.1)
df_train.reset_index(inplace=True)
df_valid.reset_index(inplace=True)
print("train shape:", df_train.shape)
print("validation shape:", df_valid.shape)

train shape: (801, 13)
validation shape: (90, 13)


## Inspecting and cleaning the data

In [25]:
df_train.head()

Unnamed: 0,index,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,715,716,0,3,"Soholt, Mr. Peter Andreas Lauritz Andersen",male,19.0,0,0,348124,7.65,F G73,S
1,319,320,1,1,"Spedden, Mrs. Frederic Oakley (Margaretta Corn...",female,40.0,1,1,16966,134.5,E34,C
2,829,830,1,1,"Stone, Mrs. George Nelson (Martha Evelyn)",female,62.0,0,0,113572,80.0,B28,
3,79,80,1,3,"Dowdell, Miss. Elizabeth",female,30.0,0,0,364516,12.475,,S
4,484,485,1,1,"Bishop, Mr. Dickinson H",male,25.0,1,0,11967,91.0792,B49,C


In [26]:
df_train.describe(include="all")

Unnamed: 0,index,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
count,801.0,801.0,801.0,801.0,801,801,642.0,801.0,801.0,801.0,801.0,179,799
unique,,,,,801,2,,,,627.0,,133,3
top,,,,,"Bystrom, Mrs. (Karolina)",male,,,,1601.0,,B96 B98,S
freq,,,,,1,517,,,,7.0,,4,578
mean,439.566792,440.566792,0.383271,2.309613,,,29.634097,0.518102,0.373283,,32.386844,,
std,256.901656,256.901656,0.486487,0.834578,,,14.516042,1.120261,0.786915,,50.556727,,
min,0.0,1.0,0.0,1.0,,,0.42,0.0,0.0,,0.0,,
25%,216.0,217.0,0.0,2.0,,,20.0,0.0,0.0,,7.8958,,
50%,441.0,442.0,0.0,3.0,,,28.0,0.0,0.0,,14.4,,
75%,659.0,660.0,1.0,3.0,,,38.0,1.0,0.0,,31.0,,


In [27]:
df_train.isna().sum()

index            0
PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            159
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          622
Embarked         2
dtype: int64

Based on our inspection, we drop the Cabin, Ticket, PassengerId, Name and index columns, since they are unusable for the prediction task:

In [28]:
columns_to_drop = ["Cabin", "Ticket", "PassengerId", "Name", "index"]
df_train = df_train.drop(columns=columns_to_drop)
df_valid = df_valid.drop(columns=columns_to_drop)
df_train.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,male,19.0,0,0,7.65,S
1,1,1,female,40.0,1,1,134.5,C
2,1,1,female,62.0,0,0,80.0,
3,1,3,female,30.0,0,0,12.475,S
4,1,1,male,25.0,1,0,91.0792,C


We encode the the gender of passenger by numbers, and, as a primitive form of data imputation, replace missing age values with the mean age in the training data:

In [29]:
age_mean = df_train.Age.mean()
replacements = {"Sex": {"male": 1, "female":0}, "Age": {np.NaN: age_mean}}
df_train.replace(replacements, inplace=True)
df_valid.replace(replacements, inplace=True)
df_train.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,1,19.0,0,0,7.65,S
1,1,1,0,40.0,1,1,134.5,C
2,1,1,0,62.0,0,0,80.0,
3,1,3,0,30.0,0,0,12.475,S
4,1,1,1,25.0,1,0,91.0792,C


We drop the remaining rows with missing data:

In [30]:
print("Train and test length before dropping:", len(df_train), len(df_valid))
df_train.dropna(inplace=True)
df_train.reset_index()
df_valid.dropna(inplace=True)
df_valid.reset_index()
print("Train and test length after dropping:", len(df_train), len(df_valid))

Train and test length before dropping: 801 90
Train and test length after dropping: 799 90


We now balance the datasets

In [31]:
# Downsample majority class
from sklearn.utils import resample
# Separate majority and minority classes
df_majority = df_train[df_train.Survived==0]
df_minority = df_train[df_train.Survived==1]
 
# Downsample majority class
df_majority_downsampled = resample(df_majority, 
                                 replace=False,    # sample without replacement
                                 n_samples=len(df_minority),     # to match minority class
                                 random_state=0) # reproducible results
 
# Combine minority class with downsampled majority class
df_downsampled = pd.concat([df_majority_downsampled, df_minority]).reset_index(drop=True)
df_train = df_downsampled

Finally, we one-hot encode the Embarked column.

In [32]:
df_train = pd.get_dummies(df_train)
df_valid = pd.get_dummies(df_valid)
df_train.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked_C,Embarked_Q,Embarked_S
0,0,2,1,29.634097,0,0,0.0,0,0,1
1,0,2,0,38.0,0,0,13.0,0,0,1
2,0,3,0,29.634097,8,2,69.55,0,0,1
3,0,3,1,29.634097,0,0,8.05,0,0,1
4,0,2,1,29.0,1,0,21.0,0,0,1


## Fitting a perceptron

In [33]:
p = Perceptron()

input_cols = list(df_train.columns.values)[1:]

X_train = df_train[input_cols].to_numpy()
y_train = df_train.Survived.to_numpy()

p.fit(X_train, y_train, n_epochs=100, lr=0.01)

Starting epoch 0
Finished training.
Starting epoch 1
Finished training.
Starting epoch 2
Finished training.
Starting epoch 3
Finished training.
Starting epoch 4
Finished training.
Starting epoch 5
Finished training.
Starting epoch 6
Finished training.
Starting epoch 7
Finished training.
Starting epoch 8
Finished training.
Starting epoch 9
Finished training.
Starting epoch 10
Finished training.
Starting epoch 11
Finished training.
Starting epoch 12
Finished training.
Starting epoch 13
Finished training.
Starting epoch 14
Finished training.
Starting epoch 15
Finished training.
Starting epoch 16
Finished training.
Starting epoch 17
Finished training.
Starting epoch 18
Finished training.
Starting epoch 19
Finished training.
Starting epoch 20
Finished training.
Starting epoch 21
Finished training.
Starting epoch 22
Finished training.
Starting epoch 23
Finished training.
Starting epoch 24
Finished training.
Starting epoch 25
Finished training.
Starting epoch 26
Finished training.
Starting ep

In [34]:
y_train_predicted = p.predict(X_train)

X_valid = df_valid[input_cols].to_numpy()
y_valid = df_valid.Survived.to_numpy()

y_valid_predicted = p.predict(X_valid)

Let's see the metrics on our training and validation data:

In [35]:
from sklearn.metrics import classification_report

print(classification_report(y_train, y_train_predicted, labels=[1], target_names=["Survivor"]))

              precision    recall  f1-score   support

    Survivor       0.80      0.76      0.78       305

   micro avg       0.80      0.76      0.78       305
   macro avg       0.80      0.76      0.78       305
weighted avg       0.80      0.76      0.78       305



In [36]:
print(classification_report(y_valid, y_valid_predicted, labels=[1], target_names=["Survivor"]))

              precision    recall  f1-score   support

    Survivor       0.69      0.71      0.70        35

   micro avg       0.69      0.71      0.70        35
   macro avg       0.69      0.71      0.70        35
weighted avg       0.69      0.71      0.70        35

