In [17]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import time
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB, BernoulliNB, MultinomialNB
%matplotlib inline

In [3]:
# Importing dataset
data = pd.read_csv("../Week_1/Titanic_Demo/all/train.csv")

In [5]:
# Convert categorical variable to numeric
data["Sex_cleaned"]=np.where(data["Sex"]=="male",0,1)
data["Embarked_cleaned"]=np.where(data["Embarked"]=="S",0, np.where(
    data["Embarked"]=="C",1,np.where(
        data["Embarked"]=="Q",2,3)))

In [6]:
# Cleaning dataset of NaN
data=data[[
    "Survived",
    "Pclass",
    "Sex_cleaned",
    "Age",
    "SibSp",
    "Parch",
    "Fare",
    "Embarked_cleaned"
]].dropna(axis=0, how='any')

# Split dataset in training and test datasets
X_train, X_test = train_test_split(data, test_size=0.5, random_state=int(time.time()))

In [7]:
# Instantiate the classifier
gnb = GaussianNB()
used_features =[
    "Pclass",
    "Sex_cleaned",
    "Age",
    "SibSp",
    "Parch",
    "Fare",
    "Embarked_cleaned"
]

In [8]:
# Train classifier
gnb.fit(
    X_train[used_features].values,
    X_train["Survived"]
)
y_pred = gnb.predict(X_test[used_features])

In [13]:
# Print results
print(f'Number of mislabeled points out of a total {X_test.shape[0]} ' +
    f'points: {(X_test["Survived"] != y_pred).sum()}, ' +
    f'performance {100*(1-(X_test["Survived"] != y_pred).sum()/X_test.shape[0]):05.2f}%')

Number of mislabeled points out of a total 357 points: 95, performance 73.39%


# Illustration with 1 feature

Let’s restrain the classification using the Fare information only. Here we compute the P(Survival = 1) and P(Survival = 0) probabilities:

In [16]:
mean_survival=np.mean(X_train["Survived"])
mean_not_survival=1-mean_survival
print(f'Survival prob = {100*mean_survival:03.2f}%, Not survival prob = {100*mean_not_survival:03.2f}%')
#       .format(,))

Survival prob = 36.69%, Not survival prob = 63.31%


Then, according to the formula 3, we just need to find the probability distribution function P(fare| Survival = 0) and P(fare| Survival = 1). We choose the Gaussian Naive Bayes classifier. Thus we have to make the assumption that those distributions are Gaussian.

$$
f(x| \mu, \sigma) = \frac{1}{{\sqrt {2\pi\sigma } }}e^{-\frac{({x - \mu })^2 } {{2\sigma ^2 }}}
$$

Formula 5: Gaussian formula (σ: standard deviation / μ: mean)

Then we have to find the mean and the standard deviation of the Fare datasets for different Survival values. We obtain the following results:

In [18]:
mean_fare_survived = np.mean(X_train[X_train["Survived"]==1]["Fare"])
std_fare_survived = np.std(X_train[X_train["Survived"]==1]["Fare"])
mean_fare_not_survived = np.mean(X_train[X_train["Survived"]==0]["Fare"])
std_fare_not_survived = np.std(X_train[X_train["Survived"]==0]["Fare"])

print(f"mean_fare_survived = {mean_fare_survived:03.2f}")
print(f"std_fare_survived = {std_fare_survived:03.2f}")
print(f"mean_fare_not_survived = {mean_fare_not_survived:03.2f}")
print(f"std_fare_not_survived = {std_fare_not_survived:03.2f}")

mean_fare_survived = 52.38
std_fare_survived = 67.33
mean_fare_not_survived = 22.86
std_fare_not_survived = 33.01


$$