In [28]:
import pandas as pd
from sklearn.model_selection import train_test_split

In [30]:
from sklearn.naive_bayes import GaussianNB

## Gaussian Naive Bayes is used when the data is normally distributed.it is used when all features are continuous
## and cannot be represented in terms of their occurences

## Multinomial Naive Bayes: isused when we have discrete data (eg. movie rating from 1 to 5 as each rating will have certain frequency)

## Bernoulli Naive Bayes: Assumes that all features are binary (eg.0: word does not exist in document
##  1: Work exista in document)

In [21]:
df = pd.read_csv('titanic.csv')
df.head(3)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S


Some of the features are not relevant on target variable (whether the passenger survived or not). They are

PassengerID, Name, SibSp, Parch, Ticket, Cabin, Embarked

So dropping the unwanted columns

In [22]:
df.drop(['PassengerId','Name', 'SibSp', 'Parch', 'Ticket', 'Cabin', 'Embarked'],axis = 1, inplace = True)
df.head(3)

Unnamed: 0,Survived,Pclass,Sex,Age,Fare
0,0,3,male,22.0,7.25
1,1,1,female,38.0,71.2833
2,1,3,female,26.0,7.925


Separating Target and Input variables

In [23]:
tgt = df['Survived']
inputs = df.drop(['Survived'], axis = 1)

Encoding categorical variable 'Sex' using pd.get_dummies

In [24]:
inputs = pd.get_dummies(inputs, drop_first = False)
inputs.head(10)

Unnamed: 0,Pclass,Age,Fare,Sex_female,Sex_male
0,3,22.0,7.25,0,1
1,1,38.0,71.2833,1,0
2,3,26.0,7.925,1,0
3,1,35.0,53.1,1,0
4,3,35.0,8.05,0,1
5,3,,8.4583,0,1
6,1,54.0,51.8625,0,1
7,3,2.0,21.075,0,1
8,3,27.0,11.1333,1,0
9,2,14.0,30.0708,1,0


Checking for any Missing data

In [25]:
inputs.isna().sum()

Pclass          0
Age           177
Fare            0
Sex_female      0
Sex_male        0
dtype: int64

Imputing missing data in 'Age' by its mean

In [27]:
inputs['Age'].fillna(inputs['Age'].mean(), inplace = True)
print(inputs.isna().sum())
print("\n")
print(inputs.head(10))

Pclass        0
Age           0
Fare          0
Sex_female    0
Sex_male      0
dtype: int64


   Pclass        Age     Fare  Sex_female  Sex_male
0       3  22.000000   7.2500           0         1
1       1  38.000000  71.2833           1         0
2       3  26.000000   7.9250           1         0
3       1  35.000000  53.1000           1         0
4       3  35.000000   8.0500           0         1
5       3  29.699118   8.4583           0         1
6       1  54.000000  51.8625           0         1
7       3   2.000000  21.0750           0         1
8       3  27.000000  11.1333           1         0
9       2  14.000000  30.0708           1         0


Splitting the data into training and testing data

In [40]:
X_train, X_test, y_train, y_test = train_test_split(inputs, tgt, test_size = 0.2, random_state=11)

In [41]:
model = GaussianNB()

In [42]:
model.fit(X_train, y_train)

GaussianNB()

In [43]:
model.score(X_test, y_test)

0.8268156424581006

In [45]:
model.predict(X_test[:10])

array([1, 0, 0, 0, 0, 0, 0, 1, 0, 1], dtype=int64)

In [46]:
y_test[:10]

431    1
821    1
629    0
626    0
665    0
582    0
250    0
780    1
728    0
259    1
Name: Survived, dtype: int64

Note that 2nd value in y_test (record 821) is 1 i.e. passenger survived

But, the prediction for the same is 0 i.e. passenger did not survive

In [48]:
model.predict_proba(X_test[:10])
## Probabilities of each class (0 and 1) for a particular test point

array([[0.07694806, 0.92305194],
       [0.98688041, 0.01311959],
       [0.98708388, 0.01291612],
       [0.96236905, 0.03763095],
       [0.92010876, 0.07989124],
       [0.96368369, 0.03631631],
       [0.98705528, 0.01294472],
       [0.0593251 , 0.9406749 ],
       [0.96929682, 0.03070318],
       [0.03006244, 0.96993756]])