# Naive Bayes Tutorial Part 1: Predicting survival from titanic crash

In [9]:
import pandas as pd

df = pd.read_csv(r"tested.csv")
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,0,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,1,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,0,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,0,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,1,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [10]:
df.drop(['PassengerId','Name','SibSp','Parch','Ticket','Cabin','Embarked'],axis='columns',inplace=True)
df.head(7)

Unnamed: 0,Survived,Pclass,Sex,Age,Fare
0,0,3,male,34.5,7.8292
1,1,3,female,47.0,7.0
2,0,2,male,62.0,9.6875
3,0,3,male,27.0,8.6625
4,1,3,female,22.0,12.2875
5,0,3,male,14.0,9.225
6,1,3,female,30.0,7.6292


In [11]:
df.shape

(418, 5)

In [12]:
inputs = df.drop('Survived',axis='columns')
target = df.Survived

In [13]:
inputs

Unnamed: 0,Pclass,Sex,Age,Fare
0,3,male,34.5,7.8292
1,3,female,47.0,7.0000
2,2,male,62.0,9.6875
3,3,male,27.0,8.6625
4,3,female,22.0,12.2875
...,...,...,...,...
413,3,male,,8.0500
414,1,female,39.0,108.9000
415,3,male,38.5,7.2500
416,3,male,,8.0500


In [14]:
inputs.isnull().sum()

Pclass     0
Sex        0
Age       86
Fare       1
dtype: int64

In [15]:
inputs.Sex = inputs.Sex.map({'male': 1, 'female': 2})
inputs

Unnamed: 0,Pclass,Sex,Age,Fare
0,3,1,34.5,7.8292
1,3,2,47.0,7.0000
2,2,1,62.0,9.6875
3,3,1,27.0,8.6625
4,3,2,22.0,12.2875
...,...,...,...,...
413,3,1,,8.0500
414,1,2,39.0,108.9000
415,3,1,38.5,7.2500
416,3,1,,8.0500


In [16]:
inputs.Age[:10]

0    34.5
1    47.0
2    62.0
3    27.0
4    22.0
5    14.0
6    30.0
7    26.0
8    18.0
9    21.0
Name: Age, dtype: float64

In [17]:
inputs.isnull().sum()

Pclass     0
Sex        0
Age       86
Fare       1
dtype: int64

In [18]:

inputs.Age = inputs.Age.fillna(inputs.Age.mean())
inputs.head()

Unnamed: 0,Pclass,Sex,Age,Fare
0,3,1,34.5,7.8292
1,3,2,47.0,7.0
2,2,1,62.0,9.6875
3,3,1,27.0,8.6625
4,3,2,22.0,12.2875


In [19]:

inputs.Fare = inputs.Fare.fillna(inputs.Fare.mean())
inputs.head()

Unnamed: 0,Pclass,Sex,Age,Fare
0,3,1,34.5,7.8292
1,3,2,47.0,7.0
2,2,1,62.0,9.6875
3,3,1,27.0,8.6625
4,3,2,22.0,12.2875


In [20]:
inputs.shape

(418, 4)

In [21]:
inputs.isnull().sum()

Pclass    0
Sex       0
Age       0
Fare      0
dtype: int64

In [22]:
target

0      0
1      1
2      0
3      0
4      1
      ..
413    0
414    1
415    0
416    0
417    0
Name: Survived, Length: 418, dtype: int64

In [23]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(inputs,target,test_size=0.2)

In [24]:
X_train.shape

(334, 4)

In [25]:
X_test.shape

(84, 4)

In [26]:
X_test

Unnamed: 0,Pclass,Sex,Age,Fare
396,3,1,24.00000,7.2500
184,1,2,27.00000,247.5208
258,2,2,19.00000,13.0000
127,3,2,30.27259,23.2500
363,3,1,27.00000,8.6625
...,...,...,...,...
61,2,1,32.00000,13.5000
110,2,1,41.00000,15.0458
232,3,1,21.00000,6.4958
10,3,1,30.27259,7.8958


In [27]:
from sklearn.naive_bayes import GaussianNB
model = GaussianNB() ## when data-distribution is Normal

In [28]:

model.fit(X_train,y_train)


In [29]:
model.score(X_test,y_test)

1.0

In [30]:

X_test[0:10]

Unnamed: 0,Pclass,Sex,Age,Fare
396,3,1,24.0,7.25
184,1,2,27.0,247.5208
258,2,2,19.0,13.0
127,3,2,30.27259,23.25
363,3,1,27.0,8.6625
47,3,1,30.27259,7.75
270,1,1,46.0,75.2417
84,2,1,30.27259,10.7083
75,1,1,32.5,211.5
162,2,2,26.0,13.5


In [31]:
# comparing wuth y_test
y_test[0:10]

396    0
184    1
258    1
127    1
363    0
47     0
270    0
84     0
75     0
162    1
Name: Survived, dtype: int64

In [32]:
model.predict(X_test[0:10])

array([0, 1, 1, 1, 0, 0, 0, 0, 0, 1], dtype=int64)