### Tutorial 14 Naive bayes classifier algorithm part `

In [40]:
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [41]:
df = pd.read_csv('tutorial14_titanic.csv')
df

Unnamed: 0,PassengerId,Name,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Survived
0,1,"Braund, Mr. Owen Harris",3,male,22.0,1,0,A/5 21171,7.2500,,S,0
1,2,"Cumings, Mrs. John Bradley (Florence Briggs Th...",1,female,38.0,1,0,PC 17599,71.2833,C85,C,1
2,3,"Heikkinen, Miss. Laina",3,female,26.0,0,0,STON/O2. 3101282,7.9250,,S,1
3,4,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",1,female,35.0,1,0,113803,53.1000,C123,S,1
4,5,"Allen, Mr. William Henry",3,male,35.0,0,0,373450,8.0500,,S,0
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,"Montvila, Rev. Juozas",2,male,27.0,0,0,211536,13.0000,,S,0
887,888,"Graham, Miss. Margaret Edith",1,female,19.0,0,0,112053,30.0000,B42,S,1
888,889,"Johnston, Miss. Catherine Helen ""Carrie""",3,female,,1,2,W./C. 6607,23.4500,,S,0
889,890,"Behr, Mr. Karl Howell",1,male,26.0,0,0,111369,30.0000,C148,C,1


In [42]:
target = df.Survived
input = df.drop(['PassengerId', 'Name', 'SibSp', 'Parch', 'Ticket', 'Cabin', 'Embarked', 'Survived'], axis='columns')

In [43]:
target.head()

0    0
1    1
2    1
3    1
4    0
Name: Survived, dtype: int64

In [44]:
input.head()

Unnamed: 0,Pclass,Sex,Age,Fare
0,3,male,22.0,7.25
1,1,female,38.0,71.2833
2,3,female,26.0,7.925
3,1,female,35.0,53.1
4,3,male,35.0,8.05


In [45]:
dummies = pd.get_dummies(input.Sex)

In [46]:
dummies.head()

Unnamed: 0,female,male
0,False,True
1,True,False
2,True,False
3,True,False
4,False,True


In [47]:
input = pd.concat([input, dummies], axis='columns')

In [48]:
input

Unnamed: 0,Pclass,Sex,Age,Fare,female,male
0,3,male,22.0,7.2500,False,True
1,1,female,38.0,71.2833,True,False
2,3,female,26.0,7.9250,True,False
3,1,female,35.0,53.1000,True,False
4,3,male,35.0,8.0500,False,True
...,...,...,...,...,...,...
886,2,male,27.0,13.0000,False,True
887,1,female,19.0,30.0000,True,False
888,3,female,,23.4500,True,False
889,1,male,26.0,30.0000,False,True


In [49]:
input = input.drop('Sex', axis='columns')

In [50]:
input

Unnamed: 0,Pclass,Age,Fare,female,male
0,3,22.0,7.2500,False,True
1,1,38.0,71.2833,True,False
2,3,26.0,7.9250,True,False
3,1,35.0,53.1000,True,False
4,3,35.0,8.0500,False,True
...,...,...,...,...,...
886,2,27.0,13.0000,False,True
887,1,19.0,30.0000,True,False
888,3,,23.4500,True,False
889,1,26.0,30.0000,False,True


In [51]:
input.Age = input.Age.fillna(input.Age.mean().round())
input

Unnamed: 0,Pclass,Age,Fare,female,male
0,3,22.0,7.2500,False,True
1,1,38.0,71.2833,True,False
2,3,26.0,7.9250,True,False
3,1,35.0,53.1000,True,False
4,3,35.0,8.0500,False,True
...,...,...,...,...,...
886,2,27.0,13.0000,False,True
887,1,19.0,30.0000,True,False
888,3,30.0,23.4500,True,False
889,1,26.0,30.0000,False,True


In [70]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(input, target, test_size=0.2)

In [71]:
X_train.shape

(712, 5)

In [72]:
X_test.shape

(179, 5)

In [73]:
from sklearn.naive_bayes import GaussianNB
model = GaussianNB()

In [74]:
model.fit(X_train, y_train)

0,1,2
,priors,
,var_smoothing,1e-09


In [75]:
model.score(X_test, y_test)

0.8268156424581006

In [76]:
model.predict(X_test[:10])

array([1, 0, 0, 1, 1, 0, 0, 0, 1, 0])

In [79]:
model.predict_proba(X_test[:10])

array([[4.45046206e-02, 9.55495379e-01],
       [9.76333924e-01, 2.36660764e-02],
       [8.56879243e-01, 1.43120757e-01],
       [4.71478122e-03, 9.95285219e-01],
       [9.04945455e-06, 9.99990951e-01],
       [9.87029389e-01, 1.29706110e-02],
       [7.21145002e-01, 2.78854998e-01],
       [9.89473434e-01, 1.05265656e-02],
       [6.09969275e-35, 1.00000000e+00],
       [9.65305565e-01, 3.46944346e-02]])

In [80]:
y_test[:10]

327    1
20     0
487    0
329    1
297    0
67     0
96     0
497    0
737    1
582    0
Name: Survived, dtype: int64