# Naive Bayes

<img src=nb1.png height=400 width=600>
<img src=nb2.png height=400 width=600>
<img src=nb3.png height=400 width=600>
<img src=nb4.png height=400 width=600>
<img src=nb5.png height=400 width=600>
<img src=nb6.png height=400 width=600>
<img src=nb7.png height=400 width=600>
<img src=nb10.png height=400 width=600>

# Implementation with Titanic Dataset

<img src=nb8.png height=400 width=600>
<img src=nb9.png height=400 width=600>

In [138]:
import pandas as pd

In [139]:
df = pd.read_csv("titanic.csv")
df.head()

Unnamed: 0,PassengerId,Name,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Survived
0,1,"Braund, Mr. Owen Harris",3.0,male,22.0,1.0,0.0,A/5 21171,7.25,,S,0.0
1,2,"Cumings, Mrs. John Bradley (Florence Briggs Th...",1.0,female,38.0,1.0,0.0,PC 17599,71.2833,C85,C,1.0
2,3,"Heikkinen, Miss. Laina",3.0,female,26.0,0.0,0.0,STON/O2. 3101282,7.925,,S,1.0
3,4,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",1.0,female,35.0,1.0,0.0,113803,53.1,C123,S,1.0
4,5,"Allen, Mr. William Henry",3.0,male,35.0,0.0,0.0,373450,8.05,,S,0.0


In [140]:
df.drop(['PassengerId','Name','SibSp','Parch','Ticket','Cabin','Embarked'],axis='columns',inplace=True)
df.head()          # Dropping Unnecessary columns

Unnamed: 0,Pclass,Sex,Age,Fare,Survived
0,3.0,male,22.0,7.25,0.0
1,1.0,female,38.0,71.2833,1.0
2,3.0,female,26.0,7.925,1.0
3,1.0,female,35.0,53.1,1.0
4,3.0,male,35.0,8.05,0.0


In [141]:
inputs = df.drop('Survived',axis='columns')
target = df[['Survived']]

In [142]:
dummies = pd.get_dummies(inputs.Sex)     # Using dummy columns for sex column
dummies.head(3)

Unnamed: 0,female,male
0,0,1
1,1,0
2,1,0


In [143]:
inputs = pd.concat([inputs,dummies],axis='columns')   # Concating this two dataframe
inputs.head(3)

Unnamed: 0,Pclass,Sex,Age,Fare,female,male
0,3.0,male,22.0,7.25,0,1
1,1.0,female,38.0,71.2833,1,0
2,3.0,female,26.0,7.925,1,0


In [144]:
inputs.drop(['Sex','male'],axis='columns',inplace=True)   # Dropping sex column
inputs.head(3)

Unnamed: 0,Pclass,Age,Fare,female
0,3.0,22.0,7.25,0
1,1.0,38.0,71.2833,1
2,3.0,26.0,7.925,1


In [145]:
inputs.Age = inputs.Age.fillna(inputs.Age.mean())
inputs.Pclass = inputs.Pclass.fillna(inputs.Pclass.mean())
inputs.Fare = inputs.Fare.fillna(inputs.Fare.mean())
target.Survived = target.Survived.fillna(0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value


In [146]:
inputs.isna().sum()

Pclass    0
Age       0
Fare      0
female    0
dtype: int64

In [147]:
target.isna().sum()

Survived    0
dtype: int64

In [148]:
inputs.shape

(418, 4)

In [149]:
target.shape

(418, 1)

In [162]:
from sklearn.model_selection import train_test_split         # Splitting
X_train, X_test, y_train, y_test = train_test_split(inputs,target,test_size=0.2)

In [163]:
from sklearn.naive_bayes import GaussianNB
model = GaussianNB()                  # Gaussian Naive Bayes Model

In [164]:
X_train.shape

(334, 4)

In [165]:
y_train.isnull().sum()

Survived    0
dtype: int64

In [166]:
model.fit(X_train,y_train)             # Training

  y = column_or_1d(y, warn=True)


GaussianNB(priors=None, var_smoothing=1e-09)

In [167]:
model.score(X_test,y_test)           # Accuracy

0.7738095238095238

In [168]:
model.predict(X_test[0:10])         # Prediction

array([0., 1., 0., 1., 1., 0., 1., 1., 0., 1.])

In [169]:
model.predict_proba(X_test[:10])

array([[0.9618662 , 0.0381338 ],
       [0.22107928, 0.77892072],
       [0.96309482, 0.03690518],
       [0.22091319, 0.77908681],
       [0.40144006, 0.59855994],
       [0.86980065, 0.13019935],
       [0.00394054, 0.99605946],
       [0.27166419, 0.72833581],
       [0.93908757, 0.06091243],
       [0.33106106, 0.66893894]])

In [170]:
from sklearn.model_selection import cross_val_score
cross_val_score(GaussianNB(),X_train, y_train, cv=5)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


array([0.80597015, 0.73134328, 0.86567164, 0.8358209 , 0.75757576])