# Naive Bayes Classifier

## Titanic

In [1]:
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
df=pd.read_csv('titanic.csv')
df.head()

Unnamed: 0,PassengerId,Name,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Survived
0,1,"Braund, Mr. Owen Harris",3,male,22.0,1,0,A/5 21171,7.25,,S,0
1,2,"Cumings, Mrs. John Bradley (Florence Briggs Th...",1,female,38.0,1,0,PC 17599,71.2833,C85,C,1
2,3,"Heikkinen, Miss. Laina",3,female,26.0,0,0,STON/O2. 3101282,7.925,,S,1
3,4,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",1,female,35.0,1,0,113803,53.1,C123,S,1
4,5,"Allen, Mr. William Henry",3,male,35.0,0,0,373450,8.05,,S,0


In [4]:
df.drop(['PassengerId','Name','SibSp','Parch','Ticket','Cabin','Embarked'], axis='columns', inplace=True)

In [5]:
df.head()

Unnamed: 0,Pclass,Sex,Age,Fare,Survived
0,3,male,22.0,7.25,0
1,1,female,38.0,71.2833,1
2,3,female,26.0,7.925,1
3,1,female,35.0,53.1,1
4,3,male,35.0,8.05,0


In [7]:
inputs=df.drop('Survived',axis='columns')
target=df.Survived

In [10]:
dummies=pd.get_dummies(inputs.Sex, dtype=int)
dummies

Unnamed: 0,female,male
0,0,1
1,1,0
2,1,0
3,1,0
4,0,1
...,...,...
886,0,1
887,1,0
888,1,0
889,0,1


In [11]:
inputs=pd.concat([inputs,dummies], axis='columns')
inputs.head(3)

Unnamed: 0,Pclass,Sex,Age,Fare,female,male
0,3,male,22.0,7.25,0,1
1,1,female,38.0,71.2833,1,0
2,3,female,26.0,7.925,1,0


In [12]:
inputs.drop('Sex', axis='columns', inplace=True)
inputs.head(3)

Unnamed: 0,Pclass,Age,Fare,female,male
0,3,22.0,7.25,0,1
1,1,38.0,71.2833,1,0
2,3,26.0,7.925,1,0


In [15]:
inputs.columns[inputs.isna().any()]

Index(['Age'], dtype='object')

In [16]:
inputs.Age=inputs.Age.fillna(inputs.Age.mean())
inputs.head()

Unnamed: 0,Pclass,Age,Fare,female,male
0,3,22.0,7.25,0,1
1,1,38.0,71.2833,1,0
2,3,26.0,7.925,1,0
3,1,35.0,53.1,1,0
4,3,35.0,8.05,0,1


In [20]:
from sklearn.model_selection import train_test_split


In [21]:
X_train, X_test, Y_train, Y_test= train_test_split(inputs, target, test_size=0.2)

In [22]:
len(X_train)

712

In [23]:
len(X_test)

179

In [24]:
from sklearn.naive_bayes import GaussianNB

In [25]:
model=GaussianNB()

In [26]:
model.fit(X_train, Y_train)

In [27]:
model.score(X_test, Y_test)

0.8100558659217877

In [29]:
model.predict_proba(X_test)

array([[9.73844132e-01, 2.61558684e-02],
       [9.76721430e-01, 2.32785700e-02],
       [1.18142951e-03, 9.98818570e-01],
       [8.15924033e-01, 1.84075967e-01],
       [9.87023504e-01, 1.29764957e-02],
       [9.72094977e-01, 2.79050228e-02],
       [9.87779356e-01, 1.22206441e-02],
       [4.75944978e-02, 9.52405502e-01],
       [4.74770854e-02, 9.52522915e-01],
       [9.73927489e-01, 2.60725109e-02],
       [9.80495778e-01, 1.95042220e-02],
       [9.88954756e-01, 1.10452438e-02],
       [4.19048331e-02, 9.58095167e-01],
       [8.78325964e-01, 1.21674036e-01],
       [1.08718918e-07, 9.99999891e-01],
       [9.89165685e-01, 1.08343150e-02],
       [3.19198530e-04, 9.99680801e-01],
       [9.83635072e-01, 1.63649279e-02],
       [9.71957601e-01, 2.80423995e-02],
       [6.39758076e-03, 9.93602419e-01],
       [9.72925022e-01, 2.70749781e-02],
       [1.00308139e-05, 9.99989969e-01],
       [4.28589857e-02, 9.57141014e-01],
       [9.74046194e-01, 2.59538058e-02],
       [4.090271

## Spam Emails

### Data Preprocessing

In [30]:
df_emails=pd.read_csv('spam.csv')
df_emails.head(5)

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [32]:
df_emails.groupby('Category').describe()

Unnamed: 0_level_0,Message,Message,Message,Message
Unnamed: 0_level_1,count,unique,top,freq
Category,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
ham,4825,4516,"Sorry, I'll call later",30
spam,747,641,Please call our customer service representativ...,4


In [35]:
df_emails['spam']=df_emails['Category'].apply(lambda x: 1 if x=='spam' else 0)
df_emails.head(5)

Unnamed: 0,Category,Message,spam
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0


In [36]:
from sklearn.model_selection import train_test_split

In [37]:
X_train,X_test,Y_train,Y_test= train_test_split(df_emails.Message,df_emails.spam, test_size=0.2)

In [38]:
len(X_train)

4457

In [39]:
len(X_test)

1115

In [40]:
from sklearn.feature_extraction.text import CountVectorizer

In [41]:
v=CountVectorizer()
X_train_count= v.fit_transform(X_train.values)
X_train_count.toarray()[:2]

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

### MultinomialNB

In [43]:
from sklearn.naive_bayes import MultinomialNB

In [44]:
model=MultinomialNB()
model.fit(X_train_count, Y_train)

In [45]:
emails = [
    'Hey mohan, can we get together to watch footbal game tomorrow?',
    'Upto 20% discount on parking, exclusive offer just for you. Dont miss this reward!'
]
emails_count = v.transform(emails)
model.predict(emails_count)

array([0, 1])

In [46]:
X_test_count=v.transform(X_test)
model.score(X_test_count, Y_test)

0.9829596412556054

### Sklearn Pipeline

In [47]:
# It is use to skip the steps to convert message into matrix

In [49]:
from sklearn.pipeline import Pipeline

In [51]:
clf=Pipeline([
    ('vectorizer', CountVectorizer()),
    ('nb', MultinomialNB())
])

In [52]:
clf.fit(X_train, Y_train)

In [53]:
clf.score(X_test, Y_test)

0.9829596412556054

In [54]:
clf.predict(emails)

array([0, 1])

# Exercise

In [58]:
from sklearn.datasets import load_wine

In [59]:
wine=load_wine()

In [60]:
dir(wine)

['DESCR', 'data', 'feature_names', 'frame', 'target', 'target_names']

In [61]:
df_wine=pd.DataFrame(wine.data, columns=wine.feature_names)
df_wine

Unnamed: 0,alcohol,malic_acid,ash,alcalinity_of_ash,magnesium,total_phenols,flavanoids,nonflavanoid_phenols,proanthocyanins,color_intensity,hue,od280/od315_of_diluted_wines,proline
0,14.23,1.71,2.43,15.6,127.0,2.80,3.06,0.28,2.29,5.64,1.04,3.92,1065.0
1,13.20,1.78,2.14,11.2,100.0,2.65,2.76,0.26,1.28,4.38,1.05,3.40,1050.0
2,13.16,2.36,2.67,18.6,101.0,2.80,3.24,0.30,2.81,5.68,1.03,3.17,1185.0
3,14.37,1.95,2.50,16.8,113.0,3.85,3.49,0.24,2.18,7.80,0.86,3.45,1480.0
4,13.24,2.59,2.87,21.0,118.0,2.80,2.69,0.39,1.82,4.32,1.04,2.93,735.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
173,13.71,5.65,2.45,20.5,95.0,1.68,0.61,0.52,1.06,7.70,0.64,1.74,740.0
174,13.40,3.91,2.48,23.0,102.0,1.80,0.75,0.43,1.41,7.30,0.70,1.56,750.0
175,13.27,4.28,2.26,20.0,120.0,1.59,0.69,0.43,1.35,10.20,0.59,1.56,835.0
176,13.17,2.59,2.37,20.0,120.0,1.65,0.68,0.53,1.46,9.30,0.60,1.62,840.0


In [63]:
df_wine['target']=wine.target
df_wine.head(10)

Unnamed: 0,alcohol,malic_acid,ash,alcalinity_of_ash,magnesium,total_phenols,flavanoids,nonflavanoid_phenols,proanthocyanins,color_intensity,hue,od280/od315_of_diluted_wines,proline,target
0,14.23,1.71,2.43,15.6,127.0,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065.0,0
1,13.2,1.78,2.14,11.2,100.0,2.65,2.76,0.26,1.28,4.38,1.05,3.4,1050.0,0
2,13.16,2.36,2.67,18.6,101.0,2.8,3.24,0.3,2.81,5.68,1.03,3.17,1185.0,0
3,14.37,1.95,2.5,16.8,113.0,3.85,3.49,0.24,2.18,7.8,0.86,3.45,1480.0,0
4,13.24,2.59,2.87,21.0,118.0,2.8,2.69,0.39,1.82,4.32,1.04,2.93,735.0,0
5,14.2,1.76,2.45,15.2,112.0,3.27,3.39,0.34,1.97,6.75,1.05,2.85,1450.0,0
6,14.39,1.87,2.45,14.6,96.0,2.5,2.52,0.3,1.98,5.25,1.02,3.58,1290.0,0
7,14.06,2.15,2.61,17.6,121.0,2.6,2.51,0.31,1.25,5.05,1.06,3.58,1295.0,0
8,14.83,1.64,2.17,14.0,97.0,2.8,2.98,0.29,1.98,5.2,1.08,2.85,1045.0,0
9,13.86,1.35,2.27,16.0,98.0,2.98,3.15,0.22,1.85,7.22,1.01,3.55,1045.0,0


In [70]:
X_train,X_test,Y_train,Y_test= train_test_split(wine.data, wine.target, test_size=0.2)

In [71]:
len(X_train)

142

In [72]:
len(X_test)

36

In [73]:
from sklearn.naive_bayes import MultinomialNB, GaussianNB

In [82]:
model_gnb=GaussianNB()
model_gnb.fit(X_train, Y_train)

In [83]:
model_gnb.score(X_test, Y_test)

1.0

In [84]:
model_mnb= MultinomialNB()
model_mnb.fit(X_train, Y_train)

In [85]:
model_mnb.score(X_test, Y_test)

0.9444444444444444