# Naive Bayes in NLP

**Naive Bayes**
* Introduction: https://en.wikipedia.org/wiki/Naive_Bayes_classifier
* Library: http://scikit-learn.org/stable/modules/naive_bayes.html
---

In [8]:
import pandas as pd
import numpy as np

In [2]:
# Source: https://github.com/sjwhitworth/golearn/blob/master/examples/datasets/tennis.csv
df = pd.read_csv('https://raw.githubusercontent.com/sjwhitworth/golearn/master/examples/datasets/tennis.csv')
df

Unnamed: 0,outlook,temp,humidity,windy,play
0,sunny,hot,high,False,no
1,sunny,hot,high,True,no
2,overcast,hot,high,False,yes
3,rainy,mild,high,False,yes
4,rainy,cool,normal,False,yes
5,rainy,cool,normal,True,no
6,overcast,cool,normal,True,yes
7,sunny,mild,high,False,no
8,sunny,cool,normal,False,yes
9,rainy,mild,normal,False,yes


#### Compute prior probability

In [10]:
P_yes = (df['play'] == 'yes').sum() * 1. / df.shape[0]
P_no  = (df['play'] == 'no').sum()  * 1. / df.shape[0]

print( P_yes, P_no)


0.642857142857 0.357142857143


#### Compute prior probability

In [11]:
np_outlook = np.array([[((df.play == c).values & (df.outlook == v).values).sum()
                            for c in ['yes', 'no']]
                               for v in ['sunny', 'rainy', 'overcast']])

s = np_outlook.sum(axis=1)
np_outlook = np_outlook * 1. / s.reshape((-1,1))

df_outlook = pd.DataFrame(np_outlook,
                          columns=['yes', 'no'])
df_outlook.index = ['sunny', 'rainy', 'overcast']
df_outlook

Unnamed: 0,yes,no
sunny,0.4,0.6
rainy,0.6,0.4
overcast,1.0,0.0


In [12]:
np_temp = np.array([[((df.play == c).values & (df.temp == v).values).sum()
                            for c in ['yes', 'no']]
                               for v in ['hot', 'mild', 'cool']])

s = np_temp.sum(axis=1)
np_temp = np_temp * 1. / s.reshape((-1,1))

df_temp = pd.DataFrame(np_temp,
                          columns=['yes', 'no'])
df_temp.index = ['hot', 'mild', 'cool']
df_temp

Unnamed: 0,yes,no
hot,0.5,0.5
mild,0.666667,0.333333
cool,0.75,0.25


In [None]:
np_humidity = np.array([[((df.play == c).values & (df.humidity == v).values).sum()
                            for c in ['yes', 'no']]
                               for v in ['high', 'normal']])

s = np_humidity.sum(axis=1)
np_humidity = np_humidity * 1. / s.reshape((-1,1))

df_humidity = pd.DataFrame(np_humidity,
                          columns=['yes', 'no'])
df_humidity.index = ['high', 'normal']
df_humidity

In [None]:
np_windy = np.array([[((df.play == c).values & (df.windy == v).values).sum()
                            for c in ['yes', 'no']]
                               for v in [True, False]])

s = np_windy.sum(axis=1)
np_windy = np_windy * 1. / s.reshape((-1,1))

df_windy = pd.DataFrame(np_windy,
                          columns=['yes', 'no'])
df_windy.index = ['True', 'False']
df_windy

#### Testing

In [14]:
x = ['sunny', 'hot', 'normal', False]

print (df_outlook.loc['sunny'])
print (df_temp.loc['hot'])

df_outlook.loc['sunny'] * df_temp.loc['hot']

yes    0.4
no     0.6
Name: sunny, dtype: float64
yes    0.5
no     0.5
Name: hot, dtype: float64


yes    0.2
no     0.3
dtype: float64

In [17]:
df_outlook.loc['sunny'] * df_temp.loc['hot'] * df_humidity.loc['normal'] * df_windy.loc['False']

NameError: name 'df_humidity' is not defined

##### Test 2

In [None]:
x = ['overcast', 'hot', 'normal', False]
df_outlook.loc['overcast'] * df_temp.loc['hot'] * df_humidity.loc['normal'] * df_windy.loc['False']

## Using **scikit-learn**

**Datasets**
Look at some datasets at: https://github.com/jbrownlee/Datasets/

We will download a Diabetes dataset from the list. More information about the each value in the dataset is available here: https://www.kaggle.com/uciml/pima-indians-diabetes-database/data

---

In [19]:
df1 = pd.read_csv('https://github.com/jbrownlee/Datasets/raw/master/pima-indians-diabetes.data.csv', header=None)
df1.columns = ['Pregnancies', 'Glucose', 'BloodPressure', 
              'SkinThickness', 'Insulin', 'BMI', 
              'DiabetesPedigreeFunction', 'Age', 'Outcome']

In [20]:
df1

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1
5,5,116,74,0,0,25.6,0.201,30,0
6,3,78,50,32,88,31.0,0.248,26,1
7,10,115,0,0,0,35.3,0.134,29,0
8,2,197,70,45,543,30.5,0.158,53,1
9,8,125,96,0,0,0.0,0.232,54,1


In [22]:
cols = df1.columns
X = df1[cols[:-1]]
y = df1[cols[-1]]

In [24]:
from sklearn.naive_bayes import GaussianNB, MultinomialNB

## Using Gaussian

$$P(x|c_k) = \frac{1}{\sqrt{2\pi\sigma_k^c}} e^{-\frac{{(x-\mu_k)}^2}{2\sigma_k^2}}$$

In [26]:
gNB = GaussianNB()

In [27]:
gNB.fit(X, y)

GaussianNB(priors=None)

In [28]:
gNB.score(X, y)

0.76302083333333337

In [29]:
gNB.class_prior_

array([ 0.65104167,  0.34895833])

In [30]:
gNB.sigma_

array([[  9.08520926e+00,   6.81995613e+02,   3.25622157e+02,
          2.21267117e+02,   9.75479675e+03,   5.90156156e+01,
          8.92863791e-02,   1.35861913e+02],
       [  1.39446557e+01,   1.01633298e+03,   4.60174481e+02,
          3.11405894e+02,   1.91629022e+04,   5.25538755e+01,
          1.38143783e-01,   1.19853711e+02]])

## Using Multinomial

In [31]:
MultinomialNB().fit(X,y).score(X,y)

0.6015625

## Let try with one more dataset

**Breast Cancer Dataset**
* Data: https://github.com/jbrownlee/Datasets/blob/master/breast-cancer.csv
* Description: https://github.com/jbrownlee/Datasets/blob/master/breast-cancer.names

In [32]:
from sklearn import datasets

In [34]:
df2 = pd.read_csv('https://github.com/jbrownlee/Datasets/raw/master/breast-cancer.csv', header=None)
df2.columns = ['age', 'menopause', 'tumor-size', 'inv-nodes', 'node-caps', 'deg-malig', 'breast', 'breast-quad', 'irradiat', 'Class']

In [None]:
print df2.shape
df2 = df2.dropna()
print df2.shape

In [36]:
df2

Unnamed: 0,age,menopause,tumor-size,inv-nodes,node-caps,deg-malig,breast,breast-quad,irradiat,Class
0,'40-49','premeno','15-19','0-2','yes','3','right','left_up','no','recurrence-events'
1,'50-59','ge40','15-19','0-2','no','1','right','central','no','no-recurrence-events'
2,'50-59','ge40','35-39','0-2','no','2','left','left_low','no','recurrence-events'
3,'40-49','premeno','35-39','0-2','yes','3','right','left_low','yes','no-recurrence-events'
4,'40-49','premeno','30-34','3-5','yes','2','left','right_up','no','recurrence-events'
5,'50-59','premeno','25-29','3-5','no','2','right','left_up','yes','no-recurrence-events'
6,'50-59','ge40','40-44','0-2','no','3','left','left_up','no','no-recurrence-events'
7,'40-49','premeno','10-14','0-2','no','2','left','left_up','no','no-recurrence-events'
8,'40-49','premeno','0-4','0-2','no','2','right','right_low','no','no-recurrence-events'
9,'40-49','ge40','40-44','15-17','yes','2','right','left_up','yes','no-recurrence-events'


In [37]:
cols2 = df2.columns
X2 = df2[cols2[:-1]]
y2 = df2[cols2[-1]]

In [38]:
mNB = MultinomialNB()
mNB.fit(X2, y2)

ValueError: could not convert string to float: "'no'"

In [39]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

In [40]:
le = LabelEncoder()
df2_ = df2.apply(le.fit_transform)

TypeError: ("'<' not supported between instances of 'str' and 'float'", 'occurred at index node-caps')

In [41]:
df2_

NameError: name 'df2_' is not defined

In [46]:
print (le.classes_)

["'0-2'" "'12-14'" "'15-17'" "'24-26'" "'3-5'" "'6-8'" "'9-11'"]


In [48]:
cols2 = df2_.columns
X2 = df2_[cols2[:-1]]
y2 = df2_[cols2[-1]]

NameError: name 'df2_' is not defined

In [50]:
GaussianNB().fit(X2, y2).score(X2, y2)

ValueError: could not convert string to float: "'no'"

In [52]:
MultinomialNB().fit(X2, y2).score(X2, y2)

ValueError: could not convert string to float: "'no'"

# Naive Bayes in Sentiment Analysis

Dataset source: https://github.com/ApoorvP02121996/Sentiment-Analysis---Movie-Reviews/blob/master/Naive%20Bayes/training_set.csv

In [56]:
df3 = pd.read_csv('https://github.com/ApoorvP02121996/Sentiment-Analysis---Movie-Reviews/raw/master/Naive%20Bayes/training_set.csv')
df3.columns = ['target', 'text']

print (df3.shape)
df3.head()

(6397, 2)


Unnamed: 0,target,text
0,1,there's an energy to y tu mam?tambi閚 . much of...
1,0,maybe you'll be lucky and there'll be a power...
2,0,could as easily have been called 'under siege ...
3,0,a close-to-solid espionage thriller with the m...
4,1,expect no major discoveries nor any stylish s...


In [57]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn import metrics

In [58]:
vectorizer = CountVectorizer(stop_words='english')
vectorizer

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words='english',
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [59]:
df3.text = df3.text.apply(lambda e: e.decode(errors='ignore'))

AttributeError: 'str' object has no attribute 'decode'

In [62]:
train = df3[:4000]
test  = df3[4000:]

print (train.shape, test.shape)

(4000, 2) (2397, 2)


In [65]:
vectorizer.fit(df3.text)
train_features = vectorizer.transform(train.text)
test_features  = vectorizer.transform(test.text)

## Multinomial

In [70]:
mNB3_1 = MultinomialNB().fit(features, df3.target)
print (mNB3_1.score(train_features, train.target))
print (mNB3_1.score(test_features, test.target))

NameError: name 'features' is not defined

In [71]:
mNB3_2 = MultinomialNB(alpha=0.0000001).fit(train_features, train.target)
print (mNB3_2.score(train_features, train.target))
print (mNB3_2.score(test_features, test.target))

0.97725
0.698372966208


In [72]:
mNB3_3 = MultinomialNB(alpha=10).fit(train_features, train.target)
print (mNB3_3.score(train_features, train.target))
print (mNB3_3.score(test_features, test.target))

0.891
0.730496453901


## Gaussian

In [76]:
gNB3_1 = GaussianNB().fit(train_features.todense(), train.target)
print (gNB3_1.score(train_features.todense(), train.target))
print (gNB3_1.score(test_features.todense(), test.target))

0.9625
0.661243220693
