#  Naive Bayes Classifiers

In [24]:
import numpy as np
import pandas as pd
from sklearn.naive_bayes import GaussianNB,BernoulliNB,MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score,classification_report,confusion_matrix
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import SelectPercentile, f_classif

## Naive Bayes
### Using Naive Bayes to predict spam

In [25]:
#Use Latin encoding as the Data has non UFT-8 Chars
data = pd.read_csv("spam.csv",encoding='latin-1')

In [26]:
data.columns

Index(['v1', 'v2', 'Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], dtype='object')

In [27]:
#rename the columnns
data.columns = ['v1', 'v2', 'Rename_Unnamed: 2', 'Rename_Unnamed: 3', 'Unnamed: 4']

In [28]:
data.shape

(5572, 5)

In [29]:
#data.v2

In [30]:
data.head(5)

Unnamed: 0,v1,v2,Rename_Unnamed: 2,Rename_Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [31]:
data[data.v1=='spam'].count()

v1                   747
v2                   747
Rename_Unnamed: 2      5
Rename_Unnamed: 3      2
Unnamed: 4             0
dtype: int64

In [32]:
#data.v1 # it is spam or not
#data.v2 # it is text


In [33]:
X =  data.v2
y = data.v1

In [34]:
X.head()

0    Go until jurong point, crazy.. Available only ...
1                        Ok lar... Joking wif u oni...
2    Free entry in 2 a wkly comp to win FA Cup fina...
3    U dun say so early hor... U c already then say...
4    Nah I don't think he goes to usf, he lives aro...
Name: v2, dtype: object

In [35]:
X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=10)

In [36]:
X_test.head()

4635                           K k pa Had your lunch aha.
2279                    Sorry, I'll call later in meeting
4545    Never try alone to take the weight of a tear t...
5084                                Hey happy birthday...
5298                 I.ll hand her my phone to chat wit u
Name: v2, dtype: object

In [37]:
y_test.describe()

count     1393
unique       2
top        ham
freq      1201
Name: v1, dtype: object

In [38]:
# TfidfVectorizer() is better than CountVectorizer() 
# TfidfVectorizer() has more feature than CountVectorizer()
vectorizer = TfidfVectorizer()

In [39]:
X_train_transformed = vectorizer.fit_transform(X_train)
X_test_transformed  = vectorizer.transform(X_test)
feature_names = vectorizer.get_feature_names()

In [40]:
len(feature_names)

7509

In [41]:
feature_names

['00',
 '000',
 '000pes',
 '008704050406',
 '0089',
 '0121',
 '01223585236',
 '01223585334',
 '0125698789',
 '02',
 '0207',
 '02072069400',
 '02073162414',
 '02085076972',
 '021',
 '03',
 '04',
 '0430',
 '05',
 '050703',
 '0578',
 '06',
 '07',
 '07046744435',
 '07090201529',
 '07090298926',
 '07099833605',
 '07123456789',
 '07734396839',
 '07742676969',
 '07753741225',
 '0776xxxxxxx',
 '07781482378',
 '07786200117',
 '077xxx',
 '078',
 '07801543489',
 '07808726822',
 '07815296484',
 '07821230901',
 '078498',
 '0796xxxxxx',
 '07xxxxxxxxx',
 '08',
 '0800',
 '08000407165',
 '08000776320',
 '08000839402',
 '08000930705',
 '08000938767',
 '08001950382',
 '08002888812',
 '08002986030',
 '08002986906',
 '08002988890',
 '08006344447',
 '0808',
 '08081263000',
 '08081560665',
 '0825',
 '083',
 '0844',
 '08448350055',
 '08448714184',
 '0845',
 '08450542832',
 '08452810071',
 '08452810073',
 '08452810075over18',
 '0870',
 '08700435505150p',
 '08700469649',
 '08700621170150p',
 '08701213186',
 '08

In [42]:
feature_names[2000:2005]

['credit', 'credited', 'credits', 'creep', 'creepy']

In [43]:
X_train_transformed.toarray()

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [44]:
#### slim the data for training and testing
selector = SelectPercentile( percentile=10) # i need only 20% of data total =7510 , total of 10% = 751 
selector.fit(X_train_transformed,y_train)
X_train_transformed_per = selector.transform(X_train_transformed).toarray()
X_test_transformed_per  = selector.transform(X_test_transformed).toarray()

In [45]:
X_train_transformed_per.shape

(4179, 751)

In [61]:
X_test_transformed_per

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [62]:
X_test_transformed

<1393x7509 sparse matrix of type '<class 'numpy.float64'>'
	with 17124 stored elements in Compressed Sparse Row format>

In [46]:
#  it is good for confusion matrix , accuracry = 0.9798994974874372
# Confusion matrix :
# array([[1200,    1],
#        [  27,  165]])
clf = BernoulliNB()




In [47]:
# 
#  it is not good for confusion matrix , accuracry = 0.9676956209619526
#clf = GaussianNB()

# Confusion matrix :
# array([[1172,   29],
#       [  16,  176]])


In [48]:
# it is not good for efficiency , accuracy = 0.9382627422828428
# Confusion matrix :
#array([[1201,    0],
#       [  86,  106]])

#clf = MultinomialNB() 

In [49]:
clf.fit(X_train_transformed_per, y_train)
y_predict = clf.predict(X_test_transformed_per)

In [50]:
print(accuracy_score(y_test, y_predict))

0.9798994974874372


In [51]:
confusion_matrix(y_test, y_predict)


array([[1200,    1],
       [  27,  165]])

In [52]:
NewEmail = pd.Series(["HI.. we have meeting today.. please attend "])
NewEmail


0    HI.. we have meeting today.. please attend 
dtype: object

In [53]:
NewEmail_transformed = vectorizer.transform(NewEmail)
NewEmail_transformed  = selector.transform(NewEmail_transformed).toarray()
clf.predict(NewEmail_transformed)

array(['ham'], dtype='<U4')

In [54]:
clf_mul = MultinomialNB()
clf_mul.fit(X_train_transformed_per, y_train)
y_predict_mul = clf_mul.predict(X_test_transformed_per)

In [55]:
confusion_matrix(y_test, y_predict_mul)


array([[1201,    0],
       [  86,  106]])

In [56]:
accuracy_score(y_test, y_predict_mul)

0.9382627422828428

In [57]:
clf_ber = BernoulliNB()
clf_ber.fit(X_train_transformed_per, y_train)
y_predict_ber = clf_ber.predict(X_test_transformed_per)

In [58]:
accuracy_score(y_test, y_predict_ber)

0.9798994974874372

In [59]:
confusion_matrix(y_test, y_predict_ber)

array([[1200,    1],
       [  27,  165]])

In [60]:
pd.crosstab(y_test, y_predict_ber)

col_0,ham,spam
v1,Unnamed: 1_level_1,Unnamed: 2_level_1
ham,1200,1
spam,27,165
