#  Naive Bayes Classifiers

In [29]:
import numpy as np
import pandas as pd
from sklearn.naive_bayes import GaussianNB,BernoulliNB,MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score,classification_report,confusion_matrix
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import SelectPercentile, f_classif
from sklearn.feature_extraction.text import CountVectorizer

## Naive Bayes
### Using Naive Bayes to predict spam

In [30]:
#Use Latin encoding as the Data has non UFT-8 Chars
data = pd.read_csv("abhilasha_dataset_updated.csv",encoding='latin-1')

In [31]:
data.columns

Index(['description', 'category'], dtype='object')

In [32]:
data.isnull().sum()

description    0
category       1
dtype: int64

In [33]:
data.dropna(subset=['category'], how='all', inplace = True)


In [64]:
data.isnull().sum()

description    0
category       0
dtype: int64

In [35]:
data.shape

(5435, 2)

In [65]:
data.head(1)

Unnamed: 0,description,category
0,Hi This is Dheeraj,ignore
1,Dheeraj joining in,ignore
2,Dheeraj here,ignore


In [66]:
X =  data.description
y = data.category

In [67]:
X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=10)

In [68]:
y_test.describe()

count         1359
unique           4
top       decision
freq           481
Name: category, dtype: object

In [69]:
#X_test

In [70]:
# TfidfVectorizer() is better than CountVectorizer() 
# TfidfVectorizer() has more feature than CountVectorizer()
vectorizer = TfidfVectorizer()

In [71]:
X_train_transformed = vectorizer.fit_transform(X_train)
X_test_transformed  = vectorizer.transform(X_test)
feature_names = vectorizer.get_feature_names()

In [72]:
len(feature_names)

5394

In [44]:
X_train_transformed

<4076x5394 sparse matrix of type '<class 'numpy.float64'>'
	with 31569 stored elements in Compressed Sparse Row format>

In [93]:
feature_names[0:5394]

['10',
 '100',
 '15th',
 '1decision',
 '30',
 '5decision',
 '6th',
 '85',
 '99',
 'abide',
 'abilities',
 'ability',
 'able',
 'abort',
 'about',
 'aboutyourself',
 'aboutÿhow',
 'aboutÿinvestingÿin',
 'above',
 'abreast',
 'absence',
 'absent',
 'absenteeism',
 'absentees',
 'absolute',
 'absorbed',
 'abstractness',
 'abundance',
 'abundant',
 'abuse',
 'accelerating',
 'accentuate',
 'accept',
 'acceptable',
 'acceptance',
 'accepted',
 'accepts',
 'access',
 'accessible',
 'accessories',
 'accident',
 'acclamation',
 'accompanying',
 'accomplish',
 'accomplishing',
 'accomplishment',
 'accomplishments',
 'according',
 'account',
 'accountabilities',
 'accountability',
 'accountable',
 'accounting',
 'accuracy',
 'accurate',
 'accurately',
 'accuse',
 'accused',
 'accusing',
 'achievable',
 'achieve',
 'achieved',
 'achievement',
 'achievements',
 'achiever',
 'achievers',
 'achieving',
 'acquire',
 'acquired',
 'acquiring',
 'across',
 'act',
 'action',
 'actions',
 'activated',
 'a

In [94]:
X_train_transformed.toarray()

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [96]:
#### slim the data for training and testing
selector = SelectPercentile( percentile=99) # i need only 20% of data total =7510 , total of 10% = 751 
selector.fit(X_train_transformed,y_train)
X_train_transformed_per = selector.transform(X_train_transformed).toarray()
X_test_transformed_per  = selector.transform(X_test_transformed).toarray()

  f = msb / msw


In [97]:
X_train_transformed_per.shape

(4076, 5340)

In [98]:
#  it is good for confusion matrix , accuracry = 0.9798994974874372
# Confusion matrix :
# array([[1200,    1],
#        [  27,  165]])
clf = BernoulliNB()




In [99]:
# 
#  it is not good for confusion matrix , accuracry = 0.9676956209619526
#clf = GaussianNB()

# Confusion matrix :
# array([[1172,   29],
#       [  16,  176]])


In [100]:
# it is not good for efficiency , accuracy = 0.9382627422828428
# Confusion matrix :
#array([[1201,    0],
#       [  86,  106]])

#clf = MultinomialNB() 

In [101]:
clf.fit(X_train_transformed_per, y_train)
y_predict = clf.predict(X_test_transformed_per)
print(accuracy_score(y_test, y_predict))

0.5364238410596026


In [102]:
confusion_matrix(y_test, y_predict)


array([[386,  12,   0,  12],
       [176, 217,   2,  86],
       [ 67,   8,   5,  62],
       [157,  44,   4, 121]])

In [103]:
clf_mul = MultinomialNB()
clf_mul.fit(X_train_transformed_per, y_train)
y_predict_mul = clf_mul.predict(X_test_transformed_per)
print(accuracy_score(y_test, y_predict_mul))

0.47167034584253126


In [104]:
confusion_matrix(y_test, y_predict_mul)


array([[ 95, 269,   1,  45],
       [ 13, 440,   0,  28],
       [  9,  84,   1,  48],
       [ 31, 190,   0, 105]])

In [105]:
accuracy_score(y_test, y_predict_mul)

0.47167034584253126

In [106]:
clf_ber = BernoulliNB()
clf_ber.fit(X_train_transformed_per, y_train)
y_predict_ber = clf_ber.predict(X_test_transformed_per)
print(accuracy_score(y_test, y_predict_ber))

0.5364238410596026


In [107]:
accuracy_score(y_test, y_predict_ber)

0.5364238410596026

In [108]:
pd.crosstab(y_test, y_predict_ber)

col_0,action point,decision,ignore,key point
category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
action point,386,12,0,12
decision,176,217,2,86
ignore,67,8,5,62
key point,157,44,4,121
