# Text Classification using Naive Bayes Algorithm

#### Naive Baye’s classifier assumes that the presence of a particular feature in a class is unrelated to the presence of any other feature.

In [1]:
import numpy as np
from sklearn.naive_bayes import MultinomialNB
import pandas as pd
import string
import nltk
from nltk import word_tokenize
from  nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
data = pd.read_csv('spam.csv')

In [3]:
data.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [4]:
# Dropping the redundent looking collumns (for this project)
to_drop = ["Unnamed: 2","Unnamed: 3","Unnamed: 4"]
data = data.drop(data[to_drop], axis=1)
# Renaming the columns 
data.rename(columns = {"v1":"Category", "v2":"Text"}, inplace = True)
data.head()

Unnamed: 0,Category,Text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [5]:
data.shape

(5572, 2)

In [6]:
data['Category'].value_counts()

ham     4825
spam     747
Name: Category, dtype: int64

#### Tokenization is breaking complex data into smaller units called tokens. It can be done by splitting paragraphs into sentences and sentences into words. I am splitting the Clean_Text into words at this step.

#### Stopwords are frequently occurring words(such as few, is, an, etc). These words hold meaning in sentence structure, but do not contribute much to language processing in NLP.

In [7]:
def text_cleaning(a):
    remove_punctuation = [char for char in a if char not in string.punctuation]
    #print(remove_punctuation)
    
    remove_punctuation = ''.join(remove_punctuation)
    #print(remove_punctuation)
    
    return [ word for word in remove_punctuation.split() if word.lower() not in stopwords.words('english')]


In [8]:
# To print the second column after removing stopwords and tokenization
print(data.iloc[:,1].apply(text_cleaning))

0       [Go, jurong, point, crazy, Available, bugis, n...
1                          [Ok, lar, Joking, wif, u, oni]
2       [Free, entry, 2, wkly, comp, win, FA, Cup, fin...
3           [U, dun, say, early, hor, U, c, already, say]
4       [Nah, dont, think, goes, usf, lives, around, t...
                              ...                        
5567    [2nd, time, tried, 2, contact, u, U, �750, Pou...
5568                   [�, b, going, esplanade, fr, home]
5569                     [Pity, mood, Soany, suggestions]
5570    [guy, bitching, acted, like, id, interested, b...
5571                                   [Rofl, true, name]
Name: Text, Length: 5572, dtype: object


In [9]:
# In order to give a value to each word so they can be changed to vector
from sklearn.feature_extraction.text import CountVectorizer
bow_transformer = CountVectorizer(analyzer=text_cleaning).fit(data['Text'])

print(bow_transformer)

CountVectorizer(analyzer=<function text_cleaning at 0x00000123F9DC3EE0>)


In [10]:
title_bow = bow_transformer.transform(data['Text'])

print(title_bow)

  (0, 1096)	1
  (0, 1461)	1
  (0, 2027)	1
  (0, 4574)	1
  (0, 5135)	1
  (0, 5136)	1
  (0, 5685)	1
  (0, 6131)	1
  (0, 6815)	1
  (0, 6846)	1
  (0, 7456)	1
  (0, 7567)	1
  (0, 8231)	1
  (0, 8809)	1
  (0, 10845)	1
  (0, 11043)	1
  (1, 2407)	1
  (1, 3012)	1
  (1, 7600)	1
  (1, 8482)	1
  (1, 10582)	1
  (1, 10952)	1
  (2, 73)	1
  (2, 422)	1
  (2, 429)	1
  :	:
  (5568, 6604)	1
  (5568, 6791)	1
  (5568, 7065)	1
  (5568, 11235)	1
  (5569, 3169)	1
  (5569, 3655)	1
  (5569, 8147)	1
  (5569, 10087)	1
  (5570, 4430)	1
  (5570, 4973)	1
  (5570, 5169)	1
  (5570, 6196)	1
  (5570, 6612)	1
  (5570, 6710)	1
  (5570, 6892)	1
  (5570, 7190)	1
  (5570, 7297)	1
  (5570, 7698)	1
  (5570, 8314)	1
  (5570, 9804)	1
  (5570, 10669)	1
  (5570, 10886)	1
  (5571, 3370)	1
  (5571, 8243)	1
  (5571, 10532)	1


#### TF-IDF in NLP stands for Term Frequency – Inverse document frequency. In NLP cleaned data needs to be converted into a numerical format where each word is represented by a matrix. This is also known as word embedding or Word vectorization.

In [11]:
from sklearn.feature_extraction.text import TfidfTransformer
tfidf_transformer = TfidfTransformer().fit(title_bow)
print(tfidf_transformer)

title_tfidf = tfidf_transformer.transform(title_bow)
print(title_tfidf) #got tfidf values for whole vocabulary

TfidfTransformer()
  (0, 11043)	0.2302307806673824
  (0, 10845)	0.19070440364977176
  (0, 8809)	0.24700781968848798
  (0, 8231)	0.17135863098645995
  (0, 7567)	0.263992475219973
  (0, 7456)	0.31248959807463006
  (0, 6846)	0.18344049775392818
  (0, 6815)	0.15156099829325625
  (0, 6131)	0.18912594285931972
  (0, 5685)	0.2498079760885523
  (0, 5136)	0.26866384122613163
  (0, 5135)	0.2983050989432094
  (0, 4574)	0.31248959807463006
  (0, 2027)	0.24200168290449323
  (0, 1461)	0.31248959807463006
  (0, 1096)	0.28824103664730155
  (1, 10952)	0.4005697292204744
  (1, 10582)	0.20689436953946386
  (1, 8482)	0.504282830397047
  (1, 7600)	0.37669696082530857
  (1, 3012)	0.29116619142344646
  (1, 2407)	0.5619244500186726
  (2, 11003)	0.1909725859033143
  (2, 10964)	0.15964606600812142
  (2, 10570)	0.1399031851067661
  :	:
  (5568, 6791)	0.31367469776242124
  (5568, 6604)	0.47781076401785183
  (5568, 6267)	0.5575721048646767
  (5568, 4801)	0.3853122086093004
  (5569, 10087)	0.520467167163554
  (5569

In [12]:
print(title_tfidf.shape)

(5572, 11301)


In [13]:
from sklearn.naive_bayes import MultinomialNB

model = MultinomialNB().fit(title_tfidf,data['Category'])

In [14]:
predicted_val = model.predict(title_tfidf)
print(predicted_val)

['ham' 'ham' 'spam' ... 'ham' 'ham' 'ham']


In [15]:
# Printing the confusion matrix of our prediction

from sklearn.metrics import confusion_matrix, accuracy_score

confusion_matrix(data['Category'],predicted_val)

array([[4825,    0],
       [ 114,  633]], dtype=int64)

In [16]:
accuracy_score(data['Category'],predicted_val)

0.9795405599425699

### References

###### https://www.youtube.com/watch?v=oq68P8Kv7nE
###### https://towardsdatascience.com/text-classification-using-naive-bayes-theory-a-working-example-2ef4b7eb7d5a