# **API: Scikit Learn, NLTK**

# **Algorithm: Naive Bayes**

# **Project: NLP Spam detection from messages**

# **Author: Parthib**

In [None]:
#import the required libraries
import pandas as pd
import string
#from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
#Get the spam data collection 
SpamCollection_DF=pd.read_csv('SpamCollection',sep='\t',names=['Response','Message'])

In [None]:
#view first 5 records
SpamCollection_DF.head()

Unnamed: 0,Response,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [None]:
#Use Describe() to view more information about data
SpamCollection_DF.describe()

Unnamed: 0,Response,Message
count,5572,5572
unique,2,5169
top,ham,"Sorry, I'll call later"
freq,4825,30


In [None]:
#Group by on response attribute to describe more
SpamCollection_DF.groupby('Response').describe()

Unnamed: 0_level_0,Message,Message,Message,Message
Unnamed: 0_level_1,count,unique,top,freq
Response,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
ham,4825,4516,"Sorry, I'll call later",30
spam,747,653,Please call our customer service representativ...,4


In [None]:
#Create new column to store length of response attribute
SpamCollection_DF['Length']=SpamCollection_DF['Message'].apply(len)

In [None]:
#View first 5 records of length of the messages 
SpamCollection_DF.head()

Unnamed: 0,Response,Message,Length
0,ham,"Go until jurong point, crazy.. Available only ...",111
1,ham,Ok lar... Joking wif u oni...,29
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,155
3,ham,U dun say so early hor... U c already then say...,49
4,ham,"Nah I don't think he goes to usf, he lives aro...",61


In [None]:
#define a function to get rid of stopwords present in the messages
def message_text_process(mess):
  #check characters to see if there are punctuation
  no_punctuation=[char for char in mess if char not in string.punctuation]
  #now form the sentance
  no_punctuation = ''.join(no_punctuation)
  #now eliminate any stopwords
  return [word for word in no_punctuation.split() if word.lower() not in stopwords.words('english')]

In [None]:
#verify the function working
SpamCollection_DF['Message'].head(5).apply(message_text_process)


0    [Go, jurong, point, crazy, Available, bugis, n...
1                       [Ok, lar, Joking, wif, u, oni]
2    [Free, entry, 2, wkly, comp, win, FA, Cup, fin...
3        [U, dun, say, early, hor, U, c, already, say]
4    [Nah, dont, think, goes, usf, lives, around, t...
Name: Message, dtype: object

In [None]:
#start text processing with vectorizer 
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
#use bag of words by applying the function and fit the data into it
bag_of_words_transformer = CountVectorizer(analyzer=message_text_process).fit(SpamCollection_DF['Message'])

  import sys


In [None]:
#print length of bag of words stored in the vocabulary_ attribute
print(len(bag_of_words_transformer.vocabulary_))

11425


In [None]:
#store bag of words for messages using transform method
message_bagofwords = bag_of_words_transformer.transform(SpamCollection_DF['Message'])


  import sys


In [None]:
#Print message of bagofwords
print(message_bagofwords)

  (0, 1110)	1
  (0, 1483)	1
  (0, 2060)	1
  (0, 4653)	1
  (0, 5217)	1
  (0, 5218)	1
  (0, 5769)	1
  (0, 6217)	1
  (0, 6906)	1
  (0, 6937)	1
  (0, 7555)	1
  (0, 7668)	1
  (0, 8336)	1
  (0, 8917)	1
  (0, 10965)	1
  (0, 11163)	1
  (1, 2451)	1
  (1, 3064)	1
  (1, 7701)	1
  (1, 8590)	1
  (1, 10698)	1
  (1, 11072)	1
  (2, 73)	1
  (2, 423)	1
  (2, 430)	1
  :	:
  (5568, 6691)	1
  (5568, 6882)	1
  (5568, 7159)	1
  (5568, 11418)	1
  (5569, 3228)	1
  (5569, 3721)	1
  (5569, 8252)	1
  (5569, 10199)	1
  (5570, 4508)	1
  (5570, 5055)	1
  (5570, 5251)	1
  (5570, 6282)	1
  (5570, 6699)	1
  (5570, 6799)	1
  (5570, 6984)	1
  (5570, 7287)	1
  (5570, 7394)	1
  (5570, 7800)	1
  (5570, 8420)	1
  (5570, 9915)	1
  (5570, 10787)	1
  (5570, 11006)	1
  (5571, 3431)	1
  (5571, 8348)	1
  (5571, 10648)	1


In [None]:
#apply tfidf transformer and fit the bag of words into it (transformed version)
from sklearn.feature_extraction.text import TfidfTransformer
tfidf_transformer=TfidfTransformer().fit(message_bagofwords)

In [None]:
#print shape of the tfidf 
message_tfidf=tfidf_transformer.transform(message_bagofwords)
print(message_tfidf.shape)

(5572, 11425)


In [None]:
#choose naive Bayes model to detect the spam and fit the tfidf data into it
from sklearn.naive_bayes import MultinomialNB
spam_detect_model = MultiNomialNB().fit(message_tfidf,SpamCollection_DF['Response'])

NameError: ignored

In [None]:
#check model for the predicted and expected value say for message#2 and message#5
message = SpamCollection_DF['Message'][4]
bag_of_words_for_message = bag_of_words_transformer.transform([message])
tfidf=tfidf_transformer.transform(bag_of_words_for_message)
print('Predicted',spam_detect_model.predict(tfidf)[0])
print('Expected ',SpamCollection_DF['Response'][4])

NameError: ignored