# Importing Requred Libraries

In [3]:
# importing file reading and storing libraries
import pandas as pd
import numpy as np

# importing text processing Libraries
import string
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

# importing the Naive Bayes estimator
from sklearn.naive_bayes import MultinomialNB

In [4]:
#Get the spam data collection 
# reading the dataset
dataset = pd.read_csv('SpamCollection', sep='\t', names=['response', 'message'])
dataset.head()

Unnamed: 0,response,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [5]:
dataset.shape

(5572, 2)

In [6]:
#view response 
response = dataset['response']
response

0        ham
1        ham
2       spam
3        ham
4        ham
        ... 
5567    spam
5568     ham
5569     ham
5570     ham
5571     ham
Name: response, Length: 5572, dtype: object

In [7]:
dataset.groupby('response').describe()

Unnamed: 0_level_0,message,message,message,message
Unnamed: 0_level_1,count,unique,top,freq
response,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
ham,4825,4516,"Sorry, I'll call later",30
spam,747,653,Please call our customer service representativ...,4


In [8]:
#Verify length of the messages and also add it as a new column 

dataset['length'] = dataset['message'].apply(len)

In [9]:
dataset.head()

Unnamed: 0,response,message,length
0,ham,"Go until jurong point, crazy.. Available only ...",111
1,ham,Ok lar... Joking wif u oni...,29
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,155
3,ham,U dun say so early hor... U c already then say...,49
4,ham,"Nah I don't think he goes to usf, he lives aro...",61


In [10]:
# define a function to get rid of PUNCTUATIONS and any STOPWORDS present in the messages

def remove_punctuation_and_stopwords(msg):
    # check characters to see if there are any punctuations
    no_punctuation = [i for i in msg if i not in string.punctuation]
    no_punctuation = ''.join(no_punctuation)
    
    # check to see if there are stopwords
    final = [i for i in no_punctuation.split() if i.lower() not in stopwords.words('english')]
    return final


In [11]:
# checking to see if our function works or not

dataset['message'].head(5).apply(remove_punctuation_and_stopwords)

0    [Go, jurong, point, crazy, Available, bugis, n...
1                       [Ok, lar, Joking, wif, u, oni]
2    [Free, entry, 2, wkly, comp, win, FA, Cup, fin...
3        [U, dun, say, early, hor, U, c, already, say]
4    [Nah, dont, think, goes, usf, lives, around, t...
Name: message, dtype: object

In [12]:
#start text processing with vectorizer 
#use bag of words by applying the function and fit the data into it

bag_of_words_transformed = CountVectorizer(analyzer=remove_punctuation_and_stopwords).fit(dataset['message'])

In [13]:
#print length of bag of words stored in the vocabulary_ attribute
len(bag_of_words_transformed.vocabulary_)

11425

In [14]:
bag_of_words_message = bag_of_words_transformed.transform(dataset['message'])

In [15]:
#apply tfidf transformer and fit the bag of words into it (transformed version)

tfidf = TfidfTransformer().fit(bag_of_words_message)

In [16]:
#print shape of the tfidf 
tfidf_message = tfidf.transform(bag_of_words_message)
tfidf_message.shape

(5572, 11425)

In [17]:
#choose naive Bayes model to detect the spam and fit the tfidf data into it
spam_detect_model = MultinomialNB()
spam_detect_model.fit(tfidf_message, dataset['response'])

MultinomialNB()

In [18]:
#check model for the predicted and expected value say for message#2 and message#5
message_2 = dataset['message'][2]
bag_of_words_for_message2 = bag_of_words_transformed.transform([message_2])
tfidf_message2 = tfidf.transform(bag_of_words_for_message2)


message_5 = dataset['message'][5]
bag_of_words_for_message5 = bag_of_words_transformed.transform([message_5])
tfidf_message5 = tfidf.transform(bag_of_words_for_message5)

In [19]:
print('Expected response for message 2 -->> ', dataset['response'][2])
print('Predicted response for message 2 -->> ', spam_detect_model.predict(tfidf_message2)[0])

Expected response for message 2 -->>  spam
Predicted response for message 2 -->>  spam


In [20]:
print('Expected response for message 5 -->> ', dataset['response'][5])
print('Predicted response for message 5 -->> ', spam_detect_model.predict(tfidf_message5)[0])

Expected response for message 5 -->>  spam
Predicted response for message 5 -->>  ham


# The End