In [2]:
import pandas as pd
import string
from nltk.corpus import stopwords

In [3]:
# Get the spam data collection using pandas
colnames= ['response','message']
df_spam_collection = pd.read_csv('C:/Users/sharv/Documents/Post Graduate Program In Data Science-PurdueSimpliLearn/Course2-Data Science With Python/Datasets/spam.csv',names=colnames,encoding="ISO-8859-1",skiprows=1,sep=",",usecols=colnames)

In [4]:
df_spam_collection.head()

Unnamed: 0,response,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [5]:
# view more information about spam data using describe method
df_spam_collection.describe()

Unnamed: 0,response,message
count,5572,5572
unique,2,5169
top,ham,"Sorry, I'll call later"
freq,4825,30


In [6]:
# view response using group by and describe method
df_spam_collection.groupby('response').describe()

Unnamed: 0_level_0,message,message,message,message
Unnamed: 0_level_1,count,unique,top,freq
response,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
ham,4825,4516,"Sorry, I'll call later",30
spam,747,653,Please call our customer service representativ...,4


In [7]:
# verify length of the messages and also add it as a new column('feature')
df_spam_collection['length'] = df_spam_collection['message'].apply(len)

In [8]:
# view first 5 messages with length
df_spam_collection.head()

Unnamed: 0,response,message,length
0,ham,"Go until jurong point, crazy.. Available only ...",111
1,ham,Ok lar... Joking wif u oni...,29
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,155
3,ham,U dun say so early hor... U c already then say...,49
4,ham,"Nah I don't think he goes to usf, he lives aro...",61


In [9]:
# define a function to get rid of stopwords present in the messages
def message_text_process(mess):
    # check characters to see if there r punctuations
    no_punctuation = [char for char in mess if char not in string.punctuation]
    # now form the sentence
    no_punctuation = ''.join(no_punctuation)
    # now eliminate any stopwords
    return[word for word in no_punctuation.split() if word.lower() not in stopwords.words('english')]
# example of stopwords are I,me,myself,we our, ours, you,yours. Punctauations are also(like stopwords) less weighted for text analysis.

In [10]:
# verify that the function is working
df_spam_collection['message'].head(5).apply(message_text_process)

0    [Go, jurong, point, crazy, Available, bugis, n...
1                       [Ok, lar, Joking, wif, u, oni]
2    [Free, entry, 2, wkly, comp, win, FA, Cup, fin...
3        [U, dun, say, early, hor, U, c, already, say]
4    [Nah, dont, think, goes, usf, lives, around, t...
Name: message, dtype: object

In [11]:
# start text processing with vectorizer
from sklearn.feature_extraction.text import CountVectorizer
# Use CountVectorizer class to convert the collection of text documents to matrix of tokens, this assigns a numeric value to
# each word present in the text.

In [13]:
# try to convert text into numerical feature vector by using the technique "bag of words".
# bag of words by applying the function and fit the data(message)into it
bag_of_words_transformer = CountVectorizer(analyzer=message_text_process).fit(df_spam_collection['message'])
# created object bag_of_words and then vectorized or assign numeric values to them by fitting the message data using the fit method.

In [14]:
# print length of bag of words stored in the vocabulary_attribute
print(len(bag_of_words_transformer.vocabulary_))

11304


In [15]:
# store bag of words for messages using transform method
message_bagofwords = bag_of_words_transformer.transform(df_spam_collection['message'])

In [22]:
# apply tfidf transformer and fit the bag of words into it (transformed version)
# tfidf means term frequency times inverse document frequency 
# this is a common term weighing scheme in information retrievel.First we fit the bag of words into the transformer using 
# TfidfTransformer() instance and then we use the transform method. It actually transforms a count matrix into tfidf representation.
from sklearn.feature_extraction.text import TfidfTransformer
tfidf_transformer = TfidfTransformer().fit(message_bagofwords)

In [23]:
# print the shape of the tfidf
message_tfidf = tfidf_transformer.transform(message_bagofwords)
print(message_tfidf.shape)

(5572, 11304)


In [24]:
# choose naive bayes model to detect the spam and fit the tfidf data into it
# naive bayes is one of the most statistical procedures or model to classify the text.
from sklearn.naive_bayes import MultinomialNB
spam_detect_model = MultinomialNB().fit(message_tfidf,df_spam_collection['response'])

In [25]:
# Once u have fit the transformed tfidf message or bag of words and response into the model, we can go ahead and try to 
# predict the response for the given message
# check model for the predicted and expected value say for message#2 and message#5
# first get the 2nd message and then transform that message as bag of words using the transform function. and then again
# perform tfidf transformation using tfidf_transformer function. Now try to predict the response using predict() method 
# of the model and check actual value present for the 2nd message in the dataset.
message = df_spam_collection['message'][2]
bag_of_words_for_message = bag_of_words_transformer.transform([message])
tfidf = tfidf_transformer.transform(bag_of_words_for_message)

print('predicted',spam_detect_model.predict(tfidf)[0])
print('expected',df_spam_collection.response[2])

predicted spam
expected spam


In [27]:
message = df_spam_collection['message'][4]
bag_of_words_for_message = bag_of_words_transformer.transform([message])
tfidf = tfidf_transformer.transform(bag_of_words_for_message)

print('predicted',spam_detect_model.predict(tfidf)[0])
print('expected',df_spam_collection.response[4])

predicted ham
expected ham


In [None]:
# as u can see predicted value matches the actual value. This proves that the text processing algorithm and model r working
# properly.