In [2]:
# This program shows how to detect spam

import pandas as pd
import string
from nltk.corpus import stopwords

In [3]:
# Get the spam data collection using pandas
df_spamCollection = pd.read_csv('SMSSpamCollection.csv', sep=',', encoding = "ISO-8859-1")

# encoding = "ISO-8859-1" is used to encode the dataset to ISO-8859-1

In [5]:
df_spamCollection.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [6]:
# From the output of the above cell, we can see that there are some unnamed columns and the label and text column name is 
# not intuitive so let's fix those in this step.

df_spamCollection = df_spamCollection.drop(["Unnamed: 2", "Unnamed: 3", "Unnamed: 4"], axis=1)
df_spamCollection = df_spamCollection.rename(columns={"v2" : "message", "v1":"response"})
df_spamCollection.head()

Unnamed: 0,response,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [7]:
df_spamCollection.describe()

Unnamed: 0,response,message
count,5574,5573
unique,3,5170
top,ham,"Sorry, I'll call later"
freq,4825,30


In [8]:
# View response using group by and describe methods
df_spamCollection.groupby('response').describe()

Unnamed: 0_level_0,message,message,message,message
Unnamed: 0_level_1,count,unique,top,freq
response,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
ham,4825,4516,"Sorry, I'll call later",30
"ham""""""",1,1,Well there's still a bit left if you guys want...,1
spam,747,653,Please call our customer service representativ...,4


In [9]:
# Verify length of the messages and also add it as a new column (feature)
## df_spamCollection['length'] = df_spamCollection['message'].apply(len)  
# since its dtype is float we convert to str because float don't have attribute len, thus we use:
df_spamCollection['length'] = df_spamCollection['message'].str.len()

In [10]:
# View first 5 messages with lenth
df_spamCollection.head()

Unnamed: 0,response,message,length
0,ham,"Go until jurong point, crazy.. Available only ...",111.0
1,ham,Ok lar... Joking wif u oni...,29.0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,155.0
3,ham,U dun say so early hor... U c already then say...,49.0
4,ham,"Nah I don't think he goes to usf, he lives aro...",61.0


In [11]:
# Define a function to get rid of stopwords present in the messages
def message_text_process(mess):
    # Check characters to see if there are punctuations
    no_punctuation = [char for char in mess if char not in string.punctuation]
    # now from the sentence
    no_punctuation = ''.join(no_punctuation)
    # Eliminate any stopwords
    return [word for word in no_punctuation.split() if word.lower() not in stopwords.words('english')]

In [12]:
# Verify that function is working
df_spamCollection['message'].head(5).apply(message_text_process)

0    [Go, jurong, point, crazy, Available, bugis, n...
1                       [Ok, lar, Joking, wif, u, oni]
2    [Free, entry, 2, wkly, comp, win, FA, Cup, fin...
3        [U, dun, say, early, hor, U, c, already, say]
4    [Nah, dont, think, goes, usf, lives, around, t...
Name: message, dtype: object

In [13]:
# Start text processing with vectorizer
from sklearn.feature_extraction.text import CountVectorizer

In [23]:
# Convert the text to a numerical feature vector by using the technique, bag of words, by applying the function and
# fit the data (message) into it.
# bag_of_words_transformer = CountVectorizer(analyzer=message_text_process).fit(df_spamCollection['message'])
# But bacause it gives error "ValueError: np.nan is an invalid document, expected byte or unicode string.", we use:
bag_of_words_transformer = CountVectorizer(analyzer=message_text_process).fit(df_spamCollection['message'].values.astype('U'))

In [24]:
# Print length of bag oof words stored in the vocabulary_ attribute
print(len(bag_of_words_transformer.vocabulary_))

11304


In [27]:
# Store bag of words for messages using transform method
# message_bagofwords = bag_of_words_transformer.transform(df_spamCollection['message'])
# to avoid error, we use:

message_bagofwords = bag_of_words_transformer.transform(df_spamCollection['message'].values.astype('U'))

In [28]:
# Apply tfidf transformer and fit the bag of words into it (transformed version)
# TF means Term Frequency, while TFIDF means Term Frequency Times Inverse Document Frequency.
# This is a common term weighing scheme in information retrieval.
from sklearn.feature_extraction.text import TfidfTransformer
tfidf_transformer = TfidfTransformer().fit(message_bagofwords)

In [29]:
# It actually transforms account matrix into a TF or TFIDF representation

In [30]:
# Print shape of the tfidf
message_tfidf = tfidf_transformer.transform(message_bagofwords)
print(message_tfidf.shape)

(5574, 11304)


In [31]:
# Choose Naive Bayes model to detect the spam and fit the tfidf data into it
from sklearn.naive_bayes import MultinomialNB
spam_detect_model = MultinomialNB().fit(message_tfidf, df_spamCollection['response'])

In [32]:
# Check model for the predicted and expected value, say for message #2 and message #5
message = df_spamCollection['message'][2]
bag_of_words_for_message = bag_of_words_transformer.transform([message])
tfidf = tfidf_transformer.transform(bag_of_words_for_message)

In [33]:
# Now try to predict the response using to predict method of the model and then check the actual value present for the 
# second message in the data set
print('predicted', spam_detect_model.predict(tfidf)[0])
print('expected',  df_spamCollection.response[4])

predicted spam
expected ham


In [34]:
# The predicted values does not match the actual value. 
# This proves that the text processing algorithm and the model are not working properly.

In [35]:
# Check model for the predicted and expected value for message #5
message5 = df_spamCollection['message'][5]
bag_of_words_for_message5 = bag_of_words_transformer.transform([message5])
tfidf5 = tfidf_transformer.transform(bag_of_words_for_message5)

In [36]:
# Now try to predict the response using to predict method of the model and then check the actual value present for the 
# fifth message in the data set
print('predicted', spam_detect_model.predict(tfidf5)[0])
print('expected',  df_spamCollection.response[4])

predicted ham
expected ham


In [37]:
# The predicted values matches the actual value. 
# This proves that the text processing algorithm and the model are working properly.