Import necessary libraries and packages

In [2]:
import pandas as pd
import re

import seaborn as sns
from matplotlib import pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from nltk import WordNetLemmatizer

Dataset source: https://www.kaggle.com/uciml/sms-spam-collection-dataset

In [3]:
# encoding "latin-1" is used since the file cannot be encoded with the default encoder "utf-8"
spam = pd.read_csv("spam.csv", encoding="latin-1")

# randomize the dataset and remove the index after randomization
spam = spam.sample(frac = 1, random_state = 1).reset_index(drop = True)

display(spam)

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,Convey my regards to him,,,
1,ham,"[Û_] anyway, many good evenings to u! s",,,
2,ham,My sort code is and acc no is . The bank is n...,,,
3,ham,Sorry i din lock my keypad.,,,
4,spam,"Hi babe its Chloe, how r u? I was smashed on s...",,,
...,...,...,...,...,...
5567,ham,Hey what's up charles sorry about the late reply.,,,
5568,ham,Oh oh... Den muz change plan liao... Go back h...,,,
5569,ham,Huh i cant thk of more oredi how many pages do...,,,
5570,ham,I have printed it oh. So &lt;#&gt; come upst...,,,


Since the dataset has 3 columns of unused values, remove them and rename the remaining columns into 'label' and 'message'.

In [4]:
# get first 2 columns
dataset = spam.iloc[:, :2]

# and rename them
dataset.columns = ["label", "message"]

display(dataset)

Unnamed: 0,label,message
0,ham,Convey my regards to him
1,ham,"[Û_] anyway, many good evenings to u! s"
2,ham,My sort code is and acc no is . The bank is n...
3,ham,Sorry i din lock my keypad.
4,spam,"Hi babe its Chloe, how r u? I was smashed on s..."
...,...,...
5567,ham,Hey what's up charles sorry about the late reply.
5568,ham,Oh oh... Den muz change plan liao... Go back h...
5569,ham,Huh i cant thk of more oredi how many pages do...
5570,ham,I have printed it oh. So &lt;#&gt; come upst...


In [5]:
dataset.describe()

Unnamed: 0,label,message
count,5572,5572
unique,2,5169
top,ham,"Sorry, I'll call later"
freq,4825,30


#### Preprocessing

In [6]:
# Tokenization
dataset.loc[:, "message_tokenized"] = dataset.loc[:, "message"].apply(lambda text: word_tokenize(text))

# Casefolding to Lowercase
dataset.loc[:, "message_to_lower"] = dataset.loc[:, "message_tokenized"].apply(lambda text: [term.lower() for term in text])

# Special characters removal with regex
dataset.loc[:, "message_no_special"] = dataset.loc[:, "message_to_lower"].apply(
    lambda text: [re.sub(r'[^a-zA-Z\s]+', ' ', term) for term in text])

# Stopwords removal
dataset.loc[:, "message_no_stopwords"] = dataset.loc[:, "message_no_special"].apply(
    lambda text: [term for term in text if term not in stopwords.words('english')])

display(dataset)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s


Unnamed: 0,label,message,message_tokenized,message_to_lower,message_no_special,message_no_stopwords
0,ham,Convey my regards to him,"[Convey, my, regards, to, him]","[convey, my, regards, to, him]","[convey, my, regards, to, him]","[convey, regards]"
1,ham,"[Û_] anyway, many good evenings to u! s","[[, Û_, ], anyway, ,, many, good, evenings, t...","[[, û_, ], anyway, ,, many, good, evenings, t...","[ , , , anyway, , many, good, evenings, to,...","[ , , , anyway, , many, good, evenings, u, ]"
2,ham,My sort code is and acc no is . The bank is n...,"[My, sort, code, is, and, acc, no, is, ., The,...","[my, sort, code, is, and, acc, no, is, ., the,...","[my, sort, code, is, and, acc, no, is, , the,...","[sort, code, acc, , bank, natwest, , reply, ..."
3,ham,Sorry i din lock my keypad.,"[Sorry, i, din, lock, my, keypad, .]","[sorry, i, din, lock, my, keypad, .]","[sorry, i, din, lock, my, keypad, ]","[sorry, din, lock, keypad, ]"
4,spam,"Hi babe its Chloe, how r u? I was smashed on s...","[Hi, babe, its, Chloe, ,, how, r, u, ?, I, was...","[hi, babe, its, chloe, ,, how, r, u, ?, i, was...","[hi, babe, its, chloe, , how, r, u, , i, was...","[hi, babe, chloe, , r, u, , smashed, saturda..."
...,...,...,...,...,...,...
5567,ham,Hey what's up charles sorry about the late reply.,"[Hey, what, 's, up, charles, sorry, about, the...","[hey, what, 's, up, charles, sorry, about, the...","[hey, what, s, up, charles, sorry, about, the...","[hey, s, charles, sorry, late, reply, ]"
5568,ham,Oh oh... Den muz change plan liao... Go back h...,"[Oh, oh, ..., Den, muz, change, plan, liao, .....","[oh, oh, ..., den, muz, change, plan, liao, .....","[oh, oh, , den, muz, change, plan, liao, , g...","[oh, oh, , den, muz, change, plan, liao, , g..."
5569,ham,Huh i cant thk of more oredi how many pages do...,"[Huh, i, cant, thk, of, more, oredi, how, many...","[huh, i, cant, thk, of, more, oredi, how, many...","[huh, i, cant, thk, of, more, oredi, how, many...","[huh, cant, thk, oredi, many, pages, ]"
5570,ham,I have printed it oh. So &lt;#&gt; come upst...,"[I, have, printed, it, oh, ., So, &, lt, ;, #,...","[i, have, printed, it, oh, ., so, &, lt, ;, #,...","[i, have, printed, it, oh, , so, , lt, , ,...","[printed, oh, , , lt, , , , gt, , come, ..."


In [7]:
# Stemming
stemmer = SnowballStemmer("english")

dataset.loc[:, "message_stemmed"] = dataset.loc[:, "message_no_stopwords"].apply(lambda text: " ".join([stemmer.stem(term) for term in text]))

# Lemmatization
lemma = WordNetLemmatizer()

dataset.loc[:, "message_lemmatized"] = dataset.loc[:, "message_no_stopwords"].apply(lambda text: " ".join([lemma.lemmatize(term) for term in text]))

display(dataset)

Unnamed: 0,label,message,message_tokenized,message_to_lower,message_no_special,message_no_stopwords,message_stemmed,message_lemmatized
0,ham,Convey my regards to him,"[Convey, my, regards, to, him]","[convey, my, regards, to, him]","[convey, my, regards, to, him]","[convey, regards]",convey regard,convey regard
1,ham,"[Û_] anyway, many good evenings to u! s","[[, Û_, ], anyway, ,, many, good, evenings, t...","[[, û_, ], anyway, ,, many, good, evenings, t...","[ , , , anyway, , many, good, evenings, to,...","[ , , , anyway, , many, good, evenings, u, ]",anyway mani good even u,anyway many good evening u
2,ham,My sort code is and acc no is . The bank is n...,"[My, sort, code, is, and, acc, no, is, ., The,...","[my, sort, code, is, and, acc, no, is, ., the,...","[my, sort, code, is, and, acc, no, is, , the,...","[sort, code, acc, , bank, natwest, , reply, ...",sort code acc bank natwest repli confirm ...,sort code acc bank natwest reply confirm ...
3,ham,Sorry i din lock my keypad.,"[Sorry, i, din, lock, my, keypad, .]","[sorry, i, din, lock, my, keypad, .]","[sorry, i, din, lock, my, keypad, ]","[sorry, din, lock, keypad, ]",sorri din lock keypad,sorry din lock keypad
4,spam,"Hi babe its Chloe, how r u? I was smashed on s...","[Hi, babe, its, Chloe, ,, how, r, u, ?, I, was...","[hi, babe, its, chloe, ,, how, r, u, ?, i, was...","[hi, babe, its, chloe, , how, r, u, , i, was...","[hi, babe, chloe, , r, u, , smashed, saturda...",hi babe chloe r u smash saturday night g...,hi babe chloe r u smashed saturday night ...
...,...,...,...,...,...,...,...,...
5567,ham,Hey what's up charles sorry about the late reply.,"[Hey, what, 's, up, charles, sorry, about, the...","[hey, what, 's, up, charles, sorry, about, the...","[hey, what, s, up, charles, sorry, about, the...","[hey, s, charles, sorry, late, reply, ]",hey s charl sorri late repli,hey s charles sorry late reply
5568,ham,Oh oh... Den muz change plan liao... Go back h...,"[Oh, oh, ..., Den, muz, change, plan, liao, .....","[oh, oh, ..., den, muz, change, plan, liao, .....","[oh, oh, , den, muz, change, plan, liao, , g...","[oh, oh, , den, muz, change, plan, liao, , g...",oh oh den muz chang plan liao go back yan ...,oh oh den muz change plan liao go back yan...
5569,ham,Huh i cant thk of more oredi how many pages do...,"[Huh, i, cant, thk, of, more, oredi, how, many...","[huh, i, cant, thk, of, more, oredi, how, many...","[huh, i, cant, thk, of, more, oredi, how, many...","[huh, cant, thk, oredi, many, pages, ]",huh cant thk oredi mani page,huh cant thk oredi many page
5570,ham,I have printed it oh. So &lt;#&gt; come upst...,"[I, have, printed, it, oh, ., So, &, lt, ;, #,...","[i, have, printed, it, oh, ., so, &, lt, ;, #,...","[i, have, printed, it, oh, , so, , lt, , ,...","[printed, oh, , , lt, , , , gt, , come, ...",print oh lt gt come upstair,printed oh lt gt come upstairs


From the results of Stemming and Lemmatization above, both results are able to generate accurate and inaccurate root form of each words. But Stemming is shown better in generating the root form such as verb-ing and past verbs into present verbs.

Therefore, in the next step, we are going to use stemmed messages as the training input for the model.

In [8]:
dataset = dataset[["label", "message_stemmed"]]
dataset.columns = ["label", "message"]
display(dataset)

Unnamed: 0,label,message
0,ham,convey regard
1,ham,anyway mani good even u
2,ham,sort code acc bank natwest repli confirm ...
3,ham,sorri din lock keypad
4,spam,hi babe chloe r u smash saturday night g...
...,...,...
5567,ham,hey s charl sorri late repli
5568,ham,oh oh den muz chang plan liao go back yan ...
5569,ham,huh cant thk oredi mani page
5570,ham,print oh lt gt come upstair


Initalize Model

In [9]:
pipeline = Pipeline([
    ('bow', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('classifier', MultinomialNB())
])

In [10]:
feature_train, feature_test, target_train, target_test = train_test_split(dataset['message'], dataset['label'], test_size=0.2, random_state=1)
print(feature_train.shape)
print(feature_test.shape)
print(target_train.shape)
print(target_test.shape)

(4457,)
(1115,)
(4457,)
(1115,)


In [11]:
pipeline.fit(feature_train, target_train)

Pipeline(memory=None,
         steps=[('bow',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, vocabulary=None)),
                ('tfidf',
                 TfidfTransformer(norm='l2', smooth_idf=True,
                                  sublinear_tf=False, use_idf=True)),
                ('classifier',
                 MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))],
         verbose=False)

In [12]:
predictions = pipeline.predict(feature_test)

print(classification_report(target_test, predictions))

              precision    recall  f1-score   support

         ham       0.97      1.00      0.98       971
        spam       0.98      0.76      0.86       144

    accuracy                           0.97      1115
   macro avg       0.97      0.88      0.92      1115
weighted avg       0.97      0.97      0.97      1115



In [13]:
print(accuracy_score(target_test, predictions))

0.967713004484305
