In [1]:
import sys
import nltk
import sklearn
import pandas
import numpy

In [2]:
import pandas as pd
import numpy as np

# load the dataset of SMS messages
df = pd.read_table('SMSSPamCollection', header=None, encoding='utf-8')

In [3]:
print(df.info())
print(df.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   0       5572 non-null   object
 1   1       5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB
None
      0                                                  1
0   ham  Go until jurong point, crazy.. Available only ...
1   ham                      Ok lar... Joking wif u oni...
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...
3   ham  U dun say so early hor... U c already then say...
4   ham  Nah I don't think he goes to usf, he lives aro...


In [4]:
# Class distribution
classes = df[0]
print(classes.value_counts())

0
ham     4825
spam     747
Name: count, dtype: int64


In [5]:
from sklearn.preprocessing import LabelEncoder

# Convert class labels to binary values, 0 = ham and 1 = spam
encoder = LabelEncoder()
Y = encoder.fit_transform(classes)

print(Y[:10])

[0 0 1 0 0 1 0 0 1 1]


In [6]:
# Store the SMS message data
text_messages = df[1]
print(text_messages[:20])

0     Go until jurong point, crazy.. Available only ...
1                         Ok lar... Joking wif u oni...
2     Free entry in 2 a wkly comp to win FA Cup fina...
3     U dun say so early hor... U c already then say...
4     Nah I don't think he goes to usf, he lives aro...
5     FreeMsg Hey there darling it's been 3 week's n...
6     Even my brother is not like to speak with me. ...
7     As per your request 'Melle Melle (Oru Minnamin...
8     WINNER!! As a valued network customer you have...
9     Had your mobile 11 months or more? U R entitle...
10    I'm gonna be home soon and i don't want to tal...
11    SIX chances to win CASH! From 100 to 20,000 po...
12    URGENT! You have won a 1 week FREE membership ...
13    I've been searching for the right words to tha...
14                  I HAVE A DATE ON SUNDAY WITH WILL!!
15    XXXMobileMovieClub: To use your credit, click ...
16                           Oh k...i'm watching here:)
17    Eh u remember how 2 spell his name... Yes 

In [7]:
# Regular expressions to replace email addresses, URLs, phone numbers and other sensitive info

# Replace email addresses with 'email'
processed = text_messages.str.replace(r'^.+@[^\.].*\.[a-z]{2,}$',
                                 'emailaddress')

# Replace URLs with 'webaddress'
processed = processed.str.replace(r'^http\://[a-zA-Z0-9\-\.]+\.[a-zA-Z]{2,3}(/\S*)?$',
                                  'webaddress')

# Replace money symbols with 'moneysymb' (£ can by typed with ALT key + 156)
processed = processed.str.replace(r'£|\$', 'moneysymb')
    
# Replace 10 digit phone numbers (formats include paranthesis, spaces, no spaces, dashes) with 'phonenumber'
processed = processed.str.replace(r'^\(?[\d]{3}\)?[\s-]?[\d]{3}[\s-]?[\d]{4}$',
                                  'phonenumbr')
    
# Replace numbers with 'numbr'
processed = processed.str.replace(r'\d+(\.\d+)?', 'numbr')

In [8]:
# Remove punctuation as it's not useful
processed = processed.str.replace(r'[^\w\d\s]', ' ')

# Replace whitespace between terms with a single space
processed = processed.str.replace(r'\s+', ' ')

# Remove leading and trailing whitespace
processed = processed.str.replace(r'^\s+|\s+?$', '')

In [9]:
# Change words to lower case - ex. Hello, HELLO, hello are now the same word
processed = processed.str.lower()
print(processed[:20])

0     go until jurong point, crazy.. available only ...
1                         ok lar... joking wif u oni...
2     free entry in 2 a wkly comp to win fa cup fina...
3     u dun say so early hor... u c already then say...
4     nah i don't think he goes to usf, he lives aro...
5     freemsg hey there darling it's been 3 week's n...
6     even my brother is not like to speak with me. ...
7     as per your request 'melle melle (oru minnamin...
8     winner!! as a valued network customer you have...
9     had your mobile 11 months or more? u r entitle...
10    i'm gonna be home soon and i don't want to tal...
11    six chances to win cash! from 100 to 20,000 po...
12    urgent! you have won a 1 week free membership ...
13    i've been searching for the right words to tha...
14                  i have a date on sunday with will!!
15    xxxmobilemovieclub: to use your credit, click ...
16                           oh k...i'm watching here:)
17    eh u remember how 2 spell his name... yes 

In [10]:
from nltk.corpus import stopwords

# Remove stop words from text messages

stop_words = set(stopwords.words('english'))

processed = processed.apply(lambda x: ' '.join(
    term for term in x.split() if term not in stop_words))

In [11]:
# Remove word stems using a Porter stemmer
ps = nltk.PorterStemmer()

processed = processed.apply(lambda x: ' '.join(
    ps.stem(term) for term in x.split()))

In [12]:
from nltk.tokenize import word_tokenize

# Create bag-of-words
all_words = []

for message in processed:
    words = word_tokenize(message)
    for w in words:
        all_words.append(w)
        
all_words = nltk.FreqDist(all_words)

In [13]:
# Print the total number of words and the 15 most common words
print('Number of words: {}'.format(len(all_words)))
print('Most common words: {}'.format(all_words.most_common(15)))

Number of words: 8921
Most common words: [('.', 4759), (',', 1939), ('?', 1550), ('!', 1397), ('...', 1146), ('u', 1138), ('&', 922), (';', 768), (':', 722), ('i', 715), ('..', 697), ('call', 644), ("'", 535), (')', 499), ('2', 478)]


In [14]:
# Use the x most common words as features
word_features = list(all_words.keys())[:4500]

In [15]:
# The find_features function will determine which of the x word features are contained in the review
def find_features(message):
    words = word_tokenize(message)
    features = {}
    for word in word_features:
        features[word] = (word in words)

    return features

# Test
features = find_features(processed[0])
for key, value in features.items():
    if value == True:
        print(key)

go
jurong
point
,
crazy
..
avail
bugi
n
great
world
la
e
buffet
...
cine
got
amor
wat


In [17]:
# Define a seed for reproducibility
seed = 1
np.random.seed(seed)

messages = list(zip(processed, Y))  # Convert to a list
np.random.shuffle(messages)

# Call find_features function for each SMS message
featuresets = [(find_features(text), label) for (text, label) in messages]

In [19]:
# Train/Test
from sklearn import model_selection

# split the data into training and testing datasets
training, testing = model_selection.train_test_split(featuresets, test_size = 0.25, random_state=seed)

print(len(training))
print(len(testing))

4179
1393


In [20]:
# We can use sklearn algorithms in NLTK
from nltk.classify.scikitlearn import SklearnClassifier
from sklearn.svm import SVC

model = SklearnClassifier(SVC(kernel = 'linear'))

# Train the model on the training data
model.train(training)

# Test
accuracy = nltk.classify.accuracy(model, testing)*100
print("SVC Accuracy: {}".format(accuracy))

SVC Accuracy: 98.42067480258436


In [21]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

# Define models to train
names = ["K Nearest Neighbors", "Decision Tree", "Random Forest", "Logistic Regression", "SGD Classifier",
         "Naive Bayes", "SVM Linear"]

classifiers = [
    KNeighborsClassifier(),
    DecisionTreeClassifier(),
    RandomForestClassifier(),
    LogisticRegression(),
    SGDClassifier(max_iter = 100),
    MultinomialNB(),
    SVC(kernel = 'linear')
]

models = zip(names, classifiers)

for name, model in models:
    nltk_model = SklearnClassifier(model)
    nltk_model.train(training)
    accuracy = nltk.classify.accuracy(nltk_model, testing)*100
    print("{} Accuracy: {}".format(name, accuracy))

K Nearest Neighbors Accuracy: 90.8829863603733
Decision Tree Accuracy: 95.83632447954056
Random Forest Accuracy: 97.27207465900933
Logistic Regression Accuracy: 98.06173725771716
SGD Classifier Accuracy: 98.27709978463747
Naive Bayes Accuracy: 97.84637473079684
SVM Linear Accuracy: 98.42067480258436


In [None]:
# Ensemble methods - Voting classifier
from sklearn.ensemble import VotingClassifier

names = ["K Nearest Neighbors", "Decision Tree", "Random Forest", "Logistic Regression", "SGD Classifier",
         "Naive Bayes", "SVM Linear"]

classifiers = [
    KNeighborsClassifier(),
    DecisionTreeClassifier(),
    RandomForestClassifier(),
    LogisticRegression(),
    SGDClassifier(max_iter=100),
    MultinomialNB(),
    SVC(kernel='linear')
]

# Create a list of estimators (list of tuples)
models = list(zip(names, classifiers))  # Convert zip object to list

# Create a VotingClassifier
voting_clf = VotingClassifier(estimators=models, voting='hard', n_jobs=-1)

# Wrap the VotingClassifier in an SklearnClassifier
nltk_ensemble = SklearnClassifier(voting_clf)

# Train the VotingClassifier using the training data
nltk_ensemble.train(training)

# Evaluate the ensemble
accuracy = nltk.classify.accuracy(nltk_ensemble, testing) * 100
print("Voting Classifier Accuracy: {:.2f}%".format(accuracy))

In [None]:
# Make class label prediction for testing set
txt_features, labels = zip(*testing)

prediction = nltk_ensemble.classify_many(txt_features)

In [None]:
# Print a confusion matrix and a classification report
print(classification_report(labels, prediction))

pd.DataFrame(
    confusion_matrix(labels, prediction),
    index = [['actual', 'actual'], ['ham', 'spam']],
    columns = [['predicted', 'predicted'], ['ham', 'spam']])