In [1]:
import sys
import nltk
import sklearn
import pandas
import numpy

print("Pyhton: {}".format(sys.version))
print("NLTK: {}".format(nltk.__version__))
print("Scikit-Learn: {}".format(sklearn.__version__))
print("Numpy: {}".format(numpy.__version__))

Pyhton: 3.6.9 |Anaconda, Inc.| (default, Jul 30 2019, 19:07:31) 
[GCC 7.3.0]
NLTK: 3.4.5
Scikit-Learn: 0.21.3
Numpy: 1.17.2


## 1. Load the Dataset

In [2]:
import pandas as pd
import numpy as np

# Link for data : https://archive.ics.uci.edu/ml/machine-learning-databases/00228/

# Download the data from link above and load.
df = pd.read_table('SMSSpamCollection', header = None, encoding= 'utf-8')

In [3]:
# print useful information 
print(df.info())
print(df.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
0    5572 non-null object
1    5572 non-null object
dtypes: object(2)
memory usage: 87.2+ KB
None
      0                                                  1
0   ham  Go until jurong point, crazy.. Available only ...
1   ham                      Ok lar... Joking wif u oni...
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...
3   ham  U dun say so early hor... U c already then say...
4   ham  Nah I don't think he goes to usf, he lives aro...


In [4]:
# Check the class distribution
classes = df[0]
print(classes.value_counts())

ham     4825
spam     747
Name: 0, dtype: int64


## 2. Presprocess the Data

In [5]:
# convert class labels to binary values, 0 = ham, 1 = spam

from sklearn.preprocessing  import LabelEncoder

encoder = LabelEncoder()
Y = encoder.fit_transform(classes)

print(classes[:10])
print(Y[:10])

0     ham
1     ham
2    spam
3     ham
4     ham
5    spam
6     ham
7     ham
8    spam
9    spam
Name: 0, dtype: object
[0 0 1 0 0 1 0 0 1 1]


In [6]:
# store the SMS message data
text_messages = df[1]
print(text_messages[:10])

0    Go until jurong point, crazy.. Available only ...
1                        Ok lar... Joking wif u oni...
2    Free entry in 2 a wkly comp to win FA Cup fina...
3    U dun say so early hor... U c already then say...
4    Nah I don't think he goes to usf, he lives aro...
5    FreeMsg Hey there darling it's been 3 week's n...
6    Even my brother is not like to speak with me. ...
7    As per your request 'Melle Melle (Oru Minnamin...
8    WINNER!! As a valued network customer you have...
9    Had your mobile 11 months or more? U R entitle...
Name: 1, dtype: object


In [7]:
# use regular expressions to replace email addresses, urls, phone numbers, usual numbers, sysmbols
#replace email addresses wit 'emailaddr'
#processed = text_messages.str.replace(r'^.+@[^\.].*\.[a-z]{2,}$', 'emailaddr')
processed = text_messages.str.replace(r'^\w+@[a-zA-Z_]+?\.[a-zA-Z]{2,3}$', 'emailaddr')

# replace urls with 'webaddress'
#processed = processed.str.replace(r'^http\://[a-zA-Z0-9\-\.]+\.[a-zA-Z]{2,3}(/|S*)?$','webaddress')
processed = processed.str.replace(r'^(http(s?)\:\/\/)*[0-9a-zA-Z]([-.\w]*[0-9a-zA-Z])*(:(0-9)*)*(\/?)([a-zA-Z0-9\-\.\?\,\'\/\\\+&amp;%\$#_]*)?$','webaddress')

# replace money symbols with 'moneysymb'
#processed = processed.str.replace(r'£|\$', 'moneysymb')
processed = processed.str.replace(r'^([1-9]{1}[\d]{0,2}(\,[\d]{3})*(\.[\d]{0,2})?|[1-9]{1}[\d]{0,}(\.[\d]{0,2})?|0(\.[\d]{0,2})?|(\.[\d]{1,2})?)$', 'moneysymb')

# replace 10 digit phone numbers with 'phonenumber'
#processed = processed.str.replace(r'^\(?[\d]{3}\)?[\s-]?[\d]{3}[\s-]?[\d]{4}$', 'phonenumbr')
processed = processed.str.replace(r'^([\+][0-9]{1,3}([ \.\-])?)?([\(]{1}[0-9]{3}[\)])?([0-9A-Z \.\-]{1,32})((x|ext|extension)?[0-9]{1,4}?)$', 'phonenumbr')


#replace normal numbers with 'numbr'
#processed = processed.str.replace(r'\d+(\.\d+)?', 'numbr')
processed = processed.str.replace(r'^\s*[+-]?\s*(?:\d{1,3}(?:(,?)\d{3})?(?:\1\d{3})*(\.\d*)?|\.\d+)\s*$', 'numbr')


In [8]:
# remove punctuation
processed = processed.str.replace(r'[^\w\d\s]', ' ')

# replace whitespace between terms wit has single space
processed = processed.str.replace(r'\s+', ' ')

# remove leading and trailing whitespace
processed = processed.str.replace(r'^\s+|\s+?$', '')


In [9]:
# change words to lower case
processed = processed.str.lower()
print(processed)

0       go until jurong point crazy available only in ...
1                                 ok lar joking wif u oni
2       free entry in 2 a wkly comp to win fa cup fina...
3             u dun say so early hor u c already then say
4       nah i don t think he goes to usf he lives arou...
                              ...                        
5567    this is the 2nd time we have tried 2 contact u...
5568                  will ü b going to esplanade fr home
5569    pity was in mood for that so any other suggest...
5570    the guy did some bitching but i acted like i d...
5571                            rofl its true to its name
Name: 1, Length: 5572, dtype: object


In [10]:
# remove stop words from text messages

from nltk.corpus import stopwords

stop_words = set(stopwords.words('english'))

processed = processed.apply(lambda x: ' '.join(term for term in x.split() if term not in stop_words))

In [11]:
# remove word stems using a Porter stemmer
ps = nltk.PorterStemmer()

processed = processed.apply(lambda x: ' '.join(ps.stem(term) for term in x.split()))

In [12]:
print(processed)

0       go jurong point crazi avail bugi n great world...
1                                   ok lar joke wif u oni
2       free entri 2 wkli comp win fa cup final tkt 21...
3                     u dun say earli hor u c alreadi say
4                    nah think goe usf live around though
                              ...                        
5567    2nd time tri 2 contact u u 750 pound prize 2 c...
5568                              ü b go esplanad fr home
5569                                    piti mood suggest
5570    guy bitch act like interest buy someth els nex...
5571                                       rofl true name
Name: 1, Length: 5572, dtype: object


In [13]:
from nltk.tokenize import word_tokenize

# creating a bag-of-words
all_words = []

for message in processed:
    words = word_tokenize(message)
    for w in words:
        all_words.append(w)
        
all_words = nltk.FreqDist(all_words)


In [14]:
# print the total number of words and the 15 most common words
print("Number of words: {}".format(len(all_words)))
print("Most common words: {}".format(all_words.most_common(15)))

Number of words: 7281
Most common words: [('u', 1207), ('call', 679), ('2', 533), ('go', 456), ('get', 452), ('ur', 391), ('4', 327), ('gt', 318), ('lt', 316), ('come', 304), ('free', 284), ('day', 276), ('know', 275), ('ok', 274), ('love', 266)]


In [15]:
# use the 1500 most common words as features
# zip * to extract keys in descending order of most common
k, v = zip(*all_words.most_common(1500)) 
word_features = list(k)

In [16]:
# define a find_features function
def find_features(message):
    words = word_tokenize(message)
    features = {}
    for word in word_features:
        features[word] = (word in words)
        
    return features

# Lets see an example
features = find_features(processed[2])
for key, value in features.items():
    if value == True:
        print(key)

2
free
text
txt
c
win
may
receiv
rate
question
appli
final
entri
wkli
comp
std
cup


In [17]:
processed[2]

'free entri 2 wkli comp win fa cup final tkt 21st may 2005 text fa 87121 receiv entri question std txt rate c appli 08452810075over18'

In [18]:
# find features for all messagaes
messages = zip(processed, Y)
messages = list(messages)           # 
# define a seed for reproducibility
seed = 1 
np.random.seed = seed
np.random.shuffle(messages)

# call find_features function for each SMS messages
features = [(find_features(text), label) for (text, label) in messages]

In [19]:
# split training and testing data sets using sklearn

from sklearn import model_selection

training, testing  = model_selection.train_test_split(features, test_size = 0.25, random_state = seed)

In [20]:
#print(len(training)


## 4. Scikit-Learn Classifiers with NLTK

In [21]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

In [22]:
# Define models to train
names = ['K Nearest NEighbors', 'Decision Tree', 'Random Forest', 'Logistic Regression', 'SGD Classifier', 'Naive Bayes', 'SVM Linear']

classifier = [
    KNeighborsClassifier(), 
    DecisionTreeClassifier(), 
    RandomForestClassifier(), 
    LogisticRegression(), 
    SGDClassifier(max_iter = 100),
    MultinomialNB(),
    SVC(kernel = 'linear')
]

models = zip(names, classifier)
modelsp = set(models)              # set is helpful for printing the zip 
print(modelsp)

{('SVM Linear', SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
    kernel='linear', max_iter=-1, probability=False, random_state=None,
    shrinking=True, tol=0.001, verbose=False)), ('Logistic Regression', LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)), ('Naive Bayes', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)), ('SGD Classifier', SGDClassifier(alpha=0.0001, average=False, class_weight=None,
              early_stopping=False, epsilon=0.1, eta0=0.0, fit_intercept=True,
              l1_ratio=0.15, learning_rate='optimal', loss='hinge',
              max_iter=100, n_iter_no_change=5, n_jobs=None, penalty='l2',
      

In [23]:
# wrap models in NLTK
from nltk.classify.scikitlearn import SklearnClassifier

for name, model in modelsp:
    nltk_model = SklearnClassifier(model)
    nltk_model.train(training)
    accuracy = nltk.classify.accuracy(nltk_model, testing)* 100
    print("{}: Accuracy: {}".format(name, accuracy))

SVM Linear: Accuracy: 98.92318736539842




Logistic Regression: Accuracy: 98.77961234745155
Naive Bayes: Accuracy: 98.7078248384781
SGD Classifier: Accuracy: 98.49246231155779




Random Forest: Accuracy: 98.1335247666906
K Nearest NEighbors: Accuracy: 92.67767408470927
Decision Tree: Accuracy: 96.98492462311557


In [24]:
# ensemble method - Voting classifier
from sklearn.ensemble import VotingClassifier

classifiers = [
    KNeighborsClassifier(), 
    DecisionTreeClassifier(), 
    RandomForestClassifier(), 
    LogisticRegression(), 
    SGDClassifier(max_iter = 100),
    MultinomialNB(),
    SVC(kernel = 'linear')
]

models = zip(names, classifiers)
models = set(models)

nltk_ensemble = SklearnClassifier(VotingClassifier(estimators = models, voting = 'hard', n_jobs = -1))
nltk_ensemble.train(training)
accuracy = nltk.classify.accuracy(nltk_ensemble, testing)* 100
print("Enseble Method Accuracy: {}".format(accuracy))

Enseble Method Accuracy: 99.21033740129216


In [25]:
# make class label prediction for testing set
txt_features, labels = zip(*testing)

prediction = nltk_ensemble.classify_many(txt_features)

In [26]:
# print a confusion matrix and a classification report
print(classification_report(labels, prediction))

pd.DataFrame(
      confusion_matrix(labels, prediction),
      index = [['actual', 'actual'], ['ham', 'spam']],
      columns = [['predicted', 'predicted'], ['ham', 'spam']]
)


              precision    recall  f1-score   support

           0       0.99      1.00      1.00      1217
           1       1.00      0.94      0.97       176

    accuracy                           0.99      1393
   macro avg       1.00      0.97      0.98      1393
weighted avg       0.99      0.99      0.99      1393



Unnamed: 0_level_0,Unnamed: 1_level_0,predicted,predicted
Unnamed: 0_level_1,Unnamed: 1_level_1,ham,spam
actual,ham,1217,0
actual,spam,11,165
