In [1]:
import sys 
import pandas  
import numpy 
import sklearn 
import nltk
import string
print(f'Python : {sys.version})')
print(f'NLTK : {nltk.__version__}')
print(f'Scikit-Learn : {sklearn.__version__}')
print(f'Numpy : {numpy.__version__}')
print(f'Pandas : {pandas.__version__}')

Python : 3.7.3 (default, Mar 27 2019, 17:13:21) [MSC v.1915 64 bit (AMD64)])
NLTK : 3.4
Scikit-Learn : 0.20.3
Numpy : 1.16.2
Pandas : 0.24.2


## 1. Load the Dataset

In [2]:
import pandas as pd
import numpy as np 

#load the dataset 
df = pd.read_csv('SMSSpamCollection',header = None, encoding='utf-8',sep='\t')

In [3]:
#print useful information
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
0    5572 non-null object
1    5572 non-null object
dtypes: object(2)
memory usage: 87.1+ KB
None


In [4]:
print(df.head())

      0                                                  1
0   ham  Go until jurong point, crazy.. Available only ...
1   ham                      Ok lar... Joking wif u oni...
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...
3   ham  U dun say so early hor... U c already then say...
4   ham  Nah I don't think he goes to usf, he lives aro...


In [5]:
#check class distribution
classes = df[0]
print(classes.value_counts())

ham     4825
spam     747
Name: 0, dtype: int64


## 2. Preprocess the data

In [6]:
#covert class label to binary 

from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
y = encoder.fit_transform(classes)

print(classes[:10])
print(y[:10])

0     ham
1     ham
2    spam
3     ham
4     ham
5    spam
6     ham
7     ham
8    spam
9    spam
Name: 0, dtype: object
[0 0 1 0 0 1 0 0 1 1]


In [7]:
#store the text messages
text_messages = df[1]
print(text_messages[:10])

0    Go until jurong point, crazy.. Available only ...
1                        Ok lar... Joking wif u oni...
2    Free entry in 2 a wkly comp to win FA Cup fina...
3    U dun say so early hor... U c already then say...
4    Nah I don't think he goes to usf, he lives aro...
5    FreeMsg Hey there darling it's been 3 week's n...
6    Even my brother is not like to speak with me. ...
7    As per your request 'Melle Melle (Oru Minnamin...
8    WINNER!! As a valued network customer you have...
9    Had your mobile 11 months or more? U R entitle...
Name: 1, dtype: object


In [8]:
#use regular expression email, urls , phone number , symobols 

#Replace 'email address' with 'emailaddr'
processed = text_messages.str.replace(r'^.+@[^\.].*\.[a-z]{2,}$','emailaddr')

#Replace 'url' with 'webaddress'
processed = processed.str.replace(r'^http\://[a-zA-Z0-9\-\.]+\.[a-zA-Z]{2,3}(/\S*)?$','webaddress')

#Replace money symbols with 'moneysymb'
processed = processed.str.replace(r'£|\$','moneysymb')

#Replace '10 digit number' with 'phonenumber'
processed = processed.str.replace(r'^[^0-9]*(?:(\d)[^0-9]*){10}$','phonenumber')

#Replace normal number with 'numbr'
processed = processed.str.replace(r'\d+(\.\d+)?','numbr')

In [9]:
#replace All puncutaion with ''
processed = processed.str.replace(r'[^\w\d\s]',' ')

#replace whitespaces between terms with single space
processed = processed.str.replace(r'\s+',' ')

#replace trailing and leading spaces with nothing
processed = processed.str.replace(r'^\s+|\s?$',' ')

In [10]:
#change all words to lower case
processed = processed.str.lower()
print(processed[:10])

0    go until jurong point crazy available only in ...
1                            ok lar joking wif u oni  
2    free entry in numbr a wkly comp to win fa cup ...
3        u dun say so early hor u c already then say  
4    nah i don t think he goes to usf he lives arou...
5    freemsg hey there darling it s been numbr week...
6    even my brother is not like to speak with me t...
7    as per your request melle melle oru minnaminun...
8    winner as a valued network customer you have b...
9    had your mobile numbr months or more u r entit...
Name: 1, dtype: object


In [11]:
#remove stop words from text msgs

from nltk.corpus import stopwords
stopwords_english = set(stopwords.words('english'))

processed = processed.apply(lambda x:' '.join(term for term in x.split() if term not in stopwords_english))

In [12]:
#remove word stem using porter stemmer
ps = nltk.PorterStemmer()

processed = processed.apply(lambda x:' '.join(ps.stem(term) for term in x.split()))

In [13]:
print(processed[:10])

0    go jurong point crazi avail bugi n great world...
1                                ok lar joke wif u oni
2    free entri numbr wkli comp win fa cup final tk...
3                  u dun say earli hor u c alreadi say
4                 nah think goe usf live around though
5    freemsg hey darl numbr week word back like fun...
6        even brother like speak treat like aid patent
7    per request mell mell oru minnaminungint nurun...
8    winner valu network custom select receivea mon...
9    mobil numbr month u r entitl updat latest colo...
Name: 1, dtype: object


In [14]:
from nltk.tokenize import word_tokenize

#Create a bag of words model
all_words=[]
for message in processed:
    words = word_tokenize(message)
    for w in words:
        all_words.append(w)
        
all_words= nltk.FreqDist(all_words)


In [15]:
print(f'number of distinct words : {len(all_words)}')
all_words

number of distinct words : 6533


FreqDist({'numbr': 2602, 'u': 1207, 'call': 669, 'go': 454, 'get': 448, 'ur': 387, 'gt': 316, 'lt': 314, 'come': 303, 'moneysymbnumbr': 299, ...})

In [16]:
print(f'mostcommon words : {all_words.most_common(100)}')

mostcommon words : [('numbr', 2602), ('u', 1207), ('call', 669), ('go', 454), ('get', 448), ('ur', 387), ('gt', 316), ('lt', 314), ('come', 303), ('moneysymbnumbr', 299), ('ok', 292), ('free', 281), ('day', 276), ('know', 275), ('love', 266), ('like', 261), ('got', 252), ('time', 250), ('good', 247), ('want', 246), ('text', 230), ('send', 213), ('need', 190), ('one', 184), ('txt', 182), ('today', 181), ('take', 173), ('ü', 173), ('see', 173), ('home', 167), ('think', 166), ('stop', 165), ('repli', 162), ('lor', 162), ('r', 161), ('sorri', 160), ('still', 158), ('tell', 157), ('n', 155), ('mobil', 152), ('da', 151), ('back', 150), ('numbrp', 149), ('dont', 149), ('make', 148), ('k', 147), ('week', 141), ('say', 140), ('hi', 140), ('pleas', 139), ('phone', 139), ('work', 136), ('pl', 135), ('new', 135), ('later', 135), ('hope', 134), ('miss', 133), ('ask', 133), ('co', 129), ('meet', 128), ('messag', 125), ('msg', 124), ('night', 124), ('dear', 122), ('wait', 121), ('happi', 121), ('well

In [17]:
# use 1500 most common words as features
word_features = list(all_words.keys())[:1500]

In [18]:
#define a find features function
def find_features(message):
    words = word_tokenize(message)
    features ={}
    for word in word_features:
        features[word]=(word in words)
    return features


In [19]:
#find features for all messages
messages = []
for i in range(0,len(processed)):
    messages.append((processed[i],y[i]))

In [20]:
messages[:10]

[('go jurong point crazi avail bugi n great world la e buffet cine got amor wat',
  0),
 ('ok lar joke wif u oni', 0),
 ('free entri numbr wkli comp win fa cup final tkt numbrst may numbr text fa numbr receiv entri question std txt rate c appli numbrovernumbr',
  1),
 ('u dun say earli hor u c alreadi say', 0),
 ('nah think goe usf live around though', 0),
 ('freemsg hey darl numbr week word back like fun still tb ok xxx std chg send moneysymbnumbr rcv',
  1),
 ('even brother like speak treat like aid patent', 0),
 ('per request mell mell oru minnaminungint nurungu vettam set callertun caller press numbr copi friend callertun',
  0),
 ('winner valu network custom select receivea moneysymbnumbr prize reward claim call numbr claim code klnumbr valid numbr hour',
  1),
 ('mobil numbr month u r entitl updat latest colour mobil camera free call mobil updat co free numbr',
  1)]

In [21]:
feature_set = [[find_features(message),category] for (message, category) in messages]

In [22]:
feature_set[0]

[{'go': True,
  'jurong': True,
  'point': True,
  'crazi': True,
  'avail': True,
  'bugi': True,
  'n': True,
  'great': True,
  'world': True,
  'la': True,
  'e': True,
  'buffet': True,
  'cine': True,
  'got': True,
  'amor': True,
  'wat': True,
  'ok': False,
  'lar': False,
  'joke': False,
  'wif': False,
  'u': False,
  'oni': False,
  'free': False,
  'entri': False,
  'numbr': False,
  'wkli': False,
  'comp': False,
  'win': False,
  'fa': False,
  'cup': False,
  'final': False,
  'tkt': False,
  'numbrst': False,
  'may': False,
  'text': False,
  'receiv': False,
  'question': False,
  'std': False,
  'txt': False,
  'rate': False,
  'c': False,
  'appli': False,
  'numbrovernumbr': False,
  'dun': False,
  'say': False,
  'earli': False,
  'hor': False,
  'alreadi': False,
  'nah': False,
  'think': False,
  'goe': False,
  'usf': False,
  'live': False,
  'around': False,
  'though': False,
  'freemsg': False,
  'hey': False,
  'darl': False,
  'week': False,
  'word

In [23]:
from random import shuffle 
shuffle(feature_set)

## 3.Creating training and test set 

In [24]:
print(len(feature_set))

5572


In [25]:
from sklearn.model_selection import train_test_split
training, testing = train_test_split(feature_set,test_size = 0.25,random_state=0)

In [26]:
print(f"Training :{len(training)}")
print(f"Testing :{len(testing)}")

Training :4179
Testing :1393


## 4.Deploying Scikit Machine Learning with NLTK

In [27]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression,SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.metrics import classification_report,accuracy_score,confusion_matrix

In [28]:
#Define Models to train
names = ["K Nearest Neighbors", "Decision Tree", "Random Forest", "Logistic Regression", "SGD Classifier",
         "Naive Bayes", "SVM Linear"]

classifiers = [
    KNeighborsClassifier(),
    DecisionTreeClassifier(),
    RandomForestClassifier(),
    LogisticRegression(),
    SGDClassifier(max_iter = 100),
    MultinomialNB(),
    SVC(kernel = 'linear')
]

models =[]
for i in range(0,len(classifiers)):
    models.append((names[i],classifiers[i]))


In [29]:
# Wrap models in nltk
from nltk.classify.scikitlearn import SklearnClassifier

for (name,model) in models:
    nltk_model = SklearnClassifier(model)
    nltk_model.train(training)
    accuracy = nltk.classify.accuracy(nltk_model,testing)*100
    print(f'{name} : Accuracy : {accuracy}')

K Nearest Neighbors : Accuracy : 94.18521177315147
Decision Tree : Accuracy : 97.27207465900933




Random Forest : Accuracy : 97.05671213208902




Logistic Regression : Accuracy : 98.20531227566404




SGD Classifier : Accuracy : 97.84637473079684
Naive Bayes : Accuracy : 98.20531227566404
SVM Linear : Accuracy : 98.27709978463747


In [30]:
# Ensemble methods - Voting classifier
from sklearn.ensemble import VotingClassifier

names = ["K Nearest Neighbors", "Decision Tree", "Random Forest", "Logistic Regression", "SGD Classifier",
         "Naive Bayes", "SVM Linear"]

classifiers = [
    KNeighborsClassifier(),
    DecisionTreeClassifier(),
    RandomForestClassifier(),
    LogisticRegression(),
    SGDClassifier(max_iter = 100),
    MultinomialNB(),
    SVC(kernel = 'linear')
]

models =[]
for i in range(0,len(classifiers)):
    models.append((names[i],classifiers[i]))
nltk_ensemble = SklearnClassifier(VotingClassifier(estimators = models, voting = 'hard', n_jobs = -1))
nltk_ensemble.train(training)
accuracy = nltk.classify.accuracy(nltk_model, testing)*100
print("Voting Classifier: Accuracy: {}".format(accuracy))

Voting Classifier: Accuracy: 98.27709978463747


In [33]:
#make class label prediction for testing set
txt_features,labels= zip(*testing)

prediction= nltk_ensemble.classify_many(txt_features)

In [34]:
print(classification_report(labels,prediction))

pd.DataFrame(
    confusion_matrix(labels, prediction),
    index = [['actual', 'actual'], ['ham', 'spam']],
    columns = [['predicted', 'predicted'], ['ham', 'spam']])

              precision    recall  f1-score   support

           0       0.98      1.00      0.99      1194
           1       0.98      0.90      0.94       199

   micro avg       0.98      0.98      0.98      1393
   macro avg       0.98      0.95      0.97      1393
weighted avg       0.98      0.98      0.98      1393



Unnamed: 0_level_0,Unnamed: 1_level_0,predicted,predicted
Unnamed: 0_level_1,Unnamed: 1_level_1,ham,spam
actual,ham,1190,4
actual,spam,19,180
