In [1]:
import sys
import nltk
import sklearn
import pandas as pd
import numpy as np
df=pd.read_table('SMSSpamCollection',header=None,encoding='utf-8')
print(df.info())
print(df.head())



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   0       5572 non-null   object
 1   1       5572 non-null   object
dtypes: object(2)
memory usage: 43.6+ KB
None
      0                                                  1
0   ham  Go until jurong point, crazy.. Available only ...
1   ham                      Ok lar... Joking wif u oni...
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...
3   ham  U dun say so early hor... U c already then say...
4   ham  Nah I don't think he goes to usf, he lives aro...


In [2]:
#check class distribution
classes=df[0]
print(classes.value_counts())

ham     4825
spam     747
Name: 0, dtype: int64


In [3]:
## 2. Preprocess the Data
#convert class labels into binary values: ham=0,spam=1
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
y = encoder.fit_transform(classes)
print(classes[:10])
print(y[:10])

0     ham
1     ham
2    spam
3     ham
4     ham
5    spam
6     ham
7     ham
8    spam
9    spam
Name: 0, dtype: object
[0 0 1 0 0 1 0 0 1 1]


In [4]:
#store sms message data
text_messages =df[1]
print(text_messages[:10])

0    Go until jurong point, crazy.. Available only ...
1                        Ok lar... Joking wif u oni...
2    Free entry in 2 a wkly comp to win FA Cup fina...
3    U dun say so early hor... U c already then say...
4    Nah I don't think he goes to usf, he lives aro...
5    FreeMsg Hey there darling it's been 3 week's n...
6    Even my brother is not like to speak with me. ...
7    As per your request 'Melle Melle (Oru Minnamin...
8    WINNER!! As a valued network customer you have...
9    Had your mobile 11 months or more? U R entitle...
Name: 1, dtype: object


In [5]:
#use regular expression to replace email,phoneno and symbols
processed = text_messages.str.replace(r'^.+@[^\.].*\.[a-z]{2,}$','emailaddr')
processed = processed.str.replace(r'^(http(s?)\:\/\/)*[0-9a-zA-Z]([-.\w]*[0-9a-zA-Z])*(:(0-9)*)*(\/?)([a-zA-Z0-9\-\.\?\,\'\/\\\+&amp;%\$#_]*)?$','webaddress')
processed = processed.str.replace(r'\$?([0-9]{1,3},([0-9]{3},)*[0-9]{3}|[0-9]+)(.[0-9][0-9])?£$','moneysymb')
processed = processed.str.replace(r'^((\(\d{3}\) ?)|(\d{3}-))?\d{3}-\d{4}','phonenumbr')
processed = processed.str.replace(r'^\d{5}$|^\d{5}-\d{4}$','numbr')

#remove punctuations
processed = processed.str.replace(r'[^\w\d\s]',' ')
#multiple-whitespaces with single one
processed = processed.str.replace(r'\s+',' ')
processed = processed.str.replace(r'^\s+|\s+?$','')

In [6]:
#changing to lowecase
processed = processed.str.lower()
print(processed)

0       go until jurong point crazy available only in ...
1                                 ok lar joking wif u oni
2       free entry in 2 a wkly comp to win fa cup fina...
3             u dun say so early hor u c already then say
4       nah i don t think he goes to usf he lives arou...
                              ...                        
5567    this is the 2nd time we have tried 2 contact u...
5568                  will ü b going to esplanade fr home
5569    pity was in mood for that so any other suggest...
5570    the guy did some bitching but i acted like i d...
5571                            rofl its true to its name
Name: 1, Length: 5572, dtype: object


In [7]:
#remove stop words
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
processed = processed.apply(lambda x: ' '.join(term for term in x.split() if term not in stop_words))

In [8]:
#remove word stems using  a Portar stemmer
ps = nltk.PorterStemmer()
processed = processed.apply(lambda x: ' '.join(ps.stem(term) for term in x.split()))

In [9]:
print(processed)

0       go jurong point crazi avail bugi n great world...
1                                   ok lar joke wif u oni
2       free entri 2 wkli comp win fa cup final tkt 21...
3                     u dun say earli hor u c alreadi say
4                    nah think goe usf live around though
                              ...                        
5567    2nd time tri 2 contact u u 750 pound prize 2 c...
5568                              ü b go esplanad fr home
5569                                    piti mood suggest
5570    guy bitch act like interest buy someth els nex...
5571                                       rofl true name
Name: 1, Length: 5572, dtype: object


In [10]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\singh\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [11]:
from nltk.tokenize import word_tokenize
#bag-of-words
all_words=[]
for message in processed:
    words = word_tokenize(message)
    for w in words:
        all_words.append(w)
all_words = nltk.FreqDist(all_words)

In [12]:
#print total number of words and most common words
print('total words:{}'.format(len(all_words)))
print('most common words:{}'.format(all_words.most_common(15)))

total words:7274
most common words:[('u', 1207), ('call', 679), ('2', 533), ('go', 456), ('get', 451), ('ur', 391), ('4', 327), ('gt', 318), ('lt', 316), ('come', 304), ('free', 284), ('day', 276), ('know', 275), ('ok', 274), ('love', 266)]


In [13]:
#use only 1500 most common words
word_features = list(all_words.keys())[:1500]

In [14]:
def find_features(message):
    words = word_tokenize(message)
    features = {}
    for word in word_features:
        features[word] = (word in words)
    return features
    
#example here
features = find_features(processed[0])
for key, value in features.items():
    if value == True:
        print(key)

go
jurong
point
crazi
avail
bugi
n
great
world
la
e
buffet
cine
got
amor
wat


In [15]:
#find features for all messages
messages = list(zip(processed,y))

#define a seed for reproducibility
seed = 1
np.random.seed = seed
np.random.shuffle(messages)

#call find_features for each sms messages
featuresets = [(find_features(text), label) for (text,label) in messages]

In [16]:
#split training and testing data sets using sklearn
from sklearn import model_selection
training, testing = model_selection.train_test_split(featuresets, test_size = 0.25, random_state = seed)

In [17]:
print('Training:{}'.format(len(training)))
print('Testing:{}'.format(len(testing)))

Training:4179
Testing:1393


## 4.Scikit-learn classifiers with nltk

In [18]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

In [19]:
#define models to train
names = ['K Nearest Neighbors', 'Decision Tree', 'Random Forest','Logistic Regression', 'SGD Classifier','Naive Bayes','SVM Linear']
classifiers = [
    KNeighborsClassifier(),
    DecisionTreeClassifier(),
    RandomForestClassifier(),
    LogisticRegression(),
    SGDClassifier(max_iter = 100),
    MultinomialNB(),
    SVC(kernel= 'linear')
]

models = list(zip(names, classifiers))
print(models)

[('K Nearest Neighbors', KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                     weights='uniform')), ('Decision Tree', DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=None, splitter='best')), ('Random Forest', RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf

In [21]:
#wrap models in NLTK
from nltk.classify.scikitlearn import SklearnClassifier

for name, model in models:
    nltk_model = SklearnClassifier(model)
    nltk_model.train(training)
    accuracy = nltk.classify.accuracy(nltk_model, testing) * 100
    print('{}:Accuracy: {}'.format(name,accuracy))

K Nearest Neighbors:Accuracy: 91.74443646805456
Decision Tree:Accuracy: 97.20028715003589
Random Forest:Accuracy: 98.20531227566404




Logistic Regression:Accuracy: 98.20531227566404
SGD Classifier:Accuracy: 98.06173725771716
Naive Bayes:Accuracy: 97.98994974874373
SVM Linear:Accuracy: 98.49246231155779


In [23]:
from sklearn.ensemble import VotingClassifier
#define models to train
names = ['K Nearest Neighbors', 'Decision Tree', 'Random Forest','Logistic Regression', 'SGD Classifier','Naive Bayes','SVM Linear']
classifiers = [
    KNeighborsClassifier(),
    DecisionTreeClassifier(),
    RandomForestClassifier(),
    LogisticRegression(),
    SGDClassifier(max_iter = 100),
    MultinomialNB(),
    SVC(kernel= 'linear')
]

models = list(zip(names, classifiers))

nltk_ensemble = SklearnClassifier(VotingClassifier(estimators = models, voting = 'hard', n_jobs = -1))
nltk_ensemble.train(training)
accuracy = nltk.classify.accuracy(nltk_ensemble, testing) * 100
print('Ensemble Method Accuracy: {}'.format(accuracy))

Ensemble Method Accuracy: 98.56424982053123


In [24]:
#make class label prediction for testing set
txt_features,labels = list(zip(*testing))
prediction = nltk_ensemble.classify_many(txt_features)

In [28]:
#print a confusion matrix and a classification report
print(classification_report(labels, prediction))

pd.DataFrame(
    confusion_matrix(labels, prediction),
    index=[['actual', 'actual'], ['ham', 'spam']],
    columns=[['predicted', 'predicted'], ['ham', 'spam']]
)

              precision    recall  f1-score   support

           0       0.98      1.00      0.99      1198
           1       0.99      0.90      0.95       195

    accuracy                           0.99      1393
   macro avg       0.99      0.95      0.97      1393
weighted avg       0.99      0.99      0.99      1393



Unnamed: 0_level_0,Unnamed: 1_level_0,predicted,predicted
Unnamed: 0_level_1,Unnamed: 1_level_1,ham,spam
actual,ham,1197,1
actual,spam,19,176
