# 1. Load Dataset

In [3]:
import pandas as pd
import numpy as np

In [4]:
# load the dataset 
df = pd.read_table('SMSSpamCollection', header = None, encoding = 'utf-8')

In [5]:
#print useful informations
print(df.info())
print(df.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   0       5572 non-null   object
 1   1       5572 non-null   object
dtypes: object(2)
memory usage: 43.6+ KB
None
      0                                                  1
0   ham  Go until jurong point, crazy.. Available only ...
1   ham                      Ok lar... Joking wif u oni...
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...
3   ham  U dun say so early hor... U c already then say...
4   ham  Nah I don't think he goes to usf, he lives aro...


In [6]:
#check class distribution
classes=df[0]
print(classes.value_counts())

ham     4825
spam     747
Name: 0, dtype: int64


# 2. Preprocessing of the data

In [9]:
from sklearn.preprocessing import LabelEncoder

#convert target value to binary values ham=0,spam=1
encoder = LabelEncoder()
Y = encoder.fit_transform(classes)

In [11]:
print(Y[:15])

[0 0 1 0 0 1 0 0 1 1 0 1 1 0 0]


In [12]:
#store the sms msg data
txt_msg = df[1]
print(txt_msg[:15])

0     Go until jurong point, crazy.. Available only ...
1                         Ok lar... Joking wif u oni...
2     Free entry in 2 a wkly comp to win FA Cup fina...
3     U dun say so early hor... U c already then say...
4     Nah I don't think he goes to usf, he lives aro...
5     FreeMsg Hey there darling it's been 3 week's n...
6     Even my brother is not like to speak with me. ...
7     As per your request 'Melle Melle (Oru Minnamin...
8     WINNER!! As a valued network customer you have...
9     Had your mobile 11 months or more? U R entitle...
10    I'm gonna be home soon and i don't want to tal...
11    SIX chances to win CASH! From 100 to 20,000 po...
12    URGENT! You have won a 1 week FREE membership ...
13    I've been searching for the right words to tha...
14                  I HAVE A DATE ON SUNDAY WITH WILL!!
Name: 1, dtype: object


In [17]:
#use regular expression to replace email addresses, urls,phone numbers,other numbers

#replace email address with 'email'
processed = txt_msg.str.replace(r'^.+@[^\.].*\.[a-z]{2,}$', 'emailaddress')

#replace url with 'webaddress'
processed = processed.str.replace(r'^http\://[a-zA-Z0-9\-\.]+\[a-zA-Z]{2,3}(/\S*)?$', 'webaddress')

#replace money symbol with 'moneysym'
processed = processed.str.replace(r'\$', 'moneysymb')

#replace 10 digit ph number
processed = processed.str.replace(r'^\(?[\d]{3}\)?[\s-]?[\d]{3}[\s-]?[\d]{4}$', 'phonenumber')

#replace numbers with 'number'
processed = processed.str.replace(r'\d+(\.\d+)?','number')

#rmove punctuation 
processed = processed.str.replace(r'[^\w\d\s]', ' ')

#replace white space b/w terms with single space
processed = processed.str.replace(r'\s+', ' ')

#remove leading and trailing white space
processed = processed.str.replace(r'^\s+|\s+?$', '')

In [18]:
#change the words to lower case
processed = processed.str.lower()
print(processed)

0       go until jurong point crazy available only in ...
1                                 ok lar joking wif u oni
2       free entry in number a wkly comp to win fa cup...
3             u dun say so early hor u c already then say
4       nah i don t think he goes to usf he lives arou...
                              ...                        
5567    this is the numbernd time we have tried number...
5568                  will ü b going to esplanade fr home
5569    pity was in mood for that so any other suggest...
5570    the guy did some bitching but i acted like i d...
5571                            rofl its true to its name
Name: 1, Length: 5572, dtype: object


In [19]:
from nltk.corpus import stopwords

In [24]:
#remove stopwords from txt msgs
stp_wrd = set(stopwords.words('english'))

processed = processed.apply(lambda x: ' '.join(term for term in x.split() if term not in stp_wrd))

# 3. Generating features

In [34]:
import nltk
+nltk.download('punkt')
from nltk.tokenize import word_tokenize
#create bag of words
all_words = []

for message in processed:
    words = word_tokenize(message)
    for w in words:
        all_words.append(w)
        
all_words = nltk.FreqDist(all_words)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


In [35]:
#print total number of words and most commmon words
print('Number of words: {}'.format(len(all_words)))
print('Most common words: {}'.format(all_words.most_common(10)))

Number of words: 7899
Most common words: [('number', 3051), ('u', 1207), ('call', 593), ('ur', 391), ('get', 390), ('gt', 318), ('lt', 316), ('ok', 293), ('free', 284), ('go', 283)]


In [36]:
#use the 1500 most common wordas as features
word_features = list(all_words.keys())[:1500]

In [45]:
def find_features(message):
    words = word_tokenize(message)
    features = {}
    for word in word_features:
        features[word] = (word in words)
        
    return features

In [46]:
features = find_features(processed[0])
for key, value in features.items():
    if value == True:
        print(key)

go
jurong
point
crazy
available
bugis
n
great
world
la
e
buffet
cine
got
amore
wat


In [49]:
#lets do it for all the msgs
messages = list(zip(processed, Y))

#define a seed for reproduceability
seed = 1
np.random.seed = seed
np.random.shuffle(messages)

#call find feature func for each sms message
featuresets = [(find_features(text), label) for (text,label) in messages]

In [50]:
#split the features sets into training and testing 
from sklearn import model_selection

training,testing = model_selection.train_test_split(featuresets, test_size=0.2, random_state=12)

In [51]:
print(len(training))
print(len(testing))

4457
1115


# 4. Apply the model

In [53]:
from nltk.classify.scikitlearn import SklearnClassifier
from sklearn.svm import SVC

In [55]:
model = SklearnClassifier(SVC(kernel = 'linear'))
model.train(training)

accuracy = nltk.classify.accuracy(model,testing)*100
print("SVC Accuracy: {}".format(accuracy))

SVC Accuracy: 98.65470852017937


In [56]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

In [58]:
#define models to train
names = ["K Nearest Neighbours", "Decision Tree", "Random Forest", "Logistic Regression", "SGD Classifier", "Naive Bayes", "SVM Linear"]

classifiers = [
    KNeighborsClassifier(),
    DecisionTreeClassifier(),
    RandomForestClassifier(),
    LogisticRegression(),
    SGDClassifier(max_iter=100),
    MultinomialNB(),
    SVC(kernel='linear')
]

models = list(zip(names, classifiers))
print(models)

[('K Nearest Neighbours', KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                     weights='uniform')), ('Decision Tree', DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=None, splitter='best')), ('Random Forest', RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_lea

In [60]:
from nltk.classify.scikitlearn import SklearnClassifier

for name,model in models:
    nltk_model = SklearnClassifier(model)
    nltk_model.train(training)
    accuracy = nltk.classify.accuracy(nltk_model, testing)*100
    print('{}: Accuracy: {}'.format(name, accuracy))

K Nearest Neighbours: Accuracy: 94.34977578475336
Decision Tree: Accuracy: 96.68161434977578




Random Forest: Accuracy: 98.38565022421525




Logistic Regression: Accuracy: 99.01345291479821
SGD Classifier: Accuracy: 98.83408071748879
Naive Bayes: Accuracy: 98.29596412556054
SVM Linear: Accuracy: 98.65470852017937


In [62]:
from sklearn.ensemble import VotingClassifier

names = ["K Nearest Neighbours", "Decision Tree", "Random Forest", "Logistic Regression", "SGD Classifier", "Naive Bayes", "SVM Linear"]

classifiers = [
    KNeighborsClassifier(),
    DecisionTreeClassifier(),
    RandomForestClassifier(),
    LogisticRegression(),
    SGDClassifier(max_iter=100),
    MultinomialNB(),
    SVC(kernel='linear')
]

models = list(zip(name, classifiers))

nltk_ensemble = SklearnClassifier(VotingClassifier(estimators = models, voting = 'hard', n_jobs = -1))
nltk_ensemble.train(training)
accuracy = nltk.classify.accuracy(nltk_ensemble, testing)*100
print('Ensemble Method Accuracy: {}'.format(accuracy))

Ensemble Method Accuracy: 99.19282511210761


In [63]:
# make class label prediction for testing set
txt_features, labels = list(zip(*testing))
prediction = nltk_ensemble.classify_many(txt_features)

In [64]:
#classification report and confusion matrix
print(classification_report(labels,prediction))

pd.DataFrame(
    confusion_matrix(labels, prediction),
    index = [['actual','actual'],['ham','spam']],
    columns = [['predicted','predicted'],['ham','spam']])

              precision    recall  f1-score   support

           0       0.99      1.00      1.00       966
           1       1.00      0.94      0.97       149

    accuracy                           0.99      1115
   macro avg       1.00      0.97      0.98      1115
weighted avg       0.99      0.99      0.99      1115



Unnamed: 0_level_0,Unnamed: 1_level_0,predicted,predicted
Unnamed: 0_level_1,Unnamed: 1_level_1,ham,spam
actual,ham,966,0
actual,spam,9,140
