In [1]:
import pandas as pd
import numpy as np
import nltk

## Load the Dataset

In [2]:
df=pd.read_table("SMSSpamCollection",header=None,encoding='utf-8')
df.head()

Unnamed: 0,0,1
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   0       5572 non-null   object
 1   1       5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB


In [4]:
df.value_counts()

0     1                                                                                                                                                                                                                                    
ham   Sorry, I'll call later                                                                                                                                                                                                                   30
      I cant pick the phone right now. Pls send a message                                                                                                                                                                                      12
      Ok...                                                                                                                                                                                                                                    10
      Wen ur lovable bcums angry wid u

In [5]:
classes=df[0]
classes.value_counts()

ham     4825
spam     747
Name: 0, dtype: int64

## Preprocessing the Data

In [6]:
from sklearn.preprocessing import LabelEncoder
encoder=LabelEncoder()
y=encoder.fit_transform(classes)


print(classes[:10])
print(y[:10])

0     ham
1     ham
2    spam
3     ham
4     ham
5    spam
6     ham
7     ham
8    spam
9    spam
Name: 0, dtype: object
[0 0 1 0 0 1 0 0 1 1]


In [7]:
text_messages = df[1]
text_messages[:10]

0    Go until jurong point, crazy.. Available only ...
1                        Ok lar... Joking wif u oni...
2    Free entry in 2 a wkly comp to win FA Cup fina...
3    U dun say so early hor... U c already then say...
4    Nah I don't think he goes to usf, he lives aro...
5    FreeMsg Hey there darling it's been 3 week's n...
6    Even my brother is not like to speak with me. ...
7    As per your request 'Melle Melle (Oru Minnamin...
8    WINNER!! As a valued network customer you have...
9    Had your mobile 11 months or more? U R entitle...
Name: 1, dtype: object

In [8]:
import re 
#replace email
processed=text_messages.str.replace(r'^.+@[^\.].*\.[a-z]{2,}$','emailaddr')
#replace url
processed=processed.str.replace(r'^http\://[a-zA-Z0-9\-\.]+\.[a-zA-Z]{2,3}(/\S*)?$','webaddr')
#replace moneysym
processed=processed.str.replace(r'£|\$','moneysymbol')
#replace phonenumber
processed=processed.str.replace(r'^(\(?\+?[0-9]*\)?)?[0-9_\- \(\)]*$','phonenumber')
#replace normal number
processed=processed.str.replace(r'\d+(\.\d+)?','number')

  processed=text_messages.str.replace(r'^.+@[^\.].*\.[a-z]{2,}$','emailaddr')
  processed=processed.str.replace(r'^http\://[a-zA-Z0-9\-\.]+\.[a-zA-Z]{2,3}(/\S*)?$','webaddr')
  processed=processed.str.replace(r'£|\$','moneysymbol')
  processed=processed.str.replace(r'^(\(?\+?[0-9]*\)?)?[0-9_\- \(\)]*$','phonenumber')
  processed=processed.str.replace(r'\d+(\.\d+)?','number')


In [9]:
#remove punctuation
processed=processed.str.replace(r'[^\w\d\s]',' ')

#removce whitespace
processed=processed.str.replace(r'\s+',' ')

#remove leading and trailing whitespace
processed=processed.str.replace(r'^\s+|\s+?$',' ')


  processed=processed.str.replace(r'[^\w\d\s]',' ')
  processed=processed.str.replace(r'\s+',' ')
  processed=processed.str.replace(r'^\s+|\s+?$',' ')


In [10]:
#change word to lower case
processed=processed.str.lower()
processed

0       go until jurong point crazy available only in ...
1                                ok lar joking wif u oni 
2       free entry in number a wkly comp to win fa cup...
3            u dun say so early hor u c already then say 
4       nah i don t think he goes to usf he lives arou...
                              ...                        
5567    this is the numbernd time we have tried number...
5568                 will ü b going to esplanade fr home 
5569    pity was in mood for that so any other suggest...
5570    the guy did some bitching but i acted like i d...
5571                            rofl its true to its name
Name: 1, Length: 5572, dtype: object

In [11]:
#remove stopword fromtext message
from nltk.corpus import stopwords

stop_words=set(stopwords.words('english'))

processed=processed.apply(lambda x : ' '.join(term for term  in x.split() if term not in stop_words))

In [12]:
#remove word stems using Porter stemmer 
ps=nltk.PorterStemmer()
processed=processed.apply(lambda x : ' '.join(ps.stem(term) for term in x.split()))

In [13]:
processed[:10]

0    go jurong point crazi avail bugi n great world...
1                                ok lar joke wif u oni
2    free entri number wkli comp win fa cup final t...
3                  u dun say earli hor u c alreadi say
4                 nah think goe usf live around though
5    freemsg hey darl number week word back like fu...
6        even brother like speak treat like aid patent
7    per request mell mell oru minnaminungint nurun...
8    winner valu network custom select receivea mon...
9    mobil number month u r entitl updat latest col...
Name: 1, dtype: object

In [14]:
from nltk.tokenize import word_tokenize

#creating bag of words
all_words=[]

for message in processed:
    words=word_tokenize(message)
    for w in words:
        all_words.append(w)
all_words=nltk.FreqDist(all_words)

In [15]:
print(f'Number of words',len(all_words))
print(f'Most Common Words',all_words.most_common(15))

Number of words 6574
Most Common Words [('number', 2758), ('u', 1207), ('call', 674), ('go', 456), ('get', 451), ('ur', 391), ('gt', 318), ('lt', 316), ('come', 304), ('moneysymbolnumb', 303), ('ok', 293), ('free', 284), ('day', 276), ('know', 275), ('love', 266)]


In [16]:
#use the most common words  as features
word_features=list(all_words.keys())[:1500]

In [17]:
#define a find_features functionj
def find_features(message):
    words=word_tokenize(message)
    features={}
    for word in word_features:
        features[word]=(word in words)
    return features 


features=find_features(processed[0])
for key,value in features.items():
    if value==True:
        print (key)

go
jurong
point
crazi
avail
bugi
n
great
world
la
e
buffet
cine
got
amor
wat


In [18]:
processed[0]

'go jurong point crazi avail bugi n great world la e buffet cine got amor wat'

In [19]:
message=list(zip(processed,y))
seed=1
np.random.seed=seed
np.random.shuffle(message)

featuresets=[(find_features(text),label) for (text,label) in message]

In [20]:
from sklearn import model_selection

In [21]:
training,testing=model_selection.train_test_split(featuresets,test_size=0.25,random_state=seed)

In [22]:
print(f'training data length:',len(training))
print(f'testing data length:',len(testing))

training data length: 4179
testing data length: 1393


## SKLearn  classifiers  with NLTK

In [23]:
from sklearn.neighbors import  KNeighborsClassifier

In [24]:
from sklearn.tree         import  DecisionTreeClassifier
from sklearn.ensemble     import  RandomForestClassifier
from sklearn.linear_model import  LogisticRegression,SGDClassifier
from sklearn.naive_bayes  import  MultinomialNB
from sklearn.svm          import  SVC
from sklearn.metrics      import  classification_report,accuracy_score,confusion_matrix

In [25]:
names=['K Nearest Neighbors','Decision Tree','Random Forest','Logistic Regression','SGD Classifier','Naive Bayes','SVM Linear']
classifier=[KNeighborsClassifier(),
            DecisionTreeClassifier(),
            RandomForestClassifier(),
            LogisticRegression(),
            SGDClassifier(max_iter=100),
            MultinomialNB(),
            SVC(kernel='linear')]
models=zip(names,classifier)
print(models)

<zip object at 0x0000021A98907600>


In [26]:
from nltk.classify.scikitlearn import SklearnClassifier

In [27]:
for name,model in models:
    nltk_model=SklearnClassifier(model)
    nltk_model.train(training)
    accuracy=nltk.classify.accuracy(nltk_model,testing)*100
    print('{}: Accuracy {}'.format(name,accuracy))

K Nearest Neighbors: Accuracy 93.96984924623115
Decision Tree: Accuracy 97.4156496769562
Random Forest: Accuracy 98.20531227566404
Logistic Regression: Accuracy 98.63603732950466
SGD Classifier: Accuracy 98.27709978463747
Naive Bayes: Accuracy 98.06173725771716
SVM Linear: Accuracy 98.49246231155779


In [28]:
#voting classifier
from sklearn.ensemble import VotingClassifier


names=['K Nearest Neighbors','Decision Tree','Random Forest','Logistic Regression','SGD Classifier','Naive Bayes','SVM Linear']
classifier=[KNeighborsClassifier(),DecisionTreeClassifier(),RandomForestClassifier(),LogisticRegression(),SGDClassifier(max_iter=100),MultinomialNB(),SVC(kernel='linear')]
models=zip(names,classifier)



In [29]:
nltk_ensemble=SklearnClassifier(VotingClassifier(estimators=list(models),voting='hard',n_jobs=-1))


nltk_ensemble.train(training)
accuracy=nltk.classify.accuracy(nltk_ensemble, testing)*100
print('Ensemble Method Accuracy : {} '.format(accuracy))

Ensemble Method Accuracy : 98.42067480258436 


In [30]:
txt_features,labels=zip(*testing)

prediction=nltk_ensemble.classify_many(txt_features)

In [31]:
#print a classfication report and confusion matrix
print(classification_report(labels,prediction))

pd.DataFrame(confusion_matrix(labels,prediction),
            index=[['actual','actual'],['ham','spam']],
            columns=[['predicted','predicted'],['ham','spam']])

              precision    recall  f1-score   support

           0       0.98      1.00      0.99      1210
           1       0.99      0.89      0.94       183

    accuracy                           0.98      1393
   macro avg       0.99      0.94      0.96      1393
weighted avg       0.98      0.98      0.98      1393



Unnamed: 0_level_0,Unnamed: 1_level_0,predicted,predicted
Unnamed: 0_level_1,Unnamed: 1_level_1,ham,spam
actual,ham,1208,2
actual,spam,20,163
