In [2]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

#nltk
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
from nltk.classify.scikitlearn import SklearnClassifier

#scikit learn
from sklearn.preprocessing import LabelEncoder
from sklearn import model_selection
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

In [3]:
data=pd.read_table("SMSSpamCollection", header=None, encoding='utf-8')

In [4]:
data.head()

Unnamed: 0,0,1
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


### Preprocessing

In [5]:
classes=data[0]

In [6]:
classes.head()

0     ham
1     ham
2    spam
3     ham
4     ham
Name: 0, dtype: object

In [7]:
encoder=LabelEncoder()
y=encoder.fit_transform(classes)
y[:10]

array([0, 0, 1, 0, 0, 1, 0, 0, 1, 1])

In [8]:
text_msg=data[1]

In [9]:
text_msg.head()

0    Go until jurong point, crazy.. Available only ...
1                        Ok lar... Joking wif u oni...
2    Free entry in 2 a wkly comp to win FA Cup fina...
3    U dun say so early hor... U c already then say...
4    Nah I don't think he goes to usf, he lives aro...
Name: 1, dtype: object

### Regular expression

In [10]:
# Replace email addresses with 'email'
processed = text_msg.str.replace(r'^.+@[^\.].*\.[a-z]{2,}$',
                                 'emailaddress')

# Replace URLs with 'webaddress'
processed = processed.str.replace(r'^http\://[a-zA-Z0-9\-\.]+\.[a-zA-Z]{2,3}(/\S*)?$',
                                  'webaddress')

# Replace money symbols with 'moneysymb' (£ can by typed with ALT key + 156)
processed = processed.str.replace(r'£|\$', 'moneysymb')
    
# Replace 10 digit phone numbers (formats include paranthesis, spaces, no spaces, dashes) with 'phonenumber'
processed = processed.str.replace(r'^\(?[\d]{3}\)?[\s-]?[\d]{3}[\s-]?[\d]{4}$',
                                  'phonenumbr')
    
# Replace numbers with 'numbr'
processed = processed.str.replace(r'\d+(\.\d+)?', 'numbr')

# Remove punctuation
processed = processed.str.replace(r'[^\w\d\s]', ' ')

# Replace whitespace between terms with a single space
processed = processed.str.replace(r'\s+', ' ')

# Remove leading and trailing whitespace
processed = processed.str.replace(r'^\s+|\s+?$', '')

# change words to lower case - Hello, HELLO, hello are all the same word
processed = processed.str.lower()

In [11]:
print(processed)

0       go until jurong point crazy available only in ...
1                                 ok lar joking wif u oni
2       free entry in numbr a wkly comp to win fa cup ...
3             u dun say so early hor u c already then say
4       nah i don t think he goes to usf he lives arou...
                              ...                        
5567    this is the numbrnd time we have tried numbr c...
5568                  will ü b going to esplanade fr home
5569    pity was in mood for that so any other suggest...
5570    the guy did some bitching but i acted like i d...
5571                            rofl its true to its name
Name: 1, Length: 5572, dtype: object


In [12]:
stop_words=set(stopwords.words('english'))

processed=processed.apply(lambda x: ' '.join(term for term in x.split() if term not in stop_words))

In [13]:
processed[:5]

0    go jurong point crazy available bugis n great ...
1                              ok lar joking wif u oni
2    free entry numbr wkly comp win fa cup final tk...
3                  u dun say early hor u c already say
4               nah think goes usf lives around though
Name: 1, dtype: object

In [14]:
ps =PorterStemmer()

processed = processed.apply(lambda x: ' '.join(ps.stem(term) for term in x.split()))

In [15]:
processed

0       go jurong point crazi avail bugi n great world...
1                                   ok lar joke wif u oni
2       free entri numbr wkli comp win fa cup final tk...
3                     u dun say earli hor u c alreadi say
4                    nah think goe usf live around though
                              ...                        
5567    numbrnd time tri numbr contact u u moneysymbnu...
5568                              ü b go esplanad fr home
5569                                    piti mood suggest
5570    guy bitch act like interest buy someth els nex...
5571                                       rofl true name
Name: 1, Length: 5572, dtype: object

### Generating features

In [16]:
bag_of_words=[]#to tokenize

for msg in processed:
    words=word_tokenize(msg)
    for w in words:
        bag_of_words.append(w)
        
bag_of_words=nltk.FreqDist(bag_of_words)

In [17]:
bag_of_words.most_common(10)

[('numbr', 2648),
 ('u', 1207),
 ('call', 674),
 ('go', 456),
 ('get', 451),
 ('ur', 391),
 ('gt', 318),
 ('lt', 316),
 ('come', 304),
 ('moneysymbnumbr', 303)]

In [18]:
len(bag_of_words.keys())

6579

In [19]:
bow=dict(bag_of_words.most_common(1500))

In [20]:
word_features=list(bow.keys())

In [21]:
word_features[:10]

['numbr', 'u', 'call', 'go', 'get', 'ur', 'gt', 'lt', 'come', 'moneysymbnumbr']

In [22]:
def find_features(message):
    words = word_tokenize(message)
    features = {}
    for word in word_features:
        features[word] = (word in words)

    return features

# Lets see an example!
features = find_features(processed[0])
for key, value in features.items():
    if value == True:
        print(key)

go
got
n
great
wat
e
world
point
avail
crazi
bugi
la
cine


In [23]:
# Now lets do it for all the messages
messages = zip(processed, y)

# define a seed for reproducibility
seed = 1
np.random.seed = seed

# call find_features function for each SMS message
featuresets = [(find_features(text), label) for (text, label) in messages]

### Split data into train and test sets

In [24]:
train,test=model_selection.train_test_split(featuresets,test_size=0.25,random_state=1 )

In [25]:
print(len(train))
print(len(test))

4179
1393


In [26]:
names = ["K Nearest Neighbors", "Decision Tree", "Random Forest", "Logistic Regression", 
         "SGD Classifier","Naive Bayes", "SVM Linear"]

classifiers = [
    KNeighborsClassifier(),
    DecisionTreeClassifier(),
    RandomForestClassifier(),
    LogisticRegression(),
    SGDClassifier(max_iter = 100),
    MultinomialNB(),
    SVC(kernel = 'linear')
]

models = zip(names, classifiers)

for name, model in models:
    nltk_model = SklearnClassifier(model)
    nltk_model.train(train)
    accuracy = nltk.classify.accuracy(nltk_model, test)*100
    print("{} Accuracy: {}".format(name, accuracy))

K Nearest Neighbors Accuracy: 94.54414931801867
Decision Tree Accuracy: 97.77458722182341
Random Forest Accuracy: 98.77961234745155
Logistic Regression Accuracy: 98.77961234745155
SGD Classifier Accuracy: 98.77961234745155
Naive Bayes Accuracy: 99.06676238334529
SVM Linear Accuracy: 98.77961234745155


Clearly naive bayes has the most accuracy. So we are gonna select naive bayes

In [27]:
txt_features, labels= zip(*test)

In [28]:
nb_classifier=SklearnClassifier(MultinomialNB())
nb_classifier.train(train)

<SklearnClassifier(MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))>

In [29]:
predict=nb_classifier.classify_many(txt_features)

### Classification report

In [30]:
print(classification_report(labels,predict))

              precision    recall  f1-score   support

           0       0.99      1.00      0.99      1208
           1       0.97      0.96      0.96       185

    accuracy                           0.99      1393
   macro avg       0.98      0.98      0.98      1393
weighted avg       0.99      0.99      0.99      1393



### Confusion matrix

In [31]:
confusion_matrix(labels,predict)

array([[1202,    6],
       [   7,  178]], dtype=int64)

In [32]:
example = find_features("Free entry in 2 a weekly competition to win world Cup final tickets.")
nb_classifier.classify_many(example)

[1]

In [1]:
#0:not-spam   1:spam