In [1]:
import sys
import nltk
import sklearn
import pandas
import numpy

print('Python {}'.format(sys.version))
print('Nltk {}'.format(nltk.__version__))
print('Pandas {}'.format(pandas.__version__))
print('Sklearn {}'.format(sklearn.__version__))

Python 3.11.5 | packaged by Anaconda, Inc. | (main, Sep 11 2023, 13:26:23) [MSC v.1916 64 bit (AMD64)]
Nltk 3.8.1
Pandas 2.0.3
Sklearn 1.3.0


In [None]:
# importing libraries
import os
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

In [3]:
df = pd.read_table('SMSSPamCollection', header=None, encoding='utf-8')

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   0       5572 non-null   object
 1   1       5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB


In [None]:
# evaluate the data, check the first 5 rows
df.head()

Unnamed: 0,0,1
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [None]:
# Evaluating the distribution of the classed in the dataset
# 0 = ham, 1 = spam
classes = df[0]
classes.value_counts()

0
ham     4825
spam     747
Name: count, dtype: int64

## Preprocess the Data

In [7]:
Lb = LabelEncoder()
Y = Lb.fit_transform(classes)
print(Y[:10])

[0 0 1 0 0 1 0 0 1 1]


In [None]:
# displaying the first 10 rows of the dataset feature
text_messages = df[1]
text_messages[:10]

0    Go until jurong point, crazy.. Available only ...
1                        Ok lar... Joking wif u oni...
2    Free entry in 2 a wkly comp to win FA Cup fina...
3    U dun say so early hor... U c already then say...
4    Nah I don't think he goes to usf, he lives aro...
5    FreeMsg Hey there darling it's been 3 week's n...
6    Even my brother is not like to speak with me. ...
7    As per your request 'Melle Melle (Oru Minnamin...
8    WINNER!! As a valued network customer you have...
9    Had your mobile 11 months or more? U R entitle...
Name: 1, dtype: object

In [9]:
# use regular expressions to replace email addresses, URLs, phone numbers, other numbers

# Replace email addresses with 'email'
processed = text_messages.str.replace(r'^.+@[^\.].*\.[a-z]{2,}$',
                                 'emailaddress', regex=True)

# Replace URLs with 'webaddress'
processed = processed.str.replace(r'^http\://[a-zA-Z0-9\-\.]+\.[a-zA-Z]{2,3}(/\S*)?$',
                                  'webaddress', regex=True)

# Replace money symbols with 'moneysymb' (£ can by typed with ALT key + 156)
processed = processed.str.replace(r'£|\$', 'moneysymb', regex=True)
    
# Replace 10 digit phone numbers (formats include paranthesis, spaces, no spaces, dashes) with 'phonenumber'
processed = processed.str.replace(r'^\(?[\d]{3}\)?[\s-]?[\d]{3}[\s-]?[\d]{4}$',
                                  'phonenumbr', regex=True)
    
# Replace numbers with 'numbr'
processed = processed.str.replace(r'\d+(\.\d+)?', 'numbr', regex=True)

In [10]:
# Remove punctuation
processed = processed.str.replace(r'[^\w\d\s]', ' ', regex =True)

# Replace whitespace between terms with a single space
processed = processed.str.replace(r'\s+', ' ', regex=True)

# Remove leading and trailing whitespace
processed = processed.str.replace(r'^\s+|\s+?$', '', regex=True)

In [11]:
processed = processed.str.lower()
print(processed)

0       go until jurong point crazy available only in ...
1                                 ok lar joking wif u oni
2       free entry in numbr a wkly comp to win fa cup ...
3             u dun say so early hor u c already then say
4       nah i don t think he goes to usf he lives arou...
                              ...                        
5567    this is the numbrnd time we have tried numbr c...
5568                  will ü b going to esplanade fr home
5569    pity was in mood for that so any other suggest...
5570    the guy did some bitching but i acted like i d...
5571                            rofl its true to its name
Name: 1, Length: 5572, dtype: object


In [None]:
# remove stopwords - common words that do not add value to the analysis
# such as 'the', 'is', 'in', 'and', etc.
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

stop_words = set(stopwords.words("english"))

processed = processed.apply(lambda x : ' '.join(word for word in x.split() if word not in stop_words))

In [13]:
print(processed)

0       go jurong point crazy available bugis n great ...
1                                 ok lar joking wif u oni
2       free entry numbr wkly comp win fa cup final tk...
3                     u dun say early hor u c already say
4                  nah think goes usf lives around though
                              ...                        
5567    numbrnd time tried numbr contact u u moneysymb...
5568                          ü b going esplanade fr home
5569                                pity mood suggestions
5570    guy bitching acted like interested buying some...
5571                                       rofl true name
Name: 1, Length: 5572, dtype: object


In [None]:
# stemming - reduce words to their root form
ps = nltk.PorterStemmer()

processed = processed.apply(lambda x : ' '.join(ps.stem(word) for word in x.split() ))

In [15]:
print(processed)

0       go jurong point crazi avail bugi n great world...
1                                   ok lar joke wif u oni
2       free entri numbr wkli comp win fa cup final tk...
3                     u dun say earli hor u c alreadi say
4                    nah think goe usf live around though
                              ...                        
5567    numbrnd time tri numbr contact u u moneysymbnu...
5568                              ü b go esplanad fr home
5569                                    piti mood suggest
5570    guy bitch act like interest buy someth els nex...
5571                                       rofl true name
Name: 1, Length: 5572, dtype: object


## Feature Engineering

In [None]:
# tokenize the text and create a bag of words
from nltk.tokenize import word_tokenize

#creating Bag-of-words
all_words = []

for message in processed:
    words = word_tokenize(message)
    for word in words:
        all_words.append(word)
    
all_words = nltk.FreqDist(all_words)

In [17]:
print('Number of words {}'.format(len(all_words)))
print('15 most common words: {}'.format(all_words.most_common(15)))

Number of words 6579
15 most common words: [('numbr', 2648), ('u', 1207), ('call', 674), ('go', 456), ('get', 451), ('ur', 391), ('gt', 318), ('lt', 316), ('come', 304), ('moneysymbnumbr', 303), ('ok', 293), ('free', 284), ('day', 276), ('know', 275), ('love', 266)]


In [None]:
# using the 2800 most words as features
word_features = list(all_words.keys())[:2800]

In [19]:
def find_features(message):
    words = word_tokenize(message)
    
    features = {}
    
    for word in word_features:
        features[word] = (word in words)
        
    return features


features = find_features(processed[0])
for key, value in features.items():
    if value == True:
        print(key)  

go
jurong
point
crazi
avail
bugi
n
great
world
la
e
buffet
cine
got
amor
wat


In [20]:
messages = list(zip(processed, Y))

seed = 1
np.random.seed = seed
np.random.shuffle(messages)

featuresets = [(find_features(text), label) for (text, label) in messages]

In [21]:
from sklearn import model_selection

training, testing = model_selection.train_test_split(featuresets, test_size=0.25, random_state=seed)

In [22]:
print(len(training))
print(len(testing))

4179
1393


## Sklearn classifier with Nltk

In [24]:
# define models to train

names = ['K Nearest Neighbors', 'Decision Tree', 'Random Forest', 'Logistic Regression', 'SGD Classifier', 'Naive Bayes', 'SVM Linear']

classifiers = [
    KNeighborsClassifier(),
    DecisionTreeClassifier(),
    RandomForestClassifier(),
    LogisticRegression(),
    SGDClassifier(max_iter = 100),
    MultinomialNB(),
    SVC(kernel= 'linear')
]

models = zip(names, classifiers)


In [25]:
# rap the model inside nltk sklearnclassifier
from nltk.classify.scikitlearn import SklearnClassifier

for name, model in models:
    nltk_model = SklearnClassifier(model)
    nltk_model.train(training)
    accuracy = nltk.classify.accuracy(nltk_model, testing) * 100
    
    print('{}: Accuracy: {}'.format(name, accuracy))

K Nearest Neighbors: Accuracy: 92.60588657573582
Decision Tree: Accuracy: 97.63101220387652
Random Forest: Accuracy: 98.56424982053123
Logistic Regression: Accuracy: 99.21033740129216
SGD Classifier: Accuracy: 99.13854989231874
Naive Bayes: Accuracy: 98.49246231155779
SVM Linear: Accuracy: 99.06676238334529


In [26]:
nltk_ensemble = SklearnClassifier(RandomForestClassifier())
nltk_ensemble.train(training)
accuracy = nltk.classify.accuracy(nltk_ensemble, testing) * 100
print('Random Forest Classifier: Accuracy {}'.format(accuracy))

Random Forest Classifier: Accuracy 98.63603732950466


In [27]:
# make class label prediction for testing set
txt_features, labels = zip(*testing)


predictions = nltk_ensemble.classify_many(txt_features)

In [28]:
predictions

[0,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,


In [30]:
def word_features(sentence):
    return {word: True for word in sentence.split()}

# Extract features for each sentence
txt_features1 = [word_features(sentence) for sentence in ['free entri numbr wkli comp win fa cup final tk', 'congrats win free numbr wkli','congrats, win free moneysymb numbr.', 
                                                          'The story about the novel was great', "Congratulations! You have been randomly selected to receive a $1,000 gift card","win moneysymb", 'free wkli entri tk']]

# Classify using the ensemble
predics = nltk_ensemble.classify_many(txt_features1)
print(predics)

[1, 0, 0, 0, 0, 0, 0, 0]
