In [1]:
import pandas as pd
import numpy as np
import nltk
nltk.download("popular")

[nltk_data] Downloading collection 'popular'
[nltk_data]    | 
[nltk_data]    | Downloading package cmudict to
[nltk_data]    |     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]    |   Package cmudict is already up-to-date!
[nltk_data]    | Downloading package gazetteers to
[nltk_data]    |     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]    |   Package gazetteers is already up-to-date!
[nltk_data]    | Downloading package genesis to
[nltk_data]    |     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]    |   Package genesis is already up-to-date!
[nltk_data]    | Downloading package gutenberg to
[nltk_data]    |     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]    |   Package gutenberg is already up-to-date!
[nltk_data]    | Downloading package inaugural to
[nltk_data]    |     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]    |   Package inaugural is already up-to-date!
[nltk_data]    | Downloading package movie_reviews to
[nltk_data]    |   

True

In [4]:
# Specify the columns to read
usecols = [0, 1]

# Try reading the CSV file with a different encoding and specific columns
try:
    data = pd.read_csv("sms_spam.csv", usecols=usecols, encoding='ISO-8859-1')
except UnicodeDecodeError:
    try:
        data = pd.read_csv("sms_spam.csv", usecols=usecols, encoding='latin1')
    except UnicodeDecodeError:
        data = pd.read_csv("sms_spam.csv", usecols=usecols, encoding='cp1252')

# Display the first few entries to verify
print(data.head(5))

     v1                                                 v2
0   ham  Go until jurong point, crazy.. Available only ...
1   ham                      Ok lar... Joking wif u oni...
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...
3   ham  U dun say so early hor... U c already then say...
4   ham  Nah I don't think he goes to usf, he lives aro...


In [5]:
data.columns

Index(['v1', 'v2'], dtype='object')

In [7]:
#checking distribution of the dataset
data.groupby('v1').count()

Unnamed: 0_level_0,v2
v1,Unnamed: 1_level_1
ham,4825
spam,747


In [9]:
#naming columns
data.columns = ['label', 'text']
# creating dummy variables for spam and ham
y=pd.get_dummies(data['label'])
y=y.iloc[:,1].values
data.head(5)

Unnamed: 0,label,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [10]:
'''Utilize the stemmer and tokenizer from nltk to clean up the data using regular expression. Also remove all stop
words from the messages.'''

from nltk.stem import SnowballStemmer
from nltk import TweetTokenizer
stopwords = nltk.corpus.stopwords.words('english')
import re
#cleaning dataset
tk = TweetTokenizer()
stemmer = SnowballStemmer('english')
corpus = []
for i in range(0,len(data)):
    message = re.sub('[^a-zA-Z]', ' ', data['text'][i])
    message = re.sub("Subject", "", message)
    message = message.lower()
    message =tk.tokenize(message)
    message = [stemmer.stem(word) for word in message if not word in set(stopwords)]
    message = ' '.join(message)
    corpus.append(message)

In [11]:
#creating cleaned dataframe containing text and labels for visualisation purpose
df = pd.DataFrame({'text':corpus})
print (df)

                                                   text
0     go jurong point crazi avail bugi n great world...
1                                 ok lar joke wif u oni
2     free entri wkli comp win fa cup final tkts st ...
3                   u dun say earli hor u c alreadi say
4                  nah think goe usf live around though
...                                                 ...
5567  nd time tri contact u u pound prize claim easi...
5568                              b go esplanad fr home
5569                                  piti mood suggest
5570  guy bitch act like interest buy someth els nex...
5571                                     rofl true name

[5572 rows x 1 columns]


In [12]:
#Append the label column to the new dataframe
data= df.join(data["label"], how="outer")
print(data)

                                                   text label
0     go jurong point crazi avail bugi n great world...   ham
1                                 ok lar joke wif u oni   ham
2     free entri wkli comp win fa cup final tkts st ...  spam
3                   u dun say earli hor u c alreadi say   ham
4                  nah think goe usf live around though   ham
...                                                 ...   ...
5567  nd time tri contact u u pound prize claim easi...  spam
5568                              b go esplanad fr home   ham
5569                                  piti mood suggest   ham
5570  guy bitch act like interest buy someth els nex...   ham
5571                                     rofl true name   ham

[5572 rows x 2 columns]


In [13]:
#Split into 80% training and 20% testing
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(corpus, y, test_size = 0.20, random_state = 0)

In [14]:
# Creating the Bag of Words model
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()
BOW_train=cv.fit_transform(X_train).toarray() #fitting only on the train set
BOW_test=cv.transform(X_test).toarray()

In [15]:
#Print the number of words in the vocabulary (features)
print(len(cv.vocabulary_))
#print array shapes
print(BOW_train.shape)
print(BOW_test.shape)


5490
(4457, 5490)
(1115, 5490)


In [16]:
print(BOW_train[:1000])

[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


In [17]:
#import the classifiers
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics

In [18]:
classifier_MNB = MultinomialNB()
classifier_MNB.fit(BOW_train, y_train)

In [19]:
# Test the model after training
test_results_MNB = classifier_MNB.predict(BOW_test)

In [20]:
accuracy = metrics.accuracy_score(y_test, test_results_MNB)
print(f'{accuracy *100: .2f}%')

 98.48%


In [21]:
classifier_KNN = KNeighborsClassifier()
classifier_KNN.fit(BOW_train, y_train)

In [22]:
# Test the model after training
test_results_KNN = classifier_KNN.predict(BOW_test)

In [23]:
accuracy = metrics.accuracy_score(y_test, test_results_KNN)
print(f'{accuracy *100: .2f}%')

 91.03%


In [24]:
classifier_RF = RandomForestClassifier(max_depth=60,n_estimators=100,n_jobs=-1,random_state=2)
classifier_RF.fit(BOW_train, y_train)

In [25]:
# Test the model after training
test_results_RF = classifier_RF.predict(BOW_test)

In [26]:
accuracy = metrics.accuracy_score(y_test, test_results_RF)
print(f'{accuracy *100: .2f}%')

 97.13%


In [27]:
# Creating the TFIDF model
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer()
TFIDF_train=tfidf.fit_transform(X_train).toarray() #fitting only on the trainset
TFIDF_test=tfidf.transform(X_test).toarray()

In [28]:
print(len(tfidf.vocabulary_))

5490


In [29]:
print(TFIDF_train.shape)
print(TFIDF_test.shape)

(4457, 5490)
(1115, 5490)


In [30]:
classifier_MNB.fit(TFIDF_train, y_train)

In [31]:
# Test the model after training
test_results_MNB = classifier_MNB.predict(TFIDF_test)

In [32]:
accuracy = metrics.accuracy_score(y_test, test_results_MNB)
print(f'{accuracy *100: .2f}%')

 96.05%


In [33]:
classifier_KNN.fit(TFIDF_train, y_train)

In [34]:
# Test the model after training
test_results_KNN = classifier_KNN.predict(TFIDF_test)

In [35]:
accuracy = metrics.accuracy_score(y_test, test_results_KNN)
print(f'{accuracy *100: .2f}%')

 91.03%


In [36]:
classifier_RF.fit(TFIDF_train, y_train)

In [38]:
# Test the model after training
test_results_RF = classifier_RF.predict(TFIDF_test)

In [39]:
accuracy = metrics.accuracy_score(y_test, test_results_RF)
print(f'{accuracy *100: .2f}%')

 97.04%
