In [11]:
import pandas as pd
import numpy as np
import nltk
nltk.download("popular")

[nltk_data] Downloading collection 'popular'
[nltk_data]    | 
[nltk_data]    | Downloading package cmudict to
[nltk_data]    |     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]    |   Package cmudict is already up-to-date!
[nltk_data]    | Downloading package gazetteers to
[nltk_data]    |     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]    |   Package gazetteers is already up-to-date!
[nltk_data]    | Downloading package genesis to
[nltk_data]    |     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]    |   Package genesis is already up-to-date!
[nltk_data]    | Downloading package gutenberg to
[nltk_data]    |     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]    |   Package gutenberg is already up-to-date!
[nltk_data]    | Downloading package inaugural to
[nltk_data]    |     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]    |   Package inaugural is already up-to-date!
[nltk_data]    | Downloading package movie_reviews to
[nltk_data]    |   

True

In [12]:
data = pd.read_csv("email_spam.csv")
data.head(5)

Unnamed: 0,email,label
0,"Subject: what up , , your cam babe what are yo...",spam
1,Subject: want to make more money ? order confi...,spam
2,Subject: food for thoughts [ join now - take a...,spam
3,Subject: your pharmacy ta would you want cheap...,spam
4,Subject: bigger breast just from a pill image ...,spam


In [13]:
data.columns

Index(['email', 'label'], dtype='object')

In [14]:
#checking distribution of the dataset
data.groupby('label').count()

Unnamed: 0_level_0,email
label,Unnamed: 1_level_1
ham,8336
spam,9494


In [15]:
#naming columns
data.columns = ['text', 'label']
# creating dummy variables for spam and ham
y=pd.get_dummies(data['label'])
y=y.iloc[:,1].values
data.head(5)

Unnamed: 0,text,label
0,"Subject: what up , , your cam babe what are yo...",spam
1,Subject: want to make more money ? order confi...,spam
2,Subject: food for thoughts [ join now - take a...,spam
3,Subject: your pharmacy ta would you want cheap...,spam
4,Subject: bigger breast just from a pill image ...,spam


In [16]:
'''Utilize the stemmer and tokenizer from nltk to clean up the data using regular expression. Also remove all stop
words from the messages.'''

from nltk.stem import SnowballStemmer
from nltk import TweetTokenizer
stopwords = nltk.corpus.stopwords.words('english')
import re
#cleaning dataset
tk = TweetTokenizer()
stemmer = SnowballStemmer('english')
corpus = []
for i in range(0,len(data)):
    message = re.sub('[^a-zA-Z]', ' ', data['text'][i])
    message = re.sub("Subject", "", message)
    message = message.lower()
    message =tk.tokenize(message)
    message = [stemmer.stem(word) for word in message if not word in set(stopwords)]
    message = ' '.join(message)
    corpus.append(message)

In [17]:
#creating cleaned dataframe containing text and labels for visualisation purpose
df = pd.DataFrame({'text':corpus})
print (df)

                                                    text
0      cam babe look look companion friendship love d...
1      want make money order confirm order ship janua...
2           food thought join take free tour click remov
3      pharmaci ta would want cheap perscript http ww...
4      bigger breast pill imag load cli k info ship s...
...                                                  ...
17825  consent need desk honor chairman local everyth...
17826  see page save thousand e help get e cheap zv s...
17827  go time aaron offload misplac shove withstand ...
17828  extra time ejacul within minut penetr must ord...
17829  one time check rock money paper space winter u...

[17830 rows x 1 columns]


In [18]:
#Append the label column to the new dataframe
data= df.join(data["label"], how="outer")
print(data)

                                                    text label
0      cam babe look look companion friendship love d...  spam
1      want make money order confirm order ship janua...  spam
2           food thought join take free tour click remov  spam
3      pharmaci ta would want cheap perscript http ww...  spam
4      bigger breast pill imag load cli k info ship s...  spam
...                                                  ...   ...
17825  consent need desk honor chairman local everyth...  spam
17826  see page save thousand e help get e cheap zv s...  spam
17827  go time aaron offload misplac shove withstand ...  spam
17828  extra time ejacul within minut penetr must ord...  spam
17829  one time check rock money paper space winter u...  spam

[17830 rows x 2 columns]


In [19]:
#Split into 80% training and 20% testing
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(corpus, y, test_size = 0.20, random_state = 0)

In [23]:
from sklearn.feature_extraction.text import CountVectorizer

# Initialize CountVectorizer
cv = CountVectorizer()

# Fit-transform on the train set (returns a sparse matrix)
BOW_train = cv.fit_transform(X_train)

# Transform the test set (also returns a sparse matrix)
BOW_test = cv.transform(X_test)


In [24]:
#Print the number of words in the vocabulary (features)
print(len(cv.vocabulary_))
#print array shapes
print(BOW_train.shape)
print(BOW_test.shape)


48091
(14264, 48091)
(3566, 48091)


In [25]:
print(BOW_train[:1000])

  (0, 17141)	1
  (0, 8464)	1
  (0, 13649)	1
  (0, 39553)	1
  (0, 12957)	2
  (0, 24847)	2
  (0, 25398)	1
  (0, 21712)	1
  (0, 20141)	1
  (0, 35738)	1
  (0, 42331)	1
  (0, 10453)	1
  (0, 15991)	1
  (0, 25158)	1
  (0, 38861)	1
  (0, 34588)	1
  (0, 25456)	2
  (0, 35011)	2
  (0, 20275)	1
  (0, 41933)	1
  (0, 33668)	1
  (0, 2906)	2
  (0, 44023)	1
  (0, 24172)	1
  (0, 14161)	1
  :	:
  (999, 36133)	1
  (999, 14471)	1
  (999, 40286)	1
  (999, 16855)	1
  (999, 27170)	1
  (999, 32312)	1
  (999, 38216)	1
  (999, 23145)	1
  (999, 18443)	1
  (999, 44524)	2
  (999, 32425)	1
  (999, 24144)	1
  (999, 32588)	1
  (999, 27047)	1
  (999, 22953)	1
  (999, 10862)	3
  (999, 11453)	1
  (999, 17204)	1
  (999, 19209)	1
  (999, 23194)	1
  (999, 1463)	1
  (999, 31286)	1
  (999, 5479)	1
  (999, 23096)	1
  (999, 29442)	1


In [26]:
#import the classifiers
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics

In [27]:
classifier_MNB = MultinomialNB()
classifier_MNB.fit(BOW_train, y_train)

In [28]:
# Test the model after training
test_results_MNB = classifier_MNB.predict(BOW_test)

In [29]:
accuracy = metrics.accuracy_score(y_test, test_results_MNB)
print(f'{accuracy *100: .2f}%')

 98.46%


In [30]:
classifier_KNN = KNeighborsClassifier()
classifier_KNN.fit(BOW_train, y_train)

In [31]:
from sklearn.decomposition import PCA

# Reduce dimensions to a manageable number, e.g., 100 components
pca = PCA(n_components=100)
BOW_train_reduced = pca.fit_transform(BOW_train)
BOW_test_reduced = pca.transform(BOW_test)

# Train and test the KNN classifier on the reduced data
classifier_KNN = KNeighborsClassifier()
classifier_KNN.fit(BOW_train_reduced, y_train)
test_results_KNN = classifier_KNN.predict(BOW_test_reduced)

In [32]:
accuracy = metrics.accuracy_score(y_test, test_results_KNN)
print(f'{accuracy *100: .2f}%')

 93.52%


In [33]:
classifier_RF = RandomForestClassifier(max_depth=60,n_estimators=100,n_jobs=-1,random_state=2)
classifier_RF.fit(BOW_train, y_train)

In [34]:
# Test the model after training
test_results_RF = classifier_RF.predict(BOW_test)

In [35]:
accuracy = metrics.accuracy_score(y_test, test_results_RF)
print(f'{accuracy *100: .2f}%')

 96.80%


In [37]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize TfidfVectorizer
tfidf = TfidfVectorizer()

# Fit-transform on the train set (returns a sparse matrix)
TFIDF_train = tfidf.fit_transform(X_train)

# Transform the test set (also returns a sparse matrix)
TFIDF_test = tfidf.transform(X_test)


In [38]:
print(len(tfidf.vocabulary_))

48091


In [39]:
print(TFIDF_train.shape)
print(TFIDF_test.shape)

(14264, 48091)
(3566, 48091)


In [40]:
classifier_MNB.fit(TFIDF_train, y_train)

In [41]:
# Test the model after training
test_results_MNB = classifier_MNB.predict(TFIDF_test)

In [42]:
accuracy = metrics.accuracy_score(y_test, test_results_MNB)
print(f'{accuracy *100: .2f}%')

 98.40%


In [43]:
classifier_KNN.fit(TFIDF_train, y_train)

In [44]:
# Test the model after training
test_results_KNN = classifier_KNN.predict(TFIDF_test)

In [45]:
accuracy = metrics.accuracy_score(y_test, test_results_KNN)
print(f'{accuracy *100: .2f}%')

 68.56%
