###                                                             Spam Email Classification

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [4]:
dataset = pd.read_csv('emails.csv')

In [5]:
dataset.columns

Index(['text', 'spam'], dtype='object')

In [6]:
dataset.shape

(5728, 2)

In [7]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5728 entries, 0 to 5727
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    5728 non-null   object
 1   spam    5728 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 89.6+ KB


In [12]:
#Checking for null values
dataset.isnull().sum()

text    0
spam    0
dtype: int64

In [13]:
#Checking for duplicates and removing them
dataset.drop_duplicates(inplace=True)
dataset.shape

(5695, 2)

In [48]:
dataset.head()

Unnamed: 0,text,spam
0,"[recollect, a, company, the, market, is, full,...",1
1,"[colza, attainder, and, penultimate, like, esm...",1
2,"[this, homeowner, you, have, been, pre, approv...",1
3,"[click, here, for, a, printable, version, of, ...",1
4,"[compatibility, ain, t, it, great, grow, old, ...",1


In [32]:
#Using Natural Language Processing to cleaning the text to make one corpus
# Cleaning the texts
import re      # Regular Expressions Library
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

# Every mail starts with Subject, hence remove this from text.
dataset['text']=dataset['text'].map(lambda text: text[1:])


# re.sub(pattern, repl, string, count=0, flags=0)   
# count = (maximum number of pattern occurrences to be replaced. if 0 replaces all.)
# substitutes/replaces string pattern.
# remove anything that is not a letter or number
dataset['text'] = dataset['text'].map(lambda text:re.sub('[^a-zA-Z0-9]+', ' ',str(text))).apply(lambda x: (x.lower()).split())

ps = PorterStemmer()
# Pandas.apply allow the users to pass a function and apply it on every single value of the Pandas series
corpus = dataset['text'].apply(lambda text_list:" ".join(list(map(lambda word:ps.stem(word),list(filter(lambda text:text not in set(stopwords.words('english')), text_list))))))

# Creating the Bag of words Model
# https://www.geeksforgeeks.org/using-countvectorizer-to-extracting-features-from-text/
# https://www.geeksforgeeks.org/bag-of-words-bow-model-in-nlp/
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()
X = cv.fit_transform(corpus.values).toarray()
y = dataset.iloc[:,1].values

In [47]:
dataset['text']

0       [recollect, a, company, the, market, is, full,...
1       [colza, attainder, and, penultimate, like, esm...
2       [this, homeowner, you, have, been, pre, approv...
3       [click, here, for, a, printable, version, of, ...
4       [compatibility, ain, t, it, great, grow, old, ...
                              ...                        
5723    [forwarded, by, shirley, crenshaw, hou, ect, o...
5724    [to, visit, lsu, shirley, will, fedex, the, re...
5725    [day, that, s, super, thank, you, so, very, mu...
5726    [5, 5290, to, set, it, up, vince, david, p, du...
5727    [fastest, model, just, got, faster, epis, anno...
Name: text, Length: 5695, dtype: object

In [35]:
# Splitting data into train and test set.
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20)

In [36]:
# Fitting Naive Bayes Classifier to the training set
from sklearn.naive_bayes import MultinomialNB
classifier = MultinomialNB(alpha = 0.1, class_prior=None, fit_prior = True)
classifier.fit(X_train,y_train)

MultinomialNB(alpha=0.1)

In [37]:
# Predicting the test set results
y_pred = classifier.predict(X_test)

In [41]:
# Making the Confusion Matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)
cm

array([[876,  13],
       [  4, 246]], dtype=int64)

In [42]:
# this function computes the subset accuracy
# normalizebool, default=True
# If False, return the number of correctly classified samples. Otherwise, return the fraction of correctly classified samples.
from sklearn.metrics import accuracy_score
accuracy_score(y_test, y_pred)

0.9850746268656716

In [43]:
accuracy_score(y_test,y_pred, normalize = False) # 1122 out of 1139

1122

In [45]:
# Applying K-Fold cross Validation
from sklearn.model_selection import cross_val_score
accuracies = cross_val_score(estimator = classifier, X=X_train, y=y_train, cv=10)
print("Mean: ",accuracies.mean())
print("Std: ",accuracies.std())

Mean:  0.9850732600732602
Std:  0.0039054155825396274


In [None]:
# https://towardsdatascience.com/naive-bayes-classifier-how-to-successfully-use-it-in-python-ecf76a995069
# https://www.kaggle.com/balakishan77/spam-or-ham-email-classification#