# Spam Email Detection using Machine Learning Algorithms

# Importing Libraries

In [1]:
from email import message
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import cross_val_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
import string

# Loading and cleaning the dataset

In [2]:
df = pd.read_csv('spam.csv')
df.rename(columns={'v1': 'spam', 'v2': 'text'}, inplace=True)
df = df[['text', 'spam']]
df['spam'] = df['spam'].replace('ham', 0)
df['spam'] = df['spam'].replace('spam', 1)
df.drop_duplicates(inplace=True)
df.head()

Unnamed: 0,text,spam
0,"Go until jurong point, crazy.. Available only ...",0
1,Ok lar... Joking wif u oni...,0
2,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,U dun say so early hor... U c already then say...,0
4,"Nah I don't think he goes to usf, he lives aro...",0


In [3]:
#Cleaning text
def clean_text(text):
    #Removing the punctuation
    punct = [word for word in text if text not in string.punctuation]
    punct = ''.join(punct)

    #Removing the stopwords
    cleaned = [word for word in punct.split() if word.lower() not in stopwords.words('english')]
    return cleaned

# Converting text to a token matrix

In [4]:
vector = CountVectorizer(analyzer=clean_text)
messages = vector.fit_transform(df['text'])
count_array = messages.toarray()
pd.DataFrame(data=count_array,columns = vector.get_feature_names_out())

Unnamed: 0,!,!!,!!!,!!!!,!!''.,!1,!:-),!This,#,#150,...,åÒHarry,åÒIt's,åÔMORROW.,åÔrents,‰Û_,‰Û_.,‰Û_Thanks,‰ÛÏ,‰ÛÏHarry,‰ÛÒ
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5164,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5165,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5166,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5167,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


# Performing a 10-fold cross validation to obtain the accuracy of each classifier

# Naive Bayes Classifier

In [5]:
naive_bayes_accuracy = np.mean(cross_val_score(MultinomialNB(), messages, df['spam'], scoring = "accuracy", cv = 10))
print("The 10-fold cross validation accuracy for the Naive Bayes Classifier is " + 
    str(100*round(naive_bayes_accuracy, 3)) + "%.")

The 10-fold cross validation accuracy for the Naive Bayes Classifier is 95.7%.


# Logistic Regression

In [6]:
LR_accuracy = np.mean(cross_val_score(LogisticRegression(), messages, df['spam'], scoring = "accuracy", cv = 10))
print("The 10-fold cross validation accuracy for Logistic Regression is " + 
    str(100*round(LR_accuracy, 3)) + "%.")

The 10-fold cross validation accuracy for Logistic Regression is 97.3%.


# Support Vector Machine

Linear Kernel

In [7]:
SVM_linear_accuracy = np.mean(cross_val_score(SVC(kernel = "linear"), messages, df['spam'], scoring = "accuracy", cv = 10))
print("The 10-fold cross validation accuracy for linear SVM is " + 
    str(100*round(SVM_linear_accuracy, 3)) + "%.")

The 10-fold cross validation accuracy for linear SVM is 97.7%.


Radial Kernel

In [8]:
SVM_radial_accuracy = np.mean(cross_val_score(SVC(kernel = "rbf"), messages, df['spam'], scoring = "accuracy", cv = 10))
print("The 10-fold cross validation accuracy for radial SVM is " + 
    str(100*round(SVM_radial_accuracy, 3)) + "%.")

The 10-fold cross validation accuracy for radial SVM is 95.89999999999999%.


Polynomial Kernel

In [9]:
SVM_polynomial_accuracy = np.mean(cross_val_score(SVC(kernel = "poly"), messages, df['spam'], scoring = "accuracy", cv = 10))
print("The 10-fold cross validation accuracy for polynomial SVM is " + 
    str(100*round(SVM_polynomial_accuracy, 3)) + "%.")

The 10-fold cross validation accuracy for polynomial SVM is 92.30000000000001%.


Sigmoid Kernel

In [10]:
SVM_sigmoid_accuracy = np.mean(cross_val_score(SVC(kernel = "sigmoid"), messages, df['spam'], scoring = "accuracy", cv = 10))
print("The 10-fold cross validation accuracy for sigmoid SVM is " + 
    str(100*round(SVM_sigmoid_accuracy, 3)) + "%.")

The 10-fold cross validation accuracy for sigmoid SVM is 97.6%.


# Summary

While most of the classifiers do very well, predicting the correct category 95-97 percent of the time, the Logistic regression, linear SVM, and sigmoid SVM, do the best job with ~ 97 percent CV accuracy each.