In [1]:
# Email Spam Classification Model
# -------------------------------
# This script implements two classification models to predict whether an email is spam or ham:
# 1. Naive Bayes
# 2. Support Vector Machine (SVM)
#
# The models are trained on a dataset (email.csv) with various features extracted from email content.
# The target variable indicates if the email is spam (1) or ham (0).
#
# Author: Sheheryar


In [2]:
# Importing libraries

import pandas as pd
import nltk
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from collections import Counter
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report

In [3]:
# Importing and reading dataset

df = pd.read_csv("email.csv")
df.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [4]:
df['Message'].isna().sum()

0

In [5]:
df['Message'].dtype

dtype('O')

In [6]:
df['Message'] = df['Message'].astype('string')

In [7]:
df['Message'].dtype

string[python]

In [8]:
# Function to clean text 
def clean_text(text):
    text = re.sub(r'[€$£]', '', text) # Remove currency symbols
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation and special characters
    text = text.lower()  # Convert to lowercase
    return text
df['Message'] = df['Message'].apply(clean_text)

In [9]:
df.head()

Unnamed: 0,Category,Message
0,ham,go until jurong point crazy available only in ...
1,ham,ok lar joking wif u oni
2,spam,free entry in 2 a wkly comp to win fa cup fina...
3,ham,u dun say so early hor u c already then say
4,ham,nah i dont think he goes to usf he lives aroun...


In [10]:
# Removing stop words and tokenization 

def tokenize_and_rem_stopwords(text):
    # Initialize stop words
    stop_words = set(stopwords.words('english'))
    tokens = word_tokenize(text)  # Tokenize 
    filtered_tokens = [token for token in tokens if token.isalpha() and token not in stop_words]  # Remove stopwords
    return filtered_tokens

df['token'] = df['Message'].apply(tokenize_and_rem_stopwords)

In [11]:
df.head()

Unnamed: 0,Category,Message,token
0,ham,go until jurong point crazy available only in ...,"[go, jurong, point, crazy, available, bugis, n..."
1,ham,ok lar joking wif u oni,"[ok, lar, joking, wif, u, oni]"
2,spam,free entry in 2 a wkly comp to win fa cup fina...,"[free, entry, wkly, comp, win, fa, cup, final,..."
3,ham,u dun say so early hor u c already then say,"[u, dun, say, early, hor, u, c, already, say]"
4,ham,nah i dont think he goes to usf he lives aroun...,"[nah, dont, think, goes, usf, lives, around, t..."


In [12]:
# Create a Counter for each row
def token_counts(tokens):
    return dict(Counter(tokens))

# Apply the function to get token counts
token_counts_df = df['token'].apply(token_counts).apply(pd.Series).fillna(0).astype(int)

# Combine with the original DataFrame (optional)
df_new = pd.concat([df, token_counts_df], axis=1).drop(columns=['token'])

In [13]:
df_new.head()

Unnamed: 0,Category,Message,go,jurong,point,crazy,available,bugis,n,great,...,dental,nmde,dump,heap,lowes,salesman,pity,soany,suggestions,bitching
0,ham,go until jurong point crazy available only in ...,1,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
1,ham,ok lar joking wif u oni,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,spam,free entry in 2 a wkly comp to win fa cup fina...,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,ham,u dun say so early hor u c already then say,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,ham,nah i dont think he goes to usf he lives aroun...,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [14]:
df_new.drop(columns='Message',inplace=True)

In [15]:
# Rename the column
df_new.rename(columns={'Category': 'Prediction'}, inplace=True)

df_new.head()


Unnamed: 0,Prediction,go,jurong,point,crazy,available,bugis,n,great,world,...,dental,nmde,dump,heap,lowes,salesman,pity,soany,suggestions,bitching
0,ham,1,1,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
1,ham,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,spam,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,ham,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,ham,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [16]:
df_new['Prediction'].dtype

dtype('O')

In [17]:
df_new['Prediction'] = df_new['Prediction'].astype('string')
df_new['Prediction'].dtype

string[python]

In [18]:
# Map 'spam' to 1 and 'ham' to 0
df_new['Prediction'] = df_new['Prediction'].map({'spam': 1, 'ham': 0})
df_new.head()

Unnamed: 0,Prediction,go,jurong,point,crazy,available,bugis,n,great,world,...,dental,nmde,dump,heap,lowes,salesman,pity,soany,suggestions,bitching
0,0,1,1,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [19]:
plt.figure(figsize = (8, 4))
sns.countplot(x='Prediction', data=df_new)
plt.title('Distribution of Spam and Ham emails')
plt.xlabel('Spam(1) or Ham(0)')
plt.ylabel('Count')
plt.show()

NameError: name 'plt' is not defined

In [None]:
# Defining x and y for model training and testing
x = df_new.drop(columns='Prediction')
y = df_new['Prediction']

In [None]:
# Splitting dataset for training and testing
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.1, random_state = 42)

In [None]:
# model training

nb = MultinomialNB()
nb.fit(x_train, y_train)

In [None]:
# testing model
y_pred_nb = nb.predict(x_test) 

In [None]:
sns.countplot(x=y_pred_nb)
plt.title('Distribution of Prediction using Navie Bayes')
plt.xlabel('Prediction Class')
plt.ylabel('Count')
plt.show()

In [None]:
# Accuracy and classification report for Naive Bayes model

nb_acc = accuracy_score(y_test, y_pred_nb)
nb_class_rep = classification_report(y_test, y_pred_nb)

print('Naive Bayes Classifier: \nAccuracy:', nb_acc,'\nClassification Report:\n', nb_class_rep)

In [None]:
# training an SVM model

svm_class = SVC(kernel = 'linear', random_state = 42)
svm_class.fit(x_train, y_train)

In [None]:
# Testing

y_pred_svm = svm_class.predict(x_test)

In [None]:
sns.countplot(x=y_pred_svm)
plt.title('Distribution of Prediction using SVM')
plt.xlabel('Prediction Class')
plt.ylabel('Count')
plt.show()

In [None]:
# Accuracy and classification report for SVM model

svm_acc = accuracy_score(y_test, y_pred_svm)
svm_class_rep = classification_report(y_test, y_pred_svm)

print('SVM Classifier: \nAccuracy:', svm_acc,'\nClassification Report:\n', svm_class_rep)

In [None]:
mod_pref = pd.DataFrame({
    'Model' : ['Naive Bayes', 'SVM'],
    'Accuracy' : [nb_acc, svm_acc]
})

sns.barplot(x = 'Model', y = 'Accuracy', data = mod_pref)
plt.title('Model Preformance Comparison')
plt.xlabel('Model')
plt.ylabel('Accurancy')
plt.ylim(0.9, 1.0)
plt.show()