In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import string

nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to C:\Users\Irha
[nltk_data]     traders\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.
[nltk_data] Downloading package wordnet to C:\Users\Irha
[nltk_data]     traders\AppData\Roaming\nltk_data...


True

In [2]:
#1: Data Collection
df = pd.read_csv('email_dataset.csv')
#2: Pre-processing
def preprocess_text(text):
    text = text.lower()
    text = text.translate(str.maketrans('','',string.punctuation))
    words = text.split()
    stop_words = set(stopwords.words('english'))
    words = [word for word in words if word not in stop_words]
    lemmatizer = WordNetLemmatizer()
    words = [lemmatizer.lemmatize(word) for word in words]
    return ' '.join(words)
df['text'] = df['text'].apply(preprocess_text)
df

Unnamed: 0,text,target
0,ilugadminlinuxie mon jul 29 112802 2002 return...,0
1,gort44excitecom mon jun 24 175421 2002 returnp...,1
2,forkadminxentcom mon jul 29 113957 2002 return...,1
3,dcm123btamailnetcn mon jun 24 174923 2002 retu...,1
4,ilugadminlinuxie mon aug 19 110247 2002 return...,0
...,...,...
5791,ilugadminlinuxie mon jul 22 181245 2002 return...,0
5792,forkadminxentcom mon oct 7 203702 2002 returnp...,0
5793,received hqpronsnet localhost 127001 hqpronsne...,1
5794,razorusersadminlistssourceforgenet thu sep 12 ...,0


In [4]:
#3: Feature selection
X = df['text']
y = df['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 42)
vectorizer = CountVectorizer()
X_train_counts = vectorizer.fit_transform(X_train)
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
X_test_counts = vectorizer.transform(X_test)
X_test_tfidf = tfidf_transformer.transform(X_test_counts)

In [8]:
#4: Applying Spam Filter Algorithms

#NAIVE BAYES MODEL
nb_model = MultinomialNB()
nb_model.fit(X_train_tfidf, y_train)
nb_predictions = nb_model.predict(X_test_tfidf)

#DECISION TREE MODEL
dt_model = DecisionTreeClassifier()
dt_model.fit(X_train_tfidf, y_train)
dt_predictions = dt_model.predict(X_test_tfidf)

#Evaluating Accuracies
#evaluation function
def evaluate_model(y_actual, y_pred):
    accuracy = accuracy_score(y_actual, y_pred)
    precision = precision_score(y_actual, y_pred, pos_label=1)
    recall = recall_score(y_actual, y_pred, pos_label=1)
    f1 = f1_score(y_actual, y_pred, pos_label=1)
    conf_matrix = confusion_matrix(y_actual, y_pred)
    return accuracy, precision, recall, f1, conf_matrix

In [9]:
#Evaluating Algorithms and Comparing

#Evaluate Naive Bayes
nb_accuracy, nb_precision, nb_recall, nb_f1, nb_conf_matrix = evaluate_model(y_test, nb_predictions)
print(f"Naive Bayes Accuracy: {nb_accuracy}")
print(f"Naive Bayes Precision: {nb_precision}")
print(f"Naive Bayes Recall: {nb_recall}")
print(f"Naive Bayes F1 Score: {nb_f1}")
print(f"Naive Bayes Confusion Matrix:\n{nb_conf_matrix}")

Naive Bayes Accuracy: 0.9252443933294997
Naive Bayes Precision: 1.0
Naive Bayes Recall: 0.7707231040564374
Naive Bayes F1 Score: 0.8705179282868526
Naive Bayes Confusion Matrix:
[[1172    0]
 [ 130  437]]


In [10]:
#Evaluating Decision Tree
dt_accuracy, dt_precision, dt_recall, dt_f1, dt_conf_matrix = evaluate_model(y_test, dt_predictions)
print(f"Naive Bayes Accuracy: {dt_accuracy}")
print(f"Naive Bayes Precision: {dt_precision}")
print(f"Naive Bayes Recall: {dt_recall}")
print(f"Naive Bayes F1 Score: {dt_f1}")
print(f"Naive Bayes Confusion Matrix:\n{dt_conf_matrix}")

Naive Bayes Accuracy: 0.9764232317423807
Naive Bayes Precision: 0.9764492753623188
Naive Bayes Recall: 0.9506172839506173
Naive Bayes F1 Score: 0.9633601429848079
Naive Bayes Confusion Matrix:
[[1159   13]
 [  28  539]]
