Import libraries

In [1]:
import os

Specify Enron email text files paths:

In [2]:
paths = [
    "./Enron Dataset Tests/enron1",
    "./Enron Dataset Tests/enron2",
    "./Enron Dataset Tests/enron3",
    "./Enron Dataset Tests/enron4",
    "./Enron Dataset Tests/enron5",
    "./Enron Dataset Tests/enron6"
]

Specify function to load emails:

After loading the datasets, pre-process them by removing additional white spaces, punctuations, digits, and next lines.

In [3]:
import re
import nltk
from nltk.corpus import stopwords
nltk.download("stopwords", download_dir="E:/nltk")

def preprocess_email(text):
    text = text.lower()
    text = re.sub(r'\n', ' ', text)
    text = re.sub(r'\W', ' ', text)  # Remove punctuation
    text = re.sub(r'\d+', '', text)  # Remove digits
    text = re.sub(r'\s+', ' ', text).strip()  # Remove extra spaces

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    words = [word for word in text.split() if word not in stop_words]

    return ' '.join(words)

[nltk_data] Downloading package stopwords to E:/nltk...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
import emoji

def handle_emoji(text):
    emoji_corpus = [emoji.demojize(doc) for doc in text]
    return

In [5]:
import sys

sys.path.append("../Handlers/")

import preprocessing

[nltk_data] Downloading package punkt to E:/nltk...
[nltk_data]   Package punkt is already up-to-date!


In [6]:
folder_path = "./enron2"

texts, labels = preprocessing.load_emails(folder_path)
print(len(labels))

5857


Store the enron1 (or any enron folder) data for later use

In [7]:
import pandas as pd

df = pd.DataFrame({
    "label": labels,
    "text": texts
})

match folder_path:
    case "./enron1":
        df.to_csv("./enron1.csv", index=False, escapechar="\\")
    case "./enron2":
        df.to_csv("./enron2.csv", index=False, escapechar="\\")
    case "./enron3":
        df.to_csv("./enron3.csv", index=False, escapechar="\\")
    case "./enron4":
        df.to_csv("./enron4.csv", index=False, escapechar="\\")
    case "./enron5":
        df.to_csv("./enron5.csv", index=False, escapechar="\\")
    case "./enron6":
        df.to_csv("./enron6.csv", index=False, escapechar="\\")
    case _:
        raise ValueError("No path exists")

Extract the urls provided in each email

In [8]:
email_urls = []

for text in texts:
    urls = preprocessing.extract_urls(text)
    email_urls.extend(urls)

print(len(email_urls))

1687


In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
import nltk

stop_words = stopwords.words("English")

vectorizer = TfidfVectorizer(stop_words=stop_words)
X_tfidf = vectorizer.fit_transform(texts)
y_tfidf = labels

In [10]:
from traintest import SupportVectorMachine

svm = SupportVectorMachine(X_tfidf, y_tfidf)
svm.train()
svm.evaluate()

              precision    recall  f1-score   support

         ham       0.99      1.00      1.00       861
        spam       0.99      0.98      0.99       311

    accuracy                           0.99      1172
   macro avg       0.99      0.99      0.99      1172
weighted avg       0.99      0.99      0.99      1172

[[859   2]
 [  5 306]]


In [11]:
from sklearn.feature_extraction.text import CountVectorizer

count_vectorizer = CountVectorizer(stop_words=stop_words)
X_count = count_vectorizer.fit_transform(texts)
y_count = labels

In [12]:
from traintest import NaiveBayes

nb = NaiveBayes(X_count.toarray(), y_count)
nb.train()
nb.evaluate()

              precision    recall  f1-score   support

         ham       0.96      0.98      0.97       861
        spam       0.95      0.89      0.92       311

    accuracy                           0.96      1172
   macro avg       0.96      0.94      0.95      1172
weighted avg       0.96      0.96      0.96      1172

[[847  14]
 [ 33 278]]


In [13]:
from traintest import RandomForest

rf = RandomForest(X_tfidf, y_tfidf)
rf.train()
rf.evaluate()

              precision    recall  f1-score   support

         ham       0.98      1.00      0.99       861
        spam       1.00      0.94      0.97       311

    accuracy                           0.98      1172
   macro avg       0.99      0.97      0.98      1172
weighted avg       0.98      0.98      0.98      1172

[[860   1]
 [ 19 292]]


In [14]:
from traintest import DecisionTree

dt = DecisionTree(X_tfidf, y_tfidf)
dt.train()
dt.evaluate()

              precision    recall  f1-score   support

         ham       0.98      0.98      0.98       861
        spam       0.94      0.95      0.94       311

    accuracy                           0.97      1172
   macro avg       0.96      0.96      0.96      1172
weighted avg       0.97      0.97      0.97      1172

[[841  20]
 [ 15 296]]


In [15]:
from traintest import KNearestNeighbors

knn = KNearestNeighbors(X_tfidf, y_tfidf)
knn.train()
knn.evaluate()

              precision    recall  f1-score   support

         ham       0.97      1.00      0.98       861
        spam       0.99      0.92      0.95       311

    accuracy                           0.98      1172
   macro avg       0.98      0.96      0.97      1172
weighted avg       0.98      0.98      0.98      1172

[[857   4]
 [ 24 287]]


In [None]:
from traintest import LogisticRegressionModel

lr = LogisticRegressionModel(X_tfidf, y_tfidf)
lr.train()
lr.evaluate()