Import libraries

In [1]:
import os

Specify Enron email text files paths:

In [2]:
paths = [
    {
        "input": "./enron1",
        "output": "./enron1.csv"
    },
    {
        "input": "./enron2",
        "output": "./enron2.csv"
    },
    {
        "input": "./enron3",
        "output": "./enron3.csv"
    },
    {
        "input": "./enron4",
        "output": "./enron4.csv"
    },
    {
        "input": "./enron5",
        "output": "./enron5.csv"
    },
    {
        "input": "./enron6",
        "output": "./enron6.csv"
    }
]

In [3]:
from email import message_from_string
import os

def extract_enron_features(file, file_type="ham"):
    if file_type not in ["spam", "ham"]:
        raise ValueError("Value not supported for classification")

    with open(file, "r", encoding="latin-1") as f:
        lines = f.readlines()

    first_line = lines[0]
    if first_line.startswith("Subject: "):
        subject = first_line[8:]
    else:
        subject = ""
    rest = lines[1:]

    body = "\n".join(rest)

    return {
        "Subject": subject,
        "Body": body,
        "Label": 0 if file_type == "ham" else 1
    }

In [4]:
specific_path = os.path.join(paths[0]["input"], "ham")

b = extract_enron_features("E:\Python Tests\AI\EmailSpamDetection\EnronDatasetTests\enron1\ham/5149.2002-01-04.farmer.ham.txt")
b["Label"]

0

In [5]:
from concurrent.futures import ThreadPoolExecutor
from functools import partial
import pandas as pd

for pat in paths:
    specific_path = pat["input"]

    all_data = []

    for t in ["ham", "spam"]:
        extract_features = partial(
            extract_enron_features,
            file_type=t
        )

        folder = os.path.join(specific_path, t)

        available_paths = [
            os.path.join(folder, p) for p in os.listdir(folder)
        ]

        with ThreadPoolExecutor(max_workers=6) as executor:
            extracted = list(executor.map(extract_features, available_paths))

        all_data.extend(extracted)

    df = pd.DataFrame(all_data)

    if os.path.exists(pat["output"]):
        old_df = pd.read_csv(pat["output"])
        rows = len(old_df)
        columns = len(old_df.columns)
        if len(all_data) != rows or len(all_data[0].keys()) != columns:
            df.to_csv(pat["output"], index=False, escapechar="\\")
    else:
        df.to_csv(pat["output"], index=False, escapechar="\\")

After loading the datasets, pre-process them by removing additional white spaces, punctuations, digits, and next lines.

In [6]:
import sys

sys.path.append("../Handlers/")

import preprocessing

[nltk_data] Downloading package punkt to E:/nltk...
[nltk_data]   Package punkt is already up-to-date!


Extract the urls provided in each email

In [7]:
enron1 = pd.read_csv("./enron1.csv")
enron2 = pd.read_csv("./enron2.csv")
enron3 = pd.read_csv("./enron3.csv")
enron4 = pd.read_csv("./enron4.csv")
enron5 = pd.read_csv("./enron5.csv")
enron6 = pd.read_csv("./enron6.csv")

## Url patterns Test

<code>url_patterns_tests = [</code>

<code style="margin-left: 1cm">"visit us on the web at http : / / www . excuria . com / jobop /",</code>

<code style="margin-left: 1cm">"visit us on the web at http : / / www . excuria . com / incomeop /",</code>

<code style="margin-left: 1cm">"distribution system go to http : / / www . excuria . com / remove /"</code>

<code>]</code>

<code>from spam_email_patterns import url_patterns, date_patterns, time_pattern</code>

<code>import re</code>

<code>matches = [re.finditer(url_patterns, url_patterns_test, re.IGNORECASE) for url_patterns_test in url_patterns_tests]</code>

<code>for mat in matches:</code>

<code style="margin-left: 1cm">for ite in mat:</code>

<code style="margin-left: 2cm">print(ite.group())</code>

## Date and time patterns test

<code>patterns_tests = "time : fri , 15 apr 2005 14 : 59 : 58 - 0800"</code>

<code>m = re.search(date_patterns, patterns_tests)</code>

<code>print(m)</code>

<code>n = re.search(time_pattern, patterns_tests)</code>

<code>print(n)</code>

## Money value test

<code>from spam_email_patterns import money_pattern</code>

<code>texts = ["$ 24 . 65", "45 . 23 $", "$ 24 . 65 $", "24 . 65"]</code>

<code>for text in texts:</code>

<code style="margin-left: 1cm">match = re.search(money_pattern, text)</code>

<code style="margin-left: 1cm">print(f"{text} => {'Match' if match else 'No match'}")</code>

In [8]:
import preprocessing
import itertools
from collections import Counter
import matplotlib.pyplot as plt

preprocession = partial(
    preprocessing.preprocess_text,
    remove_numbers=True
)

class EnronPreprocess:
    def __init__(self, enron: pd.DataFrame):
        self.__enron = enron

    def preprocess_data(self, target_property="Body"):
        self.__preprocessed_data = self.__enron[target_property].apply(preprocession)
        return self.__preprocessed_data

    def __get_counter(self):
        self.__word_list = list(itertools.chain.from_iterable(self.__preprocessed_data))
        self.__word_counter = Counter(self.__word_list)

    def visualize_wordcloud(self, minimum_occurance=4):
        self.__get_counter()

        if self.__preprocessed_data is None:
            return

        print(f"Total number of words: {len(self.__word_counter.keys())}")
        print(f"Total number of words that appear less than {minimum_occurance} times")
        print(len([key for key, value in self.__word_counter.items() if value < minimum_occurance]))
        
        preprocessing.visualize_wordcloud(self.__word_list)

    def visualize_bar_chart(
            self, 
            most_com=15,
            title="Top words frequencies",
            xlabel="Words",
            ylabel="Frequency"):
        self.__get_counter()

        most_common = self.__word_counter.most_common(most_com)

        words, counts = zip(*most_common)

        plt.figure(figsize=(10, 6))
        plt.bar(words, counts, color="skyblue")
        plt.title(title)
        plt.xlabel(xlabel)
        plt.ylabel(ylabel)
        plt.tight_layout()
        plt.show()

In [9]:
enron1_preprocess = EnronPreprocess(enron1)
enron1_preprocessed = enron1_preprocess.preprocess_data()

In [10]:
enron2_preprocess = EnronPreprocess(enron2)
enron2_preprocessed = enron2_preprocess.preprocess_data()

In [11]:
enron3_preprocess = EnronPreprocess(enron3)
enron3_preprocessed = enron3_preprocess.preprocess_data()

In [12]:
enron4_preprocess = EnronPreprocess(enron4)
enron4_preprocessed = enron4_preprocess.preprocess_data()

In [13]:
enron5_preprocess = EnronPreprocess(enron5)
enron5_preprocessed = enron5_preprocess.preprocess_data()

In [14]:
enron6_preprocess = EnronPreprocess(enron6)
enron6_preprocessed = enron6_preprocess.preprocess_data()

In [15]:
merged_enron = pd.concat([
    enron1, enron2, enron3, 
    enron4, enron5, enron6
], ignore_index=True)

In [16]:
merged_enron_preprocess = EnronPreprocess(merged_enron)
merged_enron_preprocessed = merged_enron_preprocess.preprocess_data()