# Email Preprocessing
The goal is to build a clean and efficient dataset for training and evaluating a spam/ham classification model.

In [1]:
from html.parser import HTMLParser
import email
import string
import nltk
import os

class HTMLStripper(HTMLParser):
    def __init__(self):
        super().__init__()
        self.strict = False
        self.convert_charrefs = True
        self.fed = []

    def handle_data(self, data):
        self.fed.append(data)

    def getData(self):
        return ''.join(self.fed)

def stripTags(html):
    """Remove HTML tags from a string."""
    stripper = HTMLStripper()
    stripper.feed(html)
    return stripper.getData()


In [2]:
rawEmail = open("./trec07p/data/inmail.1").read()
print(rawEmail)

From RickyAmes@aol.com  Sun Apr  8 13:07:32 2007
Return-Path: <RickyAmes@aol.com>
Received: from 129.97.78.23 ([211.202.101.74])
	by speedy.uwaterloo.ca (8.12.8/8.12.5) with SMTP id l38H7G0I003017;
	Sun, 8 Apr 2007 13:07:21 -0400
Received: from 0.144.152.6 by 211.202.101.74; Sun, 08 Apr 2007 19:04:48 +0100
Message-ID: <WYADCKPDFWWTWTXNFVUE@yahoo.com>
From: "Tomas Jacobs" <RickyAmes@aol.com>
Reply-To: "Tomas Jacobs" <RickyAmes@aol.com>
To: the00@speedy.uwaterloo.ca
Subject: Generic Cialis, branded quality@ 
Date: Sun, 08 Apr 2007 21:00:48 +0300
X-Mailer: Microsoft Outlook Express 6.00.2600.0000
MIME-Version: 1.0
Content-Type: multipart/alternative;
	boundary="--8896484051606557286"
X-Priority: 3
X-MSMail-Priority: Normal
Status: RO
Content-Length: 988
Lines: 24

----8896484051606557286
Content-Type: text/html;
Content-Transfer-Encoding: 7Bit

<html>
<body bgcolor="#ffffff">
<div style="border-color: #00FFFF; border-right-width: 0px; border-bottom-width: 0px; margin-bottom: 0px;" align="

In [3]:
class EmailParser:
    def __init__(self):
        self.stemmer = nltk.PorterStemmer()
        self.stopwords = set(nltk.corpus.stopwords.words('english'))
        self.punctuation = list(string.punctuation)

    def parseEmail(self, emailPath):
        """Parse an email from a given path."""
        with open(emailPath, errors='ignore') as emailFile:
            message = email.message_from_file(emailFile)
        return None if not message else self.getEmailContent(message)

    def getEmailContent(self, message):
        """Extract subject, body, and content type from the email."""
        subject = self.tokenize(message['Subject']) if message['Subject'] else []
        body = self.getEmailBody(message.get_payload(), message.get_content_type())
        contentType = message.get_content_type()
        return {"subject": subject, "body": body, "contentType": contentType}

    def getEmailBody(self, payload, contentType):
        """Extract the body of the email."""
        body = []
        if isinstance(payload, str) and contentType == 'text/plain':
            return self.tokenize(payload)
        elif isinstance(payload, str) and contentType == 'text/html':
            return self.tokenize(stripTags(payload))
        elif isinstance(payload, list):
            for part in payload:
                body += self.getEmailBody(part.get_payload(), part.get_content_type())
        return body

    def tokenize(self, text):
        """Tokenize, clean punctuation, and apply stemming to the text."""
        for char in self.punctuation:
            text = text.replace(char, "")
        text = text.replace("\t", " ").replace("\n", " ")
        tokens = list(filter(None, text.split(" ")))
        return [self.stemmer.stem(word) for word in tokens if word not in self.stopwords]

In [4]:
import nltk
nltk.download('stopwords')

parser = EmailParser()
parsedEmail = parser.parseEmail("./trec07p/data/inmail.1")
print(parsedEmail)

{'subject': ['gener', 'ciali', 'brand', 'qualiti'], 'body': ['do', 'feel', 'pressur', 'perform', 'rise', 'occas', 'tri', 'viagra', 'anxieti', 'thing', 'past', 'back', 'old', 'self'], 'contentType': 'multipart/alternative'}


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/sashvqz/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [5]:
DATASET_PATH = "trec07p"

def parseIndex(indexPath, numElements):
    """Parse the index file and load email paths with labels."""
    parsedIndexes = []
    indexLines = open(indexPath).readlines()
    for i in range(numElements):
        mailData = indexLines[i].split(" ../")
        label = mailData[0]
        path = mailData[1].strip()
        parsedIndexes.append({"label": label, "emailPath": os.path.join(DATASET_PATH, path)})
    return parsedIndexes

def parseEmailWithIndex(emailIndex):
    """Parse an email given its index dictionary."""
    parser = EmailParser()
    parsedMail = parser.parseEmail(emailIndex["emailPath"])
    return parsedMail, emailIndex["label"]

In [6]:
exampleHtml = '<tr><td align="left"><a href="../../issues/51/16.html#article">Phrack World News</a></td>'
print(stripTags(exampleHtml))

Phrack World News


In [7]:
indexes = parseIndex("./trec07p/full/index", 10)
print(indexes)

[{'label': 'spam', 'emailPath': 'trec07p/data/inmail.1'}, {'label': 'ham', 'emailPath': 'trec07p/data/inmail.2'}, {'label': 'spam', 'emailPath': 'trec07p/data/inmail.3'}, {'label': 'spam', 'emailPath': 'trec07p/data/inmail.4'}, {'label': 'spam', 'emailPath': 'trec07p/data/inmail.5'}, {'label': 'spam', 'emailPath': 'trec07p/data/inmail.6'}, {'label': 'spam', 'emailPath': 'trec07p/data/inmail.7'}, {'label': 'spam', 'emailPath': 'trec07p/data/inmail.8'}, {'label': 'spam', 'emailPath': 'trec07p/data/inmail.9'}, {'label': 'ham', 'emailPath': 'trec07p/data/inmail.10'}]


In [9]:
index = parseIndex("./trec07p/full/index", 1)

import os
open(index[0]["emailPath"]).read()

emailContent, emailLabel = parser.parseEmail(index[0]["emailPath"]), index[0]["label"]
print("The email label is:", emailLabel)
print(emailContent)

The email label is: spam
{'subject': ['gener', 'ciali', 'brand', 'qualiti'], 'body': ['do', 'feel', 'pressur', 'perform', 'rise', 'occas', 'tri', 'viagra', 'anxieti', 'thing', 'past', 'back', 'old', 'self'], 'contentType': 'multipart/alternative'}


In [10]:
from sklearn.feature_extraction.text import CountVectorizer
preparedEmail = [" ".join(emailContent['subject']) + " ".join(emailContent['body'])]

vectorizer = CountVectorizer()
X = vectorizer.fit(preparedEmail)

print("Email content:", preparedEmail, "\n")
print("Input features:", vectorizer.get_feature_names_out())

Email content: ['gener ciali brand qualitido feel pressur perform rise occas tri viagra anxieti thing past back old self'] 

Input features: ['anxieti' 'back' 'brand' 'ciali' 'feel' 'gener' 'occas' 'old' 'past'
 'perform' 'pressur' 'qualitido' 'rise' 'self' 'thing' 'tri' 'viagra']


In [11]:
X = vectorizer.transform(preparedEmail)
print("\nValues:\n", X.toarray())


Values:
 [[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]]


In [12]:
from sklearn.preprocessing import OneHotEncoder

preparedEmailTokens = [[token] for token in emailContent['subject'] + emailContent['body']]

encoder = OneHotEncoder(handle_unknown='ignore')
X = encoder.fit_transform(preparedEmailTokens)

print("Features:\n", encoder.get_feature_names_out())
print("\nValues:\n", X.toarray())

def createPreparedDataset(indexPath, numElements):
    features = []
    labels = []
    indexes = parseIndex(indexPath, numElements)
    for i in range(numElements):
        print("\rParsing email: {0}".format(i + 1), end='')
        emailContent, emailLabel = parser.parseEmail(indexes[i]["emailPath"]), indexes[i]["label"]
        features.append(" ".join(emailContent['subject']) + " ".join(emailContent['body']))
        labels.append(emailLabel)
    return features, labels

Features:
 ['x0_anxieti' 'x0_back' 'x0_brand' 'x0_ciali' 'x0_do' 'x0_feel' 'x0_gener'
 'x0_occas' 'x0_old' 'x0_past' 'x0_perform' 'x0_pressur' 'x0_qualiti'
 'x0_rise' 'x0_self' 'x0_thing' 'x0_tri' 'x0_viagra']

Values:
 [[0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1.]
 [1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0

In [13]:
XTrain, yTrain = createPreparedDataset("./trec07p/full/index", 100)
vectorizer = CountVectorizer()

Parsing email: 100

In [14]:
XTrain = vectorizer.fit_transform(XTrain)
print(XTrain.toarray())
print("\nNumber of Features:", len(vectorizer.get_feature_names_out()))

[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]

Number of Features: 4911


In [15]:
import pandas as pd
pd.DataFrame(XTrain.toarray(), columns=vectorizer.get_feature_names_out())

Unnamed: 0,0000,000000,00085,002,003,00450,009,01,01000u,0107,...,ӧanz,ӭѯ,ԡšݡ淶,լһʽ,չҵϣ,سŵþʊʊݾѯ,ڶҵţ,㶫иï26,饻jwk,쵼ã
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
96,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
97,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
98,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [16]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression()
model.fit(XTrain, yTrain)

In [18]:
X, y = createPreparedDataset("./trec07p/full/index", 150)
XTest = X[100:]
yTest = y[100:]

XTest = vectorizer.transform(XTest)
yPred = model.predict(XTest)

Parsing email: 150

In [19]:
from sklearn.metrics import accuracy_score
print('Accuracy: {:.7f}'.format(accuracy_score(yTest, yPred)))

Accuracy: 0.9400000


In [20]:
X, y = createPreparedDataset(os.path.join(DATASET_PATH, "full/index"), 12000)
XTrain = X[:10000]
yTrain = y[:10000]
XTest = X[2000:]
yTest = y[2000:]

XTrain = vectorizer.fit_transform(XTrain)
XTest = vectorizer.transform(XTest)

Parsing email: 12000

In [21]:
model.fit(XTrain, yTrain)

In [22]:
yPred = model.predict(XTest)
print('Accuracy: {:.7f}'.format(accuracy_score(yTest, yPred)))

Accuracy: 0.9974000
