In [9]:
import os
import tarfile
import urllib.request
import certifi

In [14]:
import ssl
ssl._create_default_https_context = ssl._create_unverified_context

In [15]:
DOWNLOAD_ROOT = "http://spamassassin.apache.org/old/publiccorpus/"
HAM_URL = DOWNLOAD_ROOT + "20030228_easy_ham.tar.bz2"
SPAM_URL = DOWNLOAD_ROOT + "20030228_spam.tar.bz2"
SPAM_PATH = os.path.join("datasets", "spam")

In [16]:
def fetch_spam_data(ham_url=HAM_URL, spam_url=SPAM_URL, spam_path=SPAM_PATH):
    if not os.path.isdir(spam_path):
        os.makedirs(spam_path)
    for filename, url in (("ham.tar.bz2", ham_url), ("spam.tar.bz2", spam_url)):
        path= os.path.join(spam_path, filename)
        if not os.path.isfile(path):
            urllib.request.urlretrieve(url, path)
        tar_bz2_file= tarfile.open(path)
        tar_bz2_file.extractall(path=spam_path)
        tar_bz2_file.close()

In [17]:
fetch_spam_data()

Load all email

In [18]:
HAM_DIR= os.path.join(SPAM_PATH, "easy_ham")
SPAM_DIR= os.path.join(SPAM_PATH, "spam")
ham_filenames= [name for name in sorted(os.listdir(HAM_DIR)) if len(name) >20]
spam_filenames= [name for name in sorted(os.listdir(SPAM_DIR)) if len(name) >20]

In [20]:
len(ham_filenames)

2500

In [21]:
len(spam_filenames)

500

We can use email modules to parse these emails (includes headers, encoding, etc):

In [22]:
import email
import email.policy

In [24]:
def load_email(is_spam, filename, spam_path=SPAM_PATH):
    directory= "spam" if is_spam else "easy_ham"
    with open(os.path.join(spam_path, directory, filename), "rb") as f:
        return email.parser.BytesParser(policy= email.policy.default).parse(f)

In [25]:
ham_emails=[load_email(is_spam=False, filename=name) for name in ham_filenames]
spam_emails=[load_email(is_spam=True, filename=name) for name in spam_filenames]

In [29]:
print(ham_emails[1].get_content().strip())

Martin A posted:
Tassos Papadopoulos, the Greek sculptor behind the plan, judged that the
 limestone of Mount Kerdylio, 70 miles east of Salonika and not far from the
 Mount Athos monastic community, was ideal for the patriotic sculpture. 
 
 As well as Alexander's granite features, 240 ft high and 170 ft wide, a
 museum, a restored amphitheatre and car park for admiring crowds are
planned
---------------------
So is this mountain limestone or granite?
If it's limestone, it'll weather pretty fast.

------------------------ Yahoo! Groups Sponsor ---------------------~-->
4 DVDs Free +s&p Join Now
http://us.click.yahoo.com/pt6YBB/NXiEAA/mG3HAA/7gSolB/TM
---------------------------------------------------------------------~->

To unsubscribe from this group, send an email to:
forteana-unsubscribe@egroups.com

 

Your use of Yahoo! Groups is subject to http://docs.yahoo.com/info/terms/


There are cases where email can have attachments such as images, files or multiparts. Let's explore various email structure that we have

In [54]:
def get_email_structure(email):
    if isinstance(email, str):
        return email
    payload = email.get_payload()
    if isinstance(payload, list):
        return "multipart({})".format(", ".join([
            get_email_structure(sub_email)
            for sub_email in payload
        ]))
    else:
        return email.get_content_type()


In [59]:
from collections import Counter
def structures_counter(emails):
    structures= Counter()
    for email in emails:
        structure= get_email_structure(email)
        structures[structure]+=1
    return structures


In [60]:
structures_counter(ham_emails).most_common()

[('text/plain', 2408),
 ('multipart(text/plain, application/pgp-signature)', 66),
 ('multipart(text/plain, text/html)', 8),
 ('multipart(text/plain, text/plain)', 4),
 ('multipart(text/plain)', 3),
 ('multipart(text/plain, application/octet-stream)', 2),
 ('multipart(text/plain, text/enriched)', 1),
 ('multipart(text/plain, application/ms-tnef, text/plain)', 1),
 ('multipart(multipart(text/plain, text/plain, text/plain), application/pgp-signature)',
  1),
 ('multipart(text/plain, video/mng)', 1),
 ('multipart(text/plain, multipart(text/plain))', 1),
 ('multipart(text/plain, application/x-pkcs7-signature)', 1),
 ('multipart(text/plain, multipart(text/plain, text/plain), text/rfc822-headers)',
  1),
 ('multipart(text/plain, multipart(text/plain, text/plain), multipart(multipart(text/plain, application/x-pkcs7-signature)))',
  1),
 ('multipart(text/plain, application/x-java-applet)', 1)]

In [61]:
structures_counter(spam_emails).most_common()

[('text/plain', 218),
 ('text/html', 183),
 ('multipart(text/plain, text/html)', 45),
 ('multipart(text/html)', 20),
 ('multipart(text/plain)', 19),
 ('multipart(multipart(text/html))', 5),
 ('multipart(text/plain, image/jpeg)', 3),
 ('multipart(text/html, application/octet-stream)', 2),
 ('multipart(text/plain, application/octet-stream)', 1),
 ('multipart(text/html, text/plain)', 1),
 ('multipart(multipart(text/html), application/octet-stream, image/jpeg)', 1),
 ('multipart(multipart(text/plain, text/html), image/gif)', 1),
 ('multipart/alternative', 1)]

There are more text email in ham emails

In [63]:
for header, value in spam_emails[0].items():
    print(header,":",value)

Return-Path : <12a1mailbot1@web.de>
Delivered-To : zzzz@localhost.spamassassin.taint.org
Received : from localhost (localhost [127.0.0.1])	by phobos.labs.spamassassin.taint.org (Postfix) with ESMTP id 136B943C32	for <zzzz@localhost>; Thu, 22 Aug 2002 08:17:21 -0400 (EDT)
Received : from mail.webnote.net [193.120.211.219]	by localhost with POP3 (fetchmail-5.9.0)	for zzzz@localhost (single-drop); Thu, 22 Aug 2002 13:17:21 +0100 (IST)
Received : from dd_it7 ([210.97.77.167])	by webnote.net (8.9.3/8.9.3) with ESMTP id NAA04623	for <zzzz@spamassassin.taint.org>; Thu, 22 Aug 2002 13:09:41 +0100
From : 12a1mailbot1@web.de
Received : from r-smtp.korea.com - 203.122.2.197 by dd_it7  with Microsoft SMTPSVC(5.5.1775.675.6);	 Sat, 24 Aug 2002 09:42:10 +0900
To : dcek1a1@netsgo.com
Subject : Life Insurance - Why Pay More?
Date : Wed, 21 Aug 2002 20:31:57 -1600
MIME-Version : 1.0
Message-ID : <0103c1042001882DD_IT7@dd_it7>
Content-Type : text/html; charset="iso-8859-1"
Content-Transfer-Encoding : qu

In [64]:

spam_emails[0]["Subject"]

'Life Insurance - Why Pay More?'

Let's divide the train, test set

In [67]:
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd

In [68]:
X= np.array(ham_emails+spam_emails, dtype=object)


In [70]:
y= np.array([0]*len(ham_emails)+[1]*len(spam_emails))

In [71]:
y

array([0, 0, 0, ..., 1, 1, 1])

In [72]:
X_train, X_test, y_train, y_test= train_test_split(X,y, test_size=.2, random_state=42)

Okay, let's start writing the preprocessing functions. First, we will need a function to convert HTML to plain text. Arguably the best way to do this would be to use the great BeautifulSoup library, but I would like to avoid adding another dependency to this project, so let's hack a quick & dirty solution using regular expressions (at the risk of un̨ho͞ly radiańcé destro҉ying all enli̍̈́̂̈́ghtenment). The following function first drops the <head> section, then converts all <a> tags to the word HYPERLINK, then it gets rid of all HTML tags, leaving only the plain text. For readability, it also replaces multiple newlines with single newlines, and finally it unescapes html entities (such as '&gt'; or '&nbsp';):

In [73]:
import re
from html import unescape

In [82]:
def html_to_plain_text(html):
    text = re.sub('<head.*?>.*?</head>', '', html, flags=re.M | re.S | re.I)
    text = re.sub('<a\s.*?>', ' HYPERLINK ', text, flags=re.M | re.S | re.I)
    text = re.sub('<.*?>', '', text, flags=re.M | re.S)
    text = re.sub(r'(\s*\n)+', '\n', text, flags=re.M | re.S)
    return unescape(text)

In [98]:
html_spam_email= [email for email in X_train[y_train==1]
                 if get_email_structure(email)=="text/html"]
sample_html_spam= html_spam_email[4]
print(sample_html_spam.get_content().strip()[:1000], '...')

<HR>
<html>
<div bgcolor="#FFFFCC">

  <p align="center"><a
href="http://www.webbasedmailing.com"><img border="0"
src="http://www.webbasedmailing.com/Toners2goLogo.jpg"
width="349" height="96"></a></p>
<p align="center"><font size="6" face="Arial MT
Black"><i>Tremendous Savings</i>
on Toners,&nbsp;</font></p>
<p align="center"><font size="6" face="Arial MT
Black">
Inkjets, FAX, and Thermal Replenishables!!</font></p>
<p><a href="http://www.webbasedmailing.com">Toners 2 Go
</a>is your secret
weapon to lowering your cost for <a
href="http://www.webbasedmailing.com">High Quality,
Low-Cost</a> printer
supplies!&nbsp; We have been in the printer
replenishables business since 1992,
and pride ourselves on rapid response and outstanding
customer service.&nbsp;
What we sell are 100% compatible replacements for
Epson, Canon, Hewlett Packard,
Xerox, Okidata, Brother, and Lexmark; products that
meet and often exceed
original manufacturer's specifications.</p>
<p><i><font size="4">Check out these
p

See the conversion

In [99]:
print(html_to_plain_text(sample_html_spam.get_content())[:1000], "...")


   HYPERLINK
Tremendous Savings
on Toners, 
Inkjets, FAX, and Thermal Replenishables!!
 HYPERLINK Toners 2 Go
is your secret
weapon to lowering your cost for  HYPERLINK High Quality,
Low-Cost printer
supplies!  We have been in the printer
replenishables business since 1992,
and pride ourselves on rapid response and outstanding
customer service. 
What we sell are 100% compatible replacements for
Epson, Canon, Hewlett Packard,
Xerox, Okidata, Brother, and Lexmark; products that
meet and often exceed
original manufacturer's specifications.
Check out these
prices!
        Epson Stylus
Color inkjet cartridge
(SO20108):     Epson's Price:
$27.99    
Toners2Go price: $9.95!
         HP
LaserJet 4 Toner Cartridge
(92298A):           
HP's
Price:
$88.99           
Toners2Go
  price: $41.75!
 
Come visit us on the web to check out our hundreds
of similar bargains at  HYPERLINK Toners
2 Go!
  request to be excluded by visiting  HYPERLINK HERE
beverley
 ...


Let's create a function to take in email as input and output to text form. Whatever the format is

In [102]:
def email_to_text(email):
    html=None
    for part in email.walk():
        ctype= part.get_content_type()
        if not ctype in ("text/plain", "text/html"):
            continue
        try:
            content=part.get_content()
        except: #In case of encoding issue
            content=str(part.get_payload())
        if ctype=="text/plain":
            return content
        else:
            html=content
    if html:
        return html_to_plain_text(html)

In [103]:
print(email_to_text(sample_html_spam)[:100], "...")


   HYPERLINK
Tremendous Savings
on Toners, 
Inkjets, FAX, and Thermal Replenishables!!
 HYPERLINK T ...


Let's throw in some stemming! For this to work, you need to install the Natural Language Toolkit (NLTK). It's as simple as running the following command (don't forget to activate your virtualenv first; if you don't have one, you will likely need administrator rights, or use the --user option):

In [104]:
%pip install nltk

Collecting nltk
  Downloading nltk-3.7-py3-none-any.whl (1.5 MB)
     ---------------------------------------- 1.5/1.5 MB 6.8 MB/s eta 0:00:00
Collecting regex>=2021.8.3
  Downloading regex-2022.8.17-cp310-cp310-win_amd64.whl (263 kB)
     -------------------------------------- 263.0/263.0 kB 8.2 MB/s eta 0:00:00
Collecting click
  Downloading click-8.1.3-py3-none-any.whl (96 kB)
     ---------------------------------------- 96.6/96.6 kB 5.4 MB/s eta 0:00:00
Collecting tqdm
  Downloading tqdm-4.64.1-py2.py3-none-any.whl (78 kB)
     ---------------------------------------- 78.5/78.5 kB 4.3 MB/s eta 0:00:00
Installing collected packages: tqdm, regex, click, nltk
Successfully installed click-8.1.3 nltk-3.7 regex-2022.8.17 tqdm-4.64.1
Note: you may need to restart the kernel to use updated packages.


In [107]:
try:
    import nltk
    stemmer = nltk.PorterStemmer()
    for word in ("Computations", "Computation", "Computing", "Computed", "Compute", "Compulsive"):
        print(word, "=>", stemmer.stem(word))
except ImportError:
    print("Error: stemming requires the NLTK module.")
    stemmer = None

Computations => comput
Computation => comput
Computing => comput
Computed => comput
Compute => comput
Compulsive => compuls


We will also need a way to replace URLs with the word "URL". For this, we could use hard core regular expressions but we will just use the urlextract library. You can install it with the following command (don't forget to activate your virtualenv first; if you don't have one, you will likely need administrator rights, or use the --user option):

In [109]:
%pip install urlextract

Collecting urlextract
  Downloading urlextract-1.6.0-py3-none-any.whl (20 kB)
Collecting platformdirs
  Downloading platformdirs-2.5.2-py3-none-any.whl (14 kB)
Collecting uritools
  Downloading uritools-4.0.0-py3-none-any.whl (10 kB)
Collecting filelock
  Downloading filelock-3.8.0-py3-none-any.whl (10 kB)
Installing collected packages: uritools, platformdirs, filelock, urlextract
Successfully installed filelock-3.8.0 platformdirs-2.5.2 uritools-4.0.0 urlextract-1.6.0
Note: you may need to restart the kernel to use updated packages.


In [111]:
try:
    import urlextract
    url_extractor= urlextract.URLExtract()
    print(url_extractor.find_urls("It will detect google.com and https://stackoverflow.com/questions/1732348/regex-match-open-tags-except-xhtml-self-contained-tags/1732454#1732454"))
except ImportError:
    print("Error: Replacing URLS requires for extract value")
    url_extractor=None

['google.com', 'https://stackoverflow.com/questions/1732348/regex-match-open-tags-except-xhtml-self-contained-tags/1732454#1732454']


In [113]:
from sklearn.base import BaseEstimator, TransformerMixin

In [118]:
class EmailToWordCounterTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, strip_headers=True, lower_case=True, remove_punctuation=True, 
                 replace_urls=True, replace_number=True, stemming=True):
        self.strip_headers=strip_headers
        self.lower_case=lower_case
        self.remove_punctuation= remove_punctuation
        self.replace_urls=replace_urls
        self.replace_number= replace_number
        self.stemming= stemming
    def fit(self,X,y=None):
        return self
    def transform(self, X, y=None):
        X_transformed=[]
        for email in X:
            text= email_to_text(email) or ""
            if self.lower_case:
                text=text.lower()
            if self.replace_urls and url_extractor is not None:
                urls= list(url_extractor.find_urls(text))
                urls.sort(key= lambda url: len(url), reverse=True)
                for url in urls:
                    text= text.replace(url, " URL ")
            if self.replace_number:
                text= re.sub(r'\d+(?:\.\d*)?(?:[eE][+-]?\d+)?', 'NUMBER', text)
            if self.remove_punctuation:
                text= re.sub(r'\W+', ' ', text, flags=re.M)
            word_counts=Counter(text.split())
            if self.stemming and stemmer is not None:
                stemmed_word_counts=Counter()
                for word, count in word_counts.items():
                    stemmed_word= stemmer.stem(word)
                    stemmed_word_counts[stemmed_word]+=count
                word_counts= stemmed_word_counts
            X_transformed.append(word_counts)
        return X_transformed
                

In [126]:
X_few= X_train[:3]
X_few_wordcounts= EmailToWordCounterTransformer().fit_transform(X_few)

In [127]:
X_few_wordcounts

[Counter({'chuck': 1, 'murcko': 1, 'wrote': 1, 'stuff': 1, 'yawn': 1, 'r': 1}),
 Counter({'some': 1,
          'interest': 1,
          'quot': 1,
          'url': 1,
          'thoma': 1,
          'jefferson': 2,
          'i': 2,
          'have': 2,
          'examin': 1,
          'all': 3,
          'the': 11,
          'known': 1,
          'superstit': 2,
          'of': 9,
          'word': 1,
          'and': 8,
          'do': 1,
          'not': 1,
          'find': 1,
          'in': 1,
          'our': 1,
          'particular': 1,
          'christian': 3,
          'one': 2,
          'redeem': 1,
          'featur': 1,
          'they': 1,
          'are': 1,
          'alik': 1,
          'found': 1,
          'on': 2,
          'fabl': 1,
          'mytholog': 1,
          'million': 1,
          'innoc': 1,
          'men': 1,
          'women': 1,
          'children': 1,
          'sinc': 1,
          'introduct': 1,
          'been': 2,
          'burnt': 1,
    

With converting email into words, now we can vectorize the entire email

For this, we will have to create a transformer whose fit() method will build the vocabulary and transform() method to convert word counts into vector

In [122]:
from scipy.sparse import csr_matrix

In [123]:
class WordCounterToVectorTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, vocabulary_size=1000):
        self.vocabulary_size=vocabulary_size
    def fit(self,X,y=None):
        total_count=Counter()
        for word_count in X:
            for word, count in word_count.items():
                total_count[word]+= min(count,10)
        most_common = total_count.most_common()[:self.vocabulary_size]
        self.vocabulary_= {word: index + 1 for index, (word, count) in enumerate(most_common)}
        return self
    def transform(self, X, y=None):
        rows=[]
        cols=[]
        data=[]
        for row, word_count in enumerate(X):
            for word, count in word_count.items():
                rows.append(row)
                cols.append(self.vocabulary_.get(word,0))
                data.append(count)
        return csr_matrix((data, (rows,cols)), shape=(len(X), self.vocabulary_size+1))
    

In [128]:
vocab_transformer= WordCounterToVectorTransformer(vocabulary_size=10)
X_few_vectors= vocab_transformer.fit_transform(X_few_wordcounts)
X_few_vectors

<3x11 sparse matrix of type '<class 'numpy.intc'>'
	with 20 stored elements in Compressed Sparse Row format>

In [129]:
X_few_vectors.toarray()


array([[ 6,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0],
       [99, 11,  9,  8,  3,  1,  3,  1,  3,  2,  3],
       [67,  0,  1,  2,  3,  4,  1,  2,  0,  1,  0]], dtype=int32)

In a nutshell, for the second row. There are 99 words that are not part of the dictionary. The 11 means the first words existed 11 times in the email, second word appears 9 times, etc

In [130]:
vocab_transformer.vocabulary_

{'the': 1,
 'of': 2,
 'and': 3,
 'to': 4,
 'url': 5,
 'all': 6,
 'in': 7,
 'christian': 8,
 'on': 9,
 'by': 10}

We are now ready to train our first spam classifier! Let's transform the whole dataset:

In [132]:
from sklearn.pipeline import Pipeline

In [133]:
preprocess_pipeline= Pipeline([
    ("email_to_wordcount", EmailToWordCounterTransformer()),
    ("wordcount_to_vector", WordCounterToVectorTransformer())
])

X_train_transformed= preprocess_pipeline.fit_transform(X_train)

Now we can begin training our model

In [134]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

In [135]:
log_clf= LogisticRegression(random_state=42, solver='lbfgs', max_iter=1000)
score= cross_val_score(log_clf, X_train_transformed, y_train, cv=3, verbose=2)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s


[CV] END .................................................... total time=   0.0s
[CV] END .................................................... total time=   0.0s
[CV] END .................................................... total time=   0.1s


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    0.3s finished


In [136]:
score.mean()

0.985

98% is not a number to scoff at. However, this dataset is really small (3000), so we might not be able to get so lucky with a larger dataset. However, the procedure are the same:
- Try other model and find the best base model
- Hyperparam tuning with GridSearchCV


In [137]:
from sklearn.metrics import precision_score, recall_score

X_test_transformed = preprocess_pipeline.transform(X_test)

log_clf = LogisticRegression(solver="lbfgs", max_iter=1000, random_state=42)
log_clf.fit(X_train_transformed, y_train)

y_pred = log_clf.predict(X_test_transformed)

print("Precision: {:.2f}%".format(100 * precision_score(y_test, y_pred)))
print("Recall: {:.2f}%".format(100 * recall_score(y_test, y_pred)))

Precision: 96.88%
Recall: 97.89%
