In [130]:
import os
import tarfile
import urllib.request

DOWNLOAD_ROOT = "http://spamassassin.apache.org/old/publiccorpus/"
HAM_URL = DOWNLOAD_ROOT + "20030228_easy_ham.tar.bz2"
SPAM_URL = DOWNLOAD_ROOT + "20030228_spam.tar.bz2"
SPAM_PATH = os.path.join("downloads", "datasets", "spam")

def fetch_spam_data(ham_url=HAM_URL, spam_url=SPAM_URL, spam_path=SPAM_PATH):
    if not os.path.isdir(spam_path):
        os.makedirs(spam_path)
    for filename, url in (("ham.tar.bz2", ham_url), ("spam.tar.bz2", spam_url)):
        path = os.path.join(spam_path, filename)
        if not os.path.isfile(path):
            urllib.request.urlretrieve(url, path)
        tar_bz2_file = tarfile.open(path)
        tar_bz2_file.extractall(path=spam_path)
        tar_bz2_file.close()

In [131]:
fetch_spam_data()

In [132]:
SPAM_ROOT = os.path.join(SPAM_PATH, "spam")
HAM_ROOT = os.path.join(SPAM_PATH, "easy_ham")

spam_emails = [filename for filename in os.listdir(SPAM_ROOT) if filename != "cmds"]
ham_emails = [filename for filename in os.listdir(HAM_ROOT) if filename != "cmds"]

In [133]:
print(len(spam_emails), len(ham_emails))

500 2500


In [134]:
import email
import email.policy

def load_email(is_spam, filename, path=SPAM_PATH):
    dir = "spam" if is_spam else "easy_ham"
    with open(os.path.join(path, dir, filename), "rb") as f:
        return email.parser.BytesParser(policy=email.policy.default).parse(f)

In [135]:
ham_emails = [load_email(False, name) for name in ham_emails]
spam_emails = [load_email(True, name) for name in spam_emails]

In [136]:
print(spam_emails[0].get_content().strip())

Dear Homeowner,
 
Interest Rates are at their lowest point in 40 years!

We help you find the best rate for your situation by
matching your needs with hundreds of lenders!

Home Improvement, Refinance, Second Mortgage,
Home Equity Loans, and More! Even with less than
perfect credit!

This service is 100% FREE to home owners and new
home buyers without any obligation. 

Just fill out a quick, simple form and jump-start
your future plans today!


Visit http://61.145.116.186/user0201/index.asp?Afft=QM10






To unsubscribe, please visit:

http://61.145.116.186/light/watch.asp


In [137]:
def get_email_structure(email):
    if isinstance(email, str):
        return email
    payload = email.get_payload()
    if isinstance(payload, list):
        return f"multipart({' ,'.join([get_email_structure(sub_email) for sub_email in payload])}"
    else:
        return email.get_content_type()

In [138]:
from collections import Counter

def struct_counter(emails):
    structs = Counter()
    for email in emails:
        struct = get_email_structure(email)
        structs[struct] += 1
    return structs

In [139]:
struct_counter(ham_emails).most_common()

[('text/plain', 2408),
 ('multipart(text/plain ,application/pgp-signature', 66),
 ('multipart(text/plain ,text/html', 8),
 ('multipart(text/plain ,text/plain', 4),
 ('multipart(text/plain', 3),
 ('multipart(text/plain ,application/octet-stream', 2),
 ('multipart(text/plain ,multipart(text/plain ,text/plain ,text/rfc822-headers',
  1),
 ('multipart(text/plain ,multipart(text/plain ,text/plain ,multipart(multipart(text/plain ,application/x-pkcs7-signature',
  1),
 ('multipart(text/plain ,application/x-java-applet', 1),
 ('multipart(text/plain ,text/enriched', 1),
 ('multipart(multipart(text/plain ,text/plain ,text/plain ,application/pgp-signature',
  1),
 ('multipart(text/plain ,multipart(text/plain', 1),
 ('multipart(text/plain ,application/ms-tnef ,text/plain', 1),
 ('multipart(text/plain ,video/mng', 1),
 ('multipart(text/plain ,application/x-pkcs7-signature', 1)]

In [140]:
struct_counter(spam_emails).most_common()

[('text/plain', 218),
 ('text/html', 183),
 ('multipart(text/plain ,text/html', 45),
 ('multipart(text/html', 20),
 ('multipart(text/plain', 19),
 ('multipart(multipart(text/html', 5),
 ('multipart(text/plain ,image/jpeg', 3),
 ('multipart(text/html ,application/octet-stream', 2),
 ('multipart(multipart(text/plain ,text/html ,image/gif', 1),
 ('multipart/alternative', 1),
 ('multipart(text/plain ,application/octet-stream', 1),
 ('multipart(multipart(text/html ,application/octet-stream ,image/jpeg', 1),
 ('multipart(text/html ,text/plain', 1)]

In [141]:
for header, value in spam_emails[0].items():
    print(header + "\t:\t" + value)

Return-Path	:	<pamela4701@eudoramail.com>
Delivered-To	:	zzzz@localhost.spamassassin.taint.org
Received	:	from localhost (jalapeno [127.0.0.1])	by zzzzason.org (Postfix) with ESMTP id 5D14216F17	for <zzzz@localhost>; Mon,  9 Sep 2002 10:49:04 +0100 (IST)
Received	:	from jalapeno [127.0.0.1]	by localhost with IMAP (fetchmail-5.9.0)	for zzzz@localhost (single-drop); Mon, 09 Sep 2002 10:49:04 +0100 (IST)
Received	:	from smtp-ft1.fr.colt.net (smtp-ft1.fr.colt.net [213.41.78.25])    by dogma.slashnull.org (8.11.6/8.11.6) with ESMTP id g899AfC06863 for    <webmaster@efi.ie>; Mon, 9 Sep 2002 10:10:41 +0100
Received	:	from mailsweeper.abc-arbitrage.com (mailhost2.abc-arbitrage.com    [213.41.18.43]) by smtp-ft1.fr.colt.net with ESMTP id g899AvS20929 for    <webmaster@efi.ie>; Mon, 9 Sep 2002 11:10:57 +0200
Received	:	from 210.214.94.76 (unverified) by mailsweeper.abc-arbitrage.com    (Content Technologies SMTPRS 4.2.10) with ESMTP id    <T5d3abf3ca1c0a8bf0537c@mailsweeper.abc-arbitrage.com>; M

In [142]:
import numpy as np
from sklearn.model_selection import train_test_split

X = np.array(ham_emails + spam_emails, dtype=object)
y = np.array([0] * len(ham_emails) + [1] * len(spam_emails))

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [143]:
from bs4 import BeautifulSoup

def clean_html(email):
    soup = BeautifulSoup(email, "html.parser")

    for data in soup("style", "script"):
        data.decompose()

    return " ".join(soup.stripped_strings)

In [144]:
html_spam_emails = [email for email in X_train[y_train == 1] if get_email_structure(email) == "text/html" ]

sample_html_email = html_spam_emails[0]
sample_html_email.get_content().strip()[:300]

'<HTML>\n<HEAD>\n</HEAD>\n<BODY>\n<FONT SIZE="4"><B> A man endowed with a 7 - 8" hammer is simply<br>\n better equipped than a man with a 5 - 6" hammer. <BR>\n<BR>Would you rather have<br>more than enough to get the job done or fall very short. It\'s totally up<br>to you. Our Methods are guaranteed to incre'

In [145]:
sample_html_email = clean_html(sample_html_email.get_content())
sample_html_email[:300]

'A man endowed with a 7 - 8" hammer is simply better equipped than a man with a 5 - 6" hammer. Would you rather have more than enough to get the job done or fall very short. It\'s totally up to you. Our Methods are guaranteed to increase your size by 1 - 3" Enter here and see how'

In [146]:
def email_to_text(email):

    for part in email.walk():
        c_type = part.get_content_type()

        if not c_type in ("text/plain", "text/html"):
            continue
    
        try:
            content = part.get_content()
        except: # in case of encoding issues
            content = str(part.get_payload())

        if c_type == "text/plain":
            return content
        html = clean_html(content)
        
        if html:
            return html

In [147]:
print(email_to_text(html_spam_emails[0])[:200])

A man endowed with a 7 - 8" hammer is simply better equipped than a man with a 5 - 6" hammer. Would you rather have more than enough to get the job done or fall very short. It's totally up to you. Our


In [148]:
from sklearn.base import BaseEstimator, TransformerMixin
import urlextract
import re
import nltk

stemmer = nltk.PorterStemmer()
url_extractor = urlextract.URLExtract()

class EmailToWordCounter(BaseEstimator, TransformerMixin):
    def __init__(self, strip_headers=True, lower_case=True, remove_punctuation=True, replace_urls=True, replace_numbers=True, stemming=True, ):
        self.strip_headers = strip_headers
        self.lower_case = lower_case
        self.remove_punctuation = remove_punctuation
        self.replace_urls = replace_urls
        self.replace_numbers = replace_numbers
        self.stemming = stemming
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        X_transformed = []

        for email in X:
            text = email_to_text(email) or ""
            if self.lower_case:
                text = text.lower()
            if self.replace_urls:
                urls = list(set(url_extractor.find_urls(text)))
                urls.sort(key=lambda x: len(x), reverse=True)
                for url in urls:
                    text = text.replace(url, "URL")
            if self.replace_numbers:
                text = re.sub(r'\d+(?:\.\d*)?(?:[eE][+-]?\d+)?', 'NUMBER', text)
            if self.remove_punctuation:
                text = re.sub(r'\W+', ' ', text, flags=re.M)
            
            word_counts = Counter(text.split())
            if self.stemming:
                stemmed_word_counts = Counter()
                for word, count in word_counts.items():
                    stemmed_word = stemmer.stem(word)
                    stemmed_word_counts[stemmed_word] += count
                word_counts = stemmed_word_counts
            X_transformed.append(word_counts)
        return np.array(X_transformed)



In [149]:
EmailToWordCounter().fit_transform([html_spam_emails[0]])

array([Counter({'number': 6, 'a': 4, 'to': 3, 'man': 2, 'with': 2, 'hammer': 2, 'than': 2, 'you': 2, 'endow': 1, 'is': 1, 'simpli': 1, 'better': 1, 'equip': 1, 'would': 1, 'rather': 1, 'have': 1, 'more': 1, 'enough': 1, 'get': 1, 'the': 1, 'job': 1, 'done': 1, 'or': 1, 'fall': 1, 'veri': 1, 'short': 1, 'it': 1, 's': 1, 'total': 1, 'up': 1, 'our': 1, 'method': 1, 'are': 1, 'guarante': 1, 'increas': 1, 'your': 1, 'size': 1, 'by': 1, 'enter': 1, 'here': 1, 'and': 1, 'see': 1, 'how': 1})],
      dtype=object)

In [150]:
from scipy.sparse import csr_matrix

class WordCounterToVector(BaseEstimator, TransformerMixin):
    def __init__(self, vocab_size=1000):
        self.vocab_size = vocab_size
    def fit(self, X, y=None):
        total_count = Counter()
        for word_count in X:
            for word, count in word_count.items():
                total_count[word] += count
        most_common = total_count.most_common()[:self.vocab_size]
        self.vocab_ = {word: index + 1 for index, (word, count) in enumerate(most_common)}
        return self
    def transform(self, X, y=None):
        rows = []
        cols = []
        data = []

        for row, word_count in enumerate(X):
            for word, count in word_count.items():
                rows.append(row)
                cols.append(self.vocab_.get(word, 0))
                data.append(count)
        return csr_matrix((data, (rows, cols)), shape=(len(X), self.vocab_size + 1))

In [151]:
from sklearn.pipeline import Pipeline

preprocess_pipeline = Pipeline([
    ("email_to_wordcount", EmailToWordCounter()),
    ("wordcount_to_vector", WordCounterToVector()),
])

X_train_transformed = preprocess_pipeline.fit_transform(X_train)

In [153]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

log_clf = LogisticRegression(max_iter=1000, random_state=42)
score = cross_val_score(log_clf, X_train_transformed, y_train, cv=3)

In [154]:
score

array([0.98875, 0.99   , 0.98375])

In [155]:
score.mean()

0.9874999999999999