In [164]:
import tarfile
from pathlib import Path
import urllib.request


def fetch_spam_data():
    spam_root = "http://spamassassin.apache.org/old/publiccorpus/"
    ham_url = spam_root + "20021010_hard_ham.tar.bz2"
    spam_url = spam_root + "20030228_spam_2.tar.bz2"

    spam_path = Path() / "datasets" / "spam"
    spam_path.mkdir(parents=True, exist_ok=True)
    for dir_name, tar_name, url in (("easy_ham", "ham", ham_url),
                                    ("spam", "spam", spam_url)):
        if not (spam_path / dir_name).is_dir():
            path = (spam_path / tar_name).with_suffix(".tar.bz2")
            print("Downloading", path)
            urllib.request.urlretrieve(url, path)
            tar_bz2_file = tarfile.open(path)
            tar_bz2_file.extractall(path=spam_path)
            tar_bz2_file.close()
    return [spam_path / dir_name for dir_name in ("easy_ham", "spam")]

In [165]:
ham_dir, spam_dir = fetch_spam_data()

In [166]:
ham_filenames = [f for f in sorted(ham_dir.iterdir()) if len(f.name) > 20]
spam_filenames = [f for f in sorted(spam_dir.iterdir()) if len(f.name) > 20]

In [167]:
len(ham_filenames)

2500

In [168]:
len(spam_filenames)

500

In [169]:
import email
import email.policy

def load_email(filepath):
    with open(filepath, "rb") as f:
        return email.parser.BytesParser(policy=email.policy.default).parse(f)

In [170]:
ham_emails = [load_email(filepath) for filepath in ham_filenames]
spam_emails = [load_email(filepath) for filepath in spam_filenames]

In [171]:
print(ham_emails[0].get_content().strip())

Date:        Wed, 21 Aug 2002 10:54:46 -0500
    From:        Chris Garrigues <cwg-dated-1030377287.06fa6d@DeepEddy.Com>
    Message-ID:  <1029945287.4797.TMDA@deepeddy.vircio.com>


  | I can't reproduce this error.

For me it is very repeatable... (like every time, without fail).

This is the debug log of the pick happening ...

18:19:03 Pick_It {exec pick +inbox -list -lbrace -lbrace -subject ftp -rbrace -rbrace} {4852-4852 -sequence mercury}
18:19:03 exec pick +inbox -list -lbrace -lbrace -subject ftp -rbrace -rbrace 4852-4852 -sequence mercury
18:19:04 Ftoc_PickMsgs {{1 hit}}
18:19:04 Marking 1 hits
18:19:04 tkerror: syntax error in expression "int ...

Note, if I run the pick command by hand ...

delta$ pick +inbox -list -lbrace -lbrace -subject ftp -rbrace -rbrace  4852-4852 -sequence mercury
1 hit

That's where the "1 hit" comes from (obviously).  The version of nmh I'm
using is ...

delta$ pick -version
pick -- nmh-1.0.4 [compiled on fuchsia.cs.mu.OZ.AU at Sun Mar 17 14:55:56 

In [172]:
def get_email_structure(email):
    if isinstance(email, str):
        return email
    payload = email.get_payload()
    if isinstance(payload, list):
        multipart = ", ".join([get_email_structure(sub_email)
                               for sub_email in payload])
        return f"multipart({multipart})"
    else:
        return email.get_content_type()

In [173]:
from collections import Counter

def structures_counter(emails):
    structures = Counter()
    for email in emails:
        structure = get_email_structure(email)
        structures[structure] += 1
    return structures

In [174]:
structures_counter(ham_emails).most_common()

[('text/plain', 2408),
 ('multipart(text/plain, application/pgp-signature)', 66),
 ('multipart(text/plain, text/html)', 8),
 ('multipart(text/plain, text/plain)', 4),
 ('multipart(text/plain)', 3),
 ('multipart(text/plain, application/octet-stream)', 2),
 ('multipart(text/plain, text/enriched)', 1),
 ('multipart(text/plain, application/ms-tnef, text/plain)', 1),
 ('multipart(multipart(text/plain, text/plain, text/plain), application/pgp-signature)',
  1),
 ('multipart(text/plain, video/mng)', 1),
 ('multipart(text/plain, multipart(text/plain))', 1),
 ('multipart(text/plain, application/x-pkcs7-signature)', 1),
 ('multipart(text/plain, multipart(text/plain, text/plain), text/rfc822-headers)',
  1),
 ('multipart(text/plain, multipart(text/plain, text/plain), multipart(multipart(text/plain, application/x-pkcs7-signature)))',
  1),
 ('multipart(text/plain, application/x-java-applet)', 1)]

In [175]:
structures_counter(spam_emails).most_common()

[('text/plain', 218),
 ('text/html', 183),
 ('multipart(text/plain, text/html)', 45),
 ('multipart(text/html)', 20),
 ('multipart(text/plain)', 19),
 ('multipart(multipart(text/html))', 5),
 ('multipart(text/plain, image/jpeg)', 3),
 ('multipart(text/html, application/octet-stream)', 2),
 ('multipart(text/plain, application/octet-stream)', 1),
 ('multipart(text/html, text/plain)', 1),
 ('multipart(multipart(text/html), application/octet-stream, image/jpeg)', 1),
 ('multipart(multipart(text/plain, text/html), image/gif)', 1),
 ('multipart/alternative', 1)]

In [176]:
ham_headers = [dict(email.items()) for email in ham_emails]
spam_headers = [dict(email.items()) for email in spam_emails]

In [177]:
ham_headers[0]["Subject"]

'Re: New Sequences Window'

In [178]:
from sklearn.model_selection import train_test_split

# Combine the ham and spam emails
emails = ham_emails + spam_emails
# Create labels for the emails: 0 for ham, 1 for spam
labels = [0]*len(ham_emails) + [1]*len(spam_emails)

# # Split the emails and labels into training and test sets
# emails_train, emails_test, labels_train, labels_test = train_test_split(emails, labels, test_size=0.2, random_state=42)

In [179]:
from html2text import html2text

def convert_html_to_plain_text(html_content):
    return html2text(html_content)

In [180]:
for email in spam_emails:
    if email.get_content_type() == 'text/html':
        plain_text_content = convert_html_to_plain_text(email.get_content())
        print(plain_text_content)
        break

* * *  
  
---  
  
* * *  
  
  
  
Save up to 70% on Life Insurance. Why Spend More Than You Have To?  Life Quote
Savings

__**  
_**__

**Ensuring your family's financial security is very important. Life Quote
Savings makes buying life insurance simple and affordable. We Provide FREE
Access to The Very Best Companies and The Lowest Rates.**  
---  
| **Life Quote Savings** is FAST, EASY and SAVES you money! Let us help you
get started with the best values in the country on new coverage. You can SAVE
hundreds or even thousands of dollars by requesting a FREE quote from
Lifequote Savings. Our service will take you less than 5 minutes to complete.
Shop and compare. SAVE up to 70% on all types of Life insurance!  
---  
  
  

**[Click Here For Your Free Quote!](http://website.e365.cc/savequote/)**

** Protecting your family is the best investment you'll ever make!  
**  
  
  
**  
  
  
  
  

  
**  
  
  
  

  

  
**  
  
If you are in receipt of this email in error and/or wish to

In [181]:
def email_to_text(email):
    html = None
    for part in email.walk():
        ctype = part.get_content_type()
        if not ctype in ("text/plain", "text/html"):
            continue
        try:
            content = part.get_content()
        except: # in case of encoding issues
            content = str(part.get_payload())
        if ctype == "text/plain":
            return content
        else:
            html = content
    if html:
        return convert_html_to_plain_text(html)

In [182]:
print(email_to_text(spam_emails[0]))


* * *  
  
---  
  
* * *  
  
  
  
Save up to 70% on Life Insurance. Why Spend More Than You Have To?  Life Quote
Savings

__**  
_**__

**Ensuring your family's financial security is very important. Life Quote
Savings makes buying life insurance simple and affordable. We Provide FREE
Access to The Very Best Companies and The Lowest Rates.**  
---  
| **Life Quote Savings** is FAST, EASY and SAVES you money! Let us help you
get started with the best values in the country on new coverage. You can SAVE
hundreds or even thousands of dollars by requesting a FREE quote from
Lifequote Savings. Our service will take you less than 5 minutes to complete.
Shop and compare. SAVE up to 70% on all types of Life insurance!  
---  
  
  

**[Click Here For Your Free Quote!](http://website.e365.cc/savequote/)**

** Protecting your family is the best investment you'll ever make!  
**  
  
  
**  
  
  
  
  

  
**  
  
  
  

  

  
**  
  
If you are in receipt of this email in error and/or wish to

In [183]:
from collections import Counter
from sklearn.base import BaseEstimator, TransformerMixin
from html import unescape
import re
import urllib.parse

class EmailToWordCountTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        X_transformed = []
        for email in X:
            text = self.get_plain_text_content(email)
            text = self.replace_urls(text)
            text = self.replace_numbers(text)
            text = self.replace_punctuation(text)
            word_counts = self.count_words(text)
            X_transformed.append(word_counts)
        return X_transformed

    def get_plain_text_content(self, email):
        parts = []
        if email.is_multipart():
            for part in email.iter_parts():
                ctype = part.get_content_type()
                if ctype == "text/plain":
                    parts.append(part.get_payload())
        else:
            parts.append(email.get_payload())
        return ''.join(parts)

    def replace_urls(self, text):
        return re.sub(r'http\S+|www.\S+', ' URL ', text)

    def replace_numbers(self, text):
        return re.sub(r'\d+(?:\.\d*(?:[eE]\d+))?', ' NUMBER ', text)

    def replace_punctuation(self, text):
        return re.sub(r'\W+', ' ', text, flags=re.M)

    def count_words(self, text):
        text = text.lower()
        word_counts = Counter(text.split())
        return word_counts

# Test the transformer
transformer = EmailToWordCountTransformer()

# Transform the emails
emails_transformed = transformer.transform(emails)

# Print the word counts of the first email
print(emails_transformed[1])

Counter({'the': 5, 'number': 4, 'a': 3, 'limestone': 3, 'of': 3, 'and': 3, 's': 3, 'to': 3, 'mount': 2, 'from': 2, 'for': 2, 'as': 2, 'granite': 2, 'ft': 2, 'is': 2, 'this': 2, 'it': 2, 'yahoo': 2, 'groups': 2, 'url': 2, 'unsubscribe': 2, 'martin': 1, 'posted': 1, 'tassos': 1, 'papadopoulos': 1, 'greek': 1, 'sculptor': 1, 'behind': 1, 'plan': 1, 'judged': 1, 'that': 1, 'kerdylio': 1, 'miles': 1, 'east': 1, 'salonika': 1, 'not': 1, 'far': 1, 'athos': 1, 'monastic': 1, 'community': 1, 'was': 1, 'ideal': 1, 'patriotic': 1, 'sculpture': 1, 'well': 1, 'alexander': 1, 'features': 1, 'high': 1, 'wide': 1, 'museum': 1, 'restored': 1, 'amphitheatre': 1, 'car': 1, 'park': 1, 'admiring': 1, 'crowds': 1, 'are': 1, 'planned': 1, 'so': 1, 'mountain': 1, 'or': 1, 'if': 1, 'll': 1, 'weather': 1, 'pretty': 1, 'fast': 1, 'sponsor': 1, 'dvds': 1, 'free': 1, 'p': 1, 'join': 1, 'now': 1, 'group': 1, 'send': 1, 'an': 1, 'email': 1, 'forteana': 1, 'egroups': 1, 'com': 1, 'your': 1, 'use': 1, 'subject': 1})


In [184]:
from scipy.sparse import csr_matrix
from sklearn.base import BaseEstimator, TransformerMixin

class WordCounterToVectorTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, vocabulary_size=1000):
        self.vocabulary_size = vocabulary_size

    def fit(self, X, y=None):
        total_count = Counter()
        for word_count in X:
            for word, count in word_count.items():
                total_count[word] += min(count, 10)
        most_common = total_count.most_common()[:self.vocabulary_size]
        self.most_common_ = most_common
        self.vocabulary_ = {word: index + 1 for index, (word, count) in enumerate(most_common)}
        return self

    def transform(self, X, y=None):
        rows = []
        cols = []
        data = []
        for row, word_count in enumerate(X):
            for word, count in word_count.items():
                rows.append(row)
                cols.append(self.vocabulary_.get(word, 0))
                data.append(count)
        return csr_matrix((data, (rows, cols)), shape=(len(X), self.vocabulary_size + 1))

In [185]:
# Initialize the transformer
vectorizer = WordCounterToVectorTransformer(vocabulary_size=5000)

# Fit the transformer and transform the word counts
word_vectors = vectorizer.fit_transform(emails_transformed)

# Print the shape of the word vectors
vectorizer.vocabulary_

{'number': 1,
 'the': 2,
 'to': 3,
 'a': 4,
 'and': 5,
 'of': 6,
 'i': 7,
 'in': 8,
 'is': 9,
 'url': 10,
 'it': 11,
 'that': 12,
 'for': 13,
 'you': 14,
 'this': 15,
 'on': 16,
 's': 17,
 'with': 18,
 't': 19,
 'be': 20,
 'from': 21,
 'have': 22,
 'not': 23,
 'are': 24,
 'or': 25,
 'as': 26,
 'your': 27,
 'if': 28,
 'at': 29,
 'but': 30,
 'by': 31,
 'can': 32,
 'list': 33,
 'all': 34,
 'an': 35,
 'we': 36,
 'my': 37,
 'was': 38,
 'they': 39,
 'so': 40,
 'will': 41,
 'do': 42,
 'there': 43,
 'd': 44,
 'com': 45,
 'more': 46,
 'one': 47,
 'has': 48,
 'get': 49,
 'b': 50,
 'no': 51,
 'just': 52,
 'about': 53,
 'out': 54,
 'font': 55,
 'net': 56,
 'what': 57,
 'p': 58,
 'up': 59,
 'like': 60,
 'which': 61,
 'size': 62,
 'would': 63,
 'only': 64,
 'use': 65,
 'time': 66,
 'br': 67,
 'new': 68,
 'now': 69,
 'any': 70,
 'some': 71,
 'who': 72,
 'email': 73,
 'm': 74,
 'their': 75,
 'me': 76,
 'people': 77,
 'don': 78,
 'when': 79,
 'our': 80,
 'tr': 81,
 'td': 82,
 'color': 83,
 'face': 84,


In [186]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(emails, labels, test_size=0.2, random_state=42)

# Create the pipeline
pipeline = Pipeline([
    ('email_to_word_count', EmailToWordCountTransformer()),
    ('word_count_to_vector', WordCounterToVectorTransformer(vocabulary_size=5000)),
    ('log_reg', LogisticRegression(solver='liblinear'))
])

# Train the model
pipeline.fit(X_train, y_train)

# Evaluate the model
print("Training score: ", pipeline.score(X_train, y_train))
print("Test score: ", pipeline.score(X_test, y_test))

Training score:  0.9991666666666666
Test score:  0.9866666666666667


In [187]:
from sklearn.metrics import precision_score, recall_score

# Predict the labels of the test set
y_pred = pipeline.predict(X_test)

# Calculate and print the precision
precision = precision_score(y_test, y_pred)
print("Precision: ", precision)

# Calculate and print the recall
recall = recall_score(y_test, y_pred)
print("Recall: ", recall)

Precision:  0.9393939393939394
Recall:  0.9789473684210527
