In [1]:
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt

# for handling and parsing email files
import email
import os
import re

from bs4 import BeautifulSoup
from collections import Counter

In [None]:
# load ham and spam files
ham_fnames = [name for name in sorted(os.listdir("/kaggle/input/email-spam-dataset-extended/ham_zipped/main_ham"))]
spam_fnames = [name for name in sorted(os.listdir("/kaggle/input/email-spam-dataset-extended/spam_zipped/main_spam"))]

In [None]:
def parse_email(fname, spam=False):
    directory = "/kaggle/input/email-spam-dataset-extended/spam_zipped/main_spam" if spam else "/kaggle/input/email-spam-dataset-extended/ham_zipped/main_ham"
    with open(os.path.join(directory, fname), "rb") as fp:
        return email.parser.BytesParser().parse(fp)
        
ham_emails = [parse_email(name) for name in ham_fnames]
spam_emails = [parse_email(name, spam=True) for name in spam_fnames]

In [None]:
test_email = ham_emails[0]
test_email

In [None]:
multi_email = None

for mail in ham_emails:
    if mail.is_multipart():
        multi_email = mail
        break

# Payload will be list of email.message.Message
print(multi_email.get_payload())

# Nested get payload
print(multi_email.get_payload()[0].get_payload())
print(multi_email.items())

In [None]:
def get_structure(email) -> str:
    # can be a non leaf
    payload = email.get_payload()
    if isinstance(payload, list):
        return "multipart({})".format(", ".join(
            get_structure(message)
            for message in payload
        ))
    else:
        return email.get_content_type()

In [None]:
# Function to count all the structures
def email_structure_counter(emails):
    structs = Counter()
    for mail in emails:
        mail_struct = get_structure(mail)
        structs[mail_struct] += 1
    return structs

ham_structs = email_structure_counter(ham_emails)
spam_structs = email_structure_counter(spam_emails)

In [None]:
html_email = None

for email in spam_emails:
    if get_structure(email) == 'text/html':
        html_email = email
        break

print(html_email.get_payload())

In [None]:
def html_to_text(email) -> str:
    try:
        soup = BeautifulSoup(email.get_payload(), "html.parser")
        plain = soup.text.replace("=\n", "")
        plain = re.sub(r"\s+", " ", plain)
        return plain.strip()
    except:
        return "nothing"

In [None]:
print(html_to_text(html_email))

In [None]:
# General purpose function to convert an email to plain text
def email_to_text(email):
    text_content = ""
    for part in email.walk():
        part_content_type = part.get_content_type()
        if part_content_type not in ['text/plain', 'text/html']:
            continue
        if part_content_type == 'text/plain':
            text_content += part.get_payload()
        else:
            text_content += html_to_text(part)
    return text_content

print("Ham email in plain text:\n", email_to_text(ham_emails[3]))
print("Spam email in plain text:\n", email_to_text(spam_emails[3]))