In [43]:
import numpy as np
import pandas as pd
import email
from email import policy
import os
from bs4 import BeautifulSoup

In [26]:
root_dir = os.getcwd()

ham_folder = os.path.join(root_dir,'main_ham')
spam_folder = os.path.join(root_dir,'main_spam')
ham_files = os.listdir(ham_folder)
spam_files = os.listdir(spam_folder)

In [44]:
def clean_html(raw_html):
    '''
    To clean any html content in emails.
    :param raw_html: string of html tags and content
    :return: a clean string with no html tags or styling
    '''
    soup = BeautifulSoup(raw_html, "html.parser")

    # Remove all style and script tags
    for script_or_style in soup(["script", "style"]):
        script_or_style.decompose()

    # Get text and strip extra whitespace
    cleaned_text = soup.get_text(separator=" ")
    cleaned_text = " ".join(cleaned_text.split())

    return cleaned_text

In [77]:
def extract_body(path):
    '''
    This function takes a path to an email file and returns the cleaned body of that email
    :param path: poth to the email file
    :return: body content after cleaning any html tags present
    '''
    with open(path, 'r', encoding='utf-8', errors='ignore') as f:
        msg = email.message_from_file(f, policy=policy.default)

    body = ''

    if msg.is_multipart():
        for part in msg.walk():
            content_type = part.get_content_type()
            if content_type == 'text/plain':
                try:
                    content = part.get_content()
                except LookupError:
                    # Unknown encoding fallback
                    content = part.get_payload(decode=True)
                    content = content.decode('utf-8', errors='ignore')
                body += content
            elif content_type == 'text/html':
                body += clean_html(part.get_content())
    else:
        if msg.get_content_type() == 'text/plain':
            try:
                content = msg.get_content()
            except LookupError:
                content = msg.get_payload(decode=True)
                content = content.decode('utf-8', errors='ignore')
            body += content
        elif msg.get_content_type() == 'text/html':
            try:
                content = msg.get_content()
            except LookupError:
                content = msg.get_payload(decode=True)
                content = content.decode('utf-8', errors='ignore')
            body += clean_html(content)

    return body

In [88]:
df = {}
for file in spam_files:
    df[extract_body(os.path.join(spam_folder, file))] = 'spam'
for file in ham_files:
    df[extract_body(os.path.join(ham_folder, file))] = 'ham'

In [110]:
data = pd.DataFrame.from_dict(df, orient='index', columns=['label']).reset_index(names='maill')

In [111]:
data.shape

(5994, 2)

In [112]:
data.head()

Unnamed: 0,maill,label
0,mv 1 00001.bfc8d64d12b325ff385cca8d07b84288\nm...,spam
1,Greetings!\n\nYou are receiving this letter be...,spam
2,Save up to 70% on Life Insurance. Why Spend Mo...,spam
3,"The Need For Safety Is Real In 2002, You Might...",spam
4,1) Fight The Risk of Cancer!\nhttp://www.adcli...,spam
