 Downloading the Data

In [1]:
import os
import tarfile
import urllib.request

In [2]:
Down_path_root = "http://spamassassin.apache.org/old/publiccorpus/"
spam_path = Down_path_root + "20030228_easy_ham.tar.bz2"
ham_path = Down_path_root + "20030228_spam.tar.bz2"
out_path = os.path.join("datasets","spam")

def fetch_data(ham_url = ham_path,spam_url = spam_path,output = out_path):
    if not os.path.isdir(output):
        os.makedirs(output)
    for filename,url in (("ham.tar.bz2",ham_url),("spam.tar.bz2",spam_url)):
        path = os.path.join(output,filename)
        if not os.path.isfile(path):
            urllib.request.urlretrieve(url,path)
        tar_bz2_file = tarfile.open(path)
        tar_bz2_file.extractall(path=output)
        tar_bz2_file.close()

In [2]:
fetch_data()

Loading the Data

In [3]:
hamDir = os.path.join(out_path,"easy_ham")
spamDir = os.path.join(out_path,"spam")
hamFileNames = [name for name in sorted(os.listdir(hamDir)) if len(name) > 20]
spamFileNames = [name for name in sorted(os.listdir(spamDir)) if len(name) > 20]

In [4]:
len(hamFileNames)

2500

In [5]:
len(spamFileNames)

500

using Python Email module for parsing

In [6]:
import email
import email.policy

def load_email(is_spam, filename, spamPath = out_path):
    directory = "spam" if is_spam else "easy_ham"
    with open(os.path.join(spamPath,directory,filename),"rb") as f:
        return email.parser.BytesParser(policy=email.policy.default).parse(f)

In [7]:
hamEmails = [load_email(is_spam = False, filename = name) for name in hamFileNames]
spamEmails = [load_email(is_spam = True, filename = name) for name in spamFileNames]

In [8]:
len(hamEmails)

2500

In [10]:
def get_email_structure(email):
    if isinstance(email,str):
        return email
    payload = email.get_payload()
    if isinstance(payload, list):
        return "multipart(())".format(",".join([
            get_email_structure(sub_email)
            for sub_email in payload
        ]))
    else:
        return email.get_content_type()

In [11]:
from collections import Counter

def structure_counter(emails):
    structures = Counter()
    for email in emails:
        structure = get_email_structure(email)
        structures[structure] += 1
    return structures

In [12]:
structure_counter(hamEmails).most_common()

[('text/plain', 2408), ('multipart(())', 92)]

In [13]:
structure_counter(spamEmails).most_common()

[('text/plain', 218),
 ('text/html', 183),
 ('multipart(())', 98),
 ('multipart/alternative', 1)]

In [15]:
for header, value in spamEmails[0].items():
    print(header,":",value)

Return-Path : <12a1mailbot1@web.de>
Delivered-To : zzzz@localhost.spamassassin.taint.org
Received : from localhost (localhost [127.0.0.1])	by phobos.labs.spamassassin.taint.org (Postfix) with ESMTP id 136B943C32	for <zzzz@localhost>; Thu, 22 Aug 2002 08:17:21 -0400 (EDT)
Received : from mail.webnote.net [193.120.211.219]	by localhost with POP3 (fetchmail-5.9.0)	for zzzz@localhost (single-drop); Thu, 22 Aug 2002 13:17:21 +0100 (IST)
Received : from dd_it7 ([210.97.77.167])	by webnote.net (8.9.3/8.9.3) with ESMTP id NAA04623	for <zzzz@spamassassin.taint.org>; Thu, 22 Aug 2002 13:09:41 +0100
From : 12a1mailbot1@web.de
Received : from r-smtp.korea.com - 203.122.2.197 by dd_it7  with Microsoft SMTPSVC(5.5.1775.675.6);	 Sat, 24 Aug 2002 09:42:10 +0900
To : dcek1a1@netsgo.com
Subject : Life Insurance - Why Pay More?
Date : Wed, 21 Aug 2002 20:31:57 -1600
MIME-Version : 1.0
Message-ID : <0103c1042001882DD_IT7@dd_it7>
Content-Type : text/html; charset="iso-8859-1"
Content-Transfer-Encoding : qu

In [16]:
spamEmails[0]["Subject"]

'Life Insurance - Why Pay More?'

Splitting DataSet

In [17]:
import numpy as np
from sklearn.model_selection import train_test_split

X = np.array(hamEmails+spamEmails,dtype = object)
y = np.array([0] * len(hamEmails) + [1] * len(spamEmails))

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 0.2, random_state = 42)