In [5]:
#downloading the dataset

import numpy as np
import tarfile 
from pathlib import Path
import urllib.request


def fetch_spam_data():
    spam_root = "http://spamassassin.apache.org/old/publiccorpus/"
    ham_url = spam_root + "20030228_easy_ham.tar.bz2"
    spam_url = spam_root + "20030228_spam.tar.bz2"

    spam_path = Path() / "datasets" / "Spam"
    spam_path.mkdir(parents=True, exist_ok=True)

    for dir_name, tar_name, url in (("easy_ham", "ham", ham_url),
                                    ("sppam", "spam", spam_url)):
        if not (spam_path / dir_name).is_dir():
            path = (spam_path / tar_name).with_suffix(".tar.bz2")
            print("Downloading", path)
            urllib.request.urlretrieve(url, path)
            tar_bz2_file = tarfile.open(path)
            tar_bz2_file.extractall(path=spam_path)
            tar_bz2_file.close()
    return [spam_path / dir_name for dir_name in ("easy_ham", "spam")]

In [6]:
ham_dir, spam_dir=fetch_spam_data()

Downloading datasets\Spam\spam.tar.bz2


In [7]:
#loading the dataset

ham_filenames = [f for f in sorted(ham_dir.iterdir()) if len(f.name)>20]
spam_filenames = [f for f in sorted(spam_dir.iterdir()) if len(f.name)>20]
print("Ham files :", len(ham_filenames))
print("Spam Files :" ,len(spam_filenames))

Ham files : 2500
Spam Files : 500


In [19]:
#dealing with emails


import email
import email.policy
 
def load_email(filepath):
    with open(filepath,"rb") as f:
        return email.parser.BytesParser(policy=email.policy.default).parse(f)
        

In [20]:
#now load in the mails
ham_emails=[load_email(filepath) for filepath in ham_filenames]
spam_emails=[load_email(filepath) for filepath in spam_filenames]

trying to view the emails now

In [21]:
print(ham_emails[1])

Return-Path: <Steve_Burt@cursor-system.com>
Delivered-To: zzzz@localhost.netnoteinc.com
Received: from localhost (localhost [127.0.0.1])
	by phobos.labs.netnoteinc.com (Postfix) with ESMTP id BE12E43C34
	for <zzzz@localhost>; Thu, 22 Aug 2002 07:46:38 -0400 (EDT)
Received: from phobos [127.0.0.1]
	by localhost with IMAP (fetchmail-5.9.0)
	for zzzz@localhost (single-drop); Thu, 22 Aug 2002 12:46:38 +0100 (IST)
Received: from n20.grp.scd.yahoo.com (n20.grp.scd.yahoo.com    [66.218.66.76])
 by dogma.slashnull.org (8.11.6/8.11.6) with SMTP id    g7MBkTZ05087 for
 <zzzz@spamassassin.taint.org>; Thu, 22 Aug 2002 12:46:29 +0100
X-Egroups-Return: =?utf-8?q?sentto-2242572-52726-1030016790-zzzz=3Dspamassas?=
 =?utf-8?q?sin=2Etaint=2Eorg=40returns=2Egroups=2Eyahoo=2Ecom?=
Received: from [66.218.67.196] by n20.grp.scd.yahoo.com with NNFMP;
    22 Aug 2002 11:46:30 -0000
X-Sender: steve.burt@cursor-system.com
X-Apparently-To: zzzzteana@yahoogroups.com
Received: (EGP: mail-8_1_0_1); 22 Aug 2002 11:4

In [22]:
print(ham_emails[1].get_content())

Martin A posted:
Tassos Papadopoulos, the Greek sculptor behind the plan, judged that the
 limestone of Mount Kerdylio, 70 miles east of Salonika and not far from the
 Mount Athos monastic community, was ideal for the patriotic sculpture. 
 
 As well as Alexander's granite features, 240 ft high and 170 ft wide, a
 museum, a restored amphitheatre and car park for admiring crowds are
planned
---------------------
So is this mountain limestone or granite?
If it's limestone, it'll weather pretty fast.

------------------------ Yahoo! Groups Sponsor ---------------------~-->
4 DVDs Free +s&p Join Now
http://us.click.yahoo.com/pt6YBB/NXiEAA/mG3HAA/7gSolB/TM
---------------------------------------------------------------------~->

To unsubscribe from this group, send an email to:
forteana-unsubscribe@egroups.com

 

Your use of Yahoo! Groups is subject to http://docs.yahoo.com/info/terms/ 






In [23]:
print(ham_emails[1].get_content().strip())

Martin A posted:
Tassos Papadopoulos, the Greek sculptor behind the plan, judged that the
 limestone of Mount Kerdylio, 70 miles east of Salonika and not far from the
 Mount Athos monastic community, was ideal for the patriotic sculpture. 
 
 As well as Alexander's granite features, 240 ft high and 170 ft wide, a
 museum, a restored amphitheatre and car park for admiring crowds are
planned
---------------------
So is this mountain limestone or granite?
If it's limestone, it'll weather pretty fast.

------------------------ Yahoo! Groups Sponsor ---------------------~-->
4 DVDs Free +s&p Join Now
http://us.click.yahoo.com/pt6YBB/NXiEAA/mG3HAA/7gSolB/TM
---------------------------------------------------------------------~->

To unsubscribe from this group, send an email to:
forteana-unsubscribe@egroups.com

 

Your use of Yahoo! Groups is subject to http://docs.yahoo.com/info/terms/


In [24]:
print(spam_emails[1].get_content().strip())

1) Fight The Risk of Cancer!
http://www.adclick.ws/p.cfm?o=315&s=pk007

2) Slim Down - Guaranteed to lose 10-12 lbs in 30 days
http://www.adclick.ws/p.cfm?o=249&s=pk007

3) Get the Child Support You Deserve - Free Legal Advice
http://www.adclick.ws/p.cfm?o=245&s=pk002

4) Join the Web's Fastest Growing Singles Community
http://www.adclick.ws/p.cfm?o=259&s=pk007

5) Start Your Private Photo Album Online!
http://www.adclick.ws/p.cfm?o=283&s=pk007

Have a Wonderful Day,
Offer Manager
PrizeMama













If you wish to leave this list please use the link below.
http://www.qves.com/trim/?ilug@linux.ie%7C17%7C114258


-- 
Irish Linux Users' Group: ilug@linux.ie
http://www.linux.ie/mailman/listinfo/ilug for (un)subscription information.
List maintainer: listmaster@linux.ie


In [25]:
def get_email_structure(email):
    if isinstance(email, str):
        return email
    payload = email.get_payload()
    if isinstance(payload, list):
        multipart = ", ".join([get_email_structure(sub_email) for sub_email in payload])
        return F"multipart({multipart})"
    else:
        return email.get_content_type()

from collections import Counter

def structures_counter(emails):
    structures = Counter()
    for email in emails:
        structure = get_email_structure(email)
        structures[structure] += 1
    return structures 

In [26]:
structures_counter(ham_emails).most_common

<bound method Counter.most_common of Counter({'text/plain': 2408, 'multipart(text/plain, application/pgp-signature)': 66, 'multipart(text/plain, text/html)': 8, 'multipart(text/plain, text/plain)': 4, 'multipart(text/plain)': 3, 'multipart(text/plain, application/octet-stream)': 2, 'multipart(text/plain, text/enriched)': 1, 'multipart(text/plain, application/ms-tnef, text/plain)': 1, 'multipart(multipart(text/plain, text/plain, text/plain), application/pgp-signature)': 1, 'multipart(text/plain, video/mng)': 1, 'multipart(text/plain, multipart(text/plain))': 1, 'multipart(text/plain, application/x-pkcs7-signature)': 1, 'multipart(text/plain, multipart(text/plain, text/plain), text/rfc822-headers)': 1, 'multipart(text/plain, multipart(text/plain, text/plain), multipart(multipart(text/plain, application/x-pkcs7-signature)))': 1, 'multipart(text/plain, application/x-java-applet)': 1})>

In [29]:
structures_counter(spam_emails).most_common

<bound method Counter.most_common of Counter({'text/plain': 218, 'text/html': 183, 'multipart(text/plain, text/html)': 45, 'multipart(text/html)': 20, 'multipart(text/plain)': 19, 'multipart(multipart(text/html))': 5, 'multipart(text/plain, image/jpeg)': 3, 'multipart(text/html, application/octet-stream)': 2, 'multipart(text/plain, application/octet-stream)': 1, 'multipart(text/html, text/plain)': 1, 'multipart(multipart(text/html), application/octet-stream, image/jpeg)': 1, 'multipart(multipart(text/plain, text/html), image/gif)': 1, 'multipart/alternative': 1})>

In [31]:
for header,value in spam_emails[1].items():
    print( header,":",value)

Return-Path : <ilug-admin@linux.ie>
Delivered-To : zzzz@localhost.spamassassin.taint.org
Received : from localhost (localhost [127.0.0.1])	by phobos.labs.spamassassin.taint.org (Postfix) with ESMTP id A7FD7454F6	for <zzzz@localhost>; Thu, 22 Aug 2002 08:27:38 -0400 (EDT)
Received : from phobos [127.0.0.1]	by localhost with IMAP (fetchmail-5.9.0)	for zzzz@localhost (single-drop); Thu, 22 Aug 2002 13:27:38 +0100 (IST)
Received : from lugh.tuatha.org (root@lugh.tuatha.org [194.125.145.45]) by    dogma.slashnull.org (8.11.6/8.11.6) with ESMTP id g7MCJiZ06043 for    <zzzz-ilug@jmason.org>; Thu, 22 Aug 2002 13:19:44 +0100
Received : from lugh (root@localhost [127.0.0.1]) by lugh.tuatha.org    (8.9.3/8.9.3) with ESMTP id NAA29323; Thu, 22 Aug 2002 13:18:52 +0100
Received : from email.qves.com ([67.104.83.251]) by lugh.tuatha.org    (8.9.3/8.9.3) with ESMTP id NAA29282 for <ilug@linux.ie>; Thu,    22 Aug 2002 13:18:37 +0100
Received : from qvp0091 ([169.254.6.22]) by email.qves.com with Micros

In [38]:
#splitting the data

from sklearn.model_selection import train_test_split

X = np.array(ham_emails + spam_emails, dtype=object)
y = np.array([0]*len(ham_emails) + [1]*len(spam_emails))

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=369)

In [39]:
#now we use html unescape to get rid of the html tags, you can use beautiful soup instead

import re
from html import unescape

def html_to_plain_text(html):
    text = re.sub('<head.*?>.*?</head>', '', html, flags=re.M | re.S | re.I)
    text = re.sub('<a\s.*?>', ' HYPERLINK ', text, flags=re.M | re.S | re.I)
    text = re.sub('<.*?>', '', text, flags=re.M | re.S)
    text = re.sub(r'(\s*\n)+', '\n', text, flags=re.M | re.S)
    return unescape(text)

In [40]:
html_spam_emails = [email for email in X_train[y_train==1]
                    if get_email_structure(email) == "text/html"]
sample_html_spam = html_spam_emails[7]
print(sample_html_spam.get_content().strip()[:1000], "...")

<html>
<body>
<p align="center"><a href="http://www.directmedium.com/bestrates/">
<img border="0" src="http://www.directmedium.com/images/mort19.gif" width="500" height="300"></a></p>
<p align="center">&nbsp;</p>
<p align="center">&nbsp;</p>
<p><font face="Arial" size="1">We are strongly against sending unsolicited emails
to those who do not wish to receive our special mailings.You have opted in to one or more of our affiliate
sites requesting to be notified of any special offers we may run from time to
time. If you do not wish to receive further mailings, please
<a href="http://www.directmedium.com/remove.htm">click this link </a>.
Please accept our apologies if you have been sent this email in error. We honor
all removal requests.</font></p>
</body>
</html>
22 ...


In [41]:
print(html_to_plain_text(sample_html_spam.get_content())[:1000], "...")


 HYPERLINK
 
 
We are strongly against sending unsolicited emails
to those who do not wish to receive our special mailings.You have opted in to one or more of our affiliate
sites requesting to be notified of any special offers we may run from time to
time. If you do not wish to receive further mailings, please
 HYPERLINK click this link .
Please accept our apologies if you have been sent this email in error. We honor
all removal requests.
22
 ...


In [42]:
def email_to_text(email):
    html = None
    for part in email.walk():
        ctype = part.get_content_type()
        if not ctype in ("text/plain", "text/html"):
            continue
        try:
            content = part.get_content()
        except: # in case of encoding issues
            content = str(part.get_payload())
        if ctype == "text/plain":
            return content
        else:
            html = content
    if html:
        return html_to_plain_text(html)

In [43]:
print(email_to_text(sample_html_spam)[:100], "...")


 HYPERLINK
 
 
We are strongly against sending unsolicited emails
to those who do not wish to recei ...


In [44]:
#using the natural language toolkit to stem 


import nltk

stemmer = nltk.PorterStemmer()
for word in ("Computations", "Computation", "Computing", "Computed", "Compute",
             "Compulsive"):
    print(word, "=>", stemmer.stem(word))

Computations => comput
Computation => comput
Computing => comput
Computed => comput
Compute => comput
Compulsive => compuls
