# Apache Spam Classifier

### Obtain Data

In [1]:
import os
import tarfile
import urllib

DOWNLOAD_ROOT = "http://spamassassin.apache.org/old/publiccorpus/"
HAM_URL = DOWNLOAD_ROOT + "20030228_easy_ham.tar.bz2"
SPAM_URL = DOWNLOAD_ROOT + "20030228_spam.tar.bz2"
SPAM_PATH = os.path.join("datasets", "spam")

def fetch_spam_data(spam_url=SPAM_URL, spam_path=SPAM_PATH):
    if not os.path.isdir(spam_path):
        os.makedirs(spam_path)
    for filename, url in (("ham.tar.bz2", HAM_URL), ("spam.tar.bz2", SPAM_URL)):
        path = os.path.join(spam_path, filename)
        if not os.path.isfile(path):
            urllib.request.urlretrieve(url, path)
        tar_bz2_file = tarfile.open(path)
        tar_bz2_file.extractall(path=SPAM_PATH)
        tar_bz2_file.close()
        
fetch_spam_data()

In [2]:
HAM_DIR = os.path.join(SPAM_PATH, "easy_ham")
SPAM_DIR = os.path.join(SPAM_PATH, "spam")
ham_filenames = [name for name in sorted(os.listdir(HAM_DIR)) if len(name) > 20]
spam_filenames = [name for name in sorted(os.listdir(SPAM_DIR)) if len(name) > 20]

In [3]:
len(ham_filenames)

2500

In [4]:
len(spam_filenames)

500

In [5]:
import email
import email.policy

def load_email(is_spam, filename, spam_path=SPAM_PATH):
    directory = "spam" if is_spam else "easy_ham"
    with open(os.path.join(spam_path, directory, filename), "rb") as f:
        return email.parser.BytesParser(policy=email.policy.default).parse(f)

In [6]:
ham_emails = [load_email(is_spam=False, filename=name) for name in ham_filenames]
spam_emails = [load_email(is_spam=True, filename=name) for name in spam_filenames]

### View The Data

In [7]:
print(ham_emails[1].get_content().strip())

Martin A posted:
Tassos Papadopoulos, the Greek sculptor behind the plan, judged that the
 limestone of Mount Kerdylio, 70 miles east of Salonika and not far from the
 Mount Athos monastic community, was ideal for the patriotic sculpture. 
 
 As well as Alexander's granite features, 240 ft high and 170 ft wide, a
 museum, a restored amphitheatre and car park for admiring crowds are
planned
---------------------
So is this mountain limestone or granite?
If it's limestone, it'll weather pretty fast.

------------------------ Yahoo! Groups Sponsor ---------------------~-->
4 DVDs Free +s&p Join Now
http://us.click.yahoo.com/pt6YBB/NXiEAA/mG3HAA/7gSolB/TM
---------------------------------------------------------------------~->

To unsubscribe from this group, send an email to:
forteana-unsubscribe@egroups.com

 

Your use of Yahoo! Groups is subject to http://docs.yahoo.com/info/terms/


In [8]:
print(spam_emails[6].get_content().strip())

Help wanted.  We are a 14 year old fortune 500 company, that is
growing at a tremendous rate.  We are looking for individuals who
want to work from home.

This is an opportunity to make an excellent income.  No experience
is required.  We will train you.

So if you are looking to be employed from home with a career that has
vast opportunities, then go:

http://www.basetel.com/wealthnow

We are looking for energetic and self motivated people.  If that is you
than click on the link and fill out the form, and one of our
employement specialist will contact you.

To be removed from our link simple go to:

http://www.basetel.com/remove.html


4139vOLW7-758DoDY1425FRhM1-764SMFc8513fCsLl40


In [9]:
from collections import Counter
def email_structure_counter(emails):
    email_structures = Counter()
    for email in emails:
        structure = email.get_content_type()
        email_structures[structure] += 1
    return email_structures

In [10]:
email_structure_counter(ham_emails).most_common()

[('text/plain', 2408),
 ('multipart/signed', 68),
 ('multipart/mixed', 10),
 ('multipart/alternative', 9),
 ('multipart/related', 3),
 ('multipart/report', 2)]

In [11]:
email_structure_counter(spam_emails).most_common()

[('text/plain', 218),
 ('text/html', 183),
 ('multipart/alternative', 47),
 ('multipart/mixed', 43),
 ('multipart/related', 9)]

In [12]:
for header, value in spam_emails[0].items():
    print(header, ' : ', value)

Return-Path  :  <12a1mailbot1@web.de>
Delivered-To  :  zzzz@localhost.spamassassin.taint.org
Received  :  from localhost (localhost [127.0.0.1])	by phobos.labs.spamassassin.taint.org (Postfix) with ESMTP id 136B943C32	for <zzzz@localhost>; Thu, 22 Aug 2002 08:17:21 -0400 (EDT)
Received  :  from mail.webnote.net [193.120.211.219]	by localhost with POP3 (fetchmail-5.9.0)	for zzzz@localhost (single-drop); Thu, 22 Aug 2002 13:17:21 +0100 (IST)
Received  :  from dd_it7 ([210.97.77.167])	by webnote.net (8.9.3/8.9.3) with ESMTP id NAA04623	for <zzzz@spamassassin.taint.org>; Thu, 22 Aug 2002 13:09:41 +0100
From  :  12a1mailbot1@web.de
Received  :  from r-smtp.korea.com - 203.122.2.197 by dd_it7  with Microsoft SMTPSVC(5.5.1775.675.6);	 Sat, 24 Aug 2002 09:42:10 +0900
To  :  dcek1a1@netsgo.com
Subject  :  Life Insurance - Why Pay More?
Date  :  Wed, 21 Aug 2002 20:31:57 -1600
MIME-Version  :  1.0
Message-ID  :  <0103c1042001882DD_IT7@dd_it7>
Content-Type  :  text/html; charset="iso-8859-1"
Cont

### Prepare The Data

Preprocessing Checklist:
1. Convert HTML Emails into plain text --> html2text package
2. Convert Hyperlinks to 'URL' --> urlextract package
3. Stemming --> nltk package
4. Transform each email into a sparse vector that indicates presence or absence of each word in vocabulary

In [34]:
#!python3 -m pip install html2text urlextract
#import html2text
#import urlextract

In [14]:
import numpy as np
from sklearn.model_selection import train_test_split

X = np.array(ham_emails + spam_emails, dtype=object)
y = np.array([0] * len(ham_emails) + [1] * len(spam_emails))

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [15]:
def Convert_HTML_To_PlainText(email):
    h = html2text.HTML2Text()
    h.ignore_links = False
    for part in email.walk():
        try:
            content = part.get_content()
        except:
            content = str(part.get_payload())
        if part.get_content_type() == 'text/html':
            return h.handle(content)
        elif part.get_content_type() == 'text/plain':
            return content
        else:
            return ""

In [16]:
spam_emails[0].get_content().strip()[:1000]

'<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN">\n<HTML><HEAD>\n<META content="text/html; charset=windows-1252" http-equiv=Content-Type>\n<META content="MSHTML 5.00.2314.1000" name=GENERATOR></HEAD>\n<BODY><!-- Inserted by Calypso -->\n<TABLE border=0 cellPadding=0 cellSpacing=2 id=_CalyPrintHeader_ rules=none \nstyle="COLOR: black; DISPLAY: none" width="100%">\n  <TBODY>\n  <TR>\n    <TD colSpan=3>\n      <HR color=black noShade SIZE=1>\n    </TD></TR></TD></TR>\n  <TR>\n    <TD colSpan=3>\n      <HR color=black noShade SIZE=1>\n    </TD></TR></TBODY></TABLE><!-- End Calypso --><!-- Inserted by Calypso --><FONT \ncolor=#000000 face=VERDANA,ARIAL,HELVETICA size=-2><BR></FONT></TD></TR></TABLE><!-- End Calypso --><FONT color=#ff0000 \nface="Copperplate Gothic Bold" size=5 PTSIZE="10">\n<CENTER>Save up to 70% on Life Insurance.</CENTER></FONT><FONT color=#ff0000 \nface="Copperplate Gothic Bold" size=5 PTSIZE="10">\n<CENTER>Why Spend More Than You Have To?\n<CENTER><FONT co

In [17]:
plaintext = Convert_HTML_To_PlainText(spam_emails[0])
#print(plaintext)

##### The html2text package did well removing the html tags but I need to remove everything so it can easily be tokenized

In [18]:
import re
def remove_special_characters(email):
    text = re.sub(r'(\w)*(@)[\w.]*', 'EMAIL', email)
    text = re.sub(r'\d+(?:\.\d*(?:[eE]\d+))?', 'NUMBER', text)
    text = re.sub(r'(\W)', ' ', text)
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    text = re.sub(r'\s+', ' ', text, flags=re.M)
    return text

In [19]:
#MAKE SURE TO URL EXTRACT BEFORE REMOVING SPECIAL CHARACTERS
pt = remove_special_characters(plaintext)
#print(pt)

##### Having never used the urlextract package I just want to check and make sure it works correctly

In [20]:
url_extractor = urlextract.URLExtract()
print(url_extractor.find_urls(plaintext))

['http://website.e365.cc/savequote/']


##### Looks good! Time to start working on the transformers

In [21]:
from sklearn.base import BaseEstimator, TransformerMixin

class EmailToTextTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, lower_case = True, replace_urls = True, remove_special_characters = True):
        self.lower_case = lower_case
        self.replace_urls = replace_urls
        self.remove_special_characters = remove_special_characters
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        X_transformed = []
        for email in X:
            text = Convert_HTML_To_PlainText(email)
            if self.lower_case:
                text = text.lower()
            if self.replace_urls:
                urls = list(set(url_extractor.find_urls(text)))
                for url in urls:
                    text = text.replace(url, " URL ")
            if self.remove_special_characters:
                text = remove_special_characters(text)
            X_transformed.append(text)
        return X_transformed

##### Lets test it

In [22]:
X_sample = X_train[:2]
X_sample_converted = EmailToTextTransformer().fit_transform(X_sample)
print(X_sample_converted[1])

 some interesting quotes URL thomas jefferson i have examined all the known superstitions of the word and i do not find in our particular superstition of christianity one redeeming feature they are all alike founded on fables and mythology millions of innocent men women and children since the introduction of christianity have been burnt tortured fined and imprisoned what has been the effect of this coercion to make one half the world fools and the other half hypocrites to support roguery and error all over the earth six historic americans by john e remsburg letter to william short jefferson again christianity has become the most perverted system that ever shone on man rogueries absurdities and untruths were perpetrated upon the teachings of jesus by a large band of dupes and importers led by paul the first great corrupter of the teaching of jesus 


##### To build the sparse matrix I am going to be using sklearns CountVectorizer

In [23]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer

preprocess_pipeline = Pipeline([
    ('email_to_text', EmailToTextTransformer()),
    ('count_vectorizer', CountVectorizer()),
])

In [24]:
X_train_vectorized = preprocess_pipeline.fit_transform(X_train)

In [25]:
print(X_train_vectorized.shape)
print(y_train.shape)

(2400, 28396)
(2400,)


#### Train The Model

In [29]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

log_clf = LogisticRegression(random_state=42, max_iter=200)
score = cross_val_score(log_clf, X_train_vectorized, y_train, cv=3, verbose=3)
score.mean()

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  ................................................................
[CV] .................................... , score=0.948, total=   0.8s
[CV]  ................................................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.8s remaining:    0.0s


[CV] .................................... , score=0.951, total=   1.0s
[CV]  ................................................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    1.8s remaining:    0.0s


[CV] .................................... , score=0.960, total=   1.1s


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    2.8s finished


0.9529166666666667

In [30]:
X_test_vectorized = preprocess_pipeline.transform(X_test)
log_clf = LogisticRegression(random_state=42, max_iter=200)
log_clf.fit(X_train_vectorized, y_train)
y_pred = log_clf.predict(X_test_vectorized)

In [32]:
from sklearn.metrics import classification_report
print (classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.96      1.00      0.98       505
           1       0.99      0.77      0.86        95

    accuracy                           0.96       600
   macro avg       0.97      0.88      0.92       600
weighted avg       0.96      0.96      0.96       600



In [33]:
from sklearn.metrics import confusion_matrix
print(confusion_matrix(y_test,y_pred))

[[504   1]
 [ 22  73]]


##### Too many false negatives, using some other features such as the email address can probably improve the model