In [3]:
import numpy as np
import os

Let's take a look at the emails.

In [4]:
ham_dirs = ["easy_ham", "hard_ham"]
spam_dirs = ["spam", "spam_2"]

In [55]:
def readDirs(dirs):
    ret = []
    for directory in dirs:
        dirent = os.listdir(directory)
        for file in dirent:
            with open(os.path.join(directory, file), "rb") as f:
                content = f.read()
                ret += [content]
    print("Loaded {} files.".format(len(ret)))
    return ret
            

ham_mails = readDirs(ham_dirs)
spam_mails = readDirs(spam_dirs)

Loaded 2801 files.
Loaded 1899 files.


In [69]:
print(ham_mails[9].decode("utf-8"))

From rpm-list-admin@freshrpms.net  Thu Oct  3 12:25:27 2002
Return-Path: <rpm-zzzlist-admin@freshrpms.net>
Delivered-To: yyyy@localhost.example.com
Received: from localhost (jalapeno [127.0.0.1])
	by jmason.org (Postfix) with ESMTP id 2707916F6D
	for <jm@localhost>; Thu,  3 Oct 2002 12:24:58 +0100 (IST)
Received: from jalapeno [127.0.0.1]
	by localhost with IMAP (fetchmail-5.9.0)
	for jm@localhost (single-drop); Thu, 03 Oct 2002 12:24:58 +0100 (IST)
Received: from egwn.net (auth02.nl.egwn.net [193.172.5.4]) by
    dogma.slashnull.org (8.11.6/8.11.6) with ESMTP id g93BDgK26115 for
    <jm-rpm@jmason.org>; Thu, 3 Oct 2002 12:13:42 +0100
Received: from auth02.nl.egwn.net (localhost [127.0.0.1]) by egwn.net
    (8.11.6/8.11.6/EGWN) with ESMTP id g93BA2f21504; Thu, 3 Oct 2002 13:10:02
    +0200
Received: from zeus.scania.co.za ([196.41.10.170]) by egwn.net
    (8.11.6/8.11.6/EGWN) with ESMTP id g93B8uf17854 for
    <rpm-list@freshrpms.net>; Thu, 3 Oct 2002 13:08:57 +0200
Received: from leen

As can be seen, those emails contain a lot of header information that will be irrelevant to classification.

In [95]:
import email
from email.parser import BytesParser
from email.policy import default

In [67]:
parser = BytesParser(policy=default).parsebytes(ham_mails[9])

In [70]:
print(email.message_from_bytes(ham_mails[9]).get_payload())

 >> > Well, I don't really find it consistent at all to use an rpm package
 >> > built against something that wasn't installed through rpm :-/
 >>
 >> Following that reasoning, I've been installing all my custom-built
 >> kernels through rpm recently. I find it annoying, though, that
 >> alsa-kernel, and similar packages, will only build for the currently
 >> running kernel.
 >>
 >> So I've attached a patch to specify an alternate kernel by setting the
 >> "TARGET_KERNEL" environment variable before running rpmbuild. You
 >> still need to have the rpm for the specified kernel installed, but at
 >> least it doesn't have to be currently running. It's kinda hackish, so
 >> if someone has a better way to do this, let me know.
 >
 >That idea looks good although it maybe needs to be tweaked a bit more (what
 >you sent doesn't support packages named "kernel-smp"). I'd also prefer a
 >cleaner way than the env variable, and preferrably not editing the spec...
 >probably "--define 'target 2.4.xx

Extracting the email body could not have been easier!

Let's prepare those emails into a dataset.

In [87]:
X_spam = np.matrix(spam_mails).T
X_ham = np.matrix(ham_mails).T

X_data = np.concatenate([X_spam, X_ham], axis=0)
y_data = np.concatenate((np.ones_like(X_spam, dtype=int), np.zeros_like(X_ham, dtype=int)), axis=0)

In [88]:
X_data.shape, y_data.shape

((4700, 1), (4700, 1))

Now, let's shuffle the data before splitting into two sets.

In [89]:
from sklearn.utils import shuffle

In [90]:
X_data, y_data = shuffle(X_data, y_data)

In [91]:
y_data[:10]

matrix([[1],
        [0],
        [0],
        [0],
        [1],
        [0],
        [1],
        [0],
        [1],
        [0]])

Looks valid. Now, let's split the sets.

In [92]:
from sklearn.model_selection import train_test_split

In [93]:
X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, train_size=0.7)

Now the data must be prepared. First, I will build a dictionary of words and their frequencies in the texts. I will strip both the most frequent and the most rare words, except for HTML tags, email addresses, and URLs. Also, I'll replace numbers by a separate string.

In [288]:
from sklearn.base import TransformerMixin, BaseEstimator

class MailParseDecode(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.parser = lambda x : BytesParser(policy=default).parsebytes(x).get_payload()
        
    def __parse(self, by):
        out = self.parser(by)
        if type(out) != list:
            return out
        else:
            nout = ""
            for o in out:
                try:
                    nout += str(o) + " "
                except Exception:
                    pass
            return nout
        
    def fit(self, X, y = None):
        return self
    def transform(self, X):
        cp = X.copy().astype(object)
        for i in range(len(cp)):
            cp[i,0] = self.__parse(X[i,0]).lower()
        return cp
    
mailParseDecode = MailParseDecode()

In [289]:
sample_mail_decoded = mailParseDecode.fit_transform(X_train[5,:])

In [290]:
import re

class MailParseReplaceKeywords(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.tagRegex = re.compile(r'<.*?>')
        self.mailRegex = re.compile(r"(^[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+$)")
        self.urlRegex = re.compile(r"((http|https)\:\/\/)?[a-zA-Z0-9\.\/\?\:@\-_=#]+\.([a-zA-Z]){2,6}([a-zA-Z0-9\.\&\/\?\:@\-_=#])*")
    def fit(self, X, y = None):
        return self
    def transform(self, X):
        cp = X.copy().astype(object)
        for i in range(len(cp)):
            tmp = self.tagRegex.sub(' TAG ', X[i,0])
            tmp = self.mailRegex.sub(' MAIL ', tmp)
            cp[i,0] = self.urlRegex.sub(' URL ', tmp)
        return cp
    
mailParseReplaceKeywords = MailParseReplaceKeywords()
sample_mail_replaced = mailParseReplaceKeywords.fit_transform(sample_mail_decoded)
sample_mail_replaced

matrix([[' TAG \n TAG \n\n TAG \n TAG \n TAG  TAG   TAG \n TAG  TAG \n TAG \n\t TAG \n\t\t\n     TAG   TAG  TAG  TAG  TAG \n\t\t\n     TAG   TAG  TAG  TAG  TAG \n\t\t TAG \n\t\t\t TAG  TAG \n\t TAG \n\t TAG \n\t\t\n     TAG   TAG  TAG  TAG  TAG \n\t\t TAG \n\t\t\t TAG  TAG \n\t TAG \n\t TAG \n\t\t\n     TAG   TAG  TAG \n\t\t TAG \n\t\t\t TAG  TAG \n\t TAG \n\t TAG \n\t\t\n     TAG   TAG  TAG \n\t\t TAG \n\t\t\t TAG  TAG \n\t TAG \n\t TAG \n\t\t\n     TAG   TAG  TAG  TAG  TAG \n\t\t TAG \n\t\t\t TAG  TAG \n\t TAG \n\t TAG \n\t\t\n     TAG   TAG  TAG  TAG  TAG \n\t\t TAG \n\t\t\t TAG  TAG \n\t TAG \n\t TAG \n\t\t\n     TAG   TAG  TAG  TAG  TAG \n\t\t TAG \n\t\t\t TAG  TAG \n\t TAG \n\t TAG \n\t\t\n     TAG   TAG  TAG  TAG  TAG \n\t\t TAG \n\t\t\t TAG  TAG \n\t TAG \n\t TAG \n\t\t\n     TAG   TAG  TAG  TAG  TAG \n\t\t TAG \n\t\t\t TAG  TAG \n\t TAG \n\t TAG \n\t\t\n     TAG   TAG  TAG  TAG  TAG \n\t\t TAG \n\t\t\t TAG  TAG \n\t TAG \n TAG \n TAG  \n TAG  \n   TAG  TAG see this lovely celb

The encoding looks correct. The next step in the pipeline would be stemming, removing punctuation, and stop words.

In [291]:
import nltk
from sklearn.feature_extraction.text import HashingVectorizer
from nltk.corpus import stopwords
from nltk.stem import LancasterStemmer
from nltk.tokenize import word_tokenize
from scipy.sparse import lil_matrix

In [292]:
class MailParseTokenizeRemove(BaseEstimator, TransformerMixin):
    def __init__(self, lang="english", keep_tokens = ["TAG", "URL", "MAIL"]):
        self.lang = lang
        self.stemmer = LancasterStemmer()
        self.stopwords = set(stopwords.words(lang))
        self.keep_tokens = set(keep_tokens)
        self.vectorizer = HashingVectorizer(input="content", strip_accents = "ascii", lowercase=False)
        
    def fit(self, X, y = None):
        return self
    def transform(self, X, y = None):
        cp = lil_matrix((X.shape[0], self.vectorizer.n_features))
        for i in range(len(X)):
            tokens = word_tokenize(X[i,0])
            output = []
            for token in tokens:
                if token in self.keep_tokens:
                    output.append(token)
                elif token in self.stopwords:
                    continue
                else:
                    output.append(self.stemmer.stem(token))
            cp[i,:] = self.vectorizer.fit_transform([' '.join(output)])
        return cp
    
mailParseTokenizeRemove = MailParseTokenizeRemove()

In [293]:
mailParseTokenizeRemove.fit_transform(sample_mail_replaced)

<1x1048576 sparse matrix of type '<class 'numpy.float64'>'
	with 16 stored elements in List of Lists format>

Let's load the whole dataset.

In [294]:
from sklearn.pipeline import Pipeline

In [295]:
mail_pipeline = Pipeline([
    ("decode", MailParseDecode()),
    ("replace", MailParseReplaceKeywords()),
    ("vectorize", MailParseTokenizeRemove())
])

In [296]:
X_train_transformed = mail_pipeline.fit_transform(X_train)

In [297]:
X_train_transformed

<3290x1048576 sparse matrix of type '<class 'numpy.float64'>'
	with 425049 stored elements in List of Lists format>

Let's try a sample classifier.

In [301]:
from sklearn.ensemble import RandomForestClassifier

In [302]:
rfc = RandomForestClassifier()

In [305]:
X_train_transformed.shape

(3290, 1048576)

In [334]:
y_train = y_train.ravel().tolist()[0]
y_test = y_test.ravel().tolist()[0]

In [335]:
rfc.fit(X_train_transformed, y_train)

RandomForestClassifier()

In [338]:
from sklearn.metrics import accuracy_score
X_test_transformed = mail_pipeline.transform(X_test)

In [339]:
X_test_transformed

<1410x1048576 sparse matrix of type '<class 'numpy.float64'>'
	with 169369 stored elements in List of Lists format>

In [340]:
prediction = rfc.predict(X_test_transformed)

In [343]:
accuracy_score(prediction, y_test)

0.9617021276595744

It's quite a good score for a first try! I have used the test set here, but I will use cross validation to improve next models.

In [344]:
from sklearn.model_selection import GridSearchCV

In [345]:
rfc_params = {
    "n_estimators" : [50, 60, 70, 80, 90, 100, 110],
    "max_depth" : [50, 60, 70],
    "min_samples_split" : [2, 3, 4, 5, 6, 7]
}

In [346]:
rfc_search = GridSearchCV(rfc, rfc_params, cv=3, scoring="accuracy", n_jobs=8, verbose=2)

In [347]:
rfc_search.fit(X_train_transformed, y_train)

Fitting 3 folds for each of 126 candidates, totalling 378 fits


GridSearchCV(cv=3, estimator=RandomForestClassifier(), n_jobs=8,
             param_grid={'max_depth': [50, 60, 70],
                         'min_samples_split': [2, 3, 4, 5, 6, 7],
                         'n_estimators': [50, 60, 70, 80, 90, 100, 110]},
             scoring='accuracy', verbose=2)

In [348]:
rfc_search.best_score_

0.9431628395957122

In [352]:
rfc_search.best_estimator_ #best_estimator_.n_estimators == 100

RandomForestClassifier(max_depth=70, min_samples_split=3)

In [359]:
rfc_params = {
    "n_estimators" : np.linspace(95,105,10, dtype=int),
    "max_depth" : [65, 75, 85, 95, 105],
    "min_samples_split" : [2, 3, 4]
}

In [360]:
rfc_search = GridSearchCV(rfc, rfc_params, cv=3, scoring="accuracy", n_jobs=8, verbose=2)

In [361]:
rfc_search.fit(X_train_transformed, y_train)

Fitting 3 folds for each of 150 candidates, totalling 450 fits


GridSearchCV(cv=3, estimator=RandomForestClassifier(), n_jobs=8,
             param_grid={'max_depth': [65, 75, 85, 95, 105],
                         'min_samples_split': [2, 3, 4],
                         'n_estimators': array([ 95,  96,  97,  98,  99, 100, 101, 102, 103, 105])},
             scoring='accuracy', verbose=2)

In [362]:
rfc_search.best_score_

0.9559276904275541

In [363]:
rfc_search.best_estimator_

RandomForestClassifier(max_depth=105, min_samples_split=3, n_estimators=99)

In [364]:
rfc_params = {
    "n_estimators" : np.linspace(98,110,10, dtype=int),
    "min_samples_split" : [3],
    "max_depth" : [100, 105, 110, 115]
}
rfc_search = GridSearchCV(rfc, rfc_params, cv=3, scoring="accuracy", n_jobs=8, verbose=2)

In [365]:
rfc_search.fit(X_train_transformed, y_train)

Fitting 3 folds for each of 40 candidates, totalling 120 fits


GridSearchCV(cv=3, estimator=RandomForestClassifier(), n_jobs=8,
             param_grid={'max_depth': [100, 105, 110, 115],
                         'min_samples_split': [3],
                         'n_estimators': array([ 98,  99, 100, 102, 103, 104, 106, 107, 108, 110])},
             scoring='accuracy', verbose=2)

In [366]:
rfc_search.best_score_

0.9547125316334973

In [367]:
rfc_search.best_estimator_

RandomForestClassifier(max_depth=115, min_samples_split=3, n_estimators=104)

Doesn't look like the classifier improves much anymore. Let's try fitting and evaluating on the whole dataset.

In [369]:
be = rfc_search.best_estimator_

In [370]:
be.fit(X_train_transformed, y_train)

RandomForestClassifier(max_depth=115, min_samples_split=3, n_estimators=104)

In [371]:
prediction = be.predict(X_test_transformed)

In [372]:
accuracy_score(prediction, y_test)

0.9453900709219858

A little less accuracy than I started with. Perhaps it would be best to just leave the max depth parameter unregularized. By any means, 96% accuracy is statisfying.