In [1]:
import os
import glob
import re

In [2]:
data_path = "./data/*/*"

In [3]:
def get_subject_data(path):
    print(path)

    data = []

    # regex for stripping out the leading "Subject:" and any spaces after it
    subject_regex = re.compile(r"^Subject:\s+")

    # glob.glob returns every filename that matches the wildcarded path
    for fn in glob.glob(path):
        is_spam = "ham" not in fn
        try:
            with open(fn,'r', encoding="utf-8") as file:
                for line in file:
                    if line.startswith("Subject:"):
                        subject = subject_regex.sub("", line).strip()
                        data.append((subject, is_spam))
        except UnicodeDecodeError as e:
            pass
    return data

In [4]:
data = get_subject_data(data_path)
corpus = set()
for text in data:
    corpus = corpus | set([characters.lower() for characters in text[0].split()])

corupus_dict = {}
for i, v in enumerate(corpus):
    corupus_dict[v] = i

corpus_index = []
for text in data:
    bag_of_word_index = []
    for word in [characters.lower() for characters in text[0].split()]:
        bag_of_word_index.append(corupus_dict[word])
    corpus_index.append(bag_of_word_index)

len(corpus), len(data)

./data/*/*


(4965, 3184)

In [5]:
import numpy as np
y_data = []
for text in data:
    y_data.append(text[1])
y_data = np.array(y_data)
y_data

array([False, False, False, ...,  True,  True,  True], dtype=bool)

In [6]:
x_data = np.zeros((len(data), len(corpus)))
for i, c_index in enumerate(corpus_index):
    x_data[i][c_index] = 1
x_data.shape

(3184, 4965)

In [7]:
from sklearn.cross_validation import KFold
from sklearn.cross_validation import cross_val_score

crossvalidation = KFold(n=x_data.shape[0], n_folds=10,
 shuffle=True, random_state=1)




In [8]:
from sklearn.naive_bayes import BernoulliNB
clf = BernoulliNB()

In [9]:
 np.mean(cross_val_score(clf, x_data, y_data,
    scoring='accuracy', cv=crossvalidation, n_jobs=1))

0.89918968474596317

In [10]:
from sklearn.tree import DecisionTreeClassifier
for depth in range (5,15):
    tree_classifier = DecisionTreeClassifier(
        max_depth=depth, random_state=0)
    score = np.mean(cross_val_score(tree_classifier, x_data, y_data,
        scoring='accuracy', cv=crossvalidation, n_jobs=1))
    print("Depth ", depth, ":", score)

Depth  5 : 0.889443228643
Depth  6 : 0.893841801226
Depth  7 : 0.896041087518
Depth  8 : 0.89824234538
Depth  9 : 0.901069576704
Depth  10 : 0.905775714201
Depth  11 : 0.90609017961
Depth  12 : 0.908602945526
Depth  13 : 0.907975000493
Depth  14 : 0.910799274462


In [12]:
from sklearn import linear_model, datasets
logreg = linear_model.LogisticRegression(fit_intercept=True,max_iter=1000)
score = np.mean(cross_val_score(logreg, x_data, y_data,
        scoring='accuracy', cv=crossvalidation, n_jobs=1))
score

0.92965339800082825

In [17]:
from sklearn.ensemble import RandomForestClassifier
for depth in range (2,15):
    clf = RandomForestClassifier(max_depth=depth, random_state=0, n_estimators=10, n_jobs=4)
    score = np.mean(cross_val_score(tree_classifier, x_data, y_data,
        scoring='accuracy', cv=crossvalidation, n_jobs=1))
    print("Depth ", depth, ":", score)

Depth  2 : 0.910799274462
Depth  3 : 0.910799274462
Depth  4 : 0.910799274462
Depth  5 : 0.910799274462
Depth  6 : 0.910799274462
Depth  7 : 0.910799274462
Depth  8 : 0.910799274462
Depth  9 : 0.910799274462
Depth  10 : 0.910799274462
Depth  11 : 0.910799274462
Depth  12 : 0.910799274462
Depth  13 : 0.910799274462
Depth  14 : 0.910799274462
