In [1]:
import glob
import os
corpus_path = 'C:/Users/Rik/Documents/corpus/'
emails, labels = [], []
def readFilesForFolder(folder_path, is_spam):
    file_path = corpus_path + 'enron1/' + folder_path
    for filename in glob.glob(os.path.join(file_path, '*.txt')):
        with open(filename, 'r', encoding="ISO-8859-1") as infile:
            emails.append(infile.read())
            labels.append(is_spam)
readFilesForFolder('spam', 1)
print(len(emails))
print(len(labels))
readFilesForFolder('ham', 0)
print(len(emails))
print(len(labels))

1500
1500
5172
5172


Initialize spacy

In [2]:
import spacy
nlp = spacy.load('en_core_web_lg')

In [3]:
def clean_email(email):
    doc = nlp(email)
    return " ".join([token.lemma_.lower() for token in doc if token.is_alpha and not token.is_stop])

In [4]:
cleaned_emails = [
    clean_email(email) for email in emails
]

Initialize a 10-fold generator to divide data into chunks with preserved class fractions

In [5]:
from sklearn.model_selection import StratifiedKFold
import numpy as np

k = 10
k_fold = StratifiedKFold(n_splits=k, random_state=42)
cleaned_emails_np = np.array(cleaned_emails)
labels_np = np.array(labels)

max_features: n most frequent terms to use as feature space  
alpha: smoothing factor  
fit_prior: use prior tailored to training data

In [42]:
max_features_option = [2000, 8000, None]
smoothing_factor_option = [0.5, 1.0, 2.0, 4.0]
fit_prior_option = [True, False]

In [43]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import roc_auc_score
def run_kfold_multinomial(max_features_option, smoothing_factor_option, fit_prior_option):
    auc_record = []
    for train_indices, test_indices in k_fold.split(cleaned_emails, labels):
        X_train, X_test = cleaned_emails_np[train_indices], cleaned_emails_np[test_indices]
        Y_train, Y_test = labels_np[train_indices], labels_np[test_indices]
        for max_features in max_features_option:
            cv = CountVectorizer(stop_words='english', max_features = max_features, max_df = 0.5, min_df=2)
            term_docs_train = cv.fit_transform(X_train)
            term_docs_test = cv.transform(X_test)
            for alpha in smoothing_factor_option:
                for fit_prior in fit_prior_option:
                    clf = MultinomialNB(alpha = alpha, fit_prior = fit_prior)
                    clf.fit(term_docs_train, Y_train)
                    prediction_prob = clf.predict_proba(term_docs_test)
                    pos_prob = prediction_prob[:, 1]
                    auc = roc_auc_score(Y_test, pos_prob)
                    auc_record.append({
                        'max_features': max_features,
                        'smoothing': alpha,
                        'fit_prior': fit_prior,
                        'auc': auc
                    })
    return auc_record
auc_record = run_kfold_multinomial(max_features_option, smoothing_factor_option, fit_prior_option)

In [44]:
import pandas as pd

def print_results(auc_record):
    for auc_run in sorted([
        {
            'max_features': key[0],
            'smoothing': key[1],
            'fit_prior': key[2],
            'auc': np.mean(group['auc'])
        }
        for key, group in pd.DataFrame(auc_record).fillna(-1).groupby(['max_features', 'smoothing', 'fit_prior'])
    ], key = lambda auc_run: auc_run['auc'], reverse = True):
        print(auc_run['max_features'], auc_run['smoothing'], auc_run['fit_prior'], auc_run['auc'])
    
print_results(auc_record)

-1.0 4.0 False 0.9891564832365832
-1.0 4.0 True 0.9891174352367413
-1.0 0.5 False 0.9883294090352643
-1.0 0.5 True 0.9882840105832642
-1.0 2.0 True 0.9881240744580027
-1.0 2.0 False 0.9880967820755833
-1.0 1.0 False 0.9880398303913438
-1.0 1.0 True 0.98796088042491
8000.0 0.5 True 0.9850525683173398
8000.0 0.5 False 0.9850373622793509
8000.0 1.0 True 0.9840703708091458
8000.0 1.0 False 0.9840158551514435
8000.0 2.0 False 0.9833223304900682
8000.0 2.0 True 0.983278753504719
8000.0 4.0 True 0.9831593610551673
8000.0 4.0 False 0.9830566900643684
2000.0 0.5 True 0.9742725585436164
2000.0 0.5 False 0.9742134916873988
2000.0 1.0 True 0.9730513441337914
2000.0 1.0 False 0.9729806253208547
2000.0 2.0 True 0.9714262429411995
2000.0 2.0 False 0.9713636763021757
2000.0 4.0 True 0.9692736765983494
2000.0 4.0 False 0.9690602342731903


In [45]:
max_features_option = [None]
smoothing_factor_option = [4.0, 10, 16, 20, 32]
fit_prior_option = [True, False]
print_results(run_kfold_multinomial(max_features_option, smoothing_factor_option, fit_prior_option))

-1 20.0 False 0.9935004294514869
-1 20.0 True 0.9935004294514869
-1 16.0 True 0.9934130262014769
-1 16.0 False 0.9933749950637759
-1 32.0 False 0.9932311041345813
-1 32.0 True 0.9932301958693678
-1 10.0 False 0.9923001273545788
-1 10.0 True 0.9922819867314299
-1 4.0 False 0.9891564832365832
-1 4.0 True 0.9891174352367413
