Load corpus

In [1]:
import glob
import os
corpus_path = 'C:/Users/Rik/Documents/corpus/'
emails, labels = [], []
def read_files_for_folder(folder_path, is_spam):
    file_path = corpus_path + 'enron1/' + folder_path
    for filename in glob.glob(os.path.join(file_path, '*.txt')):
        with open(filename, 'r', encoding="ISO-8859-1") as infile:
            emails.append(infile.read())
            labels.append(is_spam)
read_files_for_folder('ham', 0)
print(len(emails))
print(len(labels))
read_files_for_folder('spam', 1)
print(len(emails))
print(len(labels))

3672
3672
5172
5172


Load spacy

In [2]:
import spacy
nlp = spacy.load('en_core_web_lg')

Clean emails

In [3]:
def clean_email(email):
    doc = nlp(email)
    return " ".join([token.lemma_.lower() for token in doc if token.is_alpha and not token.is_stop])

In [4]:
cleaned_emails = [
    clean_email(email) for email in emails
]

Initialize a 10-fold generator to divide data into chunks with preserved class fractions

In [5]:
from sklearn.model_selection import StratifiedKFold
import numpy as np

k = 10
k_fold = StratifiedKFold(n_splits=k, random_state=42)
cleaned_emails_np = np.array(cleaned_emails)
labels_np = np.array(labels)

In [6]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import roc_auc_score
def run_kfold_multinomial(max_features_option, smoothing_factor_option, fit_prior_option, min_df_option, max_df_option):
    auc_record = []
    for train_indices, test_indices in k_fold.split(cleaned_emails, labels):
        X_train, X_test = cleaned_emails_np[train_indices], cleaned_emails_np[test_indices]
        Y_train, Y_test = labels_np[train_indices], labels_np[test_indices]
        for max_features in max_features_option:
            for min_df in min_df_option:
                for max_df in max_df_option:
                    cv = CountVectorizer(stop_words='english', max_features = max_features, max_df = max_df, min_df=min_df)
                    term_docs_train = cv.fit_transform(X_train)
                    term_docs_test = cv.transform(X_test)
                    for alpha in smoothing_factor_option:
                        for fit_prior in fit_prior_option:
                            clf = MultinomialNB(alpha = alpha, fit_prior = fit_prior)
                            clf.fit(term_docs_train, Y_train)
                            prediction_prob = clf.predict_proba(term_docs_test)
                            pos_prob = prediction_prob[:, 1]
                            auc = roc_auc_score(Y_test, pos_prob)
                            auc_record.append({
                                'max_features': max_features,
                                'min_df': min_df,
                                'max_df': max_df,
                                'smoothing': alpha,
                                'fit_prior': fit_prior,
                                'auc': auc
                            })
    return auc_record

In [7]:
max_features_option = [2000, 8000, None]
smoothing_factor_option = [0.5, 1.0, 2.0, 4.0]
fit_prior_option = [True, False]
min_df_option = [1, 2, 3, 4]
max_df_option = [0.4, 0.5, 0.6, 0.7]
auc_record = run_kfold_multinomial(max_features_option, smoothing_factor_option, fit_prior_option, min_df_option, max_df_option)

In [18]:
import pandas as pd

def print_results(auc_record):
    print('max_features \t| min_df \t| max_df \t| smoothing \t| fit_prior \t| auc')
    for auc_run in sorted([
        {
            'max_features': key[0],
            'min_df': key[1],
            'max_df': key[2],
            'smoothing': key[3],
            'fit_prior': key[4],
            'auc': np.mean(group['auc'])
        }
        for key, group in pd.DataFrame(auc_record).fillna(-1).groupby(['max_features', 'min_df', 'max_df', 'smoothing', 'fit_prior'])
    ], key = lambda auc_run: auc_run['auc'], reverse = True):
        print(auc_run['max_features'], '\t\t|', auc_run['min_df'], '\t\t|', auc_run['max_df'], '\t\t|', auc_run['smoothing'], '\t\t|', auc_run['fit_prior'], '\t|', auc_run['auc'])
    
print_results(auc_record)

max_features 	| min_df 	| max_df 	| smoothing 	| fit_prior 	| auc
-1.0 		| 1 		| 0.4 		| 4.0 		| False 	| 0.9947884186312838
-1.0 		| 1 		| 0.5 		| 4.0 		| False 	| 0.9947884186312838
-1.0 		| 1 		| 0.6 		| 4.0 		| False 	| 0.9947884186312838
-1.0 		| 1 		| 0.7 		| 4.0 		| False 	| 0.9947884186312838
-1.0 		| 1 		| 0.4 		| 4.0 		| True 	| 0.9947657514907396
-1.0 		| 1 		| 0.5 		| 4.0 		| True 	| 0.9947657514907396
-1.0 		| 1 		| 0.6 		| 4.0 		| True 	| 0.9947657514907396
-1.0 		| 1 		| 0.7 		| 4.0 		| True 	| 0.9947657514907396
-1.0 		| 1 		| 0.4 		| 2.0 		| True 	| 0.9935810700746357
-1.0 		| 1 		| 0.5 		| 2.0 		| True 	| 0.9935810700746357
-1.0 		| 1 		| 0.6 		| 2.0 		| True 	| 0.9935810700746357
-1.0 		| 1 		| 0.7 		| 2.0 		| True 	| 0.9935810700746357
-1.0 		| 1 		| 0.4 		| 2.0 		| False 	| 0.9935301899458988
-1.0 		| 1 		| 0.5 		| 2.0 		| False 	| 0.9935301899458988
-1.0 		| 1 		| 0.6 		| 2.0 		| False 	| 0.9935301899458988
-1.0 		| 1 		| 0.7 		| 2.0 		| False 	| 0.993530189945898

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



 False 	| 0.982653403032816
8000.0 		| 3 		| 0.7 		| 4.0 		| False 	| 0.982653403032816
-1.0 		| 4 		| 0.4 		| 1.0 		| True 	| 0.9824213906330213
-1.0 		| 4 		| 0.5 		| 1.0 		| True 	| 0.9824213906330213
-1.0 		| 4 		| 0.6 		| 1.0 		| True 	| 0.9824213906330213
-1.0 		| 4 		| 0.7 		| 1.0 		| True 	| 0.9824213906330213
8000.0 		| 4 		| 0.4 		| 1.0 		| True 	| 0.9824213906330213
8000.0 		| 4 		| 0.5 		| 1.0 		| True 	| 0.9824213906330213
8000.0 		| 4 		| 0.6 		| 1.0 		| True 	| 0.9824213906330213
8000.0 		| 4 		| 0.7 		| 1.0 		| True 	| 0.9824213906330213
-1.0 		| 4 		| 0.4 		| 1.0 		| False 	| 0.9822853754491963
-1.0 		| 4 		| 0.5 		| 1.0 		| False 	| 0.9822853754491963
-1.0 		| 4 		| 0.6 		| 1.0 		| False 	| 0.9822853754491963
-1.0 		| 4 		| 0.7 		| 1.0 		| False 	| 0.9822853754491963
8000.0 		| 4 		| 0.4 		| 1.0 		| False 	| 0.9822853754491963
8000.0 		| 4 		| 0.5 		| 1.0 		| False 	| 0.9822853754491963
8000.0 		| 4 		| 0.6 		| 1.0 		| False 	| 0.9822853754491963
8000.0 		| 4 		| 0.7 

2000.0 		| 2 		| 0.6 		| 2.0 		| True 	| 0.9714262429411995
2000.0

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [22]:
max_features_option = [None]
smoothing_factor_option = [4.0, 10, 16, 20, 32]
fit_prior_option = [True, False]
min_df_option = [1]
max_df_option = [0.1, 0.2, 0.3, 0.4]
auc_record = run_kfold_multinomial(max_features_option, smoothing_factor_option, fit_prior_option, min_df_option, max_df_option)

In [23]:
print_results(auc_record)

max_features 	| min_df 	| max_df 	| smoothing 	| fit_prior 	| auc
-1 		| 1 		| 0.3 		| 4.0 		| False 	| 0.9947975333688742
-1 		| 1 		| 0.4 		| 4.0 		| False 	| 0.9947884186312838
-1 		| 1 		| 0.3 		| 4.0 		| True 	| 0.9947821150732536
-1 		| 1 		| 0.4 		| 4.0 		| True 	| 0.9947657514907396
-1 		| 1 		| 0.3 		| 10.0 		| False 	| 0.9947083703550131
-1 		| 1 		| 0.4 		| 10.0 		| False 	| 0.994707422600008
-1 		| 1 		| 0.3 		| 10.0 		| True 	| 0.9947038339651699
-1 		| 1 		| 0.4 		| 10.0 		| True 	| 0.9947001638826365
-1 		| 1 		| 0.2 		| 4.0 		| False 	| 0.9940122862615013
-1 		| 1 		| 0.2 		| 4.0 		| True 	| 0.9940104697310744
-1 		| 1 		| 0.1 		| 4.0 		| False 	| 0.9938135143940292
-1 		| 1 		| 0.1 		| 4.0 		| True 	| 0.993800808553489
-1 		| 1 		| 0.2 		| 10.0 		| False 	| 0.9935588718753703
-1 		| 1 		| 0.2 		| 10.0 		| True 	| 0.9935516156261107
-1 		| 1 		| 0.3 		| 16.0 		| False 	| 0.9932416084192235
-1 		| 1 		| 0.3 		| 16.0 		| True 	| 0.9932416084192235
-1 		| 1 		| 0.4 		| 16.