Load corpus

In [1]:
import glob
import os
corpus_path = 'C:/Users/Rik/Documents/corpus/'
emails, labels = [], []
def read_files_for_folder(folder_path, is_spam):
    file_path = corpus_path + 'enron1/' + folder_path
    for filename in glob.glob(os.path.join(file_path, '*.txt')):
        with open(filename, 'r', encoding="ISO-8859-1") as infile:
            emails.append(infile.read())
            labels.append(is_spam)
read_files_for_folder('ham', 0)
print(len(emails))
print(len(labels))
read_files_for_folder('spam', 1)
print(len(emails))
print(len(labels))

3672
3672
5172
5172


Load spacy

In [6]:
import spacy
nlp = spacy.load('en_core_web_lg')

Clean reviews

In [7]:
def clean_review(review):
    doc = nlp(review)
    return " ".join([token.lemma_.lower() for token in doc if token.is_alpha and not token.is_stop])

In [12]:
cleaned_reviews = [
    clean_review(review) for review in reviews
]

Initialize a 10-fold generator to divide data into chunks with preserved class fractions

In [13]:
from sklearn.model_selection import StratifiedKFold
import numpy as np

k = 10
k_fold = StratifiedKFold(n_splits=k, random_state=42)
cleaned_reviews_np = np.array(cleaned_reviews)
labels_np = np.array(labels)

In [15]:
max_features_option = [2000, 8000, None]
smoothing_factor_option = [0.5, 1.0, 2.0, 4.0]
fit_prior_option = [True, False]
min_df_option = [1, 2, 3, 4]
max_df_option = [0.4, 0.5, 0.6, 0.7]

In [17]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import roc_auc_score
def run_kfold_multinomial(max_features_option, smoothing_factor_option, fit_prior_option, min_df_option, max_df_option):
    auc_record = {}
    for train_indices, test_indices in k_fold.split(cleaned_reviews, labels):
        X_train, X_test = cleaned_reviews_np[train_indices], cleaned_reviews_np[test_indices]
        Y_train, Y_test = labels_np[train_indices], labels_np[test_indices]
        for max_features in max_features_option:
            if max_features not in auc_record:
                auc_record[max_features] = {}
            for min_df in min_df_option:
                if min_df not in auc_record[max_features]:
                    auc_record[max_features][min_df] = {}
                for max_df in max_df_option:
                    if min_df not in auc_record[max_features][min_df]:
                        auc_record[max_features][min_df][max_df] = {}
                    cv = CountVectorizer(stop_words='english', max_features = max_features, max_df = max_df, min_df=min_df)
                    term_docs_train = cv.fit_transform(X_train)
                    term_docs_test = cv.transform(X_test)
                    for alpha in smoothing_factor_option:
                        if alpha not in auc_record[max_features][min_df][max_df]:
                            auc_record[max_features][min_df][max_df][alpha] = {}
                        for fit_prior in fit_prior_option:
                            clf = MultinomialNB(alpha = alpha, fit_prior = fit_prior)
                            clf.fit(term_docs_train, Y_train)
                            prediction_prob = clf.predict_proba(term_docs_test)
                            pos_prob = prediction_prob[:, 1]
                            auc = roc_auc_score(Y_test, pos_prob)
                            auc_record[max_features][min_df][max_df][alpha][fit_prior] = auc + auc_record[max_features][min_df][max_df][alpha].get(fit_prior, 0.0)
    return auc_record
auc_record = run_kfold_multinomial(max_features_option, smoothing_factor_option, fit_prior_option, min_df_option, max_df_option)

In [18]:
def print_results(auc_record):
    print('max features min_df max_df  smoothing fit prior auc')
    for max_features, max_feature_record in auc_record.items():
        for min_df, min_df_record in max_feature_record.items():
            for max_df, max_df_record in min_df_record.items():
                for smoothing, smoothing_record in max_df_record.items():
                    for fit_prior, auc in smoothing_record.items():
                        print('   {0}     {1}  {2}    {3}     {4}     {5:.5f}'.format(max_features, min_df, max_df, smoothing, fit_prior, auc/k))
print_results(auc_record)

max features min_df max_df  smoothing fit prior auc
   2000     1  0.4    0.5     True     0.08620
   2000     1  0.4    0.5     False     0.08620
   2000     1  0.4    1.0     True     0.08624
   2000     1  0.4    1.0     False     0.08624
   2000     1  0.4    2.0     True     0.08627
   2000     1  0.4    2.0     False     0.08627
   2000     1  0.4    4.0     True     0.08631
   2000     1  0.4    4.0     False     0.08631
   2000     1  0.5    0.5     True     0.08649
   2000     1  0.5    0.5     False     0.08649
   2000     1  0.5    1.0     True     0.08653
   2000     1  0.5    1.0     False     0.08653
   2000     1  0.5    2.0     True     0.08651
   2000     1  0.5    2.0     False     0.08651
   2000     1  0.5    4.0     True     0.08652
   2000     1  0.5    4.0     False     0.08652
   2000     1  0.6    0.5     True     0.08642
   2000     1  0.6    0.5     False     0.08642
   2000     1  0.6    1.0     True     0.08642
   2000     1  0.6    1.0     False     0.0864

   None     3  0.6    4.0     False     0.08823
   None     3  0.7    0.5     True     0.08783
   None     3  0.7    0.5     False     0.08783
   None     3  0.7    1.0     True     0.08787
   None     3  0.7    1.0     False     0.08787
   None     3  0.7    2.0     True     0.08810
   None     3  0.7    2.0     False     0.08810
   None     3  0.7    4.0     True     0.08823
   None     3  0.7    4.0     False     0.08823
   None     4  0.4    0.5     True     0.08796
   None     4  0.4    0.5     False     0.08796
   None     4  0.4    1.0     True     0.08793
   None     4  0.4    1.0     False     0.08793
   None     4  0.4    2.0     True     0.08806
   None     4  0.4    2.0     False     0.08806
   None     4  0.4    4.0     True     0.08818
   None     4  0.4    4.0     False     0.08818
   None     4  0.5    0.5     True     0.08849
   None     4  0.5    0.5     False     0.08849
   None     4  0.5    1.0     True     0.08847
   None     4  0.5    1.0     False     0.08847
  