In [2]:
import glob
import os
corpus_path = 'C:/Users/Rik/Documents/corpus/'
emails, labels = [], []
def readFilesForFolder(folder_path, is_spam):
    file_path = corpus_path + 'enron1/' + folder_path
    for filename in glob.glob(os.path.join(file_path, '*.txt')):
        with open(filename, 'r', encoding="ISO-8859-1") as infile:
            emails.append(infile.read())
            labels.append(is_spam)
readFilesForFolder('spam', 1)
print(len(emails))
print(len(labels))
readFilesForFolder('ham', 0)
print(len(emails))
print(len(labels))

from nltk.corpus import names
from nltk.stem import WordNetLemmatizer
def is_letter_only(word):
    return word.isalpha()
all_names = set(names.words())
lemmatizer = WordNetLemmatizer()
def clean_text(docs):
    docs_cleaned = []
    for doc in docs:
        doc = doc.lower()
        doc_cleaned = ' '.join(lemmatizer.lemmatize(word) for word in doc.split() if is_letter_only(word))
        docs_cleaned.append(doc_cleaned)
    return docs_cleaned
cleaned_emails= clean_text(emails)

1500
1500
5172
5172


Initialize a 10-fold generator to divide data into chunks with preserved class fractions

In [3]:
from sklearn.model_selection import StratifiedKFold
import numpy as np

k = 10
k_fold = StratifiedKFold(n_splits=k, random_state=42)
cleaned_emails_np = np.array(cleaned_emails)
labels_np = np.array(labels)

max_features: n most frequent terms to use as feature space  
alpha: smoothing factor  
fit_prior: use prior tailored to training data

In [4]:
max_features_option = [2000, 8000, None]
smoothing_factor_option = [0.5, 1.0, 2.0, 4.0]
fit_prior_option = [True, False]

In [15]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import roc_auc_score
def run_kfold_multinomial(max_features_option, smoothing_factor_option, fit_prior_option):
    auc_record = {}
    for train_indices, test_indices in k_fold.split(cleaned_emails, labels):
        X_train, X_test = cleaned_emails_np[train_indices], cleaned_emails_np[test_indices]
        Y_train, Y_test = labels_np[train_indices], labels_np[test_indices]
        for max_features in max_features_option:
            if max_features not in auc_record:
                auc_record[max_features] = {}
            cv = CountVectorizer(stop_words='english', max_features = max_features, max_df = 0.5, min_df=2)
            term_docs_train = cv.fit_transform(X_train)
            term_docs_test = cv.transform(X_test)
            for alpha in smoothing_factor_option:
                if alpha not in auc_record[max_features]:
                    auc_record[max_features][alpha] = {}
                for fit_prior in fit_prior_option:
                    clf = MultinomialNB(alpha = alpha, fit_prior = fit_prior)
                    clf.fit(term_docs_train, Y_train)
                    prediction_prob = clf.predict_proba(term_docs_test)
                    pos_prob = prediction_prob[:, 1]
                    auc = roc_auc_score(Y_test, pos_prob)
                    auc_record[max_features][alpha][fit_prior] = auc + auc_record[max_features][alpha].get(fit_prior, 0.0)
    return auc_record
auc_record = run_kfold_multinomial(max_features_option, smoothing_factor_option, fit_prior_option)

In [16]:
def print_results(auc_record):
    print('max featurs  smoothing fit prior auc')
    for max_features, max_feature_record in auc_record.items():
        for smoothing, smoothing_record in max_feature_record.items():
            for fit_prior, auc in smoothing_record.items():
                print('   {0}     {1}      {2}     {3:.5f}'.format(max_features, smoothing, fit_prior, auc/k))
print_results(auc_record)

max featurs  smoothing fit prior auc
   2000     0.5      True     0.97426
   2000     0.5      False     0.97421
   2000     1.0      True     0.97237
   2000     1.0      False     0.97238
   2000     2.0      True     0.97043
   2000     2.0      False     0.97057
   2000     4.0      True     0.96853
   2000     4.0      False     0.96843
   8000     0.5      True     0.98533
   8000     0.5      False     0.98530
   8000     1.0      True     0.98428
   8000     1.0      False     0.98430
   8000     2.0      True     0.98338
   8000     2.0      False     0.98337
   8000     4.0      True     0.98291
   8000     4.0      False     0.98296
   None     0.5      True     0.98890
   None     0.5      False     0.98884
   None     1.0      True     0.98899
   None     1.0      False     0.98904
   None     2.0      True     0.98906
   None     2.0      False     0.98915
   None     4.0      True     0.98965
   None     4.0      False     0.98969


In [18]:
max_features_option = [None]
smoothing_factor_option = [4.0, 10, 16, 20, 32]
fit_prior_option = [True, False]
print_results(run_kfold_multinomial(max_features_option, smoothing_factor_option, fit_prior_option))

max featurs  smoothing fit prior auc
   None     4.0      True     0.98965
   None     4.0      False     0.98969
   None     10      True     0.99208
   None     10      False     0.99211
   None     16      True     0.99329
   None     16      False     0.99329
   None     20      True     0.99362
   None     20      False     0.99362
   None     32      True     0.99307
   None     32      False     0.99307
