In [None]:
! curl http://nlp.cs.aueb.gr/software_and_datasets/Enron-Spam/preprocessed/enron1.tar.gz --output enron1.tar.gz

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 1760k  100 1760k    0     0   326k      0  0:00:05  0:00:05 --:--:--  404k


In [None]:
! tar -xf enron1.tar.gz enron1

In [None]:
import glob, os

emails, labels = [], []
parition = 0

file_path = 'enron1/spam/'
for fname in glob.glob(os.path.join(file_path, '*.txt')):
  with open(fname,'r', encoding='ISO-8859-1') as f:
    emails.append(f.read())
    labels.append(1)

file_path = 'enron1/ham/'
for fname in glob.glob(os.path.join(file_path, '*.txt')):
  with open(fname,'r', encoding='ISO-8859-1') as f:
    emails.append(f.read())
    labels.append(0)

print(f'number of emails = {len(emails)}\nnumber of labels = {len(labels)}')

number of emails = 5172
number of labels = 5172


In [None]:
def letters_only(word):
  return word.isalpha()

In [None]:
import nltk
nltk.download('names')
nltk.download('wordnet')

[nltk_data] Downloading package names to /root/nltk_data...
[nltk_data]   Unzipping corpora/names.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [None]:
from nltk.corpus import names
all_names = set(names.words())

In [None]:
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

In [None]:
def clean_text(doc):
  cleaned_doc = []
  for word in doc.split(' '):   # split doc. by black(' ')
    word = word.lower()   # ABD -> abd

    if letters_only(word) and word not in all_names and len(word) > 2:  # remove number and punc. and name entity
      cleaned_doc.append(lemmatizer.lemmatize(word))
  return ' '.join(cleaned_doc)

cleaned_emails = [clean_text(doc) for doc in emails]

In [None]:
import numpy as np
from sklearn.model_selection import StratifiedKFold
k = 10
k_fold = StratifiedKFold(n_splits = k) # StratifiedKFold: shuffle samples and split sample for n folds.

# convert to numpy array for more efficient slicing
cleaned_emails_np = np.array(cleaned_emails)
labels_np = np.array(labels)

In [None]:
max_features_option = [2000, 4000, 8000]
smoothing_factor_option = [0.5, 1.0, 1.5, 2.0]
fit_prior_option = [True, False]
auc_record = {}

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import roc_auc_score

for train_indices, test_indices in k_fold.split(cleaned_emails, labels):
  X_train, X_test = cleaned_emails_np[train_indices], cleaned_emails_np[test_indices]
  Y_train, Y_test = labels_np[train_indices], labels_np[test_indices]
  for max_features in max_features_option:
    if max_features not in auc_record:
      auc_record[max_features] = {}
    tv = TfidfVectorizer(sublinear_tf = True, max_df = 0.5, stop_words = "english", max_features = max_features)
    term_docs_train = tv.fit_transform(X_train)
    term_docs_test = tv.transform(X_test)
    for smoothing_factor in smoothing_factor_option:
      if smoothing_factor not in auc_record[max_features]:
        auc_record[max_features][smoothing_factor] = {}
      for fit_prior in fit_prior_option:
        clf = MultinomialNB(alpha=smoothing_factor, fit_prior=fit_prior)
        clf.fit(term_docs_train, Y_train)
        prediction_prob = clf.predict_proba(term_docs_test)
        pos_prob = prediction_prob[:,1]
        auc = roc_auc_score(Y_test, pos_prob)
        auc_record[max_features][smoothing_factor][fit_prior] = auc + auc_record[max_features][smoothing_factor].get(fit_prior, 0.0)

In [None]:
print('max features / smoothing / fit prior / auc')
for max_features, max_features_record in auc_record.items():
  for smoothing, smoothing_record in max_features_record.items():
    for fit_prior, auc in smoothing_record.items():
      print('{:10}{:13}{:12}    {:.4f}'.format(max_features, smoothing, fit_prior, auc/k))

max features / smoothing / fit prior / auc
      2000          0.5           1    0.9821
      2000          0.5           0    0.9821
      2000          1.0           1    0.9808
      2000          1.0           0    0.9808
      2000          1.5           1    0.9802
      2000          1.5           0    0.9802
      2000          2.0           1    0.9799
      2000          2.0           0    0.9799
      4000          0.5           1    0.9873
      4000          0.5           0    0.9873
      4000          1.0           1    0.9867
      4000          1.0           0    0.9867
      4000          1.5           1    0.9867
      4000          1.5           0    0.9867
      4000          2.0           1    0.9870
      4000          2.0           0    0.9870
      8000          0.5           1    0.9921
      8000          0.5           0    0.9921
      8000          1.0           1    0.9925
      8000          1.0           0    0.9925
      8000          1.5           1  