In [1]:
# good pre-reads
# paper that came with enron data: http://www2.aueb.gr/users/ion/docs/ceas2006_paper.pdf

import os
import re
from typing import Any

import pandas as pd
import numpy as np
import urlextract

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn import model_selection

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier

from sklearn.pipeline import Pipeline
from sklearn.utils import class_weight
from sklearn.metrics import precision_score, recall_score, classification_report, confusion_matrix

from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer

import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
ham_filenames = [name for name in sorted(os.listdir('data/enron1/ham')) if len(name) > 15]
spam_filenames = [name for name in sorted(os.listdir('data/enron1/spam')) if len(name) > 15]

print('Amount of ham files:', len(ham_filenames))
print('Amount of spam files:', len(spam_filenames))    
print('Spam to Ham Ratio:',len(spam_filenames)/len(ham_filenames))

Amount of ham files: 3672
Amount of spam files: 1500
Spam to Ham Ratio: 0.4084967320261438


In [3]:
def load_file(is_spam, filename):
    directory = "data/enron1/spam" if is_spam else "data/enron1/ham"
    # http://python-notes.curiousefficiency.org/en/latest/python3/text_file_processing.html
    # latin-1 encoding because some of the spam files have retarded characters
    with open(os.path.join(directory, filename), encoding="latin-1") as f:
        lines = f.readlines()
        # like this one
        if filename.startswith("0754"):
            print(lines)
        return {
            'subject': lines[0],
            'body': lines[1:],
        }
    
ham_emails = [load_file(is_spam=False, filename=name) for name in ham_filenames]
spam_emails = [load_file(is_spam=True, filename=name) for name in spam_filenames]
    
    
testEmail = spam_emails[0]

print('\n\n')
print('what one test email looks like: ', testEmail)

['Subject: our recommendations make you big money\n', 'c . d . financial newsletter\n', 'alert :\n', 'strong buy\n', 'issued on dtoi\n', 'news alert * * * * * * * * 3 / 25 / 2004 4 : 00 pm est\n', '* dtomi receives request for quote for tandem axle\n', 'enclosed trailers *\n', '* dtoi up 50 % *\n', 'dtomi , inc .\n', 'symbol : dtoi otc . bb\n', 'price $ 1 . 25\n', 'shares out : 26 million\n', 'market capitalization : $ 24 million\n', 'significant revenue growth in 2004\n', 'rating : strong buy\n', '7 days trading target : $ 2 . 00\n', '30 day trading target : $ 3 . 50\n', '* * strong buy alert * * strong\n', 'buy alert * *\n', 'dtomi , inc . ( dtomi or the\n', 'company ) markets a unique air spring suspension system , air spring axlet\n', ', that allows small to medium - sized trailers to be lowered to near flush with\n', 'the ground , greatly improving the safety and ease with which heavy cargo can be\n', 'loaded and unloaded . an aggressive commercialization schedule may bring\n', 'p

In [4]:
snowball_stemmer = SnowballStemmer('english')

class EmailJsonToText(BaseEstimator, TransformerMixin):
    def __init__(self, lowercaseConversion = True, punctuationRemoval = True, 
                 urlReplacement = True, numberReplacement = False, stemming = False):
        self.lowercaseConversion = lowercaseConversion
        self.punctuationRemoval = punctuationRemoval
        self.urlReplacement = urlReplacement
        self.urlExtractor = urlextract.URLExtract()
        self.numberReplacement = numberReplacement
        self.stemming = stemming
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        X_to_words = []
        for emailJson in X:
            text = ' '.join(emailJson['body'])
            
            if text is None:
                text = 'empty'
            if self.lowercaseConversion:
                text = text.lower()
                
            if self.urlReplacement:
                urls = self.urlExtractor.find_urls(text)
                for url in urls:
                   text = text.replace(url, 'URL')   
            
            # apparently removing numbers helped
            if self.numberReplacement:
                text = re.sub('\d', '%d', text)
                    
            if self.punctuationRemoval:
                text = text.replace('.','')
                text = text.replace(',','')
                text = text.replace('!','')
                text = text.replace('?','')
                
            if self.stemming:
                words = text.split(' ')
                
                stemmed_words = []
                for word in words:
                    stemmed_words.append(snowball_stemmer.stem(word))
                
                text = ' '.join(stemmed_words)
            
            X_to_words.append(text)
        return np.array(X_to_words)

In [5]:
X_few = ham_emails[:3]
X_few_text = EmailJsonToText().fit_transform(X_few)
vocab_transformer = CountVectorizer()
X_few_vectors = vocab_transformer.fit_transform(X_few_text)
X_few_vectors

<3x173 sparse matrix of type '<class 'numpy.int64'>'
	with 175 stored elements in Compressed Sparse Row format>

In [6]:
vocab_transformer.vocabulary_

{'gary': 101,
 'production': 139,
 'from': 98,
 'the': 157,
 'high': 111,
 'island': 118,
 'larger': 119,
 'block': 63,
 'commenced': 74,
 'on': 134,
 'saturday': 145,
 'at': 57,
 '00': 0,
 'about': 49,
 '500': 30,
 'gross': 107,
 'carlos': 70,
 'expects': 91,
 'between': 61,
 'and': 53,
 '10': 3,
 '000': 1,
 'for': 96,
 'tomorrow': 161,
 'vastar': 164,
 'owns': 136,
 '68': 36,
 'of': 133,
 'george': 103,
 '6992': 37,
 'forwarded': 97,
 'by': 66,
 'weissman': 168,
 'hou': 112,
 'ect': 85,
 '12': 6,
 '13': 7,
 '99': 48,
 '16': 9,
 'am': 51,
 'daren': 80,
 'farmer': 92,
 '38': 26,
 'to': 159,
 'rodriguez': 144,
 'cc': 71,
 'melissa': 122,
 'graves': 106,
 'subject': 152,
 'resources': 143,
 'inc': 116,
 'please': 137,
 'call': 67,
 'linda': 120,
 'get': 104,
 'everything': 90,
 'set': 147,
 'up': 163,
 'going': 105,
 'estimate': 89,
 'coming': 72,
 'with': 170,
 'increase': 117,
 'each': 84,
 'following': 95,
 'day': 82,
 'based': 59,
 'my': 125,
 'conversations': 77,
 'bill': 62,
 'fisc

In [7]:
X = np.array(ham_emails + spam_emails)
y = np.array([0] * len(ham_emails) + [1] * len(spam_emails))

X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, test_size=0.2, random_state=42)

In [8]:
vectorizer = CountVectorizer(stop_words=stopwords.words('english'), ngram_range=(1, 2))
print(vectorizer.fit_transform(X_few_text))
print(vectorizer.get_feature_names())

  (1, 224)	2
  (1, 312)	3
  (1, 249)	2
  (1, 275)	2
  (1, 277)	2
  (1, 146)	2
  (1, 169)	1
  (1, 326)	1
  (1, 0)	2
  (1, 91)	4
  (1, 240)	3
  (1, 157)	3
  (1, 204)	2
  (1, 25)	11
  (1, 3)	18
  (1, 349)	4
  (1, 356)	5
  (1, 305)	1
  (1, 106)	1
  (1, 230)	7
  (1, 108)	2
  (1, 217)	3
  (1, 365)	5
  (1, 251)	10
  (1, 188)	17
  :	:
  (1, 265)	1
  (1, 68)	1
  (1, 19)	1
  (1, 129)	1
  (1, 266)	1
  (1, 72)	1
  (1, 16)	1
  (1, 121)	1
  (1, 267)	1
  (1, 78)	1
  (1, 12)	1
  (1, 111)	1
  (1, 268)	1
  (1, 81)	1
  (1, 9)	1
  (1, 99)	1
  (2, 292)	1
  (2, 227)	1
  (2, 155)	1
  (2, 179)	1
  (2, 187)	1
  (2, 156)	1
  (2, 180)	1
  (2, 229)	1
  (2, 293)	1
['00', '00 500', '00 daren', '000', '000 108', '000 224', '000 332', '000 431', '000 521', '000 549', '000 602', '000 674', '000 703', '000 738', '000 793', '000 840', '000 847', '000 878', '000 908', '000 982', '000 gross', '000 increase', '09', '09 38', '09 44', '10', '10 00', '10 000', '10 16', '10 34', '10 38', '10 99', '108', '108 24', '11', '11 99'

In [9]:
email_pipeline = Pipeline([
    ("EmailJson to Words", EmailJsonToText()),
    ("Words to Count Vector", CountVectorizer(stop_words=stopwords.words('english'), ngram_range=(1, 2))),
])

In [10]:
X_augmented_train = email_pipeline.fit_transform(X_train)

In [11]:
log_clf = LogisticRegression(solver="liblinear", random_state=42)
score = model_selection.cross_val_score(log_clf, X_augmented_train, y_train, cv=3)
score.mean()

0.9719603577471597

In [12]:
X_augmented_test = email_pipeline.transform(X_test)

log_clf = LogisticRegression(solver="liblinear", random_state=42)
log_clf.fit(X_augmented_train, y_train)

y_pred = log_clf.predict(X_augmented_test)

print("Precision: {:.2f}%".format(100 * precision_score(y_test, y_pred)))
print("Recall: {:.2f}%".format(100 * recall_score(y_test, y_pred)))

# only added stopwords, added ngrams -> precision 91.53%, recall 98.25%

Precision: 91.53%
Recall: 98.25%


In [13]:
print(len(email_pipeline.named_steps['Words to Count Vector'].get_feature_names()))

246977


In [14]:
email_pipeline2 = Pipeline([
    ("EmailJson to Words", EmailJsonToText(stemming=True, numberReplacement=True)),
    ("Words to Count Vector", CountVectorizer(stop_words=stopwords.words('english'), ngram_range=(1, 2))),
])

X_augmented_train2 = email_pipeline2.fit_transform(X_train)
X_augmented_test2 = email_pipeline2.transform(X_test)

log_clf = LogisticRegression(solver="liblinear", random_state=42)
log_clf.fit(X_augmented_train2, y_train)

y_pred2 = log_clf.predict(X_augmented_test2)

print("Precision: {:.2f}%".format(100 * precision_score(y_test, y_pred2)))
print("Recall: {:.2f}%".format(100 * recall_score(y_test, y_pred2)))

Precision: 91.15%
Recall: 97.20%


In [15]:
print(len(email_pipeline2.named_steps['Words to Count Vector'].get_feature_names()))
# with only stopword removal, with bi-gram support
# 246977 features and 91.53% precision & 98.25% recall

# with less non-sensical features, worse results??
# with no numbers, without stemming
# 224292 features and 91.78% precision & 97.55% recall

# with no numbers, with stemming
# 216467 features and 91.15% precision & 97.20% recall

# with no bi-gram support, with no numbers, with stemming
# 35171 features and 91.50% precision & 97.90% recall

216467


In [16]:
tfidf_vectorizer = TfidfVectorizer(stop_words=stopwords.words('english'), ngram_range=(1, 2))
tfidf_res = tfidf_vectorizer.fit_transform(X_few_text)
print(tfidf_res.shape)
print(tfidf_res)
# instead of counts, we have tfidf values for each word, based on the corpus (X_few_text in this case)
print(tfidf_res.todense())
print(tfidf_vectorizer.get_feature_names())

(3, 373)
  (1, 99)	0.018790630518289823
  (1, 9)	0.018790630518289823
  (1, 81)	0.018790630518289823
  (1, 268)	0.018790630518289823
  (1, 111)	0.018790630518289823
  (1, 12)	0.018790630518289823
  (1, 78)	0.018790630518289823
  (1, 267)	0.018790630518289823
  (1, 121)	0.018790630518289823
  (1, 16)	0.018790630518289823
  (1, 72)	0.018790630518289823
  (1, 266)	0.018790630518289823
  (1, 129)	0.018790630518289823
  (1, 19)	0.018790630518289823
  (1, 68)	0.018790630518289823
  (1, 265)	0.018790630518289823
  (1, 33)	0.018790630518289823
  (1, 4)	0.018790630518289823
  (1, 64)	0.018790630518289823
  (1, 264)	0.018790630518289823
  (1, 57)	0.018790630518289823
  (1, 5)	0.018790630518289823
  (1, 62)	0.018790630518289823
  (1, 263)	0.018790630518289823
  (1, 74)	0.018790630518289823
  :	:
  (1, 349)	0.0751625220731593
  (1, 3)	0.3382313493292168
  (1, 25)	0.20669693570118802
  (1, 204)	0.03758126103657965
  (1, 157)	0.05637189155486947
  (1, 240)	0.05637189155486947
  (1, 91)	0.07516252207

In [17]:
email_pipeline3 = Pipeline([
    ("EmailJson to Words", EmailJsonToText(stemming=True, numberReplacement=True)),
    ("Words to TF-IDF Vector", TfidfVectorizer(stop_words=stopwords.words('english'), ngram_range=(1, 2))),
])

X_augmented_train3 = email_pipeline3.fit_transform(X_train)
X_augmented_test3 = email_pipeline3.transform(X_test)

xgb_clf = XGBClassifier()
xgb_clf.fit(X_augmented_train3, y_train)

y_pred3 = xgb_clf.predict(X_augmented_test3)

print("Precision: {:.2f}%".format(100 * precision_score(y_test, y_pred3)))
print("Recall: {:.2f}%".format(100 * recall_score(y_test, y_pred3)))

# https://xgboost.readthedocs.io/en/latest/python/python_api.html#xgboost.XGBClassifier
xgb_clf.save_model('enron1_trained_xgb_model.bin')

# generic xgboost guide
# https://github.com/dmlc/xgboost/tree/master/demo/guide-python

# interestingly logistic regression outperforms xgboost

Precision: 90.78%
Recall: 93.01%


In [18]:
print(y_train[0:10])
print(X_train[0])

[0 0 1 0 0 0 0 1 0 0]
{'subject': 'Subject: conoco , inc . katy tailgate contract 96001985 sitara 334995\n', 'body': ['bob , the referenced contract dated 1 / 1 / 96 as amended effective 4 / 1 / 2000 does include an evergreen provision . thus , please add may , 2001 to the existing sitara deal , 334995 , at the current price , 90 % of ifhsc , for a volume of 155 mmbtu / d .\n', 'don , please prepare and circulate a termination letter for 96001985 .\n', 'george x 3 - 6992\n', '- - - - - - - - - - - - - - - - - - - - - - forwarded by george weissman / hou / ect on 04 / 30 / 2001 02 : 11 pm - - - - - - - - - - - - - - - - - - - - - - - - - - -\n', 'john peyton\n', '04 / 30 / 2001 01 : 08 pm\n', 'to : george weissman / hou / ect @ ect\n', 'cc :\n', 'subject : conoco katy contract\n', "please send conoco ' s mike luchetti two month - to - month contracts for april and may volumes of around 150 - 200 mmbtu / day . this is for facility number 10077 - exxon plant hpl katy for gas that was bein

In [19]:
X_df = pd.Series(EmailJsonToText(stemming=True, numberReplacement=True).fit_transform(X_train))
# X_df = pd.Series(X_train)
y_df = pd.Series(y_train)
X_df.name = 'text'
X_df.shape

(4137,)

In [20]:
print(X_df.iloc[0])
print(y_df.iloc[2])
print(type(X_df[:10]))
print(X_df[:10].shape)
for X in X_df[:3]: 
    print(X)

# if X_df was a DataFrame instead of Series
# for index, row in X_df[:10].iterrows():
#     print(row)

bob  the referenc contract date %d / %d / %d%d as amend effect %d / %d / %d%d%d%d doe includ an evergreen provis  thus  pleas add may  %d%d%d%d to the exist sitara deal  %d%d%d%d%d%d  at the current price  %d%d % of ifhsc  for a volum of %d%d%d mmbtu / d 
 don  pleas prepar and circul a termin letter for %d%d%d%d%d%d%d%d 
 georg x %d - %d%d%d%d
 - - - - - - - - - - - - - - - - - - - - - - forward by georg weissman / hou / ect on %d%d / %d%d / %d%d%d%d %d%d : %d%d pm - - - - - - - - - - - - - - - - - - - - - - - - - - -
 john peyton
 %d%d / %d%d / %d%d%d%d %d%d : %d%d pm
 to : georg weissman / hou / ect @ ect
 cc :
 subject : conoco kati contract
 pleas send conoco ' s mike luchetti two month - to - month contract for april and may volum of around %d%d%d - %d%d%d mmbtu / day  this is for facil number %d%d%d%d%d - exxon plant hpl kati for gas that was be purchas under a contract that expir april %d  %d%d%d%d - contract no %d%d%d - %d%d%d%d%d - %d%d%d 
 thank you 
 john
1
<class 'pandas.c

In [22]:
print(X_df.name)
print((np.asarray(X_df.name)).astype(str))

len(X_df)

text
text


4137

In [23]:
print(pd.Series(X_few_text))
tfidf_vectorizer = TfidfVectorizer(stop_words=stopwords.words('english'), ngram_range=(1, 2))
tfidf_res = tfidf_vectorizer.fit_transform(pd.Series(X_few_text))
tfidf_res

0                                                     
1    gary  production from the high island larger b...
2                - calpine daily gas nomination 1  doc
dtype: object


<3x373 sparse matrix of type '<class 'numpy.float64'>'
	with 375 stored elements in Compressed Sparse Row format>

In [24]:
from sklearn2pmml import sklearn2pmml, make_pmml_pipeline
from sklearn2pmml.pipeline import PMMLPipeline
from sklearn2pmml.feature_extraction.text import Splitter
from sklearn.feature_selection import SelectKBest

selector = SelectKBest(k=5000)

pipeline = PMMLPipeline([
#     ("EmailJson to Words", EmailJsonToText(stemming=True, numberReplacement=True)),
    ("Words to TF-IDF Vector", TfidfVectorizer(
        stop_words=stopwords.words('english'),
        ngram_range=(1, 2),
        tokenizer=Splitter(),
        norm=None)),
    ("Selector", selector),
    ("Logistic Reg", LogisticRegression(solver="liblinear", multi_class="ovr", random_state=42))
])
# pipeline.fit(X_train, y_train)
pipeline.fit(X_df, y_df)

# pipeline = make_pmml_pipeline(
#     pipeline,
#     active_fields = ['text'],
# )
# pipeline.verify(X_train[0:10])
pipeline.verify(X_df[:10])

sklearn2pmml(pipeline, "LogRegPipeline.pmml", with_repr = True)

  self.stdout = io.open(c2pread, 'rb', bufsize)
  self.stderr = io.open(errread, 'rb', bufsize)


In [25]:
gridsearch_email_pipeline = Pipeline([
    ("email_json_to_words", EmailJsonToText()),
    ("word_vect", CountVectorizer(stop_words=stopwords.words('english'))),
    ('tfidf', TfidfTransformer()),
    ('logistic_reg', LogisticRegression(random_state=42)),
])

parameters = {
    'email_json_to_words__stemming': [True, False],
    'email_json_to_words__numberReplacement': (True, False),
    'word_vect__ngram_range': [(1, 1), (1, 2)],
    'tfidf__use_idf': (True, False),
    'logistic_reg__solver': ['liblinear', 'lbfgs'],
}

gs_clf = model_selection.GridSearchCV(gridsearch_email_pipeline, parameters, cv=5, n_jobs=4)
gs_clf.fit(X_train, y_train)

GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('email_json_to_words',
                                        EmailJsonToText()),
                                       ('word_vect',
                                        CountVectorizer(stop_words=['i', 'me',
                                                                    'my',
                                                                    'myself',
                                                                    'we', 'our',
                                                                    'ours',
                                                                    'ourselves',
                                                                    'you',
                                                                    "you're",
                                                                    "you've",
                                                                    "you'll",
                           

In [26]:
gs_clf.cv_results_

{'mean_fit_time': array([1.25238079e+01, 1.46737866e+01, 1.36392612e+01, 1.34990186e+01,
        1.27570862e+01, 1.61789546e+01, 1.35043321e+01, 1.48130617e+01,
        3.56695910e+00, 4.76409750e+00, 4.02849107e+00, 5.54905529e+00,
        4.18416357e+00, 5.72632990e+00, 4.08213882e+00, 5.70046964e+00,
        1.16070148e+01, 2.54774218e+01, 1.89019210e+01, 1.45211731e+03,
        2.16980710e+03, 1.97213035e+03, 4.14298757e+03, 1.51739255e+04,
        3.88312813e+02, 3.71516781e+00, 2.73716388e+00, 3.71986012e+00,
        3.04679751e+00, 5.19847121e+00, 3.36701183e+00, 5.20884643e+00]),
 'std_fit_time': array([5.50209358e-01, 3.85515889e-01, 3.75966177e-01, 4.19094250e-01,
        1.18815987e+00, 6.66094943e-01, 4.19564581e-01, 6.57750470e-01,
        2.85355023e-01, 2.68869407e-01, 1.55597733e-01, 1.90529509e-01,
        1.78468120e-01, 3.31276981e-01, 1.80692661e-01, 1.59260243e-01,
        4.02278143e-01, 1.61805891e+01, 1.28907177e+01, 2.88000906e+03,
        2.99234199e+03, 6.806

In [27]:
gs_clf.best_params_

{'email_json_to_words__numberReplacement': False,
 'email_json_to_words__stemming': False,
 'logistic_reg__solver': 'liblinear',
 'tfidf__use_idf': True,
 'word_vect__ngram_range': (1, 1)}