In [1]:
import numpy as np
import pandas as pd
import os
import sys

In [2]:
HAM='ham'
SPAM= 'spam'
NEWLINE='\n'

sources = [('enron3/ham/', HAM),('enron3/spam/', SPAM)]
SKIP_FILES ={'cmds'}


In [3]:
def read_files(path):
    '''
    Read all files from path which is not in SKIP_FILES.
    Returns only the message body
    '''
    for root, dir_names, file_names in os.walk(path):
        for path in dir_names:
            read_files(os.path.join(root, path))
        for file_name in file_names:
            if file_name not in SKIP_FILES:
                file_path = os.path.join(root, file_name)
                if os.path.isfile(file_path):
                    past_header, lines = False, []
                    f = open(file_path, encoding='latin-1')
                    for line in f:
                        lines.append(line)
                    f.close()
                    content = NEWLINE.join(lines)
                    yield file_path, content

In [4]:
def build_data_frame(l, path, classification):
    rows = []
    index = []
    
    for i, (file_name, text) in enumerate(read_files(path)):
        rows.append({'text': text, 'class': classification})
        index.append(file_name)
        
    data_frame = pd.DataFrame(rows, index=index)
    return data_frame, len(rows)

def load_data():
    data = pd.DataFrame({'text': [], 'class': []})
    l = 0
    
    for path, classification in sources:
        data_frame, nrows = build_data_frame(l, path, classification)
        data = data.append(data_frame)
        l += nrows
    data = data.reindex(np.random.permutation(data.index))
    
    return data

In [5]:
data=load_data()

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort)


In [6]:
data.shape

(5512, 2)

In [7]:
data.head()

Unnamed: 0,class,text
enron3/spam/2214.2005-01-03.BG.spam.txt,spam,Subject: online pharmac $ y with easy to use s...
enron3/spam/5357.2005-07-16.BG.spam.txt,spam,Subject: f / r / e / e cable tv\n\nf / r / e /...
enron3/ham/3212.2001-10-19.kitchen.ham.txt,ham,Subject: fw : middle market group\n\nfyi\n\nch...
enron3/ham/2729.2001-09-26.kitchen.ham.txt,ham,Subject: re : fyi on espeak\n\ni have read the...
enron3/ham/5351.2002-01-30.kitchen.ham.txt,ham,Subject: revised - operational restart sub - c...


In [23]:
X_train[0]

'Subject: re : drift\n\nlouise - current status on drift expense allocations . . . . we have reviewed the $ 11 . 8 million in question and that amount is correct given the methodology being used by gary to allocate costs . when gary is back in the office on thursday , i will go through his allocations with him to determine if we need to change that methodology and thus change the allocation percentages . i will let you know the outcome of that meeting .\n\nalso , keep in mind that the $ 11 . 8 million represents a plan amount . actuals should be much less since we are currently well under plan headcount amounts and not all plan expenses have been made . let me know if you have any questions - i will keep you updated .\n\nregards ,\n\nbrent\n\n- - - - - original message - - - - -\n\nfrom : kitchen , louise\n\nsent : wednesday , march 21 , 2001 6 : 08 pm\n\nto : price , brent a . ; killen , faith\n\ncc : mcconnell , mike\n\nsubject : drift\n\nbrent ,\n\nmike and i have discussed the drif

In [39]:
tf_mod1 = TfidfVectorizer().fit(X_train)

allocation_matrix = tf_mod1.transform(['there was an allocation issue somewhere along the line'])

In [41]:
allocation_matrix.shape

(1, 48309)

In [40]:
print(allocation_matrix[:,:])

  (0, 46635)	0.21436795952281673
  (0, 43359)	0.22619037705149772
  (0, 43301)	0.10476406858355838
  (0, 40739)	0.5165627348819741
  (0, 27076)	0.33033331689133716
  (0, 24814)	0.3027199009612732
  (0, 5031)	0.19385141438744408
  (0, 4736)	0.38189319036333
  (0, 4686)	0.4908069923422742


In [9]:
data.describe()

Unnamed: 0,class,text
count,5512,5512
unique,2,5274
top,ham,Subject: enron mentions\n\nusa : wrapup 1 - cr...
freq,4012,3


In [20]:
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB, ComplementNB
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
pipeline = Pipeline([
    ('counts', CountVectorizer(ngram_range=(1,2))),
    ('nb', MultinomialNB())
])

pipeline1 = Pipeline([
    ('tfid', TfidfVectorizer()),
    ('lr', LogisticRegression())
])

pipeline2 = Pipeline([
    ('counts', CountVectorizer(ngram_range=(1,2))),
    ('cnb', ComplementNB())
])

pipeline3 = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('nb', MultinomialNB())
    
])

pipeline4 = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('cnb', ComplementNB())
    
])

In [12]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(data['text'], data['class'],test_size=.20)

In [13]:

pipeline.fit(X_train, y_train)

pipeline1.fit(X_train, y_train)

pipeline2.fit(X_train, y_train)



Pipeline(memory=None,
     steps=[('counts', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 2), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)), ('cnb', ComplementNB(alpha=1.0, class_prior=None, fit_prior=True, norm=False))])

In [14]:
from sklearn.metrics import classification_report, confusion_matrix

print(classification_report(y_test, pipeline.predict(X_test)))



              precision    recall  f1-score   support

         ham       0.97      1.00      0.99       823
        spam       1.00      0.91      0.95       280

   micro avg       0.98      0.98      0.98      1103
   macro avg       0.98      0.96      0.97      1103
weighted avg       0.98      0.98      0.98      1103



In [15]:
print(classification_report(y_test, pipeline1.predict(X_test)))

              precision    recall  f1-score   support

         ham       0.98      1.00      0.99       823
        spam       0.99      0.95      0.97       280

   micro avg       0.98      0.98      0.98      1103
   macro avg       0.99      0.97      0.98      1103
weighted avg       0.98      0.98      0.98      1103



In [16]:
print(classification_report(y_test, pipeline2.predict(X_test)))

              precision    recall  f1-score   support

         ham       0.97      1.00      0.99       823
        spam       1.00      0.91      0.95       280

   micro avg       0.98      0.98      0.98      1103
   macro avg       0.98      0.96      0.97      1103
weighted avg       0.98      0.98      0.98      1103



In [18]:
pipeline3.fit(X_train, y_train)

Pipeline(memory=None,
     steps=[('tfidf', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,...True,
        vocabulary=None)), ('nb', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))])

In [19]:
print(classification_report(y_test, pipeline3.predict(X_test)))

              precision    recall  f1-score   support

         ham       0.85      1.00      0.92       823
        spam       1.00      0.47      0.64       280

   micro avg       0.87      0.87      0.87      1103
   macro avg       0.92      0.74      0.78      1103
weighted avg       0.89      0.87      0.85      1103



In [21]:
pipeline4.fit(X_train, y_train)

Pipeline(memory=None,
     steps=[('tfidf', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,...  vocabulary=None)), ('cnb', ComplementNB(alpha=1.0, class_prior=None, fit_prior=True, norm=False))])

In [22]:
print(classification_report(y_test, pipeline4.predict(X_test)))

              precision    recall  f1-score   support

         ham       0.89      1.00      0.94       823
        spam       1.00      0.62      0.77       280

   micro avg       0.90      0.90      0.90      1103
   macro avg       0.94      0.81      0.85      1103
weighted avg       0.91      0.90      0.90      1103

