In [28]:
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt

# for handling and parsing email files
import email
import os
import re
import nltk

from bs4 import BeautifulSoup
from collections import Counter

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline

In [3]:
# load ham and spam files
ham_fnames = [name for name in sorted(os.listdir("Resources/main_ham"))]
spam_fnames = [name for name in sorted(os.listdir("Resources/main_spam"))]

In [4]:
# Creates a function that takes files in fnames variables and returns a parsed email
def parse_email(fname, spam=False):
    directory = "Resources/main_spam" if spam else "Resources/main_ham"
    with open(os.path.join(directory, fname), "rb") as fp:
        return email.parser.BytesParser().parse(fp)
        
ham_emails = [parse_email(name) for name in ham_fnames]
spam_emails = [parse_email(name, spam=True) for name in spam_fnames]

In [13]:
#To handle reply threads in the data
multi_email = None

for mail in ham_emails:
    if mail.is_multipart():
        multi_email = mail
        break

# Payload will be list of email.message.Message
print(multi_email.get_payload())

# Nested get payload
print(multi_email.get_payload()[0].get_payload())
print(multi_email.items())

[<email.message.Message object at 0x103e07790>, <email.message.Message object at 0x103e07340>]
> From:  Valdis.Kletnieks@vt.edu
> Date:  Wed, 21 Aug 2002 02:36:56 -0400
>
> --==_Exmh_778588528P
> Content-Type: text/plain; charset=us-ascii
> 
> On Tue, 20 Aug 2002 22:51:52 EDT, Valdis.Kletnieks@vt.edu said:
> 
> > Ever tried to get MH to *not* have a 'pseq' sequence?  I suspect everybod
> y's
> > looking at a big box that has unseen and pseq in it.  Might want to add
> > 'pseq' to the 'hide by default' list....
> 
> Was it intended that if you added a sequence to the 'never show' list that
> it not take effect till you stopped and restarted exmh?  I added 'pseq',
> then hit 'save' for Preferences - didn't take effect till I restarted.

No it wasn't, and at one point it worked fine.  I'll check and see why it 
stopped working.

Chris
-- 
Chris Garrigues                 http://www.DeepEddy.Com/~cwg/
virCIO                          http://www.virCIO.Com
716 Congress, Suite 200
Austin, TX  

In [12]:
# General purpose function to convert an email to plain text
def email_to_text(email):
    text_content = ""
    for part in email.walk():
        part_content_type = part.get_content_type()
        if part_content_type not in ['text/plain', 'text/html']:
            continue
        if part_content_type == 'text/plain':
            text_content += part.get_payload()
        else:
            text_content += html_to_text(part)
    return text_content

print("Ham email in plain text:\n", email_to_text(ham_emails[3]))
print("Spam email in plain text:\n", email_to_text(spam_emails[3]))

Ham email in plain text:
 > From:  Valdis.Kletnieks@vt.edu
> Date:  Wed, 21 Aug 2002 02:36:56 -0400
>
> --==_Exmh_778588528P
> Content-Type: text/plain; charset=us-ascii
> 
> On Tue, 20 Aug 2002 22:51:52 EDT, Valdis.Kletnieks@vt.edu said:
> 
> > Ever tried to get MH to *not* have a 'pseq' sequence?  I suspect everybod
> y's
> > looking at a big box that has unseen and pseq in it.  Might want to add
> > 'pseq' to the 'hide by default' list....
> 
> Was it intended that if you added a sequence to the 'never show' list that
> it not take effect till you stopped and restarted exmh?  I added 'pseq',
> then hit 'save' for Preferences - didn't take effect till I restarted.

No it wasn't, and at one point it worked fine.  I'll check and see why it 
stopped working.

Chris
-- 
Chris Garrigues                 http://www.DeepEddy.Com/~cwg/
virCIO                          http://www.virCIO.Com
716 Congress, Suite 200
Austin, TX  78701		+1 512 374 0500

  World War III:  The Wrong-Doers Vs. the Evi

In [19]:
# transforms the email to count the word usage in the message
class EmailToWordsCount(BaseEstimator, TransformerMixin):
    def __init__(self, strip_headers=True, to_lowercase=True, remove_punc=True, do_stem=True):
        self.strip_headers = strip_headers
        self.to_lowercase = to_lowercase
        self.remove_punc = remove_punc
        self.do_stem = do_stem
        
        # To perform stemming
        self.stemmer = nltk.PorterStemmer()
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        X_word_counts = []
        for email in X:
            # text of the email
            plain = email_to_text(email)
            if plain is None:
                plain = "nothing"
            
            if self.to_lowercase:
                plain = plain.lower()
            
            if self.remove_punc:
                plain = plain.replace(".", "")
                plain = plain.replace(",", "")
                plain = plain.replace("!", "")
                plain = plain.replace("?", "")
                plain = plain.replace(";", "")
                
            word_counts = Counter(plain.split())
            if self.do_stem:
                # Stem the word, and add their counts
                stemmed_word_counts = Counter()
                for word, count in word_counts.items():
                    root_word = self.stemmer.stem(word)
                    stemmed_word_counts[root_word] += count
                word_counts = stemmed_word_counts
            
            X_word_counts.append(word_counts)
        return np.array(X_word_counts)

In [20]:
# test 
X_few = spam_emails[1:3]
ewc = EmailToWordsCount()
X_few_counts = ewc.fit_transform(X_few)
print(X_few_counts)

[Counter({'you': 21, 'the': 21, 'and': 14, 'to': 14, 'thi': 12, 'is': 11, 'that': 10, 'of': 10, 'mlm': 8, 'receiv': 7, 'for': 7, 'inform': 6, 'most': 6, 'it': 6, 'be': 6, 'will': 6, 'email': 6, 'in': 5, 'a': 5, 'ha': 5, 'your': 5, 'letter': 4, 'then': 4, 'no': 4, 'peopl': 4, 'work': 4, 'use': 4, 'our': 4, 'list': 4, 'are': 3, 'have': 3, 'about': 3, 'if': 3, "you'v": 3, 'market': 3, 'one': 3, 'their': 3, 'tell': 3, 'there': 3, 'through': 3, 'send': 3, 'i': 3, 'up': 3, 'free': 3, 'or': 3, 'becaus': 2, 'an': 2, 'onlin': 2, 'pleas': 2, 'so': 2, 'been': 2, 'multi-level': 2, 'read': 2, 'ever': 2, 'inbox': 2, 'promis': 2, 'cannot': 2, 'who': 2, 'earn': 2, 'big': 2, 'not': 2, 'someon': 2, "haven't": 2, 'dream': 2, 'with': 2, 'whi': 2, 'sent': 2, 'unsolicit': 2, 'sign': 2, 'system': 2, 'which': 2, 'agre': 2, 'also': 2, 'such': 2, 'greet': 1, 'express': 1, 'interest': 1, 'busi': 1, 'opportun': 1, 'erron': 1, 'accept': 1, 'my': 1, 'sincer': 1, 'apolog': 1, 'one-tim': 1, 'mail': 1, 'remov': 1, 'ne

In [21]:
#Build a Numpy matrix with the vocabulary of words to consider and their usage counts
class WordCountVectorizer(BaseEstimator, TransformerMixin):
    def __init__(self, vocabulary_size=1000):
        self.vocabulary_size = vocabulary_size
    # train on list of word counts and build vocabulary
    def fit(self, X, y=None):
        total_word_counts = Counter()
        for word_count in X:
            for word, count in word_count.items():
                total_word_counts[word] += count
                
        # Build a vocabulary out of total most common
        self.most_common = total_word_counts.most_common()[:self.vocabulary_size]
        self.vocabulary_ = {word: i for i, (word, count) in enumerate(self.most_common)}
    
        return self
    # Create the vector out of vocabulary
    def transform(self, X, y=None):
        X_new = np.zeros([X.shape[0], self.vocabulary_size + 1], dtype=int)
        
        # The vectors will contain additional column for counts of words
        # not captured in vocabulary
        for row, word_counts in enumerate(X):
            for word, count in word_counts.items():
                col = self.vocabulary_.get(word, self.vocabulary_size)
                X_new[row, col] += count
                
        return X_new

In [22]:
vectorizer = WordCountVectorizer(vocabulary_size=10)
X_few_vector = vectorizer.fit_transform(X_few_counts)

print("Vector of word counts:\n", X_few_vector)
print("Vocabulary generated:\n", vectorizer.vocabulary_)

Vector of word counts:
 [[ 21  21  14  14  12  11  10  10   5   7 368]
 [  7   5   6   5   2   3   3   0   4   2 135]]
Vocabulary generated:
 {'you': 0, 'the': 1, 'to': 2, 'and': 3, 'thi': 4, 'is': 5, 'of': 6, 'that': 7, 'in': 8, 'for': 9}


Note: The fianl column is the number of words not included in the vocabulary

In [29]:
email_to_cvector = Pipeline([
    ("emailToWords", EmailToWordsCount()), 
    ("wordCountVectorizer", WordCountVectorizer())
])

In [24]:
X = np.array(ham_emails + spam_emails)
y = np.array([0] * len(ham_emails) + [1] * len(spam_emails))

  X = np.array(ham_emails + spam_emails)


In [25]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True, random_state=3301)

print("Training set size: ", X_train.shape, y_train.shape)
print("Testing set size: ", X_test.shape, y_test.shape)

Training set size:  (7479,) (7479,)
Testing set size:  (1870,) (1870,)


In [30]:
# prepare the training and testing set 
X_train_prepared = email_to_cvector.fit_transform(X_train)
X_train_prepared

array([[  7,   6,   0, ...,   0,   0,  57],
       [  9,   7,   0, ...,   0,   0,  42],
       [  1,   5,   4, ...,   0,   0,  60],
       ...,
       [ 14,   9,   0, ...,   0,   0, 134],
       [ 33,  20,   0, ...,   1,   0, 125],
       [  0,   3,   0, ...,   0,   0,  17]])

In [31]:
X_test_prepared = email_to_cvector.transform(X_test)
X_test_prepared

array([[  2,   0,   0, ...,   0,   0,  37],
       [ 23,  21,   0, ...,   0,   0, 168],
       [  2,   3,   0, ...,   0,   0,  19],
       ...,
       [ 16,   7,   0, ...,   0,   0,  63],
       [ 31,  41,  33, ...,   0,   0, 164],
       [  0,   4,   0, ...,   0,   0,  22]])