In [1]:
# Import libraries and tools
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt

# for handling and parsing email files
import email
import os
import re
import nltk

from bs4 import BeautifulSoup
from collections import Counter

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

In [2]:
# Load files by creating lists of file names in resources files
ham_fnames = [name for name in sorted(os.listdir("Resources/main_ham"))]
spam_fnames = [name for name in sorted(os.listdir("Resources/main_spam"))]

#Return file names
print(ham_fnames[0])
print(spam_fnames[0])

00001.1a31cc283af0060967a233d26548a6ce
0000.7b1b73cf36cf9dbc3d64e3f2ee2b91f1


In [3]:
# Create a function to create lists containing email objects
def parse_email(fname, spam=False):
    directory = "Resources/main_spam" if spam else "Resources/main_ham"
    with open(os.path.join(directory, fname), "rb") as fp:
        return email.parser.BytesParser().parse(fp)
        
# Call the function on both sets of data  
ham_emails = [parse_email(name) for name in ham_fnames]
spam_emails = [parse_email(name, spam=True) for name in spam_fnames]

# Return object
ham_emails[0]

<email.message.Message at 0x21fa0c09540>

In [4]:
# Create a helper function that converts html emails to plain text
def html_to_text(email) -> str:
    try:
        soup = BeautifulSoup(email.get_payload(), "html.parser")
        plain = soup.text.replace("=\n", "")
        plain = re.sub(r"\s+", " ", plain)
        return plain.strip()
    except:
        return "nothing"

In [5]:
# General purpose function convert an email to plain text
def email_to_text(email):
    text_content = ""
    # Accounts for multi-part emails (reply threads ), checks for text and returns it as plain text 
    for part in email.walk():
        #Finds content type
        part_content_type = part.get_content_type()
        #Ignores non-text sections
        if part_content_type not in ['text/plain', 'text/html']:
            continue
        #If the section is plain text, strip it out of the email object
        if part_content_type == 'text/plain':
            text_content += part.get_payload()
        #If the section is html, run the html_to_text function
        else:
            text_content += html_to_text(part)
    return text_content

#Return email text
print(email_to_text(ham_emails[3]))
print(email_to_text(spam_emails[3]))

> From:  Valdis.Kletnieks@vt.edu
> Date:  Wed, 21 Aug 2002 02:36:56 -0400
>
> --==_Exmh_778588528P
> Content-Type: text/plain; charset=us-ascii
> 
> On Tue, 20 Aug 2002 22:51:52 EDT, Valdis.Kletnieks@vt.edu said:
> 
> > Ever tried to get MH to *not* have a 'pseq' sequence?  I suspect everybod
> y's
> > looking at a big box that has unseen and pseq in it.  Might want to add
> > 'pseq' to the 'hide by default' list....
> 
> Was it intended that if you added a sequence to the 'never show' list that
> it not take effect till you stopped and restarted exmh?  I added 'pseq',
> then hit 'save' for Preferences - didn't take effect till I restarted.

No it wasn't, and at one point it worked fine.  I'll check and see why it 
stopped working.

Chris
-- 
Chris Garrigues                 http://www.DeepEddy.Com/~cwg/
virCIO                          http://www.virCIO.Com
716 Congress, Suite 200
Austin, TX  78701		+1 512 374 0500

  World War III:  The Wrong-Doers Vs. the Evil-Doers.




The Need For 

In [6]:
# Transform the email to count the word usage in the message
class EmailToWordsCount(BaseEstimator, TransformerMixin):
    def __init__(self, strip_headers=True, to_lowercase=True, remove_punc=True, do_stem=True):
        self.strip_headers = strip_headers
        self.to_lowercase = to_lowercase
        self.remove_punc = remove_punc
        self.do_stem = do_stem
        
        # To perform stemming
        self.stemmer = nltk.PorterStemmer()
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        X_word_counts = []
        for email in X:
            # Text of the email
            plain = email_to_text(email)
            if plain is None:
                plain = "nothing"
            # Make all letters to lowercase
            if self.to_lowercase:
                plain = plain.lower()
            # Remove all punctuation
            if self.remove_punc:
                plain = plain.replace(".", "")
                plain = plain.replace(",", "")
                plain = plain.replace("!", "")
                plain = plain.replace("?", "")
                plain = plain.replace(";", "")
            # Reduce words to their stems   
            word_counts = Counter(plain.split())
            if self.do_stem:
                # Stem the word, and add their counts
                stemmed_word_counts = Counter()
                for word, count in word_counts.items():
                    root_word = self.stemmer.stem(word)
                    stemmed_word_counts[root_word] += count
                word_counts = stemmed_word_counts
            # Count the unique words in a message
            X_word_counts.append(word_counts)
        return np.array(X_word_counts)

In [7]:
#Example of stemming (also from Kaggle Notebook "Email Spam Classification [98%]" cited in ReadMe)
text = "Hello, today I am going to London for performing and dancing"
stemmer = nltk.PorterStemmer()

for word in text.split():
    stemmed_word = stemmer.stem(word)
    print(stemmed_word, end=" ")

hello, today i am go to london for perform and danc 

In [8]:
#Build a Numpy matrix with the vocabulary of words to consider and their usage counts
class WordCountVectorizer(BaseEstimator, TransformerMixin):
    def __init__(self, vocabulary_size=1000):
        self.vocabulary_size = vocabulary_size
    # Train on list of word counts and build vocabulary
    def fit(self, X, y=None):
        total_word_counts = Counter()
        for word_count in X:
            for word, count in word_count.items():
                total_word_counts[word] += count
                
        # Build a vocabulary out of total most common
        self.most_common = total_word_counts.most_common()[:self.vocabulary_size]
        self.vocabulary_ = {word: i for i, (word, count) in enumerate(self.most_common)}
    
        return self
    # Create the vector out of vocabulary
    def transform(self, X, y=None):
        X_new = np.zeros([X.shape[0], self.vocabulary_size + 1], dtype=int)
        
        # The vectors will contain additional column for counts of words
        # not captured in vocabulary
        for row, word_counts in enumerate(X):
            for word, count in word_counts.items():
                col = self.vocabulary_.get(word, self.vocabulary_size)
                X_new[row, col] += count
                
        return X_new

In [9]:
#Call both processing functions
email_to_cvector = Pipeline([
    ("emailToWords", EmailToWordsCount()), 
    ("wordCountVectorizer", WordCountVectorizer())
])

In [10]:
#Create variables containing the data set and wether each message is spam or ham
X = np.array(ham_emails + spam_emails, dtype='object')
y = np.array([0] * len(ham_emails) + [1] * len(spam_emails))

#Display variables
print(X)
print(y)

[<email.message.Message object at 0x0000021FA0C09540>
 <email.message.Message object at 0x0000021FA8C04820>
 <email.message.Message object at 0x0000021FA8C04850> ...
 <email.message.Message object at 0x0000021FAEA88EB0>
 <email.message.Message object at 0x0000021FAEA88EE0>
 <email.message.Message object at 0x0000021FAEA88F10>]
[0 0 0 ... 1 1 1]


In [11]:
# Checking data type
print(type(X))

<class 'numpy.ndarray'>


In [12]:
# Create training and testing data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True, random_state=3301)
# Display sizes of sets
print("Training set size: ", X_train.shape, y_train.shape)
print("Testing set size: ", X_test.shape, y_test.shape)

Training set size:  (7479,) (7479,)
Testing set size:  (1870,) (1870,)


In [13]:
# Prepare the training set
X_train_prepared = email_to_cvector.fit_transform(X_train)
# Display prepared array
X_train_prepared

array([[  7,   6,   0, ...,   0,   0,  57],
       [  9,   7,   0, ...,   0,   0,  42],
       [  1,   5,   4, ...,   0,   0,  60],
       ...,
       [ 14,   9,   0, ...,   0,   0, 134],
       [ 33,  20,   0, ...,   1,   0, 125],
       [  0,   3,   0, ...,   0,   0,  17]])

In [14]:
# Prepare the testing set
X_test_prepared = email_to_cvector.transform(X_test)
# Display prepared array
X_test_prepared

array([[  2,   0,   0, ...,   0,   0,  37],
       [ 23,  21,   0, ...,   0,   0, 168],
       [  2,   3,   0, ...,   0,   0,  19],
       ...,
       [ 16,   7,   0, ...,   0,   0,  63],
       [ 31,  41,  33, ...,   0,   0, 164],
       [  0,   4,   0, ...,   0,   0,  22]])

In [15]:
# Create a function that 
def classification_models(model, X_train_prepared, X_test_prepared, y_train, y_test):
    # Trains the model on the training data
    model.fit(X_train_prepared, y_train)
    # Makes predictions on the testing data
    model_prediction = model.predict(X_test_prepared)
    # and prints both the classification report and confusion matrix
    print("Classification Report: ", classification_report(y_test, model_prediction))
    matrix = confusion_matrix(y_test, model_prediction)
    print("Confusion Matrix:", matrix)


In [16]:
# List of classification models to evaluate
models = [
    RandomForestClassifier(random_state=3301),
    LogisticRegression(solver="lbfgs", random_state=3301),
    DecisionTreeClassifier(random_state=3301)
]

# Create loop that goes through each model and evals its performance
for model in models:
    classification_models(model, X_train_prepared, X_test_prepared, y_train, y_test )

Classification Report:                precision    recall  f1-score   support

           0       0.99      0.99      0.99      1386
           1       0.98      0.96      0.97       484

    accuracy                           0.99      1870
   macro avg       0.98      0.98      0.98      1870
weighted avg       0.99      0.99      0.99      1870

Confusion Matrix: [[1378    8]
 [  19  465]]


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Classification Report:                precision    recall  f1-score   support

           0       0.99      0.99      0.99      1386
           1       0.96      0.96      0.96       484

    accuracy                           0.98      1870
   macro avg       0.98      0.97      0.97      1870
weighted avg       0.98      0.98      0.98      1870

Confusion Matrix: [[1369   17]
 [  19  465]]
Classification Report:                precision    recall  f1-score   support

           0       0.97      0.98      0.98      1386
           1       0.95      0.93      0.94       484

    accuracy                           0.97      1870
   macro avg       0.96      0.95      0.96      1870
weighted avg       0.97      0.97      0.97      1870

Confusion Matrix: [[1360   26]
 [  35  449]]


##### Note about Confusion Matrix

[[True Negative, False Positive][False Negative, True positive]]

True Positives (TP): These are the emails that the model correctly identified as spam. This is good because it means the spam filter is doing its job correctly.

True Negatives (TN): These are the emails that the model correctly identified as ham (non-spam). This is also good as it indicates that non-spam emails are being correctly identified.

False Positives (FP): These are the emails that are actually ham, but the model incorrectly classified them as spam. False positives can be annoying because they may lead to important emails being flagged as spam.

False Negatives (FN): These are the emails that are actually spam, but the model incorrectly classified them as ham. False negatives are a more serious issue because they allow spam to reach the inbox, defeating the purpose of the spam filter.

##### Explaining model outcomes:

1. Random Forest: 99% Accuracy, TP- 465, TN- 1378, FP- 8, FN- 19.

2. Logistic Regression: 98% Accuracy, TP- 465, TN- 1369, FP- 17, FN- 19.

3. Decision Tree: 97% Accuracy, TP- 449, TN- 1360, FP- 26, FN- 35.

The Random Forest Model performed the best and is the model we are choosing to optimize. 

In [17]:
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor(random_state = 42)
from pprint import pprint
# Look at parameters used by our current forest
print('Parameters currently in use:\n')
pprint(rf.get_params())

Parameters currently in use:

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'criterion': 'squared_error',
 'max_depth': None,
 'max_features': 1.0,
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': 42,
 'verbose': 0,
 'warm_start': False}


In [18]:
# Create a parameter grid to sample from during fitting
from sklearn.model_selection import RandomizedSearchCV
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
pprint(random_grid)
{'bootstrap': [True, False],
 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, None],
 'max_features': ['auto', 'sqrt'],
 'min_samples_leaf': [1, 2, 4],
 'min_samples_split': [2, 5, 10],
 'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000]}

{'bootstrap': [True, False],
 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, None],
 'max_features': ['auto', 'sqrt'],
 'min_samples_leaf': [1, 2, 4],
 'min_samples_split': [2, 5, 10],
 'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000]}


{'bootstrap': [True, False],
 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, None],
 'max_features': ['auto', 'sqrt'],
 'min_samples_leaf': [1, 2, 4],
 'min_samples_split': [2, 5, 10],
 'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000]}

In [19]:
# Use the random grid to search for best hyperparameters
# First create the base model to tune
rf = RandomForestRegressor()
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)
# Fit the random search model
rf_random.fit(X_train_prepared, y_train)


In [None]:
#  View the best params
rf_random.best_params_