In [2]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [3]:
cd gdrive/MyDrive/MachineLearning/HandsOnMachineLearning/spam-classifier

/content/gdrive/MyDrive/MachineLearning/HandsOnMachineLearning/spam-classifier


In [4]:
import os


HAM_DIR = "easy_ham"
SPAM_DIR = "spam"
ham_filenames = [name for name in sorted(os.listdir(HAM_DIR)) if len(name) > 20]
spam_filenames = [name for name in sorted(os.listdir(SPAM_DIR)) if len(name) > 20]

print(len(ham_filenames))
len((spam_filenames))

2500


500

In [5]:
import email
import email.policy
import numpy as np

def load_email(is_spam, filename):
    directory = "spam" if is_spam else "easy_ham"

    with open(os.path.join(directory, filename), "rb") as f:
        email_obj = email.parser.BytesParser(policy=email.policy.default).parse(f)

        if email_obj.is_multipart():
            content = ''
            msgs = email_obj.get_payload()
            for msg in msgs:
                content_type = msg.get_content_type()
                if content_type == 'text/plain' or content_type == 'text/html':
                    content += msg.get_content()
        else:
            content = email_obj.get_payload()

        content = content.strip()
        return content

ham_emails = np.array([load_email(is_spam=False, filename=name) for name in ham_filenames], dtype=object)
spam_emails = np.array([load_email(is_spam=True, filename=name) for name in spam_filenames], dtype=object)


In [10]:
import pandas as pd

ham_labels = np.zeros(len(ham_emails))
spam_labels = np.ones(len(spam_emails))

ham_set = np.c_[ham_emails, ham_labels]
spam_set = np.c_[spam_emails, spam_labels]
email_set = np.r_[ham_set, spam_set]

email_df = pd.DataFrame(email_set, columns=['text', 'spam'])
print(email_df)

                                                   text spam
0     Date:        Wed, 21 Aug 2002 10:54:46 -0500\n...  0.0
1     Martin A posted:\nTassos Papadopoulos, the Gre...  0.0
2     Man Threatens Explosion In Moscow \n\nThursday...  0.0
3     Klez: The Virus That Won't Die\n \nAlready the...  0.0
4     >  in adding cream to spaghetti carbonara, whi...  0.0
...                                                 ...  ...
2995  <html>\n<head>\n<title>Toy</title>\n</head>\n<...  1.0
2996  <html>\n<head>\n<title>Untitled Document</titl...  1.0
2997  <TABLE border=0 cellPadding=0 cellSpacing=10 h...  1.0
2998  <html>\n<head>\n</head>\n  <body background="h...  1.0
2999  <STYLE type="text/css">\n<!--\nP{\n  font-size...  1.0

[3000 rows x 2 columns]


In [34]:
from sklearn.model_selection import train_test_split


X = email_df['text']
y = email_df['spam'].astype('int')

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y,
                                                    test_size=0.2,
                                                    random_state=42)

In [83]:
import re
import string
import nltk
from sklearn.base import BaseEstimator, TransformerMixin
#nltk.download('popular')


test = """ On Mon, 2 Sep 2002, Reza B'Far (eBuilt) wrote:

> With the increasing prevalence of web services (not that they are always a
> good thing), I doubt that parsing XML will be something that will remain at
> the Java application layer for long... Recent threads here on Fork
> indicating the move towards hardware parsing or this code even become part
> of the native implementation of Java on various platforms...

OK, so, you get the XML toss it through hardware and turn it back into a
struct/object whatever youhttps://colab.research.google.com/drive/10GLJWvPO04H933eqIlk2PluKQb6AU902#scrollTo=N2kTJHE18 call your binary data. I agree this is the way
things will go, as XML parsing has just too much overhead to survive in the
application layer. 111111111 So 123 wh1y turn it into XML in the first place? Becasue it
gives geeks something to do and sells XML hardware accelerators and way more
CPUs?

Is there anyone out there actually doing anything new that actually IMPROVES
things anymore, or are they all too scared of the fact that improvements put
people out of work and cut #1 is the creators... """


sno = nltk.stem.SnowballStemmer('english')


def transform_text(text, lowercase=False, remove_header=False,
                   remove_punctuation=False, replace_url=False,
                   replace_numbers=False, stemming=False):
    transformed_text = text

    if lowercase:
        transformed_text = transformed_text.lower()

    if remove_header:
        transformed_text = re.sub(r'^.*?^\s*$', '', transformed_text, count=1, flags=re.MULTILINE | re.DOTALL)

    if remove_punctuation:
        translator = str.maketrans('', '', string.punctuation)
        transformed_text = transformed_text.translate(translator)

    if replace_url:
        transformed_text = re.sub(r"(?i)\b((?:[a-z][\w-]+:(?:/{1,3}|[a-z0-9%])|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'\".,<>?«»“”‘’]))",
                                  'URL', transformed_text, count=0, flags=re.MULTILINE)
    if replace_numbers:
        transformed_text = re.sub(r'\d+', 'NUMBER', transformed_text, count=0)

    if stemming:
        split_text = nltk.word_tokenize(transformed_text)
        transformed_text = ' '.join([sno.stem(word) for word in split_text])

    return transformed_text


print(transform_text(test, remove_punctuation=True))


class ModifyEmails(BaseEstimator, TransformerMixin):
    def __init__(self, lowercase=False, remove_header=False,
            remove_punctuation=False, replace_url=False,
            replace_numbers=False, stemming=False):
        self.lowercase = lowercase
        self.remove_header = remove_header
        self.remove_punctuation = remove_punctuation
        self.replace_url = replace_url
        self.replace_numbers = replace_numbers
        self.stemming = stemming

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        if any([self.lowercase, self.remove_header, self.remove_punctuation,
                self.replace_url, self.replace_numbers, self.stemming]):
            for text in X:
                text = transform_text(text, self.lowercase, self.remove_header,
                                      self.remove_punctuation, self.replace_url,
                                      self.replace_numbers, self.stemming)
        return X


 On Mon 2 Sep 2002 Reza BFar eBuilt wrote

 With the increasing prevalence of web services not that they are always a
 good thing I doubt that parsing XML will be something that will remain at
 the Java application layer for long Recent threads here on Fork
 indicating the move towards hardware parsing or this code even become part
 of the native implementation of Java on various platforms

OK so you get the XML toss it through hardware and turn it back into a
structobject whatever youhttpscolabresearchgooglecomdrive10GLJWvPO04H933eqIlk2PluKQb6AU902scrollToN2kTJHE18 call your binary data I agree this is the way
things will go as XML parsing has just too much overhead to survive in the
application layer 111111111 So 123 wh1y turn it into XML in the first place Becasue it
gives geeks something to do and sells XML hardware accelerators and way more
CPUs

Is there anyone out there actually doing anything new that actually IMPROVES
things anymore or are they all too scared of the fact that 

In [85]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer


transform_pipe = Pipeline([
    ('modify_text', ModifyEmails()),
    ('encode', CountVectorizer())
])
X_train_prepared = transform_pipe.fit_transform(X_train)
print(X_train_prepared.dtype)
print(y_train.dtype)

int64
int64


In [42]:
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score


svc = SVC(random_state=42)
svc_scores = cross_val_score(svc, X_train_prepared, y_train,
                             scoring='roc_auc', cv=3)
print(svc_scores.mean())

0.9831374675519767


In [45]:
from sklearn.neighbors import KNeighborsClassifier


knn_clf = KNeighborsClassifier()
knn_scores = cross_val_score(knn_clf, X_train_prepared, y_train,
                             scoring='roc_auc', cv=3)
print(knn_scores.mean())

0.9121001890646133


In [43]:
from sklearn.ensemble import RandomForestClassifier


forest_clf = RandomForestClassifier(random_state=42)
forest_scores = cross_val_score(forest_clf, X_train_prepared, y_train,
                                scoring='roc_auc', cv=3)
print(forest_scores.mean())

0.99486744055951


In [93]:
from sklearn.model_selection import GridSearchCV


full_pipe = Pipeline([
    ('transform', transform_pipe),
    ('predict', RandomForestClassifier())
])

param_grid = [{
    'predict__n_estimators': [110],
    'predict__criterion': ['entropy'],
    'predict__random_state': [42],
    'transform__modify_text__lowercase': [True],
    'transform__modify_text__remove_header': [True],
    'transform__modify_text__remove_punctuation': [True],
    'transform__modify_text__replace_url': [True],
    'transform__modify_text__replace_numbers': [True],
    'transform__modify_text__stemming': [True],
}]

grid_search = GridSearchCV(full_pipe, param_grid, scoring='roc_auc', cv=3,
                           verbose=2, return_train_score=True, n_jobs=-1)
grid_search.fit(X_train, y_train)

print(grid_search.best_params_)
print(grid_search.best_score_)
print(grid_search.best_estimator_)

Fitting 3 folds for each of 1 candidates, totalling 3 fits
{'predict__criterion': 'entropy', 'predict__n_estimators': 110, 'predict__random_state': 42, 'transform__modify_text__lowercase': True, 'transform__modify_text__remove_header': True, 'transform__modify_text__remove_punctuation': True, 'transform__modify_text__replace_numbers': True, 'transform__modify_text__replace_url': True, 'transform__modify_text__stemming': True}
0.9950087514943639


In [95]:
from sklearn.metrics import roc_auc_score


y_pred = grid_search.best_estimator_.predict(X_test)
print(roc_auc_score(y_test, y_pred))

0.933
