In [134]:
# helpful pre-reads:
# https://www.kaggle.com/veleon/spam-classification
# https://stackabuse.com/text-classification-with-python-and-scikit-learn
# https://scikit-learn.org/stable/tutorial/text_analytics/working_with_text_data.html

# Steps taken below:
# Convert email into feature vector
# Create Test & Training Set
# Add Hyperparameters to:
# - Strip email headers
# - Convert to lowercase
# - Remove punctuation
# - Replace urls with "URL"
# - Replace numbers with "NUMBER"
# - Perform Stemming (trim word endings with nltk library)

import os
import re
import email
import email.policy
from typing import Any

import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import urlextract

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn import model_selection

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier

from sklearn.pipeline import Pipeline
from sklearn.utils import class_weight
from sklearn.metrics import precision_score, recall_score, classification_report, confusion_matrix

from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer

import tensorflow as tf

import matplotlib.pyplot as plt
import seaborn as sns

os.listdir('data')

['spam',
 'hard_ham',
 'spam_2',
 '.DS_Store',
 'enron1',
 'easy_ham',
 'spam_assassin_ham',
 'gmail_spam_examples',
 'glove.6B']

In [114]:
ham_filenames = [name for name in sorted(os.listdir('data/spam_assassin_ham')) if len(name) > 20]
spam_filenames = [name for name in sorted(os.listdir('data/gmail_spam_examples/converted')) if len(name) > 3]

print('Amount of ham files:', len(ham_filenames))
print('Amount of spam files:', len(spam_filenames))    
print('Spam to Ham Ratio:',len(spam_filenames) / len(ham_filenames))

Amount of ham files: 2750
Amount of spam files: 668
Spam to Ham Ratio: 0.2429090909090909


In [115]:
def load_email(filename):
    directory = "data/spam_assassin_ham"
    with open(os.path.join(directory, filename), "rb") as f:
        return email.parser.BytesParser(policy=email.policy.default).parse(f)
    
def load_file(filename):
    directory = "data/gmail_spam_examples/converted"
    # http://python-notes.curiousefficiency.org/en/latest/python3/text_file_processing.html
    # latin-1 encoding because some of the spam files have retarded characters
    with open(os.path.join(directory, filename)) as f:
#         lines = f.readlines()
        return f.read()
        # like this one
#         if filename.startswith("0754"):
#             print(lines)
#         return {
#             'subject': lines[0],
#             'body': lines[1:],
#         }
    
ham_emails = [load_email(filename=name) for name in ham_filenames]
spam_emails = [load_file(filename=name) for name in spam_filenames]
    
    
testEmail = spam_emails[0]

# print('Header Field Names:', testEmail.keys())
# print('\n\n')
# print('Message Field Values:', testEmail.values())
# print('\n\n')
print('Object Type:', type(testEmail))
print(testEmail)

# if testEmail.is_multipart():
#     for payload in testEmail.get_payload():
#         # if payload.is_multipart(): ...
#         print('payload:', payload.get_payload())
# else:
#     print(testEmail.get_payload())
# print('Message Content:', testEmail.get_content())

Object Type: <class 'str'>
Subject: Get a cash offer for your home now



In [116]:
def html_to_plain(email):
    try:
        soup = BeautifulSoup(email, 'html.parser')
#         print(type(soup.text))
        return " ".join(soup.text.split())
#         return soup.text.replace('\n', '').replace('\t', '')
    except:
        return "something went wrong parsing html text"

In [117]:
html_to_plain(testEmail)

'Subject: Get a cash offer for your home now'

In [118]:
def get_email_structure(email):    
    payload = email.get_payload()
    if isinstance(payload, list):
        return "multipart({})".format(", ".join([
            get_email_structure(sub_email)
            for sub_email in payload
        ]))
    else:
        return email.get_content_type()
    
def email_to_plain(email):
    struct = get_email_structure(email)
    for part in email.walk():
        partContentType = part.get_content_type()
        if partContentType not in ['text/plain','text/html']:
            continue
        try:
            partContent = part.get_content()
        except: # in case of encoding issues
            partContent = str(part.get_payload())
        if partContentType == 'text/plain':
            return partContent
        else:
            return html_to_plain(part)

print(email_to_plain(ham_emails[42]))
ham_emails[42]['subject']

On Thu, Aug 22, 2002 at 05:13:01PM +0100, Fergal Moran mentioned:
> In a nutshell - Solaris is Suns own flavour of UNIX.

 Though I'm sure that this nice person would like a bit more detail.

 Solaris is quite different to Linux, though these days you can make
solaris act a lot like linux with an extra CD of GNU tools Sun ship with
solaris. It is based on the SysV unix family, so it's quite similar to
other unixen like HPUX and SCO.

 Sun's hardware in general is more reliable, and a lot more expensive. One
of the main bonuses you get by buying Sun is that you are getting your
hardware and software from one company, so if you have a support contract,
they have to fix it. They can't fob you off with 'that's a software
problem, talk to the software vendor.' etc.

 If you are set on Linux, you most likely can do your own support. There
is then a world of different hardware options. You can run Linux on Sparc,
though some companies like RedHat don't maintain a sparc port anymore.

 You can

'Re: [ILUG] Sun Solaris..'

In [119]:
ham_emails = [(ham['subject'] or '') + '\n' + email_to_plain(ham) for ham in ham_emails]
spam_emails = [html_to_plain(spam) for spam in spam_emails]

In [120]:
snowball_stemmer = SnowballStemmer('english')

class EmailTextToWords(BaseEstimator, TransformerMixin):
    def __init__(self, lowercaseConversion = True, punctuationRemoval = True, 
                 urlReplacement = True, numberReplacement = False, stemming = False):
        self.lowercaseConversion = lowercaseConversion
        self.punctuationRemoval = punctuationRemoval
        self.urlReplacement = urlReplacement
        self.urlExtractor = urlextract.URLExtract()
        self.numberReplacement = numberReplacement
        self.stemming = stemming
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        X_to_words = []
        for text in X:
            if text is None:
                text = 'empty'
            if self.lowercaseConversion:
                text = text.lower()
                
            if self.urlReplacement:
                urls = self.urlExtractor.find_urls(text)
                for url in urls:
                   text = text.replace(url, 'URL')   
            
            # apparently removing numbers helped
            if self.numberReplacement:
                text = re.sub('\d', '%d', text)
                    
            if self.punctuationRemoval:
                text = text.replace('.','')
                text = text.replace(',','')
                text = text.replace('!','')
                text = text.replace('?','')
                
            if self.stemming:
                words = text.split(' ')
                
                stemmed_words = []
                for word in words:
                    stemmed_words.append(snowball_stemmer.stem(word))
                
                text = ' '.join(stemmed_words)
            
            X_to_words.append(text)
        return np.array(X_to_words)

In [121]:
X_few = ham_emails[:3]
X_few_text = EmailTextToWords().fit_transform(X_few)
vocab_transformer = CountVectorizer()
X_few_vectors = vocab_transformer.fit_transform(X_few_text)
X_few_vectors

<3x582 sparse matrix of type '<class 'numpy.int64'>'
	with 651 stored elements in Compressed Sparse Row format>

In [122]:
vocab_transformer.vocabulary_

{'re': 421,
 'new': 364,
 'sequences': 463,
 'window': 568,
 'date': 143,
 'wed': 559,
 '21': 21,
 'aug': 73,
 '2002': 20,
 '10': 6,
 '54': 32,
 '46': 27,
 '0500': 5,
 'from': 216,
 'chris': 112,
 'garrigues': 224,
 'cwg': 140,
 'dated': 144,
 '103037728706fa6d': 8,
 'deepeddycom': 151,
 'message': 341,
 'id': 268,
 '10299452874797tmda': 7,
 'deepeddyvirciocom': 152,
 'can': 99,
 'reproduce': 435,
 'this': 517,
 'error': 183,
 'for': 211,
 'me': 336,
 'it': 286,
 'is': 283,
 'very': 548,
 'repeatable': 431,
 'like': 307,
 'every': 184,
 'time': 519,
 'without': 570,
 'fail': 196,
 'the': 510,
 'debug': 149,
 'log': 317,
 'of': 374,
 'pick': 399,
 'happening': 245,
 '18': 14,
 '19': 16,
 '03': 3,
 'pick_it': 400,
 'exec': 188,
 'inbox': 274,
 'list': 310,
 'lbrace': 300,
 'subject': 492,
 'ftp': 219,
 'rbrace': 420,
 '4852': 30,
 'sequence': 462,
 'mercury': 340,
 '04': 4,
 'ftoc_pickmsgs': 218,
 'hit': 257,
 'marking': 332,
 'hits': 258,
 'tkerror': 523,
 'syntax': 497,
 'in': 273,
 'e

In [123]:
vectorizer = CountVectorizer(stop_words=stopwords.words('english'), ngram_range=(1, 2))
print(vectorizer.fit_transform(X_few_text))
print(vectorizer.get_feature_names())

  (0, 771)	1
  (0, 987)	1
  (0, 1201)	1
  (0, 286)	1
  (0, 1189)	1
  (0, 56)	1
  (0, 148)	1
  (0, 49)	2
  (0, 15)	1
  (0, 80)	1
  (0, 69)	1
  (0, 13)	1
  (0, 219)	1
  (0, 462)	1
  (0, 280)	1
  (0, 288)	1
  (0, 23)	1
  (0, 304)	1
  (0, 718)	1
  (0, 557)	1
  (0, 21)	1
  (0, 306)	1
  (0, 916)	1
  (0, 369)	2
  (0, 908)	1
  :	:
  (2, 273)	1
  (2, 854)	1
  (2, 760)	1
  (2, 638)	1
  (2, 494)	1
  (2, 640)	1
  (2, 1188)	1
  (2, 866)	1
  (2, 405)	1
  (2, 1218)	2
  (2, 504)	1
  (2, 1013)	1
  (2, 341)	1
  (2, 442)	1
  (2, 590)	1
  (2, 1134)	1
  (2, 502)	1
  (2, 978)	1
  (2, 360)	1
  (2, 440)	1
  (2, 1133)	1
  (2, 353)	1
  (2, 1158)	1
  (2, 505)	1
  (2, 1045)	1
['01', '01 02_13', '02', '02 2002', '02_13', '02_13 22', '03', '03 exec', '03 pick_it', '04', '04 ftoc_pickmsgs', '04 marking', '04 tkerror', '0500', '0500 chris', '10', '10 2000', '10 54', '10 miles', '10 times', '10 try', '10299452874797tmda', '10299452874797tmda deepeddyvirciocom', '103037728706fa6d', '103037728706fa6d deepeddycom', '104'

In [124]:
X = np.array(ham_emails + spam_emails)
y = np.array([0] * len(ham_emails) + [1] * len(spam_emails))

X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, test_size=0.2, random_state=42)

In [125]:
email_pipeline = Pipeline([
    ("EmailText to Words", EmailTextToWords()),
    ("Words to Count Vector", CountVectorizer(stop_words=stopwords.words('english'), ngram_range=(1, 2))),
])

X_augmented_train = email_pipeline.fit_transform(X_train)

In [126]:
log_clf = LogisticRegression(solver="liblinear", random_state=42)
score = model_selection.cross_val_score(log_clf, X_augmented_train, y_train, cv=3)
score.mean()

0.9945147354298663

In [128]:
X_augmented_test = email_pipeline.transform(X_test)

log_clf = LogisticRegression(solver="liblinear", random_state=42)
log_clf.fit(X_augmented_train, y_train)

y_pred = log_clf.predict(X_augmented_test)

print("Precision: {:.2f}%".format(100 * precision_score(y_test, y_pred)))
print("Recall: {:.2f}%".format(100 * recall_score(y_test, y_pred)))

# just hard_ham got:
# Precision: 97.12%
# Recall: 99.26%

# from joblib import dump, load
# dump(log_clf, 'filename.joblib')

Precision: 97.44%
Recall: 100.00%


In [129]:
# pre-read for rnn stuff: https://www.analyticsvidhya.com/blog/2020/03/pretrained-word-embeddings-nlp

# load the whole embedding into memory
embeddings_index = dict()
f = open('data/glove.6B/glove.6B.300d.txt')

for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs

f.close()
print('Loaded %s word vectors.' % len(embeddings_index)) # 400,000

Loaded 400000 word vectors.


In [131]:
X_train_for_rnn = EmailTextToWords().fit_transform(X_train)
print(X_train_for_rnn.shape)
X_train_for_rnn[0]

(2734,)


'i\'m on tv tonight\nurl: URL\ndate: not supplied\n\ntonight on the style network\'s tv show "area" my house will be featured \nundergoing a hawaiiana makeover watch it and meet carla my daughter and me \nit\'ll play monday at 9:30 pm et (if someone can tape it for me i\'d appreciate \nit because my cable service doesn\'t get the style channel i\'ll send you a new \nt-shirt iron on of a girl and her pet slug email mark@wellcom) link[1] \ndiscuss[2]\n\n[1] URL\n[2] URL\n\n\n'

In [145]:
X_test_for_rnn = EmailTextToWords().fit_transform(X_test)
X_test_for_rnn.shape

(684,)

In [146]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# tokenize the sentences
tokenizer = Tokenizer()

# preparing vocabulary
tokenizer.fit_on_texts(list(X_train_for_rnn))

# converting text into integer sequences
x_train_seq  = tokenizer.texts_to_sequences(X_train_for_rnn) 
x_test_seq = tokenizer.texts_to_sequences(X_test_for_rnn)

# padding to prepare sequences of same length
x_train_seq  = pad_sequences(x_train_seq, maxlen=400)
x_test_seq = pad_sequences(x_test_seq, maxlen=400)

x_train_seq[0]

array([    0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,

In [147]:
size_of_vocabulary=len(tokenizer.word_index) + 1
size_of_vocabulary

37238

In [143]:
# create a weight matrix for words in training docs
embedding_matrix = np.zeros((size_of_vocabulary, 300)) # 300 b/c our glove is 300 dimensions

for word, i in tokenizer.word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector
        
embedding_matrix

array([[ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.04656   ,  0.21318001, -0.0074364 , ...,  0.0090611 ,
        -0.20988999,  0.053913  ],
       [-0.25756001, -0.057132  , -0.67189997, ..., -0.16043   ,
         0.046744  , -0.070621  ],
       ...,
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.15554   , -0.16915999,  0.24866   , ...,  0.36546001,
         0.53972   ,  0.40272999],
       [ 0.21274   , -0.21021999, -0.23616999, ...,  0.32126001,
         0.19767   , -0.059769  ]])

In [148]:
from tensorflow.keras.models import *
from tensorflow.keras.layers import *
from tensorflow.keras.callbacks import *

model = Sequential()

# embedding layer - https://keras.io/api/layers/core_layers/embedding/
model.add(Embedding(size_of_vocabulary, 300, weights=[embedding_matrix], input_length=400, trainable=False)) 

# lstm layer - https://keras.io/api/layers/recurrent_layers/lstm/
model.add(LSTM(128, return_sequences=True, dropout=0.2))

# global Maxpooling
model.add(GlobalMaxPooling1D())

# dense Layer
model.add(Dense(64, activation='relu')) 
model.add(Dense(1, activation='sigmoid')) 

# add loss function, metrics, optimizer
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=["acc"]) 

# adding callbacks
es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=3)  
mc = ModelCheckpoint('best_model.h5', monitor='val_acc', mode='max', save_best_only=True, verbose=1)  

print(model.summary())

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 400, 300)          11171400  
_________________________________________________________________
lstm_1 (LSTM)                (None, 400, 128)          219648    
_________________________________________________________________
global_max_pooling1d_1 (Glob (None, 128)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 64)                8256      
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 65        
Total params: 11,399,369
Trainable params: 227,969
Non-trainable params: 11,171,400
_________________________________________________________________
None


In [149]:
history = model.fit(
    np.array(x_train_seq),
    np.array(y_train),
    batch_size=128,
    epochs=10,
    validation_data=(np.array(x_test_seq), np.array(y_test)),
    verbose=1,
    callbacks=[es,mc])

Epoch 1/10
Epoch 00001: val_acc improved from -inf to 0.89620, saving model to best_model.h5
Epoch 2/10
Epoch 00002: val_acc improved from 0.89620 to 0.98538, saving model to best_model.h5
Epoch 3/10
Epoch 00003: val_acc improved from 0.98538 to 0.99561, saving model to best_model.h5
Epoch 4/10
Epoch 00004: val_acc did not improve from 0.99561
Epoch 5/10
Epoch 00005: val_acc improved from 0.99561 to 0.99854, saving model to best_model.h5
Epoch 6/10
Epoch 00006: val_acc did not improve from 0.99854
Epoch 7/10
Epoch 00007: val_acc improved from 0.99854 to 1.00000, saving model to best_model.h5
Epoch 8/10
Epoch 00008: val_acc did not improve from 1.00000
Epoch 9/10
Epoch 00009: val_acc did not improve from 1.00000
Epoch 10/10
Epoch 00010: val_acc did not improve from 1.00000


In [152]:
# load best model
from tensorflow.keras.models import load_model
model = load_model('best_model.h5')

# evaluation 
_, val_acc = model.evaluate(x_test_seq, y_test, batch_size=128)
val_acc



1.0

In [None]:
nazario_spam_filenames = [name for name in sorted(os.listdir('data/phishing-2019')) if len(name) > 3]
