In [1]:
import time
import pickle
import tensorflow as tf
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
    # only use GPU memory that we need, not allocate all the GPU memory
    tf.config.experimental.set_memory_growth(gpus[0], enable=True)

import tqdm
import numpy as np


from os import walk 
from os.path import join
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from bs4 import BeautifulSoup 

from sklearn import svm
from sklearn.feature_extraction import text
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report, ConfusionMatrixDisplay

import gensim
import matplotlib.pyplot as plt
import seaborn as sns

from bs4 import BeautifulSoup
from wordcloud import WordCloud
from PIL import Image

from collections import Counter

%matplotlib inline 




from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.callbacks import ModelCheckpoint, TensorBoard
from sklearn.model_selection import train_test_split
from tensorflow.keras.layers import Embedding, LSTM, Dropout, Dense
from tensorflow.keras.models import Sequential
from tensorflow.keras.metrics import Recall, Precision

In [2]:
SPAM_1_FILEPATH = 'UniversityProject_SpamFilter/01_Processing/spam_assassin_corpus/spam_1'
SPAM_2_FILEPATH = 'UniversityProject_SpamFilter/01_Processing/spam_assassin_corpus/spam_2'
HAM_1_FILEPATH = 'UniversityProject_SpamFilter/01_Processing/spam_assassin_corpus/ham_1'
HAM_2_FILEPATH = 'UniversityProject_SpamFilter/01_Processing/spam_assassin_corpus/ham_2'

SPAM_CAT = 1
HAM_CAT = 0


SEQUENCE_LENGTH = 100 # the length of all sequences (number of words per sample)
EMBEDDING_SIZE = 100  # Using 100-Dimensional GloVe embedding vectors
TEST_SIZE = 0.25 # ratio of testing set

BATCH_SIZE = 64
EPOCHS = 10 # number of epochs

label2int = {"ham": 0, "spam": 1}
int2label = {0: "ham", 1: "spam"}

CUSTOM_FONT = 'UniversityProject_SpamFilter/wordcloud_resources/OpenSansCondensed-Bold.ttf'
WORD_CLOUD = 'UniversityProject_SpamFilter/wordcloud_resources/word_cloud.png'
THUMBS_UP = 'UniversityProject_SpamFilter/wordcloud_resources/thumbs-up.png'
THUMBS_DOWN = 'UniversityProject_SpamFilter/wordcloud_resources/thumbs-down.png'
X_ICON = 'UniversityProject_SpamFilter/wordcloud_resources/x-icon.png'

In [3]:
def email_body_generator(path): 
                                    #walk provides a tuple 
    for root, dirnames, filenames, in walk(path):
        for file_name in filenames:
            
            filepath = join(root, file_name)
            stream = open(filepath, encoding='latin-1')
            is_body= False
            lines = []
            
            #extracts email body 
            for line in stream: 
                if is_body: 
                    lines.append(line)
                elif line == '\n':
                    is_body = True

            stream.close()

            email_body = '\n'.join(lines)
            
            #loops over the file in the directory and returns the file name and associated email body 
            yield file_name, email_body

In [4]:
def dataframe_from_directory(path, classification):
    rows = []
    row_names = []
    
    for file_name, email_body in email_body_generator(path):
        rows.append({'MESSAGE': email_body, 'CATEGORY': classification})
        row_names.append(file_name) 
    
    return pd.DataFrame(rows, index=row_names) 

In [5]:
spam_emails = dataframe_from_directory(SPAM_1_FILEPATH, SPAM_CAT)
spam_emails = spam_emails.append(dataframe_from_directory(SPAM_2_FILEPATH, SPAM_CAT))
ham_emails = dataframe_from_directory(HAM_1_FILEPATH, HAM_CAT)
ham_emails = ham_emails.append(dataframe_from_directory(HAM_2_FILEPATH, HAM_CAT))
data = pd.concat([spam_emails, ham_emails])

In [6]:
data[data.MESSAGE.str.len() == 0].index

Index(['cmds', 'cmds', 'cmds'], dtype='object')

In [7]:
data.drop(['cmds'], inplace = True)

In [8]:
data['length']=data['MESSAGE'].apply(len)
data

Unnamed: 0,MESSAGE,CATEGORY,length
00249.5f45607c1bffe89f60ba1ec9f878039a,"Dear Homeowner,\n\n \n\nInterest Rates are at ...",1,612
00373.ebe8670ac56b04125c25100a36ab0510,ATTENTION: This is a MUST for ALL Computer Use...,1,1298
00214.1367039e50dc6b7adb0f2aa8aba83216,This is a multi-part message in MIME format.\n...,1,6691
00210.050ffd105bd4e006771ee63cabc59978,IMPORTANT INFORMATION:\n\n\n\nThe new domain n...,1,1141
00033.9babb58d9298daa2963d4f514193d7d6,This is the bottom line. If you can GIVE AWAY...,1,1795
...,...,...,...
00609.dd49926ce94a1ea328cce9b62825bc97,"I'm one of the 30,000 but it's not working ver...",0,953
00957.e0b56b117f3ec5f85e432a9d2a47801f,Damien Morton quoted:\n\n>W3C approves HTML 4 ...,0,257
01127.841233b48eceb74a825417d8d918abf8,"On Mon, 2002-07-22 at 06:50, che wrote:\n\n\n\...",0,393
01178.5c977dff972cd6eef64d4173b90307f0,"Once upon a time, Manfred wrote :\n\n\n\n> I w...",0,1434


In [9]:
documents_ids = range(0, len(data.index))
data['Doc_ID'] = documents_ids
data['File_Name'] = data.index
data = data.set_index('Doc_ID')

In [10]:
sklearn_stopwords = set(text.ENGLISH_STOP_WORDS)
gensim_stopwords = set(gensim.parsing.preprocessing.STOPWORDS)
nltk_stop_words = stopwords.words('english')

gensim_and_sklearn = sklearn_stopwords.union(gensim_stopwords)
libary_stopwords = gensim_and_sklearn.union(nltk_stop_words)

all_stop_words = {'url', 'http', '\n', '[html', 'html', 'tr', 'td', 'https', 'br', 'ign', 'err', 'mpt','[', ']' }
all_stop_words.update(libary_stopwords)

capital_sw = [each_string.title() for each_string in all_stop_words]
uppercase_sw = [each_string.upper() for each_string in all_stop_words]

all_stop_words.update(capital_sw)
all_stop_words.update(uppercase_sw)

In [11]:
def clean_msg_nohtml(message, #stemmer = PorterStemmer(),
                  stop_words = set(all_stop_words)): 
    
    # Remove HTML tags 
    soup = BeautifulSoup(message, 'html.parser')
    cleaned_text = soup.get_text()
    
    #Converts to lower case and splots up  the individual words
    words = word_tokenize(cleaned_text)
    
    filtered_words = []
    
    for word in words: 
        if word not in stop_words and word.isalpha():
            filtered_words.append(word)
    
    return filtered_words

In [12]:
nested_list = data.MESSAGE.apply(clean_msg_nohtml)

" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client like requests to get the document behind the URL, and feed that document to Beautiful Soup.


In [13]:
nested_list

Doc_ID
0       [Dear, Homeowner, Rates, lowest, point, years,...
1       [ATTENTION, Users, Package, Deal, Norton, Syst...
2       [message, MIME, format, dare, Try, better, ann...
3       [IMPORTANT, INFORMATION, new, domain, names, f...
4       [line, AWAY, CD, FREE, people, like, month, le...
                              ...                        
5791    [working, week, TES, updates, servers, syncing...
5792    [Damien, Morton, quoted, approves, able, feeli...
5793    [Mon, che, wrote, thats, correct, lines, added...
5794    [time, Manfred, wrote, like, install, RPM, tri...
5795    [run, Pick, use, New, FTOC, button, messages, ...
Name: MESSAGE, Length: 5796, dtype: object

In [14]:
docs_id_SPAM = data[data.CATEGORY ==1].index
docs_id_HAM = data[data.CATEGORY ==0].index

nested_list_HAM = nested_list.loc[docs_id_HAM]
nested_list_SPAM = nested_list.loc[docs_id_SPAM]
flat_list_HAM = [item for sublist in nested_list_HAM for item in sublist]

#total number of unique words in the non spam messages
HAM_words = pd.Series(flat_list_HAM).value_counts() 
flat_list_SPAM = [item for sublist in nested_list_SPAM for item in sublist]

#total number of unique words in the spam messages
SPAM_words = pd.Series(flat_list_SPAM).value_counts() 

In [15]:
type(SPAM_words)

pandas.core.series.Series

In [16]:
SPAM_words

email          1955
list           1312
money          1185
business       1181
information    1178
               ... 
sNo               1
DEFENSE           1
carpet            1
Eluxmedia         1
Birthday          1
Length: 24303, dtype: int64

In [None]:
X_train, X_test, y_train, y_test = train_test_split(data.MESSAGE, data.CATEGORY, test_size=0.3, random_state=43)