# Imports

In [1]:
from os import walk 
from os.path import join
import numpy as np 
import pandas as pd

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import recall_score, precision_score, f1_score
from nltk.corpus import stopwords
from sklearn.metrics import classification_report


import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

# Constants 

In [2]:
SPAM_1_FILEPATH = 'UniversityProject_SpamFilter/01_Processing/spam_assassin_corpus/spam_1'
SPAM_2_FILEPATH = 'UniversityProject_SpamFilter/01_Processing/spam_assassin_corpus/spam_2'
HAM_1_FILEPATH = 'UniversityProject_SpamFilter/01_Processing/spam_assassin_corpus/ham_1'
HAM_2_FILEPATH = 'UniversityProject_SpamFilter/01_Processing/spam_assassin_corpus/ham_2'

SPAM_CAT = 1
HAM_CAT = 0

DATA_JSON_FILE = 'UniversityProject_SpamFilter/01_Processing/Email_Text_Data.json'
VALIDATION_FILEPATH = 'UniversityProject_SpamFilter/validation_emails.json'
VALIDATION_DATA = 'UniversityProject_SpamFilter/Validation_Data.csv '

# Extract Email Body

In [3]:
def email_body_generator(path): 
                                    #walk provides a tuple 
    for root, dirnames, filenames, in walk(path):
        for file_name in filenames:
            
            filepath = join(root, file_name)
            stream = open(filepath, encoding='latin-1')
            is_body= False
            lines = []
            
            #extracts email body 
            for line in stream: 
                if is_body: 
                    lines.append(line)
                elif line == '\n':
                    is_body = True

            stream.close()

            email_body = '\n'.join(lines)
            
            #loops over the file in the directory and returns the file name and associated email body 
            yield file_name, email_body

In [4]:
def dataframe_from_directory(path, classification):
    rows = []
    row_names = []
    
    for file_name, email_body in email_body_generator(path):
        rows.append({'MESSAGE': email_body, 'CATEGORY': classification})
        row_names.append(file_name) 
    
    return pd.DataFrame(rows, index=row_names) 

In [5]:
spam_emails = dataframe_from_directory(SPAM_1_FILEPATH, SPAM_CAT)
spam_emails = spam_emails.append(dataframe_from_directory(SPAM_2_FILEPATH, SPAM_CAT))


In [6]:
ham_emails = dataframe_from_directory(HAM_1_FILEPATH, HAM_CAT)
ham_emails = ham_emails.append(dataframe_from_directory(HAM_2_FILEPATH, HAM_CAT))


In [7]:
data = pd.concat([spam_emails, ham_emails])

In [8]:
data[data.MESSAGE.str.len() == 0].index

Index(['cmds', 'cmds', 'cmds'], dtype='object')

In [9]:
data.drop(['cmds'], inplace = True)

## Add Document ID's to track 

In [10]:
documents_ids = range(0, len(data.index))
data['Doc_ID'] = documents_ids
data['File_Name'] = data.index
data = data.set_index('Doc_ID')
data.head() 

Unnamed: 0_level_0,MESSAGE,CATEGORY,File_Name
Doc_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,"Dear Homeowner,\n\n \n\nInterest Rates are at ...",1,00249.5f45607c1bffe89f60ba1ec9f878039a
1,ATTENTION: This is a MUST for ALL Computer Use...,1,00373.ebe8670ac56b04125c25100a36ab0510
2,This is a multi-part message in MIME format.\n...,1,00214.1367039e50dc6b7adb0f2aa8aba83216
3,IMPORTANT INFORMATION:\n\n\n\nThe new domain n...,1,00210.050ffd105bd4e006771ee63cabc59978
4,This is the bottom line. If you can GIVE AWAY...,1,00033.9babb58d9298daa2963d4f514193d7d6


## Saving to File 

In [11]:
data.to_json(DATA_JSON_FILE)

In [12]:
data = pd.read_json(DATA_JSON_FILE)

In [13]:
data.sort_index(inplace = True)

In [14]:
SW = stopwords.words('english')

capital_sw = [each_string.title() for each_string in SW]
uppercase_sw = [each_string.upper() for each_string in SW]

SW.extend(capital_sw)
SW.extend(uppercase_sw)

all_stop_words = set(SW)

In [15]:
#create vectorizer

vectorizer = CountVectorizer(stop_words=all_stop_words)

In [16]:
#created document term matrix in the previous method, however can do in one line with scikitlean

all_features = vectorizer.fit_transform(data.MESSAGE)

In [17]:
all_features.shape

(5796, 102858)

In [18]:
X_train, X_test, y_train, y_test = train_test_split(all_features, data.CATEGORY, test_size = 0.3, random_state = 88)

In [19]:
classifier = MultinomialNB()

In [20]:
 classifier.fit(X_train, y_train)

MultinomialNB()

In [21]:
nr_correct = (y_test == classifier.predict(X_test)).sum()

In [22]:
predictions = classifier.predict(X_test)

In [23]:
print(f'{nr_correct} documents classified correctly')

1658 documents classified correctly


In [24]:
nr_incorrect = y_test.size - nr_correct

In [25]:
print(f'{nr_incorrect} documents classified incorrectly')

81 documents classified incorrectly


In [26]:
fraction_wrong = nr_incorrect/ (nr_correct +nr_incorrect)
print(f' The (testing) accuracy of the model is {1-fraction_wrong:.3%}')

 The (testing) accuracy of the model is 95.342%


In [27]:
classifier.score(X_test, y_test)

0.953421506612996

In [28]:
# Recall Score
recall_score(y_test, classifier.predict(X_test))

0.8610108303249098

In [29]:
# Precision
precision_score(y_test, classifier.predict(X_test))

0.9916839916839917

In [30]:
#F1 Score
f1_score(y_test, classifier.predict(X_test))

0.9217391304347826

In [31]:
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

           0       0.94      1.00      0.97      1185
           1       0.99      0.86      0.92       554

    accuracy                           0.95      1739
   macro avg       0.97      0.93      0.94      1739
weighted avg       0.96      0.95      0.95      1739



In [32]:
correct_doc = (y_test == predictions).sum()
numbs_doc_wrong = X_test.shape[0] - correct_doc

print('Docs classified correctly', correct_doc)
print('Docs classified incorrectly', numbs_doc_wrong)

Docs classified correctly 1658
Docs classified incorrectly 81


## Visualising the results 

In [33]:
#Chart styling info 
yaxis_label = 'P(X | Spam)'
xaxis_label = 'P(X | Nonspam)'

linedata = np.linspace(start=-14000, stop = 1, num=1000)