# Imports

In [1]:
from os import walk 
from os.path import join

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt 
import sys

import nltk
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

from sklearn.model_selection import train_test_split

from bs4 import BeautifulSoup

%matplotlib inline 

# Constants

In [2]:
EXAMPLE_FILE = 'UniversityProject_SpamFilter/01_Processing/practice_email.txt'
SPAM_1_FILEPATH = 'UniversityProject_SpamFilter/01_Processing/spam_assassin_corpus/spam_1'
SPAM_2_FILEPATH = 'UniversityProject_SpamFilter/01_Processing/spam_assassin_corpus/spam_2'
HAM_1_FILEPATH = 'UniversityProject_SpamFilter/01_Processing/spam_assassin_corpus/ham_1'
HAM_2_FILEPATH = 'UniversityProject_SpamFilter/01_Processing/spam_assassin_corpus/ham_2'

SPAM_CAT = 1
HAM_CAT = 0

TRAINING_DATA_FILE = 'UniversityProject_SpamFilter/02_Training/Training_Data.txt'
TEST_DATA_FILE = 'UniversityProject_SpamFilter/02_Training/Test_Data.txt'

DATA_JSON_FILE = 'UniversityProject_SpamFilter/01_Processing/Email_Text_Data.json'
WORD_ID_FILE = 'UniversityProject_SpamFilter/01_Processing/Word_By_ID.csv'


TOKEN_SPAM_PROB_FILE = 'UniversityProject_SpamFilter/03_Testing/Prob_Spam.txt'
TOKEN_HAM_PROB_FILE = 'UniversityProject_SpamFilter/03_Testing/Prob_Ham.txt'
TOKEN_ALL_PROB_FILE = 'UniversityProject_SpamFilter/03_Testing/Prob_All_Tokens.txt'

TEST_FEATURE_MATRIX = 'UniversityProject_SpamFilter/03_Testing/Test_Features.txt'
TEST_TARGET_FILE = 'UniversityProject_SpamFilter/03_Testing/Test_Target.txt'


VOCAB_SIZE=2500

# Extract Email Body

In [3]:
def email_body_generator(path): 
                                    #walk provides a tuple 
    for root, dirnames, filenames, in walk(path):
        for file_name in filenames:
            
            filepath = join(root, file_name)
            stream = open(filepath, encoding='latin-1')
            is_body= False
            lines = []
            
            #extracts email body 
            for line in stream: 
                if is_body: 
                    lines.append(line)
                elif line == '\n':
                    is_body = True

            stream.close()

            email_body = '\n'.join(lines)
            
            #loops over the file in the directory and returns the file name and associated email body 
            yield file_name, email_body

In [4]:
def dataframe_from_directory(path, classification):
    rows = []
    row_names = []
    
    for file_name, email_body in email_body_generator(path):
        rows.append({'MESSAGE': email_body, 'CATEGORY': classification})
        row_names.append(file_name) 
    
    return pd.DataFrame(rows, index=row_names) 

In [5]:
spam_emails = dataframe_from_directory(SPAM_1_FILEPATH, SPAM_CAT)
spam_emails = spam_emails.append(dataframe_from_directory(SPAM_2_FILEPATH, SPAM_CAT))
spam_emails.head()

Unnamed: 0,MESSAGE,CATEGORY
00249.5f45607c1bffe89f60ba1ec9f878039a,"Dear Homeowner,\n\n \n\nInterest Rates are at ...",1
00373.ebe8670ac56b04125c25100a36ab0510,ATTENTION: This is a MUST for ALL Computer Use...,1
00214.1367039e50dc6b7adb0f2aa8aba83216,This is a multi-part message in MIME format.\n...,1
00210.050ffd105bd4e006771ee63cabc59978,IMPORTANT INFORMATION:\n\n\n\nThe new domain n...,1
00033.9babb58d9298daa2963d4f514193d7d6,This is the bottom line. If you can GIVE AWAY...,1


In [6]:
ham_emails = dataframe_from_directory(HAM_1_FILEPATH, HAM_CAT)
ham_emails = ham_emails.append(dataframe_from_directory(HAM_2_FILEPATH, HAM_CAT))
ham_emails.head()

Unnamed: 0,MESSAGE,CATEGORY
00769.25bf9a767b5db0ed93f03c1637281663,"\n\nIn a message dated 9/24/2002 11:24:58 AM, ...",0
01074.8590d61ac0aeeadb58dc2f2ba776c406,"Hiya, I always seem to get errors when I do an...",0
02479.14365bcad3a60fcf24c5c1813f6291fb,\n\nI don't know how one can expect better and...,0
01417.ce7b07a2114218dbac682b599785820d,Tim Peters wrote:\n\n> I've run no experiments...,0
00357.d559b71616f64ba1d1c1e61a03644fd4,"At 10:34 AM -0700 on 8/28/02, Jim Whitehead wr...",0


In [7]:
data = pd.concat([spam_emails, ham_emails])

In [97]:
data.shape

(5796, 3)

# Data Cleaning: Checking for Missing Values¶

In [8]:
data.MESSAGE.str.len() == 0

00249.5f45607c1bffe89f60ba1ec9f878039a    False
00373.ebe8670ac56b04125c25100a36ab0510    False
00214.1367039e50dc6b7adb0f2aa8aba83216    False
00210.050ffd105bd4e006771ee63cabc59978    False
00033.9babb58d9298daa2963d4f514193d7d6    False
                                          ...  
00609.dd49926ce94a1ea328cce9b62825bc97    False
00957.e0b56b117f3ec5f85e432a9d2a47801f    False
01127.841233b48eceb74a825417d8d918abf8    False
01178.5c977dff972cd6eef64d4173b90307f0    False
00747.352d424267d36975a7b40b85ffd0885e    False
Name: MESSAGE, Length: 5799, dtype: bool

In [9]:
data[data.MESSAGE.str.len() == 0].index

Index(['cmds', 'cmds', 'cmds'], dtype='object')

In [10]:
data.drop(['cmds'], inplace = True)

## Add Document ID's to track 

In [11]:
documents_ids = range(0, len(data.index))
data['Doc_ID'] = documents_ids
data['File_Name'] = data.index
data = data.set_index('Doc_ID')
data.head() 

Unnamed: 0_level_0,MESSAGE,CATEGORY,File_Name
Doc_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,"Dear Homeowner,\n\n \n\nInterest Rates are at ...",1,00249.5f45607c1bffe89f60ba1ec9f878039a
1,ATTENTION: This is a MUST for ALL Computer Use...,1,00373.ebe8670ac56b04125c25100a36ab0510
2,This is a multi-part message in MIME format.\n...,1,00214.1367039e50dc6b7adb0f2aa8aba83216
3,IMPORTANT INFORMATION:\n\n\n\nThe new domain n...,1,00210.050ffd105bd4e006771ee63cabc59978
4,This is the bottom line. If you can GIVE AWAY...,1,00033.9babb58d9298daa2963d4f514193d7d6


## Saving to File 

In [12]:
data.to_json(DATA_JSON_FILE)

## Remove HTML Tags

In [13]:
soup = BeautifulSoup(data.at[2, 'MESSAGE'], 'html.parser')
#print(soup.prettify())

## Process Emails

In [14]:
def msg_nohtml(message, stemmer = PorterStemmer(), 
                  stop_words = set(stopwords.words('english'))): 
    
    # Remove HTML tags 
    soup = BeautifulSoup(message, 'html.parser')
    cleaned_text = soup.get_text()
    
    #Converts to lower case and splots up  the individual words --> evaluate if tolower() is needed? 
    words = word_tokenize(cleaned_text.lower())
    
    filtered_words = []
    
    for word in words: 
        #isalpha returns True for alphabetic letters (Aa-Zz)
        if word not in stop_words and word.isalpha():
            filtered_words.append(stemmer.stem(word))
    
    
    return filtered_words

# Apply Cleaning and Tokenisation to all Messages

### Slicing Dataframes and Series & Creating Subsets

In [15]:
%%time

nested_list = data.MESSAGE.apply(msg_nohtml)

" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client like requests to get the document behind the URL, and feed that document to Beautiful Soup.


CPU times: user 29.2 s, sys: 72.1 ms, total: 29.3 s
Wall time: 29.3 s


In [16]:
docs_id_spam = data[data.CATEGORY ==1].index
docs_id_ham = data[data.CATEGORY ==0].index

In [17]:
nested_list_spam = nested_list.loc[docs_id_spam]

In [18]:
nested_list_ham = nested_list.loc[docs_id_ham]

In [19]:
flat_list_ham = [item for sublist in nested_list_ham for item in sublist]
#total number of unique words in the ham messages
normal_words = pd.Series(flat_list_ham).value_counts() 
#total number of words in ham messages
normal_words.shape[0] 

20815

In [20]:
flat_list_spam = [item for sublist in nested_list_spam for item in sublist]
#total number of unique words in the spam messages
spammy_words = pd.Series(flat_list_spam).value_counts() 
#total number of words in spam messages
spammy_words.shape[0] 

13242

# Generate Vocabularly & Dictionary --> generate for the STEMMED list of words

In [21]:
stemmed_nested_list = data.MESSAGE.apply(msg_nohtml)
flat_stemmed_list = [item for sublist in stemmed_nested_list for item in sublist]

" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client like requests to get the document behind the URL, and feed that document to Beautiful Soup.


In [22]:
unique_words = pd.Series(flat_stemmed_list).value_counts()
frequent_words = unique_words[0:VOCAB_SIZE]
print('Number of unique words: ', unique_words.shape[0])

Number of unique words:  27334


### Create Vocab dataframe with a WORD_ID 

In [23]:
word_ids = list(range(0, VOCAB_SIZE))
vocab = pd.DataFrame({'VOCAB_WORD': frequent_words.index.values}, index=word_ids)
vocab.index.name = 'WORD_ID'

In [24]:
# Save Vocab to CSV file
vocab.to_csv(WORD_ID_FILE, index_label = vocab.index.name, header = vocab.VOCAB_WORD.name)

### Check - Certain word part of Vocab

In [25]:
'hello' in set(vocab.VOCAB_WORD)

True

# Generate Features and a Sparse Matrix 

Creating a dataFrame with one Word per Column

In [26]:
stemmed_nested_list

Doc_ID
0       [dear, homeown, interest, rate, lowest, point,...
1       [attent, must, comput, user, packag, deal, nor...
2       [messag, mime, format, dare, tri, find, better...
3       [import, inform, new, domain, name, final, ava...
4       [bottom, line, give, away, cd, free, peopl, li...
                              ...                        
5791    [one, work, well, week, te, updat, server, syn...
5792    [damien, morton, quot, approv, html, abl, say,...
5793    [mon, che, wrote, that, correct, line, ad, rep...
5794    [upon, time, manfr, wrote, would, like, instal...
5795    [run, pick, use, new, ftoc, button, show, mess...
Name: MESSAGE, Length: 5796, dtype: object

In [27]:
# Panda List = stemmed_nested_list  
# Convert stemm_nested_list from a series of list to a nested list
word_columns_df = pd.DataFrame.from_records(stemmed_nested_list.tolist())
word_columns_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,7661,7662,7663,7664,7665,7666,7667,7668,7669,7670
0,dear,homeown,interest,rate,lowest,point,year,help,find,best,...,,,,,,,,,,
1,attent,must,comput,user,packag,deal,norton,systemwork,softwar,suit,...,,,,,,,,,,
2,messag,mime,format,dare,tri,find,better,annuiti,guarante,year,...,,,,,,,,,,
3,import,inform,new,domain,name,final,avail,gener,public,discount,...,,,,,,,,,,
4,bottom,line,give,away,cd,free,peopl,like,one,month,...,,,,,,,,,,


In [28]:
word_columns_df.shape

(5796, 7671)

# Splitting the Data into a Training and Testing Dataset

In [29]:
X_train, X_test, y_train, y_test = train_test_split(word_columns_df, 
                                                    data.CATEGORY, 
                                                    test_size=0.3, 
                                                    random_state =42)

In [30]:
print('# of training samples', X_train.shape[0])
print('Fraction of training samples', X_train.shape[0]/word_columns_df.shape[0])

# of training samples 4057
Fraction of training samples 0.6999654934437544


In [31]:
X_train.index.name = X_test.index.name = 'DOC_ID'
X_train.head()

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,7661,7662,7663,7664,7665,7666,7667,7668,7669,7670
DOC_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
4844,thu,jul,rodent,unusu,size,wrote,appli,one,three,order,...,,,,,,,,,,
4727,well,would,vaniti,list,forward,least,littl,pure,data,week,...,,,,,,,,,,
5022,world,wide,word,issu,saturday,august,sent,saturday,subscrib,least,...,,,,,,,,,,
3504,believ,spamassassin,maintain,scheme,wherebi,corpu,distribut,ie,sever,peopl,...,,,,,,,,,,
3921,sorri,think,send,realiz,list,sender,usual,anyawi,ask,harri,...,,,,,,,,,,


In [32]:
y_train.head()

Doc_ID
4844    0
4727    0
5022    0
3504    0
3921    0
Name: CATEGORY, dtype: int64

In [33]:
word_index = pd.Index(vocab.VOCAB_WORD)

In [34]:
def make_sparse_matrix(df, indexed_words, labels):
    """
    Returns Sparse Matrix as dataframe
    
    df: a dataframe with words in the columns with a document is as an index (X_train or X_test)
    indexed_words: index of words ordered by word_id
    labels: CATEGORY as a series (y_train or y_test)
    """
    
    
    rows = df.shape[0]
    cols = df.shape[1]
    word_set = set(indexed_words)
    
    dictionary_list = []
    
    for i in range(rows):
        for j in range(cols):
            
            word = df.iat[i,j]
            if word in word_set: 
                doc_id = df.index[i]
                word_id = indexed_words.get_loc(word)
                category = labels.at[doc_id]
                
                item = {'LABEL' : category, 'DOC_ID': doc_id, 
                       'OCCURANCE' :1, 'WORD_ID': word_id}
                
                dictionary_list.append(item)
    
    return pd.DataFrame(dictionary_list)

In [35]:
%%time 
sparse_train_df = make_sparse_matrix(X_train, word_index, y_train)

CPU times: user 8min, sys: 566 ms, total: 8min
Wall time: 8min 1s


In [36]:
sparse_train_df[:5]

Unnamed: 0,LABEL,DOC_ID,OCCURANCE,WORD_ID
0,0,4844,1,392
1,0,4844,1,492
2,0,4844,1,2356
3,0,4844,1,497
4,0,4844,1,37


In [37]:
sparse_train_df.shape

(444958, 4)

In [38]:
## combine occurrences with the Pandas groupby() --> similar to pivot table in excel 
trained_grouped = sparse_train_df.groupby(['DOC_ID', 'WORD_ID', 'LABEL']).sum()

In [39]:
trained_grouped.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,OCCURANCE
DOC_ID,WORD_ID,LABEL,Unnamed: 3_level_1
0,0,1,2
0,7,1,1
0,16,1,1
0,18,1,1
0,24,1,1


In [40]:
data.MESSAGE[0] ## --> shows the html is actually links embedded into the email body 

'Dear Homeowner,\n\n \n\nInterest Rates are at their lowest point in 40 years!\n\n\n\nWe help you find the best rate for your situation by\n\nmatching your needs with hundreds of lenders!\n\n\n\nHome Improvement, Refinance, Second Mortgage,\n\nHome Equity Loans, and More! Even with less than\n\nperfect credit!\n\n\n\nThis service is 100% FREE to home owners and new\n\nhome buyers without any obligation. \n\n\n\nJust fill out a quick, simple form and jump-start\n\nyour future plans today!\n\n\n\n\n\nVisit http://61.145.116.186/user0201/index.asp?Afft=QM10\n\n\n\n\n\n\n\n\n\n\n\n\n\nTo unsubscribe, please visit:\n\n\n\nhttp://61.145.116.186/light/watch.asp\n\n\n\n\n'

In [41]:
trained_grouped = trained_grouped.reset_index()
trained_grouped.head()

Unnamed: 0,DOC_ID,WORD_ID,LABEL,OCCURANCE
0,0,0,1,2
1,0,7,1,1
2,0,16,1,1
3,0,18,1,1
4,0,24,1,1


In [42]:
trained_grouped.shape

(265568, 4)

# Save Training Data as .txt File

In [43]:
np.savetxt(TRAINING_DATA_FILE, trained_grouped, fmt='%d')

# Create sparse matrix for the test data. Group the occurrences of the same word & save data as .txt file

In [44]:
X_test.shape

(1739, 7671)

In [45]:
%%time
sparse_test_df = make_sparse_matrix(X_test, word_index, y_test)

CPU times: user 3min 22s, sys: 291 ms, total: 3min 23s
Wall time: 3min 23s


In [46]:
sparse_test_df.shape

(187651, 4)

In [47]:
test_grouped = sparse_test_df.groupby(['DOC_ID', 'WORD_ID', 'LABEL']).sum()

In [48]:
test_grouped = test_grouped.reset_index()
test_grouped.head()

Unnamed: 0,DOC_ID,WORD_ID,LABEL,OCCURANCE
0,8,7,1,1
1,8,8,1,1
2,8,12,1,1
3,8,21,1,2
4,8,26,1,1


In [49]:
test_grouped.shape

(110579, 4)

In [50]:
np.savetxt(TEST_DATA_FILE, test_grouped, fmt = '%d')

# Pre Processing Subtleties

In [51]:
train_doc_ids = set(trained_grouped.DOC_ID)
test_doc_ids = set(test_grouped.DOC_ID)

In [52]:
len(test_doc_ids)

1723

In [53]:
len(X_test)

1739

In [54]:
X_test.index

Int64Index([4675, 4220, 2484, 2418, 5110, 1161, 3296,  101, 3337,  401,
            ...
            3674, 4885, 5120, 1338, 2671, 5295, 5740, 5069, 1293, 4354],
           dtype='int64', name='DOC_ID', length=1739)

In [55]:
# set(X_test.index.values)

In [56]:
#set(X_test.index.values) - test_doc_ids

In [57]:
data.loc[14]

MESSAGE      ------=_NextPart_000_00A3_65E24E1C.A3468E63\n\...
CATEGORY                                                     1
File_Name               00095.17594a58d6736a8f6a1990b0b92090cd
Name: 14, dtype: object

# Load the text into a Numpy Array

In [58]:
sparse_training_data = np.loadtxt(TRAINING_DATA_FILE, delimiter = ' ', dtype=int)
sparse_testing_data = np.loadtxt(TEST_DATA_FILE, delimiter =' ', dtype = int) 

In [59]:
sparse_testing_data[:5]

array([[ 8,  7,  1,  1],
       [ 8,  8,  1,  1],
       [ 8, 12,  1,  1],
       [ 8, 21,  1,  2],
       [ 8, 26,  1,  1]])

In [60]:
sparse_training_data[:5]

array([[ 0,  0,  1,  2],
       [ 0,  7,  1,  1],
       [ 0, 16,  1,  1],
       [ 0, 18,  1,  1],
       [ 0, 24,  1,  1]])

# Create a Full Matrix from a Sparse Matrix

In [61]:
def make_full_matrix(sparse_matrix, nr_words, doc_idx=0, word_idx=1, cat_idx=2, freq_idx=3):
    """
    Form a full matrix from a sparse matrix. Return a pandas dataframe. 
    Keyword arguments:
    sparse_matrix -- numpy array
    nr_words -- size of the vocabulary. Total number of tokens. 
    doc_idx -- position of the document id in the sparse matrix. Default: 1st column
    word_idx -- position of the word id in the sparse matrix. Default: 2nd column
    cat_idx -- position of the label (spam is 1, nonspam is 0). Default: 3rd column
    freq_idx -- position of occurrence of word in sparse matrix. Default: 4th column
    """
    column_names = ['DOC_ID'] + ['CATEGORY'] + list(range(0, VOCAB_SIZE))
    doc_id_names = np.unique(sparse_matrix[:, 0])
    full_matrix = pd.DataFrame(index=doc_id_names, columns=column_names)
    full_matrix.fillna(value=0, inplace=True)
    
    for i in range(sparse_matrix.shape[0]):
        doc_nr = sparse_matrix[i][doc_idx]
        word_id = sparse_matrix[i][word_idx]
        label = sparse_matrix[i][cat_idx]
        occurrence = sparse_matrix[i][freq_idx]
        
        full_matrix.at[doc_nr, 'DOC_ID'] = doc_nr
        full_matrix.at[doc_nr, 'CATEGORY'] = label
        full_matrix.at[doc_nr, word_id] = occurrence
    
    full_matrix.set_index('DOC_ID', inplace=True)
    return full_matrix

In [62]:
%%time
full_training_data = make_full_matrix(sparse_training_data, VOCAB_SIZE)

CPU times: user 7.39 s, sys: 87 ms, total: 7.48 s
Wall time: 7.5 s


# Training the NC Model - Calculating the Probability of Spam

In [63]:
prob_spam = full_training_data.CATEGORY.sum()/ full_training_data.CATEGORY.size
print('Probability of spam is: ', prob_spam)

Probability of spam is:  0.3111609367214748


In [64]:
full_training_features = full_training_data.loc[:, full_training_data.columns != 'CATEGORY'] 
email_lengths = full_training_features.sum(axis=1)
total_wordcount = email_lengths.sum()

In [65]:
spam_lengths = email_lengths[full_training_data.CATEGORY ==1]
spam_wordcount = spam_lengths.sum()
ham_lengths = email_lengths[full_training_data.CATEGORY == 0]
ham_wordcount = ham_lengths.sum()

In [66]:
##check
spam_wordcount + ham_wordcount - total_wordcount

0

In [67]:
full_training_features.shape

(4014, 2500)

In [68]:
train_spam_tokens = full_training_features.loc[full_training_data.CATEGORY ==1]
summed_spam_tokens = train_spam_tokens.sum(axis = 0) +1 ##-->Laplase smoothing technique 
train_ham_tokens = full_training_features.loc[full_training_data.CATEGORY ==0]
summed_ham_tokens = train_ham_tokens.sum(axis=0)+1

In [69]:
summed_ham_tokens.tail()

2495    17
2496    23
2497    33
2498    21
2499    21
dtype: int64

In [70]:
train_ham_tokens[2499].sum()+1

21

# P(Token | Spam) - Probability that a Token Occurs given the Email is Spam¶

In [71]:
prob_tokens_spam = summed_spam_tokens / (spam_wordcount + VOCAB_SIZE)

# P(Token | Ham) - Probability that a Token Occurs given the Email is Nonspam

In [72]:
prob_tokens_ham = summed_ham_tokens / (ham_wordcount + VOCAB_SIZE)

# P(Token) - Probability that Token Occurs

In [98]:
prob_tokens_all = full_training_features.sum(axis=0) / total_wordcount
prob_tokens_all

0       0.015851
1       0.007792
2       0.007616
3       0.006828
4       0.006639
          ...   
2495    0.000063
2496    0.000049
2497    0.000081
2498    0.000061
2499    0.000063
Length: 2500, dtype: float64

# Save Trained Model 

In [74]:
np.savetxt(TOKEN_SPAM_PROB_FILE, prob_tokens_spam)
np.savetxt(TOKEN_HAM_PROB_FILE, prob_tokens_ham)
np.savetxt(TOKEN_ALL_PROB_FILE, prob_tokens_all)

# Prepare Test Data

In [75]:
sparse_testing_data.shape

(110579, 4)

In [76]:
%%time

full_test_data = make_full_matrix(sparse_testing_data, nr_words=VOCAB_SIZE)

CPU times: user 3.27 s, sys: 21.5 ms, total: 3.29 s
Wall time: 3.29 s


In [77]:
X_test = full_test_data.loc[:, full_test_data.columns != 'CATEGORY']
y_test = full_test_data.CATEGORY

In [78]:
np.savetxt(TEST_TARGET_FILE, y_test)
np.savetxt(TEST_FEATURE_MATRIX, X_test)

# Calculating Joint Probability 

# Set the Prior 

$$P(Spam \, | \, X) = \frac{P(X\, | \, Spam \,) \, P(Spam)} {P(X)} $$

In [79]:
prob_spam

0.3111609367214748

### In this calc, going to combine the joint prob and the conditional probability.
### Calc the probability that the email is spam given the tokens using the Dot Product

In [80]:
#Loading the Data: 
#Features

X_test = np.loadtxt(TEST_FEATURE_MATRIX, delimiter = ' ')
#Target
y_test = np.loadtxt(TEST_TARGET_FILE, delimiter = ' ')
#Tken Probabilities
prob_token_spam = np.loadtxt(TOKEN_SPAM_PROB_FILE, delimiter = ' ')
prob_token_ham = np.loadtxt(TOKEN_HAM_PROB_FILE, delimiter = ' ')
prob_all_tokens = np.loadtxt(TOKEN_ALL_PROB_FILE, delimiter = ' ')

In [81]:
joint_log_ham= X_test.dot(np.log(prob_token_ham) - np.log(prob_all_tokens)) + np.log(1-prob_spam)

In [82]:
joint_log_spam = X_test.dot(np.log(prob_token_spam) - np.log(prob_all_tokens)) + np.log(prob_spam)

# Making predictions
### Checking for higher joint probaility 

$$P(Spam \, | \, X) \, > \, P(Ham \, | \, X)$$ 
<center> **OR** </center> 
$$P(Spam \, | \, X) \, < \, P(Ham \, | \, X)$$

In [100]:
prediction = joint_log_spam > joint_log_ham
prediction

array([ True,  True,  True, ..., False, False, False])

### Simplify

$$P(X \, | \, Spam) \, P(Spam) ≠ \frac{P(X\, | \, Spam) \, P(Spam)} {P(X)} $$

In [102]:
joint_log_spam = X_test.dot(np.log(prob_token_spam)) + np.log(prob_spam)
joint_log_ham= X_test.dot(np.log(prob_token_ham)) + np.log(1-prob_spam)



# Metrics and Evaluation 
## Accuracy of the model 

In [85]:
correct_doc = (y_test == prediction).sum()
numbs_doc_wrong = X_test.shape[0] - correct_doc

print('Docs classified correctly', correct_doc)
print('Docs classified incorrectly', numbs_doc_wrong)

Docs classified correctly 1673
Docs classified incorrectly 50


In [86]:
## Accuracy 
correct_doc/len(X_test)

0.9709808473592572

In [87]:
fraction_wrong = numbs_doc_wrong /len(X_test)
print('Fraction classified incorrectly is {:.2%}'.format(fraction_wrong))
print('Fraction classified correclt is {:.2%}'.format(1-fraction_wrong))

Fraction classified incorrectly is 2.90%
Fraction classified correclt is 97.10%


In [88]:
np.unique(prediction, return_counts = True)

(array([False,  True]), array([1167,  556]))

In [89]:
true_pos = (y_test == 1) & (prediction == 1)

In [90]:
false_pos = (y_test == 0) & (prediction == 1) 

In [91]:
false_neg = (y_test == 1) & (prediction == 0)

# Recall Score- the sentitivity 
recalls is the (True Positions) /(( true Positive) + (False Neg)) 

In [92]:
recall_score = true_pos.sum() / (true_pos.sum() + false_neg.sum())
print('Recall score is {:.3}'.format(recall_score))

Recall score is 0.93


# Precision metric 
precision = (True Positive) / (True Pos + False Pos) 

In [93]:
precision_score = true_pos.sum() / (true_pos.sum() + false_pos.sum())

In [94]:
print('Precision score is {:.3}'.format(precision_score))

Precision score is 0.984


# F score or F1 SCore - the harmonic mean 
2 x ((Precision x Recall) / Precision + Recall )

In [95]:
f1_score = 2 * (precision_score *recall_score)/ (precision_score +recall_score)
print('F1 Score is {:.3}'.format(f1_score))

F1 Score is 0.956
