# Winters, Alexander (V00970263)

# Problem 3. Naive Bayes

### Sources:

https://gtraskas.github.io/post/spamit/ ---> To load the data

https://www.kdnuggets.com/2020/07/spam-filter-python-naive-bayes-scratch.html

https://www.youtube.com/watch?v=O2L2Uv9pdDA

https://github.com/Soft-Squad/concordance/blob/main/concord4.py

In [1]:
import numpy as np
np.random.seed(1337)

In [2]:
import pandas as pd
import time

In [3]:
# Necessary to download file, extract and format
import urllib.request
import os
import tarfile
import pickle
# SSL Certificate verificaiton failed workaround
import ssl
ssl._create_default_https_context = ssl._create_unverified_context

In [4]:
# Download file
url = 'http://www.aueb.gr/users/ion/data/enron-spam/preprocessed/enron5.tar.gz'

# Get user's download folder path
user_downloads = os.path.join(os.path.expanduser('~') + '\Downloads')
enron_dir = os.path.join(user_downloads, 'Enron5-Emails')

if not os.path.exists(enron_dir):
    os.makedirs(enron_dir)
path = os.path.join(enron_dir, 'enron5.tar.gz')
if not os.path.exists(path):
    urllib.request.urlretrieve(url, path)

In [5]:
# Extract Emails
def extract_emails(fname):
    rows = []
    tfile = tarfile.open(fname, 'r:gz')
    for member in tfile.getmembers():
        if 'ham' in member.name:
            f = tfile.extractfile(member)
            if f is not None:
                row = f.read()
                rows.append({'LABEL': 'ham', 'MSG': row})
        if 'spam' in member.name:
            f = tfile.extractfile(member)
            if f is not None:
                row = f.read()
                rows.append({'LABEL': 'spam', 'MSG': row})
    tfile.close()
    return pd.DataFrame(rows)
    

In [6]:
# Make sure the pickle file is deleted to run again
if not os.path.exists(user_downloads + '/emails.pickle'):
    df = pd.DataFrame({'LABEL': [], 'MSG': []})
    unzipped_file = extract_emails(os.path.join(enron_dir, 'enron5.tar.gz'))
    df = pd.concat([df, unzipped_file])
    df.to_pickle(user_downloads + '/emails.pickle')
        

with open(user_downloads + '/emails.pickle', 'rb') as f:
    emails_df = pickle.load(f) 

# Translate bytes objects into strings.
df['MSG'] = df['MSG'].apply(lambda x: x.decode('latin-1'))

# Reset pandas df index.
df = df.reset_index(drop=True)

In [7]:
df.head()

Unnamed: 0,LABEL,MSG
0,ham,Subject: leadership development pilot\r\nsally...
1,ham,Subject: re : additional responsibility\r\ncon...
2,ham,Subject: global risk management operations\r\n...
3,ham,Subject: global risk management operations\r\n...
4,ham,Subject: global risk management operations\r\n...


In [8]:
print(df.shape)
print(df['LABEL'].value_counts(normalize=True))

(5175, 2)
spam    0.710145
ham     0.289855
Name: LABEL, dtype: float64


In [9]:
# Shuffle data for 70/30 split into training and validation sets
shuffled_dataset = df.sample(frac=1).reset_index(drop=True)
split = int(df.shape[0] * 0.7)

train_set = shuffled_dataset.iloc[:split].reset_index(drop=True)
# Remove anything that is not a word
train_set['MSG'] = train_set['MSG'].str.lower().replace('[^\sa-zA-Z]+', ' ', regex=True)
train_set['MSG'] = train_set['MSG'].str.split()

test_set = shuffled_dataset.iloc[split:].reset_index(drop=True)
test_set['MSG'] = test_set['MSG'].str.lower().replace('[^\sa-zA-Z]+', ' ', regex=True)
test_set['MSG'] = test_set['MSG'].str.split()

In [10]:
train_X = train_set['MSG']
train_y = train_set['LABEL']

test_X = test_set['MSG']
test_y = test_set['LABEL']

In [11]:
def fit(train_X, train_y):
    # Remove duplicate words
    unique_words = []
    for key, email in train_X.iteritems():
        for word in email:
            unique_words.append(word)
    unique_words = sorted(list(set(unique_words)))
    
    email_word_counts = {unique_word: [0] * len(train_X) for unique_word in unique_words}
    # Find count of each word in email
    for index, email in enumerate(train_X):
        for word in email:
            email_word_counts[word][index] += 1
    
    # Store word counts in a df
    word_counts = pd.DataFrame(email_word_counts)
    
    clean_df = pd.concat([train_X, word_counts], axis=1)
    
    # Get all spam and ham emails
    spam_emails = clean_df.iloc[np.where(train_y == 'spam')[0]]
    ham_emails = clean_df.iloc[np.where(train_y == 'ham')[0]]
    
    # Get the prob of an email being spam and ham
    p_spam = len(spam_emails) / len(clean_df)
    p_ham = len(ham_emails) / len(clean_df)
    
    p_wi_spam = {unique_word: 0 for unique_word in unique_words}
    p_wi_ham = {unique_word: 0 for unique_word in unique_words}
    
    # Find conditional prob. of each word
    for word in unique_words:
        # P(wi|spam) and P(wi|ham)
        p_wi_spam[word] = spam_emails[word].sum() / len(spam_emails)
        p_wi_ham[word] = ham_emails[word].sum() / len(ham_emails)
    return p_wi_spam, p_wi_ham, p_spam, p_ham

In [12]:
def predict(test_X, test_y, p_wi_spam, p_wi_ham, p_spam, p_ham):
    correct = 0
    
    for index, email in test_X.iteritems():
        test_p_wi_spam = []
        test_p_wi_ham = []
        
        true_value = test_y.loc[index]
        y_pred = ''
        
        for word in email:
            if word in p_wi_spam:
                test_p_wi_spam.append(p_wi_spam[word])
            if word in p_wi_ham:
                test_p_wi_ham.append(p_wi_ham[word])
        
        # Get prob. of a given email
        p_w_spam = p_spam * np.prod(test_p_wi_spam)
        p_w_ham = p_ham * np.prod(test_p_wi_ham)
        
        # Check which prob. is higher and provide a label
        if p_w_spam > p_w_ham:
            y_pred = 'spam'
        elif p_w_ham > p_w_spam: 
            y_pred = 'ham'
            
        if y_pred == true_value:
            correct += 1
        
    acc = (correct / len(test_X)) * 100
    err = 100. - acc
    return acc, err

In [13]:
bayes_time = time.time()
p_wi_spam, p_wi_ham, p_spam, p_ham = fit(train_X, train_y)

train_acc, train_err = predict(train_X, train_y, p_wi_spam, p_wi_ham, p_spam, p_ham)
test_acc, test_err = predict(test_X, test_y, p_wi_spam, p_wi_ham, p_spam, p_ham)

print("Naive Bayes\n")
print("Training Accuracy: {:.2f}% Training Error: {:.2f}%".format(train_acc, train_err))
print("Test Accuracy: {:.2f}% Test Error: {:.2f}%".format(test_acc, test_err))
print("Total Time: {:.2f}s".format(time.time() - bayes_time))

Naive Bayes

Training Accuracy: 88.85% Training Error: 11.15%
Test Accuracy: 68.32% Test Error: 31.68%
Total Time: 75.17s


It is moderately accurate on test data with ~68% accuracy. I think if we added an alpha value to each word_count, we might see better results due to some sneaky emails.