Dataset: trec06p
Programming Language: Python 3.9.13

In [1]:
import sys
sys.path.append("../../")
import os
os.chdir("../..")

In [2]:
import numpy as np
import pandas as pd

from src.utils.unzip import unzip_no_pass

Part of the machine learning process is processing data to a format suitable for analysis and training. In this part, we start by loading the zip file contents and creation of an initial dataframe.

In [3]:
import os
import zipfile

def unzip_no_pass(path, out_path):
    '''
    Extracts all contents of a zip file without a password.
    
    Args:
        path:
            (str) path to zip file to extract.
        out_path:
            (str) path to directory to store the extracted contents.
            
    Returns:
        None
    '''
    # make sure out path directory existsge
    os.makedirs(out_path, exist_ok=True)
    
    with zipfile.ZipFile(path, 'r') as file:
        file.extractall(out_path)

In [4]:
import os
import pandas as pd

from src.utils.unzip import unzip_no_pass

# create dataframe from spam filtering dataset
def create_dataframe(path_to_zip, 
                     out_path="./data/spam_filter", 
                     unzip=False,
                     save_ext="csv"
                    ):
    
    assert save_ext in ["csv", "parquet"], "Extensions allowed are csv and parquet."
    
    path_to_zip = os.path.normpath(path_to_zip)
    out_path = os.path.normpath(out_path)
    
    # unzip file first
    if unzip:
        unzip_no_pass(path=path_to_zip, out_path=out_path)
    
    filename = os.path.basename(path_to_zip).split(".")[0]
    
    #start of processing unzipped files
    df = pd.read_csv(os.path.join(out_path, filename, "labels"), sep="/", header=None)
    
    # clean dataframe
    # initial columns are 0, 1, 2, 3
    df[0] = df[0].str.replace(" ..", "") # labels has extra artifacts
    df = df.drop(columns=1) # drop unnecessary column
    df = df.rename(columns = {0 : "Class", 2 : "Folder", 3: "File"}) # rename
    
    # replace spam and ham values
    df["Class"] = df["Class"].replace({"ham" : 0, "spam" : 1})
    
    # combine with text / email data
    for i in range(len(df)):
        folder_id = "{0:0=3d}".format(df.at[i,'Folder'])
        file_id = "{0:0=3d}".format(df.at[i,'File'])
        
        path = os.path.normpath(os.path.join(out_path, filename, "data", folder_id, file_id))
        
        # there will be an error if utf-8 is used as encoding
        df.at[i, "Email"] = open(path, encoding="latin1").read()
        
    df["Email"] = df["Email"].str.lower()

    if save_ext == "csv":
        df.to_csv(os.path.join(out_path, f"{filename}.{save_ext}"), index=False)
    elif save_ext == "parquet":
        df.to_parquet(os.path.join(out_path, f"{filename}.{save_ext}"))

Run creation of dataframe.

In [5]:
create_dataframe(path_to_zip="./data/trec06p-cs280.zip", unzip=False)

  df["Class"] = df["Class"].replace({"ham" : 0, "spam" : 1})


Loading processed data.

In [6]:
df = pd.read_csv("./data/spam_filter/trec06p-cs280.csv")

In this work, no class balancing techniques will be implemented.

In [7]:
df["Class"].value_counts(normalize=True)

Class
1    0.658664
0    0.341336
Name: proportion, dtype: float64

The above probabilities are the spam and ham priori respectively.

In [8]:
df

Unnamed: 0,Class,Folder,File,Email
0,0,0,0,received: from rodan.uu.net by aramis.rutgers....
1,1,0,1,received: from unknown (helo groucho.cs.psu.ed...
2,1,0,2,received:\n\tfrom 24-151-178-89.dhcp.kgpt.tn.c...
3,0,0,3,received: from psuvax1.cs.psu.edu ([130.203.2....
4,1,0,4,received: from 201-1-198-159.dsl.telesp.net.br...
...,...,...,...,...
37817,1,126,17,received: from ?211.200.1.51? (unknown [211.20...
37818,1,126,18,received:\n\tfrom dsl.dynamic212156187251.ttne...
37819,1,126,19,received: from wonder.hananet.net (unknown [21...
37820,1,126,20,received: from mail.csonline.com (unknown [61....


The dataset is composed of 66% spam and 34% ham. 

We now construct the vocabulary. In constructing the vocabulary, we limit our definition of a word to a sequence of alphabetic characters delimited by a white space, comma, and period.

In [9]:
import re

def create_vocab(arr, pattern=r'\b[a-zA-Z]+\b'):
    # empty array to store vocabulary
    vocab = []
    
    for i in range(len(arr)):
        vocab += re.findall(pattern, arr[i])
        
    return vocab # list of unique words

In [10]:
vocab = create_vocab(df["Email"].values)
len(vocab)

18280478

In [11]:
vocab

['received',
 'from',
 'rodan',
 'uu',
 'net',
 'by',
 'aramis',
 'rutgers',
 'edu',
 'id',
 'mon',
 'jul',
 'edt',
 'received',
 'from',
 'uu',
 'net',
 'by',
 'rodan',
 'uu',
 'net',
 'with',
 'smtp',
 'uunet',
 'mail',
 'drop',
 'id',
 'mon',
 'jul',
 'received',
 'from',
 'uunet',
 'uu',
 'net',
 'via',
 'localhost',
 'uu',
 'net',
 'by',
 'uu',
 'net',
 'with',
 'smtp',
 'uunet',
 'internet',
 'primary',
 'id',
 'mon',
 'jul',
 'received',
 'from',
 'sarto',
 'uucp',
 'by',
 'uunet',
 'uu',
 'net',
 'with',
 'uucp',
 'rmail',
 'queueing',
 'rmail',
 'id',
 'mon',
 'jul',
 'edt',
 'newsgroups',
 'soc',
 'religion',
 'christian',
 'path',
 'jhpb',
 'from',
 'jhpb',
 'sarto',
 'budd',
 'lake',
 'nj',
 'us',
 'joseph',
 'h',
 'buehler',
 'subject',
 'new',
 'catholic',
 'mailing',
 'list',
 'now',
 'up',
 'and',
 'running',
 'message',
 'id',
 'jhpb',
 'sarto',
 'budd',
 'lake',
 'nj',
 'us',
 'sender',
 'jhpb',
 'sarto',
 'budd',
 'lake',
 'nj',
 'us',
 'joseph',
 'h',
 'buehler',
 '

We split the dataset using sklearn.

In [12]:
from sklearn.model_selection import train_test_split

In [13]:
train_df, test_df = train_test_split(df, test_size=0.3, random_state=42)

In [14]:
from collections import defaultdict
import re

class NaiveBayesClassifier:
    def __init__(self):
        self.spam_word_count = defaultdict(int)
        self.ham_word_count = defaultdict(int)
        self.spam_total_words = 0
        self.ham_total_words = 0
        self.spam_priori = 0
        self.ham_priori = 0
    
    def preprocess(self, text):
        # our dataframe is already in lowercase, but for future purposes we add this
        text = text.lower()
        words = re.findall(r'\b[a-zA-Z]+\b', text)
        return words
    
    def train(self, X, y):
        # filter Email data (X) with indices where Class (y) is equal to 1 or spam
        spam_messages = X[y == 1]
        # filter Email data (X) with indices where Class (y) is equal to 0 or ham
        ham_messages = X[y == 0]
        
        self.spam_priori = len(spam_messages) / len(X)
        self.ham_priori = len(ham_messages) / len(X)
        
        for message in spam_messages:
            words = self.preprocess(message)
            for word in words:
                # update dictionary with word and add value of 1 in value
                self.spam_word_count[word] += 1
                self.spam_total_words += 1
        
        for message in ham_messages:
            words = self.preprocess(message)
            for word in words:
                # update dictionary with word and add value of 1 in value
                self.ham_word_count[word] += 1
                self.ham_total_words += 1
    
    def predict(self, X, alpha=0):
        predictions = []
        for message in X:
            words = self.preprocess(message)
            spam_prob = np.log(self.spam_priori)
            ham_prob = np.log(self.ham_priori)
            
            for word in words:
                spam_prob += np.log((self.spam_word_count[word] + alpha) / (self.spam_total_words + len(self.spam_word_count) * alpha))
                ham_prob += np.log((self.ham_word_count[word] + alpha) / (self.ham_total_words + len(self.ham_word_count) * alpha))
            
            if spam_prob > ham_prob:
                predictions.append(1)
            else:
                predictions.append(0)
        
        return np.array(predictions)

In [15]:
from sklearn.metrics import confusion_matrix

def accuracy_score(y_true, y_pred):
    
    return np.mean(y_pred == y_true)

def precision_score(y_true, y_pred):
    
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    
    return tp / (tp + fp)

def recall_score(y_true, y_pred):
    
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    
    return tp / (tp + fn)

In [16]:
nb_classifier = NaiveBayesClassifier()

nb_classifier.train(train_df["Email"].values, train_df["Class"].values)

Performance metrics with different alpha parameters.

In [18]:
for i in [2, 1, 0.5, 0.1, 0.005]:
    
    y_pred = nb_classifier.predict(test_df["Email"].values, alpha=i)
    
    print("alpha-{} Accuracy: {:.4f}".format(i, accuracy_score(test_df["Class"].values, y_pred)))
    print("alpha-{} Precision: {:.4f}".format(i, precision_score(test_df["Class"].values, y_pred)))
    print("alpha-{} Recall: {:.4f}".format(i, recall_score(test_df["Class"].values, y_pred)))
    print("*"*20)

alpha-2 Accuracy: 0.9543
alpha-2 Precision: 0.9896
alpha-2 Recall: 0.9412
********************
alpha-1 Accuracy: 0.9561
alpha-1 Precision: 0.9914
alpha-1 Recall: 0.9422
********************
alpha-0.5 Accuracy: 0.9604
alpha-0.5 Precision: 0.9924
alpha-0.5 Recall: 0.9478
********************
alpha-0.1 Accuracy: 0.9684
alpha-0.1 Precision: 0.9938
alpha-0.1 Recall: 0.9584
********************
alpha-0.005 Accuracy: 0.9766
alpha-0.005 Precision: 0.9948
alpha-0.005 Recall: 0.9698
********************
