# Cordero, Palmsdale
https://github.com/Pawmsdayl/CMSC197-ML/tree/main/Assignment3

In [61]:
# libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

import os

from email import policy
from email.parser import BytesParser
import re
import html
import string

from collections import Counter

# Preprocessing

In [36]:
# getting the list of files

# train set: folders 0-70
train_folders : list[str] = []
for folder in range(71):
    train_folders.append('data\\' + str(folder).rjust(3, '0'))

train_files : list[str] = []
for folder in train_folders:
    for file in os.listdir(folder):
        train_files.append(os.path.join(folder, file))

# test set: folders 71-126
test_folders : list[str] = []
for folder in range(71, 127):
    test_folders.append('data\\' + str(folder).rjust(3, '0'))

test_files : list[str] = []
for folder in test_folders:
    for file in os.listdir(folder):
        test_files.append(os.path.join(folder, file))

In [37]:
# getting the list of stop words

with open('stop_words.txt', 'r') as file:
    stop_words = set()
    for line in file:
        if line.strip():
            stop_words.add(line.strip().lower())    

In [38]:
# functions for preprocessing

def decode_with_fallback(payload, encoding : str) -> str:
    """
    Decode email using encoding. 
    If error occurs, fallback to utf-8 and latin-1.
    """
    if encoding is None:
        # default encoding
        encoding = 'utf-8'                      
    
    try:
        return payload.decode(encoding)
    except (LookupError, UnicodeDecodeError):
        try:
            # fallback to default encoding
            encoding = 'utf-8'                  
            return payload.decode(encoding)
        except UnicodeDecodeError:
            # fallback to latin-1 encoding, ignore errors
            encoding = 'latin-1'                
            return payload.decode('latin-1')    

def get_tokens_cleaned(email_path : str, exclude_stop_words : bool = True) -> list[str]:
    """
    Extracts the body of an email and returns a list of cleaned tokens.
    """
    
    with open(email_path, 'rb') as file:
        msg = BytesParser(policy=policy.default).parse(file)
    
    body = ""
    
    if msg.is_multipart():
        for part in msg.iter_parts():
            if part.get_content_type() in ['text/plain', 'text/html']:
                payload = part.get_payload(decode=True)
                encoding = part.get_content_charset()
                body += decode_with_fallback(payload, encoding)
    else:
        payload = msg.get_payload(decode=True)
        encoding = msg.get_content_charset()
        body = decode_with_fallback(payload, encoding)   
    
    # remove HTML tags
    body_no_html = re.sub(r'<.*?>', '', body)
    
    # decode HTML entities, removes "&nbsp;" and such
    body_decoded = html.unescape(body_no_html)
    
    # convert to lowercase
    body_lowercase = body_decoded.lower()
    
    # remove punctuation, numbers, and special characters
    body_only_letters = body_lowercase.translate(str.maketrans('', '', string.punctuation + '0123456789~!@#$%^&*(){}[]\\/|<>;:'))
    
    # remove contractions
    body_no_contraction = re.sub(r"(\w+)('ll|'ve|'re|'d|'m|'s|n't)", r"\1", body_only_letters)    
    
    # remove newlines and extra spaces
    body_trimmed = re.sub(r'\s+', ' ', body_no_contraction).strip()
    
    # tokenize body
    tokens_dirty = body_trimmed.split()
    
    if not exclude_stop_words:
        return tokens_dirty
    
    # remove stop words
    tokens_cleaned : list[str] = []
    for token in tokens_dirty:
        if token not in stop_words:
            tokens_cleaned.append(token)
    
    return tokens_cleaned

def create_word_count(files : list[str], exclude_stop_words : bool = True) -> list[tuple[str, int]]:
    """
    Returns a Counter of the top 10000 common words from a list of email files.
    """
    
    word_count = Counter()
    
    for file in files:
        tokens_cleaned = get_tokens_cleaned(file, exclude_stop_words)
        word_count.update(tokens_cleaned)

    return word_count.most_common(10000)

def create_dictionary(word_count : list[tuple[str, int]], min_count : int = 0) -> list[str]:
    """
    Returns a the words that appear at least min_count times.
    """
    
    dictionary : list[str] = []
    for word, count in word_count:
        if count >= min_count:
            dictionary.append(word)
    
    return dictionary

def get_token_lists(files : list[str], dictionary : list[str]) -> list[list[str]]:
    """
    Returns a list of clean and dictionary-filtered token lists, corresponding to each file.
    """
    
    token_lists : list[list[str]] = []
    
    for file in files:
        tokens_cleaned = get_tokens_cleaned(file)
        token_lists.append([token for token in tokens_cleaned if token in dictionary])
    
    return token_lists

# Creating the feature matrices

In [39]:
# getting the labels

labels : list[str] = []

with open('labels', 'r') as file:
    for line in file:
        labels.append(line[:4].strip())

train_labels = labels[:len(train_files)]
test_labels = labels[len(train_files):]

In [40]:
# functions for creating dataframes

def create_labeled_data(token_lists : list[list[str]], labels : list[str]) -> list[tuple[str, list[str]]]:
    """
    Returns a list of tuples, each containing a label and a token list.
    """
    
    data_zip = zip(labels, token_lists)
    data_list = list(data_zip)
    
    return data_list

def create_feature_matrices(data : list[tuple[str, list[str]]], dictionary : list[str]) -> tuple[pd.DataFrame, pd.DataFrame]:
    """
    Returns two dataframes, one for spam and one for ham, with columns corresponding to dictionary words.
    """
    
    row_skeleton = {word: 0 for word in dictionary}
    spam_rows : list[dict] = []
    ham_rows : list[dict] = []
    
    for datapoint in data:
        row = row_skeleton.copy()
    
        for token in datapoint[1]:
            if token in dictionary:
                row[token] = 1
        
        if datapoint[0] == 'spam':
            spam_rows.append(row)
        else:
            ham_rows.append(row)
    
    spam_df = pd.DataFrame(spam_rows)
    ham_df = pd.DataFrame(ham_rows)
    
    return spam_df, ham_df

# Computing the Priors

In [41]:
# getting the priors

cat_count = Counter()

cat_count.update(labels)

for cat, count in cat_count.items():
    if cat == 'spam':
        spam_count = count
    else:
        ham_count = count

prior_spam = spam_count / len(labels)
prior_ham = ham_count / len(labels)

# Computing  the  Likelihood  of  each  word

In [42]:
# functions for getting the probabilities

def prob_given_cat(lam : float, dictionary_size : int, cat_word_count : int, word_in_cat_count : int) -> float:
    """
    Returns the probability of a word given a category (spam/ham).
    """
    
    prob = (word_in_cat_count + lam) / (cat_word_count + (lam * dictionary_size))
    
    return prob

def sum_occurences(df : pd.DataFrame) -> int:
    """
    Returns the sum of all occurences in a dataframe.
    """
    
    return df.sum().sum()

# Classifying the emails

In [43]:
# functions for predicting the category


def predict_category(tokens : list[str], lam : float, dictionary : list[str], spam_df : pd.DataFrame, ham_df : pd.DataFrame, spam_word_count : int, ham_word_count : int) -> str:
    """
    Predicts the category (spam/ham) of an email.
    """
    
    
    dictionary_size = len(dictionary)
    
    spam_prob = np.log(prior_spam)
    ham_prob = np.log(prior_ham)
    
    for token in tokens:
        if token in dictionary:
            word_in_spam_count = spam_df[token].sum()
            spam_prob += np.log(prob_given_cat(
                lam, dictionary_size, spam_word_count, word_in_spam_count
            ))
            
            word_in_ham_count = ham_df[token].sum()
            ham_prob += np.log(prob_given_cat(
                lam, dictionary_size, ham_word_count, word_in_ham_count
            ))
    
    if spam_prob > ham_prob:
        return 'spam'
    else:
        return 'ham'

# Testing the Classifier

In [44]:
# creating the model

cat_word_count = create_word_count(train_files)
dictionary = create_dictionary(cat_word_count)

train_token_lists = get_token_lists(train_files, dictionary)
train_data = create_labeled_data(train_token_lists, train_labels)
spam_df, ham_df = create_feature_matrices(train_data, dictionary)

spam_word_count = sum_occurences(spam_df)
ham_word_count = sum_occurences(ham_df)

In [45]:
# testing the model

test_token_lists = get_token_lists(test_files, dictionary)

lam = 1
predictions : list[str] = []

for file in test_token_lists:
    predictions.append(predict_category(file, lam, dictionary, spam_df, ham_df, spam_word_count, ham_word_count))

# Performance Evaluation

In [46]:
# functions for evaluating the model

def accuracy(predictions : list[str], labels : list[str]) -> float:
    """
    Returns the accuracy of the model.
    """
    
    correct = 0
    for pred, label in zip(predictions, labels):
        if pred == label:
            correct += 1
    
    return correct / len(labels)

def recall(predictions : list[str], labels : list[str], category : str) -> float:
    """
    Returns the recall of the model.
    """
    
    true_positives = 0
    false_negatives = 0
    
    for pred, label in zip(predictions, labels):
        if pred == category and label == category:
            true_positives += 1
        if pred != category and label == category:
            false_negatives += 1
    
    return true_positives / (true_positives + false_negatives)

def precision(predictions : list[str], labels : list[str], category : str) -> float:
    """
    Returns the precision of the model.
    """
    
    true_positives = 0
    false_positives = 0
    
    for pred, label in zip(predictions, labels):
        if pred == category and label == category:
            true_positives += 1
        if pred == category and label != category:
            false_positives += 1
    
    return true_positives / (true_positives + false_positives)

In [47]:
# evaluating the model

orig_accuracy = accuracy(predictions, test_labels)
orig_recall_spam = recall(predictions, test_labels, 'spam')
orig_recall_ham = recall(predictions, test_labels, 'ham')
orig_precision_spam = precision(predictions, test_labels, 'spam')
orig_precision_ham = precision(predictions, test_labels, 'ham')


print('accuracy: ', orig_accuracy)
print('recall spam: ', orig_recall_spam)
print('recall ham: ', orig_recall_ham)
print('precision spam: ', orig_precision_spam)
print('precision ham: ', orig_precision_ham)

accuracy:  0.9288221764919501
recall spam:  0.9245621912887292
recall ham:  0.9376276220530908
precision spam:  0.9683943185024927
precision ham:  0.8574096078764216


# 1. 

### What  is  the  effect  of  removing  stop  words  in  terms  of  precision,  recall,  and accuracy?  Show a plot or a table of these results.

In [48]:
# creating the model with stop words (sw)

sw_word_count = create_word_count(train_files, False)
sw_dictionary = create_dictionary(sw_word_count)

sw_train_token_lists = get_token_lists(train_files, sw_dictionary)
sw_train_data = create_labeled_data(sw_train_token_lists, train_labels)
sw_spam_df, sw_ham_df = create_feature_matrices(sw_train_data, sw_dictionary)

sw_spam_word_count = sum_occurences(sw_spam_df)
sw_ham_word_count = sum_occurences(sw_ham_df)

In [49]:
# testing the model with stop words (sw)

sw_test_token_lists = get_token_lists(test_files, sw_dictionary)

lam = 1
sw_predictions : list[str] = []

for file in sw_test_token_lists:
    sw_predictions.append(predict_category(file, lam, sw_dictionary, sw_spam_df, sw_ham_df, sw_spam_word_count, sw_ham_word_count))

In [50]:
# evaluating the model with stop words (sw)

sw_accuracy = accuracy(sw_predictions, test_labels)
sw_recall_spam = recall(sw_predictions, test_labels, 'spam')
sw_recall_ham = recall(sw_predictions, test_labels, 'ham')
sw_precision_spam = precision(sw_predictions, test_labels, 'spam')
sw_precision_ham = precision(sw_predictions, test_labels, 'ham')

In [51]:
# dataframe for results

results_sw = pd.DataFrame({
    'metric': ['accuracy', 'recall spam', 'recall ham', 'precision spam', 'precision ham'],
    'orig w/o sw': [orig_accuracy, orig_recall_spam, orig_recall_ham, orig_precision_spam, orig_precision_ham],
    'w/ stop words': [sw_accuracy, sw_recall_spam, sw_recall_ham, sw_precision_spam, sw_precision_ham],
    'difference (orig-sw)': [orig_accuracy - sw_accuracy, orig_recall_spam - sw_recall_spam, orig_recall_ham - sw_recall_ham, orig_precision_spam - sw_precision_spam, orig_precision_ham - sw_precision_ham]
})

results_sw

Unnamed: 0,metric,orig w/o sw,w/ stop words,difference (orig-sw)
0,accuracy,0.928822,0.929609,-0.000787
1,recall spam,0.924562,0.925909,-0.001347
2,recall ham,0.937628,0.937256,0.000371
3,precision spam,0.968394,0.968257,0.000137
4,precision ham,0.85741,0.859551,-0.002141


Usually, including stop words would lower all its metrics. This is because we are using useless words that have no relevant information. 

# 2.

### Experiment  on  the  number  of  words  used  for  training.    Filter  the  dictionary  to include only words occurring more than k times (1000 words, then k > 100, and k = 50 times).  For example, the word “offer” appears 150 times, that means that it will be included in the dictionary. 

In [52]:
# creating the models with different min_count (mc) values

min_counts = [50, 100, 1000]

mc_dictionaries = []

for count in min_counts:
    dictionary = create_dictionary(cat_word_count, count)
    mc_dictionaries.append(dictionary)

mc_train_data = []
mc_spam_df = []
mc_ham_df = []
mc_spam_word_count = []
mc_ham_word_count = []

for dictionary in mc_dictionaries:
    train_token_lists = get_token_lists(train_files, dictionary)
    train_data = create_labeled_data(train_token_lists, train_labels)
    spam_df, ham_df = create_feature_matrices(train_data, dictionary)
    spam_word_count = sum_occurences(spam_df)
    ham_word_count = sum_occurences(ham_df)
    
    mc_train_data.append(train_data)
    mc_spam_df.append(spam_df)
    mc_ham_df.append(ham_df)
    mc_spam_word_count.append(spam_word_count)
    mc_ham_word_count.append(ham_word_count)

In [53]:
# testing the models with different min_count values

mc_predictions = []

lam = 1

for i in range(len(min_counts)):
    test_token_lists = get_token_lists(test_files, mc_dictionaries[i])
    
    predictions = []
    
    for file in test_token_lists:
        predictions.append(predict_category(file, lam, mc_dictionaries[i], mc_spam_df[i], mc_ham_df[i], mc_spam_word_count[i], mc_ham_word_count[i]))
    
    mc_predictions.append(predictions)

In [54]:
# evaluating the models with different min_count values

mc_accuracies = []
mc_recalls_spam = []
mc_recalls_ham = []
mc_precisions_spam = []
mc_precisions_ham = []


for i in range(len(min_counts)):
    accuracy_value = accuracy(mc_predictions[i], test_labels)
    recall_spam_value = recall(mc_predictions[i], test_labels, 'spam')
    recall_ham_value = recall(mc_predictions[i], test_labels, 'ham')
    precision_spam_value = precision(mc_predictions[i], test_labels, 'spam')
    precision_ham_value = precision(mc_predictions[i], test_labels, 'ham')
    
    mc_accuracies.append(accuracy_value)
    mc_recalls_spam.append(recall_spam_value)
    mc_recalls_ham.append(recall_ham_value)
    mc_precisions_spam.append(precision_spam_value)
    mc_precisions_ham.append(precision_ham_value)

In [55]:
# dataframe for results

results_mc = pd.DataFrame({
    'metric': ['accuracy', 'recall spam', 'recall ham', 'precision spam', 'precision ham'],
    'min_count = 0': [orig_accuracy, orig_recall_spam, orig_recall_ham, orig_precision_spam, orig_precision_ham],
    'min_count = 50': [mc_accuracies[0], mc_recalls_spam[0], mc_recalls_ham[0], mc_precisions_spam[0], mc_precisions_ham[0]],
    'min_count = 100': [mc_accuracies[1], mc_recalls_spam[1], mc_recalls_ham[1], mc_precisions_spam[1], mc_precisions_ham[1]],
    'min_count = 1000': [mc_accuracies[2], mc_recalls_spam[2], mc_recalls_ham[2], mc_precisions_spam[2], mc_precisions_ham[2]],
})

results_mc

Unnamed: 0,metric,min_count = 0,min_count = 50,min_count = 100,min_count = 1000
0,accuracy,0.928822,0.928822,0.929427,0.886515
1,recall spam,0.924562,0.92564,0.928514,0.898608
2,recall ham,0.937628,0.9354,0.931316,0.861518
3,precision spam,0.968394,0.967339,0.96545,0.930618
4,precision ham,0.85741,0.858872,0.863066,0.804333


As the min_count increases, all metrics decrease. This is because of the loss of information as we have less words to use as basis. The only benefit of this is faster processing time because of the decreased data needed.

# 3.

### Discuss the results of the different parameters used for Lambda smoothing.  Test it  on  5  varying  values  of  the  λ  (e.g.  λ  =  2.0,  1.0,  0.5,  0.1,  0.005). Evaluate performance metrics for each.

In [57]:
# testing the original model with different lambda (lam) values

lam_predictions = []

lams = [0.1, 0.5, 1, 2, 10]

for lam in lams:
    predictions = []
    
    for file in test_token_lists:
        predictions.append(predict_category(file, lam, dictionary, spam_df, ham_df, spam_word_count, ham_word_count))
    lam_predictions.append(predictions)

In [58]:
# dataframe for results

results_lam = pd.DataFrame({
    'metric': ['accuracy', 'recall spam', 'recall ham', 'precision spam', 'precision ham'],
    'lambda = 0.1': [accuracy(lam_predictions[0], test_labels), recall(lam_predictions[0], test_labels, 'spam'), recall(lam_predictions[0], test_labels, 'ham'), precision(lam_predictions[0], test_labels, 'spam'), precision(lam_predictions[0], test_labels, 'ham')],
    'lambda = 0.5': [accuracy(lam_predictions[1], test_labels), recall(lam_predictions[1], test_labels, 'spam'), recall(lam_predictions[1], test_labels, 'ham'), precision(lam_predictions[1], test_labels, 'spam'), precision(lam_predictions[1], test_labels, 'ham')],
    'lambda = 1': [accuracy(lam_predictions[2], test_labels), recall(lam_predictions[2], test_labels, 'spam'), recall(lam_predictions[2], test_labels, 'ham'), precision(lam_predictions[2], test_labels, 'spam'), precision(lam_predictions[2], test_labels, 'ham')],    
    'lambda = 2': [accuracy(lam_predictions[3], test_labels), recall(lam_predictions[3], test_labels, 'spam'), recall(lam_predictions[3], test_labels, 'ham'), precision(lam_predictions[3], test_labels, 'spam'), precision(lam_predictions[3], test_labels, 'ham')],
    'lambda = 10': [accuracy(lam_predictions[4], test_labels), recall(lam_predictions[4], test_labels, 'spam'), recall(lam_predictions[4], test_labels, 'ham'), precision(lam_predictions[4], test_labels, 'spam'), precision(lam_predictions[4], test_labels, 'ham')],
})

results_lam

Unnamed: 0,metric,lambda = 0.1,lambda = 0.5,lambda = 1,lambda = 2,lambda = 10
0,accuracy,0.886152,0.886333,0.886515,0.886697,0.887544
1,recall spam,0.898698,0.898698,0.898608,0.898788,0.899057
2,recall ham,0.860219,0.860776,0.861518,0.861704,0.863746
3,precision spam,0.930019,0.930278,0.930618,0.930717,0.931689
4,precision ham,0.804235,0.804337,0.804333,0.804646,0.805435


The negligible difference could be because the test data has only a few number of words that the model was not trained on. Lambda really kicks in when there are a lot of this. 

# 4.

### What are your recommendations to further improve the model?

The instructions were quite limited in preprocessing, hence I took the initative to improve on it. This included filtering away HTML tags and enitities, numbers, and common English contractions. Proper preprocessing ensures that the tokens that the model will take into account are actually useful, which aligns with the trash-in trash-out principle. The instruction also used one hot encoding, wherein we only check for the presence of the word in the file. By doing this process, we lose some possibly valuable information. Hence, I also recommend using count of the word, instead of its presence. Reintroducing this data may give better insight to the model for predictions. 