In [1]:
# libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

import os

from email import policy
from email.parser import BytesParser
import re
import html
import string

from collections import Counter

# Preprocessing

In [2]:
# getting the list of files

# train set: folders 0-70
train_folders : list[str] = []
for folder in range(71):
    train_folders.append('data\\' + str(folder).rjust(3, '0'))

train_files : list[str] = []
for folder in train_folders:
    for file in os.listdir(folder):
        train_files.append(os.path.join(folder, file))

# test set: folders 71-126
test_folders : list[str] = []
for folder in range(71, 127):
    test_folders.append('data\\' + str(folder).rjust(3, '0'))

test_files : list[str] = []
for folder in test_folders:
    for file in os.listdir(folder):
        test_files.append(os.path.join(folder, file))

print(train_files)
print(test_files)

['data\\000\\000', 'data\\000\\001', 'data\\000\\002', 'data\\000\\003', 'data\\000\\004', 'data\\000\\005', 'data\\000\\006', 'data\\000\\007', 'data\\000\\008', 'data\\000\\009', 'data\\000\\010', 'data\\000\\011', 'data\\000\\012', 'data\\000\\013', 'data\\000\\014', 'data\\000\\015', 'data\\000\\016', 'data\\000\\017', 'data\\000\\018', 'data\\000\\019', 'data\\000\\020', 'data\\000\\021', 'data\\000\\022', 'data\\000\\023', 'data\\000\\024', 'data\\000\\025', 'data\\000\\026', 'data\\000\\027', 'data\\000\\028', 'data\\000\\029', 'data\\000\\030', 'data\\000\\031', 'data\\000\\032', 'data\\000\\033', 'data\\000\\034', 'data\\000\\035', 'data\\000\\036', 'data\\000\\037', 'data\\000\\038', 'data\\000\\039', 'data\\000\\040', 'data\\000\\041', 'data\\000\\042', 'data\\000\\043', 'data\\000\\044', 'data\\000\\045', 'data\\000\\046', 'data\\000\\047', 'data\\000\\048', 'data\\000\\049', 'data\\000\\050', 'data\\000\\051', 'data\\000\\052', 'data\\000\\053', 'data\\000\\054', 'data\\00

In [3]:
# getting the list of stop words

with open('stop_words.txt', 'r') as file:
    stop_words = set()
    for line in file:
        if line.strip():
            stop_words.add(line.strip().lower())    
print(stop_words)

{'various', 'was', 'cause', 'nor', 'tries', 'trying', 'affecting', 'here', 'others', 'what', 'these', 'showns', 'us', 'suggest', 'approximately', "won't", 'ours', 'contain', 'words', 'does', 'vol', 'g', 'already', 'due', 'k', 'lately', 'six', 'ah', 'except', 'thereby', 'last', 'thats', 'nay', 'regarding', 'five', 'possible', 'together', 'theirs', 'further', 'ff', 'shall', 'get', 'actually', 'as', 'there', 'need', 'otherwise', 'hid', 're', 'therein', 'below', 'abst', 'became', 'latter', 'know', 'still', 'and', 'no', 'non', 'took', 'have', 'merely', 'sup', "didn't", 'than', "i've", 'take', 'whence', 'through', 'keeps', 'nine', 'ref', 'recently', 'now', 'meanwhile', 'lets', 'edu', 'own', 'four', 'value', 'shed', 'hed', 'hi', 'thru', 'who', 'downwards', 'between', 'research', 'been', 'everybody', 'theres', 'might', 'shows', "we've", 'seeming', 'follows', 'meantime', 'c', 'immediate', 'whither', 'she', 'should', 'whole', 'o', 'say', 'unless', 'thoughh', 'briefly', 'those', 'almost', "that'v

In [4]:
# functions for preprocessing

def decode_with_fallback(payload, encoding : str) -> str:
    """
    Decode email using encoding. 
    If error occurs, fallback to utf-8 and latin-1.
    """
    if encoding is None:
        # default encoding
        encoding = 'utf-8'                      
    
    try:
        return payload.decode(encoding)
    except (LookupError, UnicodeDecodeError):
        try:
            # fallback to default encoding
            encoding = 'utf-8'                  
            return payload.decode(encoding)
        except UnicodeDecodeError:
            # fallback to latin-1 encoding, ignore errors
            encoding = 'latin-1'                
            return payload.decode('latin-1')    

def get_tokens_cleaned(email_path : str) -> list[str]:
    """
    Extracts the body of an email and returns a list of cleaned tokens.
    """
    
    with open(email_path, 'rb') as file:
        msg = BytesParser(policy=policy.default).parse(file)
    
    body = ""
    
    if msg.is_multipart():
        for part in msg.iter_parts():
            if part.get_content_type() in ['text/plain', 'text/html']:
                payload = part.get_payload(decode=True)
                encoding = part.get_content_charset()
                body += decode_with_fallback(payload, encoding)
    else:
        payload = msg.get_payload(decode=True)
        encoding = msg.get_content_charset()
        body = decode_with_fallback(payload, encoding)   
    
    # remove HTML tags
    body_no_html = re.sub(r'<.*?>', '', body)
    
    # decode HTML entities, removes "&nbsp;" and such
    body_decoded = html.unescape(body_no_html)
    
    # convert to lowercase
    body_lowercase = body_decoded.lower()
    
    # remove punctuation, numbers, and special characters
    body_only_letters = body_lowercase.translate(str.maketrans('', '', string.punctuation + '0123456789~!@#$%^&*(){}[]\\/|<>,.?;:'))
    
    # remove contractions
    body_no_contraction = re.sub(r"(\w+)('ll|'ve|'re|'d|'m|'s|n't)", r"\1", body_only_letters)    
    
    # remove newlines and extra spaces
    body_trimmed = re.sub(r'\s+', ' ', body_no_contraction).strip()
    
    # tokenize body
    tokens_dirty = body_trimmed.split()
    
    # remove stop words
    tokens_cleaned : list[str] = []
    for token in tokens_dirty:
        if token not in stop_words:
            tokens_cleaned.append(token)
    
    return tokens_cleaned


In [5]:
# getting dictionary of common words, and list of token lists

train_token_lists : list[list[str]] = []
word_count = Counter()

for token_list in train_files:
    tokens_cleaned = get_tokens_cleaned(token_list)
    train_token_lists.append(tokens_cleaned)
    word_count.update(tokens_cleaned)

dictionary : list[str] = []
for word, _ in word_count.most_common(10000):
    dictionary.append(word)

print(dictionary)



# Creating the feature matrices

In [6]:
# getting the labels

labels : list[str] = []

with open('labels', 'r') as file:
    for line in file:
        labels.append(line[:4].strip())

print(labels)

['ham', 'spam', 'spam', 'ham', 'spam', 'ham', 'ham', 'spam', 'spam', 'spam', 'ham', 'spam', 'spam', 'spam', 'spam', 'spam', 'spam', 'spam', 'spam', 'spam', 'ham', 'ham', 'spam', 'spam', 'ham', 'ham', 'ham', 'ham', 'spam', 'spam', 'spam', 'spam', 'ham', 'spam', 'ham', 'ham', 'spam', 'spam', 'ham', 'ham', 'spam', 'spam', 'spam', 'spam', 'ham', 'ham', 'ham', 'ham', 'spam', 'ham', 'spam', 'spam', 'spam', 'spam', 'spam', 'spam', 'spam', 'spam', 'spam', 'spam', 'spam', 'ham', 'spam', 'spam', 'spam', 'spam', 'spam', 'spam', 'spam', 'ham', 'ham', 'spam', 'ham', 'ham', 'ham', 'ham', 'ham', 'ham', 'ham', 'ham', 'spam', 'ham', 'ham', 'ham', 'spam', 'spam', 'spam', 'spam', 'spam', 'spam', 'spam', 'spam', 'spam', 'spam', 'spam', 'spam', 'ham', 'spam', 'ham', 'spam', 'ham', 'spam', 'spam', 'spam', 'spam', 'spam', 'ham', 'ham', 'ham', 'ham', 'ham', 'ham', 'spam', 'ham', 'spam', 'spam', 'spam', 'spam', 'spam', 'ham', 'ham', 'spam', 'ham', 'spam', 'spam', 'ham', 'ham', 'ham', 'ham', 'ham', 'spam', 'spa

In [16]:
# adding the labels

train_zip = zip(labels, train_token_lists)
train_data = list(train_zip)

In [20]:
# creating feature matrices

row_skeleton = {}
spam_rows : list[dict] = []
ham_rows : list[dict] = []

for word in dictionary:
    row_skeleton[word] = 0

for datapoint in train_data:
    row = row_skeleton.copy()
    
    for token in datapoint[1]:
        if token in dictionary:
            row[token] = 1
    
    if datapoint[0] == 'spam':
        spam_rows.append(row)
    else:
        ham_rows.append(row)

spam_df = pd.DataFrame(spam_rows)
ham_df = pd.DataFrame(ham_rows)

In [25]:
spam_df

Unnamed: 0,px,will,float,styleborder,price,producttable,и,left,adobe,info,...,でもあえて無料で勝負したい！！という方に絶対おすすめのサイトです。,【大人のセフレ探し】,直メールし放題が売り！の完全無料にて利用出来るサイト。,地域・趣味・目的別に相手が検索可能なためピンポイントで近くの女性と知り合える！！,【丁目の奥様】,様々なイベントや個室・ツーショット部屋など多彩な機能を持つビッグサイト。登録者数が多く出会い率が非常に高いのが魅力☆,人妻・熟所と出会いを求めるなら断然ココ！！,httpcyoumecomcco,特別募集企画に関するご案内,■全国のｍ男さん企画
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,1,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
3,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,1,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13772,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
13773,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
13774,1,0,0,0,1,1,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
13775,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [26]:
ham_df

Unnamed: 0,px,will,float,styleborder,price,producttable,и,left,adobe,info,...,でもあえて無料で勝負したい！！という方に絶対おすすめのサイトです。,【大人のセフレ探し】,直メールし放題が売り！の完全無料にて利用出来るサイト。,地域・趣味・目的別に相手が検索可能なためピンポイントで近くの女性と知り合える！！,【丁目の奥様】,様々なイベントや個室・ツーショット部屋など多彩な機能を持つビッグサイト。登録者数が多く出会い率が非常に高いのが魅力☆,人妻・熟所と出会いを求めるなら断然ココ！！,httpcyoumecomcco,特別募集企画に関するご案内,■全国のｍ男さん企画
0,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7518,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7519,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7520,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7521,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
