In [2]:
import tarfile
import os

tar_files = ['20021010_easy_ham.tar.bz2', '20021010_hard_ham.tar.bz2', '20021010_spam.tar.bz2']
extract_dir = './Dataset_ham_spam'

# for each file
for tar_file in tar_files:
    with tarfile.open(tar_file) as file:
        # extract each tar file into the dataset directory
        file.extractall(extract_dir)
        print(f"Extracted {tar_file} to {extract_dir}")


  file.extractall(extract_dir)


Extracted 20021010_easy_ham.tar.bz2 to ./Dataset_ham_spam
Extracted 20021010_hard_ham.tar.bz2 to ./Dataset_ham_spam
Extracted 20021010_spam.tar.bz2 to ./Dataset_ham_spam


In [3]:
easy_ham_dir = os.path.join(extract_dir, 'easy_ham')
hard_ham_dir = os.path.join(extract_dir, 'hard_ham')
spam_dir = os.path.join(extract_dir, 'spam')

# function to read and display a limited number of files from a directory
def extract_data_from_dir(directory):
    files = os.listdir(directory)
    files = [os.path.join(directory, f) for f in files if os.path.isfile(os.path.join(directory, f))]
    files_list = []
    print(f"\nDisplaying {len(files)} files from {directory}:\n")
    for file_path in files:  # limit the number of files displayed
        with open(file_path, 'r', encoding='latin1') as file:  # latin1 encoding for email data
            files_list.append(file.read())
    return files_list
#prepare data for train-test
easy_ham_data = extract_data_from_dir(easy_ham_dir)
hard_ham_data = extract_data_from_dir(hard_ham_dir)
spam_data = extract_data_from_dir(spam_dir)
nr_easy_ham_files = int(len(easy_ham_data))
nr_hard_ham_files = int(len(hard_ham_data))
nr_spam_files = int(len(spam_data))


Displaying 2551 files from ./Dataset_ham_spam/easy_ham:


Displaying 250 files from ./Dataset_ham_spam/hard_ham:


Displaying 501 files from ./Dataset_ham_spam/spam:



In [72]:
num_mails_to_print = 10 #printed 10 emails only to read through
for i, data in enumerate(easy_ham_data):
    print(data + ' \n')
    print('\n\n' + '--'*20)
    if i >= num_mails_to_print:
        break

From rssfeeds@jmason.org  Sun Oct  6 22:54:41 2002
Return-Path: <rssfeeds@example.com>
Delivered-To: yyyy@localhost.example.com
Received: from localhost (jalapeno [127.0.0.1])
	by jmason.org (Postfix) with ESMTP id 0324116F79
	for <jm@localhost>; Sun,  6 Oct 2002 22:53:00 +0100 (IST)
Received: from jalapeno [127.0.0.1]
	by localhost with IMAP (fetchmail-5.9.0)
	for jm@localhost (single-drop); Sun, 06 Oct 2002 22:53:00 +0100 (IST)
Received: from dogma.slashnull.org (localhost [127.0.0.1]) by
    dogma.slashnull.org (8.11.6/8.11.6) with ESMTP id g9680EK15204 for
    <jm@jmason.org>; Sun, 6 Oct 2002 09:00:14 +0100
Message-Id: <200210060800.g9680EK15204@dogma.slashnull.org>
To: yyyy@example.com
From: boingboing <rssfeeds@example.com>
Subject: Curling videogame hits bigtime
Date: Sun, 06 Oct 2002 08:00:13 -0000
Content-Type: text/plain; encoding=utf-8
X-Spam-Status: No, hits=-899.1 required=5.0
	tests=AWL,T_NONSENSE_FROM_40_50
	version=2.50-cvs
X-Spam-Level: 

URL: http://boingboing.net/#85

In [None]:
num_mails_to_print = 10
for i, data in enumerate(hard_ham_data):
    print(data + ' \n')
    print('\n\n' + '--'*20)
    if i >= num_mails_to_print:
        break

In [None]:
num_mails_to_print = 10
for i, data in enumerate(spam_data):
    print(data + ' \n')
    print('\n\n' + '--'*20)
    if i >= num_mails_to_print:
        break

In [41]:
#We can see that in the easy ham folder, there are more 'normal' emails, and that the hard
#ham emails have html code in them which makes us assume that they are ads and are spam.
#The spam level indicator in some of the mails indicated that as well.
#The emails in the spam folder has mails that look like scams since they are trying desperatly
#to sell something.

In [6]:
from sklearn.model_selection import train_test_split
import numpy as np

#split each data set so that we train 80% of it and test on 20% of it
easy_ham_train, easy_ham_test = train_test_split(easy_ham_data, test_size=0.2, random_state=42)
hard_ham_train, hard_ham_test = train_test_split(hard_ham_data, test_size=0.2, random_state=42)
spam_train, spam_test = train_test_split(spam_data, test_size=0.2, random_state=42)

print(f"Easy Ham: {len(easy_ham_train)} train, {len(easy_ham_test)} test")
print(f"Hard Ham: {len(hard_ham_train)} train, {len(hard_ham_test)} test")
print(f"Spam: {len(spam_train)} train, {len(spam_test)} test")
#combine the training sets and test sets into a single list
X_train = easy_ham_train + hard_ham_train + spam_train
#create training labels, grouping the data to their each respective sets (easy, hard, spam)
y_train = [0] * len(easy_ham_train) + [1] * len(hard_ham_train) + [2] * len(spam_train)
X_test = easy_ham_test + hard_ham_test + spam_test
y_test = [0] * len(easy_ham_test) + [1] * len(hard_ham_test) + [2] * len(spam_test)

Easy Ham: 2040 train, 511 test
Hard Ham: 200 train, 50 test
Spam: 400 train, 101 test


In [8]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(max_features=5000, stop_words='english') #had to limit since all of the words crashed my notebook lol

X_train_vec = vectorizer.fit_transform(X_train) #i used fit_transform to learn the vocab 
#and transform it to a matrix
X_test_vec = vectorizer.transform(X_test) #and i used transform on the test to transform it to a matrix.
#learning the vocab would not fit on a test

print(f"Training data shape: {X_train_vec.shape}")
print(f"Testing data shape: {X_test_vec.shape}")

Training data shape: (2640, 5000)
Testing data shape: (662, 5000)


In [10]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score

print(f"Training data shape: {X_train_vec.shape}")
print(f"Testing data shape: {X_test_vec.shape}")

clf = MultinomialNB()
clf.fit(X_train_vec, y_train)

accuracy = clf.score(X_test_vec, y_test)
print("Accuracy:", accuracy)

Training data shape: (2640, 5000)
Testing data shape: (662, 5000)
Accuracy: 0.9561933534743202
