In [71]:
import tarfile
import os
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    classification_report,
    confusion_matrix,
)

In [72]:
# extract tar files
tar_files = [
    "20021010_easy_ham.tar.bz2",
    "20021010_hard_ham.tar.bz2",
    "20021010_spam.tar.bz2",
]
extract_dir = "./Dataset_ham_spam"

In [73]:
# for each file
for tar_file in tar_files:
    with tarfile.open(tar_file) as file:
        # extract each tar file into the dataset directory
        file.extractall(extract_dir, filter="tar")
        print(f"Extracted {tar_file} to {extract_dir}")

Extracted 20021010_easy_ham.tar.bz2 to ./Dataset_ham_spam
Extracted 20021010_hard_ham.tar.bz2 to ./Dataset_ham_spam
Extracted 20021010_spam.tar.bz2 to ./Dataset_ham_spam


In [74]:
# set directories
easy_ham_dir = os.path.join(extract_dir, "easy_ham")
hard_ham_dir = os.path.join(extract_dir, "hard_ham")
spam_dir = os.path.join(extract_dir, "spam")

In [75]:
# function to read and display a limited number of files from a directory
def extract_data_from_dir(directory):
    files = os.listdir(directory)
    files = [
        os.path.join(directory, f)
        for f in files
        if os.path.isfile(os.path.join(directory, f))
    ]
    files_list = []
    print(f"Reading {len(files)} files from {directory}")
    for file_path in files:  # limit the number of files displayed
        with open(
            file_path, "r", encoding="latin1"
        ) as file:  # latin1 encoding for email data
            files_list.append(file.read())
    return files_list

In [76]:
# prepare data for train-test
easy_ham_data = extract_data_from_dir(easy_ham_dir)
hard_ham_data = extract_data_from_dir(hard_ham_dir)
spam_data = extract_data_from_dir(spam_dir)

nr_easy_ham_files = int(len(easy_ham_data))
nr_hard_ham_files = int(len(hard_ham_data))
nr_spam_files = int(len(spam_data))

Reading 2551 files from ./Dataset_ham_spam/easy_ham
Reading 250 files from ./Dataset_ham_spam/hard_ham
Reading 501 files from ./Dataset_ham_spam/spam


In [77]:
def print_emails(full_data, limit=10):
    for i, data in enumerate(full_data):
        print(data)
        print("==" * 20)
        if i >= limit:
            break

In [78]:
# print easy ham emails
print_emails(easy_ham_data)

From quinlan@pathname.com  Tue Sep 17 23:30:38 2002
Return-Path: <quinlan@pathname.com>
Delivered-To: yyyy@localhost.example.com
Received: from localhost (jalapeno [127.0.0.1])
	by jmason.org (Postfix) with ESMTP id EF15F16F03
	for <jm@localhost>; Tue, 17 Sep 2002 23:30:37 +0100 (IST)
Received: from jalapeno [127.0.0.1]
	by localhost with IMAP (fetchmail-5.9.0)
	for jm@localhost (single-drop); Tue, 17 Sep 2002 23:30:38 +0100 (IST)
Received: from proton.pathname.com
    (adsl-216-103-211-240.dsl.snfc21.pacbell.net [216.103.211.240]) by
    dogma.slashnull.org (8.11.6/8.11.6) with ESMTP id g8HKVLC25673 for
    <jm@jmason.org>; Tue, 17 Sep 2002 21:31:21 +0100
Received: from quinlan by proton.pathname.com with local (Exim 3.35 #1
    (Debian)) id 17rP0f-0004fo-00; Tue, 17 Sep 2002 13:31:49 -0700
To: yyyy@example.com (Justin Mason)
Cc: spamassassin-devel@example.sourceforge.net
Subject: Re: [SAdev] Re: [SAtalk] SpamAssassin and unconfirmed.dsbl.org
References: <20020917142054.5C4E916F16@exa

In [79]:
# print hard ham email
print_emails(hard_ham_data)

Return-Path: <Online#3.19822.84-7a8Hl0_lygzZARRR.1.b@newsletter.online.com>
Received: from acmta3.cnet.com (abv-sfo1-acmta3.cnet.com [206.16.1.162])
	by dogma.slashnull.org (8.11.6/8.11.6) with ESMTP id g6BKuYJ02668
	for <qqqqqqqqqq-zdnet@example.com>; Thu, 11 Jul 2002 21:56:35 +0100
Received: from abv-sfo1-ac-agent5 (206.16.0.240) by acmta3.cnet.com (PowerMTA(TM) v1.5); Thu, 11 Jul 2002 13:49:18 -0700 (envelope-from <Online#3.19822.84-7a8Hl0_lygzZARRR.1.b@newsletter.online.com>)
Message-ID: <7804199.1026420992990.JavaMail.root@abv-sfo1-ac-agent5>
Date: Thu, 11 Jul 2002 13:56:32 -0700 (PDT)
From: CNET Message Boards <Online#3.19822.84-7a8Hl0_lygzZARRR.1@newsletter.online.com>
To: qqqqqqqqqq-zdnet@example.com
Subject: CNET: Why I don't use firewalls
Mime-Version: 1.0
Content-Type: text/html; charset=ISO-8859-1
Content-Transfer-Encoding: 7bit
X-Mailer: Accucast (http://www.accucast.com)
X-Mailer-Version: 2.8.4-2

<html>
<head>
	<title>Message Boards Dispatch</title>
</head>
<body bgcolor

In [80]:
# print spam email
print_emails(spam_data)

Received: from qrq.cc.ntu.edu.tw (giga.tw.freebsd.org [203.133.92.249])
	by dogma.slashnull.org (8.11.6/8.11.6) with ESMTP id g6J7RhJ02577
	for <postmaster_neto.net@spamtraps.taint.org>; Fri, 19 Jul 2002 08:27:45 +0100
Received: from ccsun37.cc.ntu.edu.tw (ccsun37.cc.ntu.edu.tw [140.112.8.37])
	by qrq.cc.ntu.edu.tw (8.12.5/8.12.5) with ESMTP id g6J7Renr016990
	for <meilan@neto.net>; Fri, 19 Jul 2002 15:27:40 +0800 (CST)
	(envelope-from master@ibd.pe.kr)
Received: from localhost (localhost [127.0.0.1])
	by ccsun37.cc.ntu.edu.tw (Postfix) with ESMTP id 199BDDE082
	for <meilan@neto.net>; Fri, 19 Jul 2002 15:23:07 +0800 (CST)
Received: from bigfoot.com (unknown [64.15.239.140])
	by ccsun37.cc.ntu.edu.tw (Postfix) with SMTP id 7A551DE087
	for <meilan@neto.net>; Fri, 19 Jul 2002 15:23:00 +0800 (CST)
Received: from localhost ([61.104.101.163])
	by BFLITEMAIL4A.bigfoot.com (LiteMail v3.02(BFLITEMAIL4A)) with SMTP id 18Jul2002_BFLITEMAIL4A_36057_169370901;
	Thu, 18 Jul 2002 14:46:11 -0400 EST
R

We can see that in the easy ham folder, there are more 'normal' emails, and that the hard
ham emails have html code in them which makes us assume that they are ads and are spam.
The spam level indicator in some of the mails indicated that as well.
The emails in the spam folder has mails that look like scams since they are trying desperatly
to sell something.

In [81]:
# set label:
# 0-easy and hard ham
# 1-spam
# problem 3
set_one = easy_ham_data + spam_data
set_one_labels = [0] * len(easy_ham_data) + [1] * len(spam_data)

# problem 4
set_two = hard_ham_data + spam_data
set_two_labels = [0] * len(hard_ham_data) + [1] * len(spam_data)

In [82]:
print(f"Size of set one: ", len(set_one))
print(f"Size of set two: ", len(set_two))

Size of set one:  3052
Size of set two:  751


In [83]:
# split each data set so that we train 80% of it and test on 20% of it
set_one_train, set_one_test, set_one_train_label, set_one_test_label = train_test_split(
    set_one, set_one_labels, test_size=0.2, random_state=42
)

set_two_train, set_two_test, set_two_train_label, set_two_test_label = train_test_split(
    set_two, set_two_labels, test_size=0.2, random_state=42
)

In [84]:
print(
    f"Set One (Easy ham + Spam): {len(set_one_train)} train, {len(set_one_test)} test"
)
print(
    f"Set One (Hard ham + Spam): {len(set_two_train)} train, {len(set_two_test)} test"
)

Set One (Easy ham + Spam): 2441 train, 611 test
Set One (Hard ham + Spam): 600 train, 151 test


In [85]:
def vectorize_data(taining_data, testing_data):
    vectorizer = CountVectorizer()

    # fit_transform learns the document vocab and return a matrix
    training_matrix = vectorizer.fit_transform(taining_data)

    # use transform to get the matrix of test data
    testing_matrix = vectorizer.transform(testing_data)

    print(f"Training data shape: {training_matrix.shape}")
    print(f"Testing data shape: {testing_matrix.shape}")
    return training_matrix, testing_matrix

In [86]:
# Multinomial Naive Bayes classifier
def classification_multinomialNB(training_matrix, training_label, testing_matrix):
    mnb = MultinomialNB()
    # pass the training data against the classification label
    mnb.fit(training_matrix, training_label)

    # Make predictions
    mnb_predection_matrix = mnb.predict(testing_matrix)
    return mnb_predection_matrix

In [87]:
# Bernoulli Naive Bayes classifier
def classification_bernoulliNB(training_matrix, training_label, testing_matrix):
    # Train a Bernoulli Naive Bayes classifier
    bnb = BernoulliNB()
    bnb.fit(training_matrix, training_label)

    # Make predictions
    bnb_predection_matrix = bnb.predict(testing_matrix)
    return bnb_predection_matrix

In [88]:
def metric(expected_label, predection_matrix):
    # Evaluate the model
    print(
        "Accuracy score",
        accuracy_score(expected_label, predection_matrix),
    )
    print(
        "Precision score",
        precision_score(expected_label, predection_matrix),
    )
    print(
        "Recall score",
        recall_score(expected_label, predection_matrix),
    )
    print(
        "classification report",
        classification_report(expected_label, predection_matrix),
     )
    print("confusion matrix", confusion_matrix(expected_label, predection_matrix))

In [89]:
# problem 3
set_one_training_matrix, set_one_testing_matrix = vectorize_data(
    set_one_train, set_one_test
)
set_one_mnb_predection_matrix = classification_multinomialNB(
    set_one_training_matrix, set_one_train_label, set_one_testing_matrix
)

set_one_bnb_predection_matrix = classification_bernoulliNB(
    set_one_training_matrix, set_one_train_label, set_one_testing_matrix
)

Training data shape: (2441, 78758)
Testing data shape: (611, 78758)


In [93]:
# problem 3
print("classification_multinomialNB")
metric(set_one_test_label, set_one_mnb_predection_matrix)
print("classification_bernoulliNB")
metric(set_one_test_label, set_one_bnb_predection_matrix)

classification_multinomialNB
Accuracy score 0.972176759410802
Precision score 1.0
Recall score 0.8152173913043478
classification report               precision    recall  f1-score   support

           0       0.97      1.00      0.98       519
           1       1.00      0.82      0.90        92

    accuracy                           0.97       611
   macro avg       0.98      0.91      0.94       611
weighted avg       0.97      0.97      0.97       611

confusion matrix [[519   0]
 [ 17  75]]
classification_bernoulliNB
Accuracy score 0.9214402618657938
Precision score 1.0
Recall score 0.4782608695652174
classification report               precision    recall  f1-score   support

           0       0.92      1.00      0.96       519
           1       1.00      0.48      0.65        92

    accuracy                           0.92       611
   macro avg       0.96      0.74      0.80       611
weighted avg       0.93      0.92      0.91       611

confusion matrix [[519   0]
 [ 48  

In [91]:
# problem 4
set_two_training_matrix, set_two_testing_matrix = vectorize_data(
    set_two_train, set_two_test
)
set_two_mnb_predection_matrix = classification_multinomialNB(
    set_two_training_matrix, set_two_train_label, set_two_testing_matrix
)
set_two_bnb_predection_matrix = classification_bernoulliNB(
    set_two_training_matrix, set_two_train_label, set_two_testing_matrix
)

Training data shape: (600, 60456)
Testing data shape: (151, 60456)


In [92]:
# problem 4
print("classification_multinomialNB")
metric(set_two_test_label, set_two_mnb_predection_matrix)
print("classification_bernoulliNB")
metric(set_two_test_label, set_two_bnb_predection_matrix)

classification_multinomialNB
Accuracy score 0.9470198675496688
Precision score 0.9333333333333333
Recall score 0.98989898989899
classification report               precision    recall  f1-score   support

           0       0.98      0.87      0.92        52
           1       0.93      0.99      0.96        99

    accuracy                           0.95       151
   macro avg       0.96      0.93      0.94       151
weighted avg       0.95      0.95      0.95       151

confusion matrix [[45  7]
 [ 1 98]]
classification_bernoulliNB
Accuracy score 0.8874172185430463
Precision score 0.8596491228070176
Recall score 0.98989898989899
classification report               precision    recall  f1-score   support

           0       0.97      0.69      0.81        52
           1       0.86      0.99      0.92        99

    accuracy                           0.89       151
   macro avg       0.92      0.84      0.86       151
weighted avg       0.90      0.89      0.88       151

confusion ma