In [1]:
import os

In [2]:
DATA_DIR = 'trec07p/data/'
LABELS_FILE = 'trec07p/full/index'
TRAINING_SET_RATIO = 0.7

In [3]:
labels = {}
spam_words = set()
ham_words = set()

In [4]:
# Read the labels
with open(LABELS_FILE) as f:
    for line in f:
        line = line.strip()
        label, key = line.split()
        labels[key.split('/')[-1]] = 1 if label.lower() == 'ham' else 0

In [5]:
# Split corpus into train and test sets
filelist = os.listdir(DATA_DIR)
X_train = filelist[:int(len(filelist)*TRAINING_SET_RATIO)]
X_test = filelist[int(len(filelist)*TRAINING_SET_RATIO):]

In [6]:
import nltk
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to /home/artaz/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [8]:
from exercise_1.email_read_util import extract_email_text

fp = 0
tp = 0
fn = 0
tn = 0

for filename in X_test:
    path = os.path.join(DATA_DIR, filename)
    if filename in labels:
        true_label = labels[filename]

        # Load the email and check if it has a subject
        email = extract_email_text(path)

        # Our prediction: 1 (ham) if subject exists, 0 (spam) if not
        predicted_label = 0
        if email != '':
            predicted_label = 1

        # Update confusion matrix
        if true_label == 1 and predicted_label == 1:
            tn += 1
        elif true_label == 1 and predicted_label == 0:
            fp += 1
        elif true_label == 0 and predicted_label == 1:
            fn += 1
        elif true_label == 0 and predicted_label == 0:
            tp += 1

# Calculate metrics from the confusion matrix
count = tn + fp + fn + tp
accuracy = (tp + tn) / count
precision = tp / (tp + fp) if (tp + fp) > 0 else 0
recall = tp / (tp + fn) if (tp + fn) > 0 else 0
f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
false_positive_rate = fp / (fp + tn) if (fp + tn) > 0 else 0

# Print all metrics
print("\nClassification Metrics:")
print(f"True Positives (TP): {tp}")
print(f"True Negatives (TN): {tn}")
print(f"False Positives (FP): {fp}")
print(f"False Negatives (FN): {fn}")
print(f"Accuracy: {accuracy:.1%}")
print(f"Precision (Spam): {precision:.1%}")
print(f"Recall (Spam): {recall:.1%}")
print(f"F1 Score (Spam): {f1:.1%}")
print(f"False Positive Rate: {false_positive_rate:.1%}")


Classification Metrics:
True Positives (TP): 0
True Negatives (TN): 7490
False Positives (FP): 0
False Negatives (FN): 15136
Accuracy: 33.1%
Precision (Spam): 0.0%
Recall (Spam): 0.0%
F1 Score (Spam): 0.0%
False Positive Rate: 0.0%
