In [3]:
import os
import pickle
import email_read_util

In [4]:
DATA_DIR = 'trec07p/data/'
LABELS_FILE = 'trec07p/full/index'
TRAINING_SET_RATIO = 0.7

In [5]:
labels = {}
spam_words = set()
ham_words = set()

In [6]:
# Read the labels
with open(LABELS_FILE) as f:
    for line in f:
        line = line.strip()
        label, key = line.split()
        labels[key.split('/')[-1]] = 1 if label.lower() == 'ham' else 0

In [7]:
# Split corpus into train and test sets
filelist = os.listdir(DATA_DIR)
X_train = filelist[:int(len(filelist)*TRAINING_SET_RATIO)]
X_test = filelist[int(len(filelist)*TRAINING_SET_RATIO):]

In [8]:
import nltk
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to /home/artaz/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [9]:
from collections import defaultdict
from nltk import ngrams

In [10]:
# Parameters
NGRAM_SIZE = 2  # Use bigrams (you can change this to 3 for trigrams, etc.)
MIN_FREQ = 5    # Minimum frequency for n-grams to be considered

In [11]:

# Train the model - find frequent n-grams in spam
if not os.path.exists('spam_ngrams.pkl'):
    spam_ngram_counts = defaultdict(int)
    ham_ngram_counts = defaultdict(int)

    for filename in X_train:
        path = os.path.join(DATA_DIR, filename)
        if filename in labels:
            label = labels[filename]
            stems = email_read_util.load(path)
            if not stems:
                continue

            # Generate n-grams
            stem_ngrams = ngrams(stems, NGRAM_SIZE)

            # Count n-grams based on label
            if label == 0:  # Spam
                for ng in stem_ngrams:
                    spam_ngram_counts[ng] += 1
            else:  # Ham
                for ng in stem_ngrams:
                    ham_ngram_counts[ng] += 1

    # Filter n-grams that appear frequently in spam but rarely in ham
    spam_indicative_ngrams = set()
    for ng, count in spam_ngram_counts.items():
        if count >= MIN_FREQ and ham_ngram_counts.get(ng, 0) < count/2:
            spam_indicative_ngrams.add(ng)

    pickle.dump(spam_indicative_ngrams, open('spam_ngrams.pkl', 'wb'))
else:
    spam_indicative_ngrams = pickle.load(open('spam_ngrams.pkl', 'rb'))


print(f'Found {len(spam_indicative_ngrams)} spam-indicative {NGRAM_SIZE}-grams')


Found 57344 spam-indicative 2-grams


In [12]:

# Test the model
fp = 0
tp = 0
fn = 0
tn = 0

for filename in X_test:
    path = os.path.join(DATA_DIR, filename)
    if filename in labels:
        true_label = labels[filename]
        stems = email_read_util.load(path)
        if not stems:
            continue

        # Generate n-grams for this email
        stem_ngrams = set(ngrams(stems, NGRAM_SIZE))

        # Check for spam-indicative n-grams
        spam_score = len(stem_ngrams & spam_indicative_ngrams)

        # Predict spam if any spam-indicative n-grams found
        predicted_label = 0 if spam_score > 0 else 1

        # Update confusion matrix
        if true_label == 1 and predicted_label == 1:
            tn += 1
        elif true_label == 1 and predicted_label == 0:
            fp += 1
        elif true_label == 0 and predicted_label == 1:
            fn += 1
        elif true_label == 0 and predicted_label == 0:
            tp += 1


In [13]:
total = tp + tn + fp + fn
accuracy = (tp + tn) / total
precision = tp / (tp + fp) if (tp + fp) > 0 else 0
recall = tp / (tp + fn) if (tp + fn) > 0 else 0
f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
false_positive_rate = fp / (fp + tn) if (fp + tn) > 0 else 0
false_negative_rate = fn / (fn + tp) if (fn + tp) > 0 else 0

# Print formatted metrics
print("\n📊 Comprehensive Classification Metrics:")
print(f"True Positives (TP): {tp}")
print(f"True Negatives (TN): {tn}")
print(f"False Positives (FP): {fp}")
print(f"False Negatives (FN): {fn}\n")

print(f"Accuracy: {accuracy:.1%}")
print(f"Precision (Spam): {precision:.1%}")
print(f"Recall (Spam): {recall:.1%}")
print(f"F1 Score (Spam): {f1_score:.1%}")
print(f"False Positive Rate: {false_positive_rate:.1%}")
print(f"False Negative Rate: {false_negative_rate:.1%}\n")

print(f"Spam Detection Rate (Recall): {recall:.1%}")
print(f"Ham Misclassification Rate (FPR): {false_positive_rate:.1%}")
print(f"Total Emails Classified: {total}")
print(f"Spam-Indicative {NGRAM_SIZE}-grams Used: {len(spam_indicative_ngrams)}")


📊 Comprehensive Classification Metrics:
True Positives (TP): 11504
True Negatives (TN): 3127
False Positives (FP): 4309
False Negatives (FN): 1874

Accuracy: 70.3%
Precision (Spam): 72.8%
Recall (Spam): 86.0%
F1 Score (Spam): 78.8%
False Positive Rate: 57.9%
False Negative Rate: 14.0%

Spam Detection Rate (Recall): 86.0%
Ham Misclassification Rate (FPR): 57.9%
Total Emails Classified: 20814
Spam-Indicative 2-grams Used: 57344
