In [None]:
import os
import pickle
import email_read_util

In [None]:
DATA_DIR = 'trec07p/data/'
LABELS_FILE = 'trec07p/full/index'
TRAINING_SET_RATIO = 0.7

In [None]:
labels = {}
spam_words = set()
ham_words = set()

In [None]:
# Read the labels
with open(LABELS_FILE) as f:
    for line in f:
        line = line.strip()
        label, key = line.split()
        labels[key.split('/')[-1]] = 1 if label.lower() == 'ham' else 0

In [None]:
# Split corpus into train and test sets
filelist = os.listdir(DATA_DIR)
X_train = filelist[:int(len(filelist)*TRAINING_SET_RATIO)]
X_test = filelist[int(len(filelist)*TRAINING_SET_RATIO):]

In [None]:
import nltk
nltk.download('punkt_tab')

In [32]:
# Parameters
MIN_SPAM_PERCENT = 0.20  # حداقل حضور در اسپم
MAX_HAM_PERCENT = 0.05   # حداکثر حضور در هم
LAPLACE_SMOOTHING = 1    # برای جلوگیری از تقسیم بر صفر

if not os.path.exists('spam_scores.pkl'):
    spam_word_counts = defaultdict(int)
    ham_word_counts = defaultdict(int)
    total_spam = 0
    total_ham = 0

    # First pass: count words and total emails
    for filename in X_train:
        path = os.path.join(DATA_DIR, filename)
        if filename in labels:
            label = labels[filename]
            stems = email_read_util.load(path)
            if not stems:
                continue

            if label == 0:  # Spam
                total_spam += 1
                for word in set(stems):
                    spam_word_counts[word] += 1
            else:  # Ham
                total_ham += 1
                for word in set(stems):
                    ham_word_counts[word] += 1

    # Calculate spam scores for all words
    word_scores = {}
    for word in set(spam_word_counts.keys()).union(set(ham_word_counts.keys())):
        spam_count = spam_word_counts.get(word, 0)
        ham_count = ham_word_counts.get(word, 0)

        # با استفاده از smoothing برای جلوگیری از تقسیم بر صفر
        spam_prob = (spam_count + LAPLACE_SMOOTHING) / (total_spam + 2*LAPLACE_SMOOTHING)
        ham_prob = (ham_count + LAPLACE_SMOOTHING) / (total_ham + 2*LAPLACE_SMOOTHING)

        # محاسبه امتیاز اسپم به عنوان نسبت احتمال
        word_scores[word] = spam_prob / (spam_prob + ham_prob)

    pickle.dump(word_scores, open('spam_scores.pkl', 'wb'))
else:
    word_scores = pickle.load(open('spam_scores.pkl', 'rb'))

print(f'Total spam emails in training: {total_spam}')
print(f'Total ham emails in training: {total_ham}')

# Test the model
fp = 0
tp = 0
fn = 0
tn = 0
THRESHOLD = 0.7  # آستانه برای تشخیص اسپم

for filename in X_test:
    path = os.path.join(DATA_DIR, filename)
    if filename in labels:
        true_label = labels[filename]
        stems = email_read_util.load(path)
        if not stems:
            continue

        # محاسبه امتیاز کلی ایمیل
        email_score = 0
        found_words = 0
        for word in set(stems):
            if word in word_scores:
                email_score += word_scores[word]
                found_words += 1

        if found_words > 0:
            email_score /= found_words  # میانگین امتیاز کلمات

        # پیش‌بینی بر اساس آستانه
        predicted_label = 0 if email_score > THRESHOLD else 1

        # Update confusion matrix
        if true_label == 1 and predicted_label == 1:
            tn += 1
        elif true_label == 1 and predicted_label == 0:
            fp += 1
        elif true_label == 0 and predicted_label == 1:
            fn += 1
        elif true_label == 0 and predicted_label == 0:
            tp += 1

# نمایش نتایج
from IPython.display import HTML, display
conf_matrix = [[tn, fp],
               [fn, tp]]
display(HTML('<table><tr>{}</tr></table>'.format(
    '</tr><tr>'.join('<td>{}</td>'.format(
        '</td><td>'.join(str(_) for _ in row))
                     for row in conf_matrix))))

count = tn + tp + fn + fp
percent_matrix = [["{:.1%}".format(tn/count), "{:.1%}".format(fp/count)],
                  ["{:.1%}".format(fn/count), "{:.1%}".format(tp/count)]]
display(HTML('<table><tr>{}</tr></table>'.format(
    '</tr><tr>'.join('<td>{}</td>'.format(
        '</td><td>'.join(str(_) for _ in row))
                     for row in percent_matrix))))

print("Classification accuracy: {}".format("{:.1%}".format((tp+tn)/count)))
print("Precision (spam): {}".format("{:.1%}".format(tp/(tp+fp))))
print("Recall (spam): {}".format("{:.1%}".format(tp/(tp+fn))))
print("F1-score (spam): {}".format("{:.1%}".format(2*tp/(2*tp + fp + fn))))

Total spam emails in training: 31134
Total ham emails in training: 17615


0,1
7435,1
10178,3200


0,1
35.7%,0.0%
48.9%,15.4%


Classification accuracy: 51.1%
Precision (spam): 100.0%
Recall (spam): 23.9%
F1-score (spam): 38.6%
