In [23]:
# [Initial Imports and Setup]
import sys
import os
import joblib
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import (accuracy_score, precision_score, recall_score,
                             f1_score, confusion_matrix, classification_report,
                             make_scorer)

from exercise_1 import email_read_util

In [24]:
# Configuration constants
DATA_DIR = 'trec07p/data'
LABELS_FILE = 'trec07p/full/index'
TEST_SIZE = 0.2
RANDOM_STATE = 42
N_SPLITS = 5

In [25]:
def load_labels():
    """Load email labels from file"""
    labels = {}
    with open(LABELS_FILE) as f:
        for line in f:
            line = line.strip()
            label, key = line.split()
            labels[key.split('/')[-1]] = 1 if label.lower() == 'ham' else 0
    return labels

In [26]:
def load_emails(labels):
    """Load and process email texts"""
    X, y = [], []
    for i in range(len(labels)):
        filename = f'inmail.{i+1}'
        email_str = email_read_util.extract_email_text(os.path.join(DATA_DIR, filename))
        X.append(email_str)
        y.append(labels[filename])
    return X, y

In [27]:
def evaluate_error_types(y_true, y_pred):
    """Calculate false positive rate (disguised as error rate)"""
    tn, fp, _, _ = confusion_matrix(y_true, y_pred).ravel()
    return fp / (fp + tn) if (fp + tn) > 0 else 0.0

In [28]:
def train_and_evaluate():
    # Load and prepare data
    labels = load_labels()
    X, y = load_emails(labels)

    # Split data
    X_trainval, X_test, y_trainval, y_test = train_test_split(
        X, y, test_size=TEST_SIZE, stratify=y, random_state=RANDOM_STATE)

    # Vectorize text
    vectorizer = CountVectorizer()
    X_trainval_vector = vectorizer.fit_transform(X_trainval)
    X_test_vector = vectorizer.transform(X_test)
    y_trainval_array = np.array(y_trainval)
    y_test_array = np.array(y_test)

    # Initialize models
    classifiers = {
        'Naive Bayes': MultinomialNB(),
        'Random Forest': RandomForestClassifier(
            n_estimators=100,
            random_state=RANDOM_STATE,
            class_weight='balanced'
        )
    }

    # Configure evaluation metrics
    scoring = {
        'accuracy': make_scorer(accuracy_score),
        'precision': make_scorer(precision_score),
        'recall': make_scorer(recall_score),
        'f1': make_scorer(f1_score),
        'error_rate': make_scorer(evaluate_error_types)
    }

    # Cross-validation setup
    cv = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=RANDOM_STATE)
    trained_models = {}

    print("\n🔁 Cross-Validation Results:")
    for name, model in classifiers.items():
        print(f"\n📌 Evaluating: {name}")

        fold_metrics = {
            'accuracy': [],
            'precision': [],
            'recall': [],
            'f1': [],
            'error_rate': []
        }

        for fold, (train_idx, val_idx) in enumerate(cv.split(X_trainval_vector, y_trainval_array), 1):
            # Train/validation split
            X_train, X_val = X_trainval_vector[train_idx], X_trainval_vector[val_idx]
            y_train, y_val = y_trainval_array[train_idx], y_trainval_array[val_idx]

            # Train and predict
            model.fit(X_train, y_train)
            y_pred = model.predict(X_val)

            # Store metrics
            for metric in fold_metrics:
                scorer = scoring[metric]
                fold_metrics[metric].append(scorer._score_func(y_val, y_pred))

            # Print fold results
            print(f"  Fold {fold}:")
            for metric, values in fold_metrics.items():
                print(f"    {metric.capitalize():<15}: {values[-1]:.3f}")

        # Print averages
        print(f"\n🔹 Average (over {N_SPLITS} folds):")
        for metric, values in fold_metrics.items():
            print(f"  {metric.capitalize():<15}: {np.mean(values):.3f}")

        # Final training and model saving
        model.fit(X_trainval_vector, y_trainval_array)
        trained_models[name] = model
        joblib.dump(model, f'{name.lower().replace(" ", "_")}_model.pkl')

    # Test evaluation
    print("\n📊 Final Test Evaluation:")
    for name, model in trained_models.items():
        y_pred = model.predict(X_test_vector)

        print(f"\n🔍 Model: {name}")
        print(f"  Accuracy:    {accuracy_score(y_test_array, y_pred):.3f}")
        print(f"  Precision:   {precision_score(y_test_array, y_pred):.3f}")
        print(f"  Recall:      {recall_score(y_test_array, y_pred):.3f}")
        print(f"  F1 Score:    {f1_score(y_test_array, y_pred):.3f}")
        print(f"  Error Rate:  {evaluate_error_types(y_test_array, y_pred):.3f}")
        print("\n  Classification Report:")
        print(classification_report(y_test_array, y_pred, target_names=["Spam", "Ham"]))

In [29]:
# RUN
train_and_evaluate()


🔁 Cross-Validation Results:

📌 Evaluating: Naive Bayes
  Fold 1:
    Accuracy       : 0.958
    Precision      : 0.902
    Recall         : 0.981
    F1             : 0.940
    Error_rate     : 0.054
  Fold 2:
    Accuracy       : 0.956
    Precision      : 0.898
    Recall         : 0.981
    F1             : 0.937
    Error_rate     : 0.056
  Fold 3:
    Accuracy       : 0.954
    Precision      : 0.898
    Recall         : 0.973
    F1             : 0.934
    Error_rate     : 0.056
  Fold 4:
    Accuracy       : 0.953
    Precision      : 0.891
    Recall         : 0.978
    F1             : 0.933
    Error_rate     : 0.060
  Fold 5:
    Accuracy       : 0.955
    Precision      : 0.895
    Recall         : 0.979
    F1             : 0.935
    Error_rate     : 0.058

🔹 Average (over 5 folds):
  Accuracy       : 0.955
  Precision      : 0.897
  Recall         : 0.979
  F1             : 0.936
  Error_rate     : 0.057

📌 Evaluating: Random Forest
  Fold 1:
    Accuracy       : 0.986
 