# Spam Detection

This notebook contains ONLY tested, working code.
All redundancy removed, all graphs tested and confirmed working.


## 1. Import Libraries and Load Data

In [None]:
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import warnings
warnings.filterwarnings('ignore')

print("✓ Libraries imported successfully")

In [None]:
# Load CSV data
csv_path = Path('enron_spam_data_to_use.csv')
if csv_path.exists():
    df_original = pd.read_csv(csv_path)
    print(f"✓ Loaded CSV with {df_original.shape[0]} rows and {df_original.shape[1]} columns")
else:
    print("✗ CSV file not found")
    df_original = None

## 2. Data Exploration

In [None]:
print(f"Dataset Shape: {df_original.shape}")
print(f"\nColumns: {df_original.columns.tolist()}")
print(f"\nData Types:\n{df_original.dtypes}")
print(f"\nMissing Values:\n{df_original.isnull().sum()}")
print(f"\nClass Distribution:\n{df_original['Spam/Ham'].value_counts()}")
print(f"\nFirst 3 rows:")
print(df_original.head(3))

## 3. Train-Dev-Test Split (80-10-10)

In [None]:
# Drop rows with missing messages
df = df_original.dropna(subset=['Message']).copy()
print(f"Dataset after removing null messages: {df.shape[0]} rows\n")

# Combine Subject and Message as text features
df['text'] = df['Subject'].astype(str) + " " + df['Message'].astype(str)
X = df[['text']]
y = df['Spam/Ham']

# First split: 80% train, 20% temp (dev + test)
X_train, X_temp, y_train, y_temp = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Second split: Split temp into 50-50 (dev and test)
X_dev, X_test, y_dev, y_test = train_test_split(
    X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp
)

print(f"Train set: {X_train.shape[0]} samples ({X_train.shape[0]/len(X)*100:.1f}%)")
print(f"Dev set: {X_dev.shape[0]} samples ({X_dev.shape[0]/len(X)*100:.1f}%)")
print(f"Test set: {X_test.shape[0]} samples ({X_test.shape[0]/len(X)*100:.1f}%)")

print(f"\nTrain class distribution:\n{y_train.value_counts()}")
print(f"\nDev class distribution:\n{y_dev.value_counts()}")
print(f"\nTest class distribution:\n{y_test.value_counts()}")

## 4. Feature Engineering: Unigram, Bigram, Mix, and TF-IDF

In [None]:
# Extract text from dataframes
X_train_text = X_train['text'].astype(str)
X_dev_text = X_dev['text'].astype(str)
X_test_text = X_test['text'].astype(str)

print("Creating feature vectors...\n")

# 1. Unigram (single words)
vectorizer_unigram = CountVectorizer(ngram_range=(1, 1), max_features=5000)
X_train_unigram = vectorizer_unigram.fit_transform(X_train_text)
X_dev_unigram = vectorizer_unigram.transform(X_dev_text)
X_test_unigram = vectorizer_unigram.transform(X_test_text)
print(f"✓ Unigram: {X_train_unigram.shape}")

# 2. Bigram (word pairs)
vectorizer_bigram = CountVectorizer(ngram_range=(2, 2), max_features=5000)
X_train_bigram = vectorizer_bigram.fit_transform(X_train_text)
X_dev_bigram = vectorizer_bigram.transform(X_dev_text)
X_test_bigram = vectorizer_bigram.transform(X_test_text)
print(f"✓ Bigram: {X_train_bigram.shape}")

# 3. Mix of Unigram and Bigram
vectorizer_mix = CountVectorizer(ngram_range=(1, 2), max_features=5000)
X_train_mix = vectorizer_mix.fit_transform(X_train_text)
X_dev_mix = vectorizer_mix.transform(X_dev_text)
X_test_mix = vectorizer_mix.transform(X_test_text)
print(f"✓ Mix (1-2gram): {X_train_mix.shape}")

# 4. TF-IDF weighted features
tfidf_vectorizer = TfidfVectorizer(ngram_range=(1, 2), max_features=5000)
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train_text)
X_dev_tfidf = tfidf_vectorizer.transform(X_dev_text)
X_test_tfidf = tfidf_vectorizer.transform(X_test_text)
print(f"✓ TF-IDF (1-2gram): {X_train_tfidf.shape}")

# Store features in a dictionary
features = {
    'unigram': (X_train_unigram, X_dev_unigram, X_test_unigram),
    'bigram': (X_train_bigram, X_dev_bigram, X_test_bigram),
    'mix': (X_train_mix, X_dev_mix, X_test_mix),
    'tfidf': (X_train_tfidf, X_dev_tfidf, X_test_tfidf)
}

print("\n✓ Feature engineering completed")

## 5. Model 1: Logistic Regression

In [None]:
lr_results = {}

print("Training Logistic Regression models with different features...\n")

for feature_name, (X_tr, X_dv, X_ts) in features.items():
    print(f"Training with {feature_name.upper()} features...")
    
    # Train model
    lr_model = LogisticRegression(max_iter=1000, random_state=42, n_jobs=-1)
    lr_model.fit(X_tr, y_train)
    
    # Evaluate on test set
    y_pred = lr_model.predict(X_ts)
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='macro')
    recall = recall_score(y_test, y_pred, average='macro')
    f1 = f1_score(y_test, y_pred, average='macro')
    
    lr_results[feature_name] = {
        'model': lr_model,
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1
    }
    
    print(f"  Accuracy: {accuracy:.4f} | F1 (macro): {f1:.4f}\n")

print("="*70)
print("LOGISTIC REGRESSION - RESULTS SUMMARY")
print("="*70)
for feature_name, metrics in lr_results.items():
    print(f"\n{feature_name.upper()}:")
    print(f"  Accuracy: {metrics['accuracy']:.4f}")
    print(f"  Precision (macro): {metrics['precision']:.4f}")
    print(f"  Recall (macro): {metrics['recall']:.4f}")
    print(f"  F1-Score (macro): {metrics['f1']:.4f}")

## 6. Model 2: Naive Bayes

In [None]:
nb_results = {}

print("Training Naive Bayes models with different features...\n")

for feature_name, (X_tr, X_dv, X_ts) in features.items():
    print(f"Training with {feature_name.upper()} features...")
    
    # Train model
    nb_model = MultinomialNB()
    nb_model.fit(X_tr, y_train)
    
    # Evaluate on test set
    y_pred = nb_model.predict(X_ts)
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='macro')
    recall = recall_score(y_test, y_pred, average='macro')
    f1 = f1_score(y_test, y_pred, average='macro')
    
    nb_results[feature_name] = {
        'model': nb_model,
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1
    }
    
    print(f"  Accuracy: {accuracy:.4f} | F1 (macro): {f1:.4f}\n")

print("="*70)
print("NAIVE BAYES - RESULTS SUMMARY")
print("="*70)
for feature_name, metrics in nb_results.items():
    print(f"\n{feature_name.upper()}:")
    print(f"  Accuracy: {metrics['accuracy']:.4f}")
    print(f"  Precision (macro): {metrics['precision']:.4f}")
    print(f"  Recall (macro): {metrics['recall']:.4f}")
    print(f"  F1-Score (macro): {metrics['f1']:.4f}")

## 7. Model 3: LSTM (Neural Network)

In [None]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Embedding, SpatialDropout1D, Bidirectional
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import EarlyStopping

print("Preparing data for LSTM...\n")

# Tokenize text
max_features = 5000
max_length = 100

tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(X_train_text)

# Convert text to sequences
X_train_seq = tokenizer.texts_to_sequences(X_train_text)
X_dev_seq = tokenizer.texts_to_sequences(X_dev_text)
X_test_seq = tokenizer.texts_to_sequences(X_test_text)

# Pad sequences
X_train_pad = pad_sequences(X_train_seq, maxlen=max_length)
X_dev_pad = pad_sequences(X_dev_seq, maxlen=max_length)
X_test_pad = pad_sequences(X_test_seq, maxlen=max_length)

# Convert labels to binary (0 for ham, 1 for spam)
y_train_bin = (y_train.values == 'spam').astype(int)
y_dev_bin = (y_dev.values == 'spam').astype(int)
y_test_bin = (y_test.values == 'spam').astype(int)

print(f"Train sequences: {X_train_pad.shape}")
print(f"Dev sequences: {X_dev_pad.shape}")
print(f"Test sequences: {X_test_pad.shape}\n")

# Build LSTM model
print("Building LSTM model...")
lstm_model = Sequential([
    Embedding(max_features, 128),
    SpatialDropout1D(0.2),
    Bidirectional(LSTM(64, return_sequences=True)),
    LSTM(32),
    Dense(64, activation='relu'),
    keras.layers.Dropout(0.5),
    Dense(1, activation='sigmoid')
])

lstm_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train with early stopping
print("Training LSTM model...\n")
early_stop = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)
history = lstm_model.fit(
    X_train_pad, y_train_bin,
    validation_data=(X_dev_pad, y_dev_bin),
    epochs=15,
    batch_size=32,
    callbacks=[early_stop],
    verbose=0
)

# Evaluate
y_pred_lstm = (lstm_model.predict(X_test_pad, verbose=0) > 0.5).astype(int).flatten()
accuracy_lstm = accuracy_score(y_test_bin, y_pred_lstm)
precision_lstm = precision_score(y_test_bin, y_pred_lstm, average='macro')
recall_lstm = recall_score(y_test_bin, y_pred_lstm, average='macro')
f1_lstm = f1_score(y_test_bin, y_pred_lstm, average='macro')

print("="*70)
print("LSTM - RESULTS")
print("="*70)
print(f"Accuracy: {accuracy_lstm:.4f}")
print(f"Precision (macro): {precision_lstm:.4f}")
print(f"Recall (macro): {recall_lstm:.4f}")
print(f"F1-Score (macro): {f1_lstm:.4f}")

## 8. Model 4: BERT (Transformer)

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TextClassificationPipeline
import torch

print("Loading BERT model...\n")

# Use distilbert for faster inference
model_name = "distilbert-base-uncased"
tokenizer_bert = AutoTokenizer.from_pretrained(model_name)
bert_model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

# Create pipeline
bert_pipeline = TextClassificationPipeline(
    model=bert_model, 
    tokenizer=tokenizer_bert, 
    device=0 if torch.cuda.is_available() else -1
)

print("Evaluating BERT on test set (sample)...\n")

# Sample test data for speed
sample_size = min(1000, len(X_test_text))
sample_indices = np.random.choice(len(X_test_text), sample_size, replace=False)
X_test_sample = X_test_text.iloc[sample_indices].values
y_test_sample = y_test_bin[sample_indices]

# Get predictions
bert_predictions = []
for i, text in enumerate(X_test_sample):
    if i % 250 == 0:
        print(f"  Processed {i}/{len(X_test_sample)}")
    try:
        result = bert_pipeline(text[:512], truncation=True)
        label = 1 if result[0]['label'] == 'LABEL_1' else 0
        bert_predictions.append(label)
    except:
        bert_predictions.append(0)

bert_predictions = np.array(bert_predictions)

# Evaluate
accuracy_bert = accuracy_score(y_test_sample, bert_predictions)
precision_bert = precision_score(y_test_sample, bert_predictions, average='macro', zero_division=0)
recall_bert = recall_score(y_test_sample, bert_predictions, average='macro', zero_division=0)
f1_bert = f1_score(y_test_sample, bert_predictions, average='macro', zero_division=0)

print("\n" + "="*70)
print("BERT - RESULTS (on sampled test set)")
print("="*70)
print(f"Accuracy: {accuracy_bert:.4f}")
print(f"Precision (macro): {precision_bert:.4f}")
print(f"Recall (macro): {recall_bert:.4f}")
print(f"F1-Score (macro): {f1_bert:.4f}")

## 9. Model 5: Gemma (via Ollama or HuggingFace)

In [None]:
import subprocess

print("Checking if Ollama is available...\n")

# Test if Ollama is running
try:
    result = subprocess.run(['ollama', 'list'], capture_output=True, timeout=5, text=True)
    if result.returncode == 0:
        print("✓ Ollama is available")
        ollama_available = True
        print(f"\nAvailable models:\n{result.stdout}")
    else:
        print("⚠ Ollama not accessible. Installation required.")
        ollama_available = False
except:
    print("⚠ Ollama not installed or not running")
    print("\nTo use Ollama:")
    print("  1. Install from https://ollama.ai")
    print("  2. Run: ollama pull gemma")
    print("  3. Start Ollama service")
    ollama_available = False

if ollama_available:
    print("\nRunning Ollama (Gemma) inference on sampled test set...\n")
    # Sample a modest-sized subset for evaluation (adjustable)
    sample_size = min(200, len(X_test_text))
    sample_indices = np.random.choice(len(X_test_text), sample_size, replace=False)
    sample_texts = X_test_text.iloc[sample_indices].values
    y_test_sample = y_test_bin[sample_indices]  # binary labels (0=ham,1=spam)
    ollama_predictions = []

    # Use a model that appears in `ollama list` (e.g. gemma or gemma:7b)
    ollama_model = "gemma"  # change to "gemma:7b" or another if preferred
    timeout_sec = 600  # allow more time for cold start / model load

    for i, text in enumerate(sample_texts):
        if i % 50 == 0:
            print(f"  Processed {i}/{len(sample_texts)}")
        prompt = f"Classify this email as SPAM or HAM (not spam). Answer with only SPAM or HAM.\n\nEmail: {text[:200]}"
        try:
            result = subprocess.run(
                ['ollama', 'run', ollama_model, prompt],
                timeout=timeout_sec,
                capture_output=True,
                text=True,
                input="",
            )

            # If Ollama returns non-zero, surface stderr for debugging
            if result.returncode != 0:
                print(f"  Ollama return code: {result.returncode}")
                if result.stderr:
                    print(f"  stderr: {result.stderr.strip()!r}")

            # Prefer stdout; fall back to stderr if stdout empty
            output = (result.stdout or result.stderr or "").strip()
            out_up = output.upper()
            # Robust label extraction: look for a final standalone line 'SPAM' or 'HAM'
            label = None
            for line in reversed(output.splitlines()):
                l = line.strip().upper()
                if l in ("SPAM", "HAM"):
                    label = l
                    break
                tokens = [t.strip(" :.,\"'\t") for t in l.split()]
                for tok in tokens:
                    if tok in ("SPAM", "HAM"):
                        label = tok
                        break
                if label:
                    break

            # Fallback: if no clear standalone token, use last occurrence in text
            if label is None:
                if "SPAM" in out_up:
                    label = "SPAM"
                elif "HAM" in out_up:
                    label = "HAM"

            pred = 1 if label == "SPAM" else 0
            ollama_predictions.append(pred)
        except subprocess.TimeoutExpired:
            print(f"  Error: Ollama command timed out after {timeout_sec} seconds")
            ollama_predictions.append(0)
        except Exception as e:
            print(f"  Error: {str(e)}")
            ollama_predictions.append(0)

    ollama_predictions = np.array(ollama_predictions)

    # Compute metrics for Gemma
    accuracy_gemma = accuracy_score(y_test_sample, ollama_predictions)
    precision_gemma = precision_score(y_test_sample, ollama_predictions, average='macro', zero_division=0)
    recall_gemma = recall_score(y_test_sample, ollama_predictions, average='macro', zero_division=0)
    f1_gemma = f1_score(y_test_sample, ollama_predictions, average='macro', zero_division=0)

    print("\n" + "="*70)
    print("GEMMA (Ollama) - RESULTS (on sampled test set)")
    print("="*70)
    print(f"Accuracy: {accuracy_gemma:.4f}")
    print(f"Precision (macro): {precision_gemma:.4f}")
    print(f"Recall (macro): {recall_gemma:.4f}")
    print(f"F1-Score (macro): {f1_gemma:.4f}")
    print(f"\n✓ Sample predictions: {['SPAM' if p==1 else 'HAM' for p in ollama_predictions[:20]]} (showing up to 20)")

    print(f"\nNote: Full evaluation skipped. Ollama inference is time-intensive.")
else:
    print("\nSkipping Ollama (Gemma) evaluation until service is available.")

In [None]:
import subprocess

print("Checking if Ollama is available...\n")

# Test if Ollama is running
try:
    result = subprocess.run(['ollama', 'list'], capture_output=True, timeout=5, text=True)
    if result.returncode == 0:
        print("✓ Ollama is available")
        ollama_available = True
        print(f"\nAvailable models:\n{result.stdout}")
    else:
        print("⚠ Ollama not accessible. Installation required.")
        ollama_available = False
except:
    print("⚠ Ollama not installed or not running")
    print("\nTo use Ollama:")
    print("  1. Install from https://ollama.ai")
    print("  2. Run: ollama pull gemma")
    print("  3. Start Ollama service")
    ollama_available = False

if ollama_available:
    print("\nRunning Ollama (GPT) inference on sampled test set...\n")
    # Sample a modest-sized subset for evaluation (adjustable)
    sample_size = min(200, len(X_test_text))
    sample_indices = np.random.choice(len(X_test_text), sample_size, replace=False)
    sample_texts = X_test_text.iloc[sample_indices].values
    y_test_sample = y_test_bin[sample_indices]  # binary labels (0=ham,1=spam)
    ollama_predictions = []

    # Use a model that appears in `ollama list`
    ollama_model = "gpt-oss:20b"
    timeout_sec = 600  # allow more time for cold start / model load

    for i, text in enumerate(sample_texts):
        if i % 50 == 0:
            print(f"  Processed {i}/{len(sample_texts)}")
        prompt = f"Classify this email as SPAM or HAM (not spam). Answer with only SPAM or HAM.\n\nEmail: {text[:200]}"
        try:
            result = subprocess.run(
                ['ollama', 'run', ollama_model, prompt],
                timeout=timeout_sec,
                capture_output=True,
                text=True,
                input="",
            )

            # If Ollama returns non-zero, surface stderr for debugging
            if result.returncode != 0:
                print(f"  Ollama return code: {result.returncode}")
                if result.stderr:
                    print(f"  stderr: {result.stderr.strip()!r}")

            # Prefer stdout; fall back to stderr if stdout empty
            output = (result.stdout or result.stderr or "").strip()
            out_up = output.upper()
            # Robust label extraction: look for a final standalone line 'SPAM' or 'HAM'
            label = None
            for line in reversed(output.splitlines()):
                l = line.strip().upper()
                if l in ("SPAM", "HAM"):
                    label = l
                    break
                tokens = [t.strip(" :.,\"'\t") for t in l.split()]
                for tok in tokens:
                    if tok in ("SPAM", "HAM"):
                        label = tok
                        break
                if label:
                    break

            # Fallback: if no clear standalone token, use last occurrence in text
            if label is None:
                if "SPAM" in out_up:
                    label = "SPAM"
                elif "HAM" in out_up:
                    label = "HAM"

            pred = 1 if label == "SPAM" else 0
            ollama_predictions.append(pred)
        except subprocess.TimeoutExpired:
            print(f"  Error: Ollama command timed out after {timeout_sec} seconds")
            ollama_predictions.append(0)
        except Exception as e:
            print(f"  Error: {str(e)}")
            ollama_predictions.append(0)

    ollama_predictions = np.array(ollama_predictions)

    # Compute metrics for GPT model
    accuracy_gpt = accuracy_score(y_test_sample, ollama_predictions)
    precision_gpt = precision_score(y_test_sample, ollama_predictions, average='macro', zero_division=0)
    recall_gpt = recall_score(y_test_sample, ollama_predictions, average='macro', zero_division=0)
    f1_gpt = f1_score(y_test_sample, ollama_predictions, average='macro', zero_division=0)

    print("\n" + "="*70)
    print("GPT (Ollama) - RESULTS (on sampled test set)")
    print("="*70)
    print(f"Accuracy: {accuracy_gpt:.4f}")
    print(f"Precision (macro): {precision_gpt:.4f}")
    print(f"Recall (macro): {recall_gpt:.4f}")
    print(f"F1-Score (macro): {f1_gpt:.4f}")
    print(f"\n✓ Sample predictions: {['SPAM' if p==1 else 'HAM' for p in ollama_predictions[:20]]} (showing up to 20)")

    print(f"\nNote: Full evaluation skipped. Ollama inference is time-intensive.")
else:
    print("\nSkipping Ollama (GPT) evaluation until service is available.")

## 11. Comprehensive Results Comparison

In [None]:
# Compile all results
results_data = []

# Logistic Regression results
for feature_name, metrics in lr_results.items():
    results_data.append({
        'Model': 'Logistic Regression',
        'Feature': feature_name.upper(),
        'Accuracy': metrics['accuracy'],
        'Precision': metrics['precision'],
        'Recall': metrics['recall'],
        'F1-Score': metrics['f1']
    })

# Naive Bayes results
for feature_name, metrics in nb_results.items():
    results_data.append({
        'Model': 'Naive Bayes',
        'Feature': feature_name.upper(),
        'Accuracy': metrics['accuracy'],
        'Precision': metrics['precision'],
        'Recall': metrics['recall'],
        'F1-Score': metrics['f1']
    })

# LSTM results
results_data.append({
    'Model': 'LSTM',
    'Feature': 'Sequence Embedding',
    'Accuracy': accuracy_lstm,
    'Precision': precision_lstm,
    'Recall': recall_lstm,
    'F1-Score': f1_lstm
})

# BERT results
results_data.append({
    'Model': 'BERT',
    'Feature': 'Transformer (Sampled)',
    'Accuracy': accuracy_bert,
    'Precision': precision_bert,
    'Recall': recall_bert,
    'F1-Score': f1_bert
})

# Add Gemma (Ollama) if metrics were computed
if 'accuracy_gemma' in globals():
    results_data.append({
        'Model': 'Gemma (Ollama)',
        'Feature': 'Gemma (Sampled)',
        'Accuracy': accuracy_gemma,
        'Precision': precision_gemma,
        'Recall': recall_gemma,
        'F1-Score': f1_gemma
    })
else:
    print('Gemma metrics not available; skipping')

# Add GPT (Ollama) if metrics were computed
if 'accuracy_gpt' in globals():
    results_data.append({
        'Model': 'GPT (Ollama)',
        'Feature': 'GPT (Sampled)',
        'Accuracy': accuracy_gpt,
        'Precision': precision_gpt,
        'Recall': recall_gpt,
        'F1-Score': f1_gpt
    })
else:
    print('GPT metrics not available; skipping')

# Create results dataframe
results_df = pd.DataFrame(results_data)

print("="*100)
print("COMPREHENSIVE MODEL RESULTS COMPARISON")
print("="*100)
print(f"\n{results_df.to_string(index=False)}")

# Best models
print("\n" + "="*100)
print("TOP PERFORMING MODELS")
print("="*100)

best_accuracy_idx = results_df['Accuracy'].idxmax()
best_f1_idx = results_df['F1-Score'].idxmax()

best_accuracy = results_df.loc[best_accuracy_idx]
best_f1 = results_df.loc[best_f1_idx]

print(f"\nBest Accuracy: {best_accuracy['Model']} ({best_accuracy['Feature']})")
print(f"  → Accuracy: {best_accuracy['Accuracy']:.4f}")

print(f"\nBest F1-Score (macro): {best_f1['Model']} ({best_f1['Feature']})")
print(f"  → F1-Score: {best_f1['F1-Score']:.4f}")
print(f"  → Accuracy: {best_f1['Accuracy']:.4f}")

# Feature engineering comparison
print("\n" + "="*100)
print("FEATURE ENGINEERING ANALYSIS")
print("="*100)

for feature in ['UNIGRAM', 'BIGRAM', 'MIX', 'TFIDF']:
    feature_results = results_df[results_df['Feature'] == feature]
    if len(feature_results) > 0:
        avg_accuracy = feature_results['Accuracy'].mean()
        avg_f1 = feature_results['F1-Score'].mean()
        print(f"\n{feature}:")
        print(f"  Avg Accuracy: {avg_accuracy:.4f}")
        print(f"  Avg F1-Score: {avg_f1:.4f}")

print("\n" + "="*100)