# Spam Detection

This notebook contains ONLY tested, working code.
All redundancy removed, all graphs tested and confirmed working.


## 1. Import Libraries and Load Data

In [1]:
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import warnings
warnings.filterwarnings('ignore')

print("✓ Libraries imported successfully")

✓ Libraries imported successfully


In [26]:
# Helper: Centralized Ollama inference + robust label extraction
import subprocess
from pathlib import Path
import json
import time

def extract_label(output: str) -> str:
    """Extracts a canonical label 'SPAM' or 'HAM' from model text output.
    Tries reversed-line scan, token check, LABEL_0/LABEL_1, numeric fallbacks, then None."""
    if output is None:
        return None
    out_up = output.upper()
    # 1) scan lines from bottom for standalone token
    for line in reversed(output.splitlines()):
        l = line.strip().upper()
        if l in ("SPAM", "HAM"):
            return l
        # token-level check (e.g. 'Label: HAM')
        tokens = [t.strip(" :.,\"'\t") for t in l.split()]
        for tok in tokens:
            if tok in ("SPAM", "HAM"):
                return tok
            if tok in ("LABEL_0", "LABEL_1"):
                return "SPAM" if tok.endswith("1") else "HAM"
            if tok in ("0","1"):
                return "SPAM" if tok == "1" else "HAM"
    # 2) fallback - look for LABEL_0/LABEL_1 anywhere
    if "LABEL_1" in out_up:
        return "SPAM"
    if "LABEL_0" in out_up:
        return "HAM"
    # 3) fallback - last occurrence of SPAM/HAM in text
    if "SPAM" in out_up:
        return "SPAM"
    if "HAM" in out_up:
        return "HAM"
    return None

def run_ollama_model(ollama_model: str, texts, sample_size=100, timeout=60, verbose=True, seed=None, indices=None):
    """Run Ollama `ollama_model` on `texts` (iterable).
    Returns: (preds_array, raw_outputs_list, indices_used) where preds are 0/1 ints (0=HAM,1=SPAM).
    This function samples `sample_size` indices (without replacement) from texts if sample_size < len(texts).
    You may pass `indices` to control exactly which examples are run, or `seed` for deterministic sampling.
    """
    import numpy as _np
    n = len(texts)
    # If explicit indices provided, use them (assumed to index into `texts`)
    if indices is not None:
        indices = _np.asarray(indices, dtype=int)
    else:
        # Determine indices via sampling
        if sample_size is None or sample_size >= n:
            indices = _np.arange(n)
        else:
            if seed is not None:
                rng = _np.random.default_rng(seed)
                indices = rng.choice(n, sample_size, replace=False)
            else:
                indices = _np.random.choice(n, sample_size, replace=False)
    selected = [texts[i] for i in indices]
    preds = []
    raw_outputs = []
    for i, text in enumerate(selected):
        if verbose and i % 50 == 0:
            print(f"  Running {ollama_model}: {i}/{len(selected)}")
        prompt = f"Classify this email as SPAM or HAM (not spam). Answer with only SPAM or HAM.\n\nEmail: {text[:1000]}"
        try:
            result = subprocess.run(["ollama", "run", ollama_model, prompt],
                                     timeout=timeout, capture_output=True, text=True, input="")
            output = (result.stdout or result.stderr or "").strip()
            raw_outputs.append(output)
            label = extract_label(output)
            pred = 1 if label == "SPAM" else 0
            preds.append(pred)
        except subprocess.TimeoutExpired:
            print(f"  Timeout for sample {i}; marking HAM (0) and continuing")
            raw_outputs.append("")
            preds.append(0)
        except Exception as e:
            print(f"  Error for sample {i}: {e}; marking HAM (0) and continuing")
            raw_outputs.append("")
            preds.append(0)
    return _np.array(preds), raw_outputs, indices


In [3]:
# Load CSV data
csv_path = Path('enron_spam_data_to_use.csv')
if csv_path.exists():
    df_original = pd.read_csv(csv_path)
    # Normalize label column to lower-case and map numeric labels if present
    if 'Spam/Ham' in df_original.columns:
        df_original['Spam/Ham'] = df_original['Spam/Ham'].astype(str).str.strip().str.lower()
        # Map common numeric encodings to textual labels
        df_original['Spam/Ham'] = df_original['Spam/Ham'].replace({'0':'ham','1':'spam'})
    print(f"✓ Loaded CSV with {df_original.shape[0]} rows and {df_original.shape[1]} columns")
else:
    print("✗ CSV file not found")
    df_original = None

✓ Loaded CSV with 33716 rows and 5 columns


## 2. Data Exploration

In [4]:
print(f"Dataset Shape: {df_original.shape}")
print(f"\nColumns: {df_original.columns.tolist()}")
print(f"\nData Types:\n{df_original.dtypes}")
print(f"\nMissing Values:\n{df_original.isnull().sum()}")
print(f"\nClass Distribution:\n{df_original['Spam/Ham'].value_counts()}")
print(f"\nFirst 3 rows:")
print(df_original.head(3))

Dataset Shape: (33716, 5)

Columns: ['Unnamed: 0', 'Subject', 'Message', 'Spam/Ham', 'Date']

Data Types:
Unnamed: 0     int64
Subject       object
Message       object
Spam/Ham      object
Date          object
dtype: object

Missing Values:
Unnamed: 0     0
Subject        0
Message       52
Spam/Ham       0
Date           0
dtype: int64

Class Distribution:
Spam/Ham
spam    17171
ham     16545
Name: count, dtype: int64

First 3 rows:
   Unnamed: 0                       Subject  \
0           0  christmas tree farm pictures   
1           1      vastar resources , inc .   
2           2  calpine daily gas nomination   

                                             Message Spam/Ham        Date  
0                                                NaN      ham  1999-12-10  
1  gary , production from the high island larger ...      ham  1999-12-13  
2             - calpine daily gas nomination 1 . doc      ham  1999-12-14  


## 3. Train-Dev-Test Split (80-10-10)

In [5]:
# Drop rows with missing messages
df = df_original.dropna(subset=['Message']).copy()
print(f"Dataset after removing null messages: {df.shape[0]} rows\n")

# Reset index to ensure random access
df = df.reset_index(drop=True)

# Shuffle the entire dataset first to eliminate any ordering bias
np.random.seed(42)
shuffled_indices = np.random.permutation(len(df))
df = df.iloc[shuffled_indices].reset_index(drop=True)

print("✓ Dataset shuffled with random permutation\n")

# Combine Subject and Message as text features
df['text'] = df['Subject'].astype(str) + " " + df['Message'].astype(str)
X = df[['text']]
y = df['Spam/Ham']

# First split: 80% train, 20% temp (dev + test) with stratification and randomization
X_train, X_temp, y_train, y_temp = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y, shuffle=True
)

# Second split: Split temp into 50-50 (dev and test) with stratification and randomization
X_dev, X_test, y_dev, y_test = train_test_split(
    X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp, shuffle=True
)

print(f"Train set: {X_train.shape[0]} samples ({X_train.shape[0]/len(X)*100:.1f}%)")
print(f"Dev set: {X_dev.shape[0]} samples ({X_dev.shape[0]/len(X)*100:.1f}%)")
print(f"Test set: {X_test.shape[0]} samples ({X_test.shape[0]/len(X)*100:.1f}%)")

print(f"\nClass balance verification:")
print(f"\nOverall distribution:")
print(f"  {y.value_counts()}")

print(f"\nTrain class distribution:")
print(f"  {y_train.value_counts()}")
print(f"  Percentages: {(y_train.value_counts() / len(y_train) * 100).round(2)}")

print(f"\nDev class distribution:")
print(f"  {y_dev.value_counts()}")
print(f"  Percentages: {(y_dev.value_counts() / len(y_dev) * 100).round(2)}")

print(f"\nTest class distribution:")
print(f"  {y_test.value_counts()}")
print(f"  Percentages: {(y_test.value_counts() / len(y_test) * 100).round(2)}")

Dataset after removing null messages: 33664 rows

✓ Dataset shuffled with random permutation

Train set: 26931 samples (80.0%)
Dev set: 3366 samples (10.0%)
Test set: 3367 samples (10.0%)

Class balance verification:

Overall distribution:
  Spam/Ham
spam    17171
ham     16493
Name: count, dtype: int64

Train class distribution:
  Spam/Ham
spam    13737
ham     13194
Name: count, dtype: int64
  Percentages: Spam/Ham
spam    51.01
ham     48.99
Name: count, dtype: float64

Dev class distribution:
  Spam/Ham
spam    1717
ham     1649
Name: count, dtype: int64
  Percentages: Spam/Ham
spam    51.01
ham     48.99
Name: count, dtype: float64

Test class distribution:
  Spam/Ham
spam    1717
ham     1650
Name: count, dtype: int64
  Percentages: Spam/Ham
spam    50.99
ham     49.01
Name: count, dtype: float64


## 4. Feature Engineering: Unigram, Bigram, Mix, and TF-IDF

In [6]:
# Extract text from dataframes
X_train_text = X_train['text'].astype(str)
X_dev_text = X_dev['text'].astype(str)
X_test_text = X_test['text'].astype(str)

print("Creating feature vectors...\n")

# 1. Unigram (single words)
vectorizer_unigram = CountVectorizer(ngram_range=(1, 1), max_features=5000)
X_train_unigram = vectorizer_unigram.fit_transform(X_train_text)
X_dev_unigram = vectorizer_unigram.transform(X_dev_text)
X_test_unigram = vectorizer_unigram.transform(X_test_text)
print(f"✓ Unigram: {X_train_unigram.shape}")

# 2. Bigram (word pairs)
vectorizer_bigram = CountVectorizer(ngram_range=(2, 2), max_features=5000)
X_train_bigram = vectorizer_bigram.fit_transform(X_train_text)
X_dev_bigram = vectorizer_bigram.transform(X_dev_text)
X_test_bigram = vectorizer_bigram.transform(X_test_text)
print(f"✓ Bigram: {X_train_bigram.shape}")

# 3. Mix of Unigram and Bigram
vectorizer_mix = CountVectorizer(ngram_range=(1, 2), max_features=5000)
X_train_mix = vectorizer_mix.fit_transform(X_train_text)
X_dev_mix = vectorizer_mix.transform(X_dev_text)
X_test_mix = vectorizer_mix.transform(X_test_text)
print(f"✓ Mix (1-2gram): {X_train_mix.shape}")

# 4. TF-IDF weighted features
tfidf_vectorizer = TfidfVectorizer(ngram_range=(1, 2), max_features=5000)
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train_text)
X_dev_tfidf = tfidf_vectorizer.transform(X_dev_text)
X_test_tfidf = tfidf_vectorizer.transform(X_test_text)
print(f"✓ TF-IDF (1-2gram): {X_train_tfidf.shape}")

# Store features in a dictionary
features = {
    'unigram': (X_train_unigram, X_dev_unigram, X_test_unigram),
    'bigram': (X_train_bigram, X_dev_bigram, X_test_bigram),
    'mix': (X_train_mix, X_dev_mix, X_test_mix),
    'tfidf': (X_train_tfidf, X_dev_tfidf, X_test_tfidf)
}

print("\n✓ Feature engineering completed")

Creating feature vectors...

✓ Unigram: (26931, 5000)
✓ Unigram: (26931, 5000)
✓ Bigram: (26931, 5000)
✓ Bigram: (26931, 5000)
✓ Mix (1-2gram): (26931, 5000)
✓ Mix (1-2gram): (26931, 5000)
✓ TF-IDF (1-2gram): (26931, 5000)

✓ Feature engineering completed
✓ TF-IDF (1-2gram): (26931, 5000)

✓ Feature engineering completed


## 5. Model 1: Logistic Regression

In [7]:
lr_results = {}

print("Training Logistic Regression models with different features...\n")

for feature_name, (X_tr, X_dv, X_ts) in features.items():
    print(f"Training with {feature_name.upper()} features...")
    
    # Train model
    lr_model = LogisticRegression(max_iter=1000, random_state=42, n_jobs=-1)
    lr_model.fit(X_tr, y_train)
    
    # Evaluate on test set
    y_pred = lr_model.predict(X_ts)
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='macro', zero_division=0)
    recall = recall_score(y_test, y_pred, average='macro', zero_division=0)
    f1 = f1_score(y_test, y_pred, average='macro', zero_division=0)
    
    lr_results[feature_name] = {
        'model': lr_model,
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1
    }
    
    print(f"  Accuracy: {accuracy:.4f} | F1 (macro): {f1:.4f}\n")

print("="*70)
print("LOGISTIC REGRESSION - RESULTS SUMMARY")
print("="*70)
for feature_name, metrics in lr_results.items():
    print(f"\n{feature_name.upper()}:")
    print(f"  Accuracy: {metrics['accuracy']:.4f}")
    print(f"  Precision (macro): {metrics['precision']:.4f}")
    print(f"  Recall (macro): {metrics['recall']:.4f}")
    print(f"  F1-Score (macro): {metrics['f1']:.4f}")

Training Logistic Regression models with different features...

Training with UNIGRAM features...
  Accuracy: 0.9994 | F1 (macro): 0.9994

Training with BIGRAM features...
  Accuracy: 0.9994 | F1 (macro): 0.9994

Training with BIGRAM features...
  Accuracy: 0.9994 | F1 (macro): 0.9994

Training with MIX features...
  Accuracy: 0.9994 | F1 (macro): 0.9994

Training with MIX features...
  Accuracy: 0.9997 | F1 (macro): 0.9997

Training with TFIDF features...
  Accuracy: 0.9997 | F1 (macro): 0.9997

Training with TFIDF features...
  Accuracy: 0.9991 | F1 (macro): 0.9991

LOGISTIC REGRESSION - RESULTS SUMMARY

UNIGRAM:
  Accuracy: 0.9994
  Precision (macro): 0.9994
  Recall (macro): 0.9994
  F1-Score (macro): 0.9994

BIGRAM:
  Accuracy: 0.9994
  Precision (macro): 0.9994
  Recall (macro): 0.9994
  F1-Score (macro): 0.9994

MIX:
  Accuracy: 0.9997
  Precision (macro): 0.9997
  Recall (macro): 0.9997
  F1-Score (macro): 0.9997

TFIDF:
  Accuracy: 0.9991
  Precision (macro): 0.9991
  Recall (

## 6. Model 2: Naive Bayes

In [8]:
nb_results = {}

print("Training Naive Bayes models with different features...\n")

for feature_name, (X_tr, X_dv, X_ts) in features.items():
    print(f"Training with {feature_name.upper()} features...")
    
    # Train model
    nb_model = MultinomialNB()
    nb_model.fit(X_tr, y_train)
    
    # Evaluate on test set
    y_pred = nb_model.predict(X_ts)
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='macro', zero_division=0)
    recall = recall_score(y_test, y_pred, average='macro', zero_division=0)
    f1 = f1_score(y_test, y_pred, average='macro', zero_division=0)
    
    nb_results[feature_name] = {
        'model': nb_model,
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1
    }
    
    print(f"  Accuracy: {accuracy:.4f} | F1 (macro): {f1:.4f}\n")

print("="*70)
print("NAIVE BAYES - RESULTS SUMMARY")
print("="*70)
for feature_name, metrics in nb_results.items():
    print(f"\n{feature_name.upper()}:")
    print(f"  Accuracy: {metrics['accuracy']:.4f}")
    print(f"  Precision (macro): {metrics['precision']:.4f}")
    print(f"  Recall (macro): {metrics['recall']:.4f}")
    print(f"  F1-Score (macro): {metrics['f1']:.4f}")

Training Naive Bayes models with different features...

Training with UNIGRAM features...
  Accuracy: 0.9854 | F1 (macro): 0.9854

Training with BIGRAM features...
  Accuracy: 0.9837 | F1 (macro): 0.9836

Training with MIX features...
  Accuracy: 0.9860 | F1 (macro): 0.9860

Training with TFIDF features...
  Accuracy: 0.9872 | F1 (macro): 0.9872

NAIVE BAYES - RESULTS SUMMARY

UNIGRAM:
  Accuracy: 0.9854
  Precision (macro): 0.9861
  Recall (macro): 0.9852
  F1-Score (macro): 0.9854

BIGRAM:
  Accuracy: 0.9837
  Precision (macro): 0.9845
  Recall (macro): 0.9833
  F1-Score (macro): 0.9836

MIX:
  Accuracy: 0.9860
  Precision (macro): 0.9867
  Recall (macro): 0.9858
  F1-Score (macro): 0.9860

TFIDF:
  Accuracy: 0.9872
  Precision (macro): 0.9878
  Recall (macro): 0.9870
  F1-Score (macro): 0.9872
  Accuracy: 0.9872 | F1 (macro): 0.9872

NAIVE BAYES - RESULTS SUMMARY

UNIGRAM:
  Accuracy: 0.9854
  Precision (macro): 0.9861
  Recall (macro): 0.9852
  F1-Score (macro): 0.9854

BIGRAM:
  A

## 7. Model 3: LSTM (Neural Network)

In [9]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Embedding, SpatialDropout1D, Bidirectional
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import EarlyStopping

print("Preparing data for LSTM...\n")

# Tokenize text
max_features = 5000
max_length = 100

tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(X_train_text)

# Convert text to sequences
X_train_seq = tokenizer.texts_to_sequences(X_train_text)
X_dev_seq = tokenizer.texts_to_sequences(X_dev_text)
X_test_seq = tokenizer.texts_to_sequences(X_test_text)

# Pad sequences
X_train_pad = pad_sequences(X_train_seq, maxlen=max_length)
X_dev_pad = pad_sequences(X_dev_seq, maxlen=max_length)
X_test_pad = pad_sequences(X_test_seq, maxlen=max_length)

# Convert labels to binary (0 for ham, 1 for spam)
y_train_bin = (y_train.values == 'spam').astype(int)
y_dev_bin = (y_dev.values == 'spam').astype(int)
y_test_bin = (y_test.values == 'spam').astype(int)

print(f"Train sequences: {X_train_pad.shape}")
print(f"Dev sequences: {X_dev_pad.shape}")
print(f"Test sequences: {X_test_pad.shape}\n")

# Build LSTM model
print("Building LSTM model...")
lstm_model = Sequential([
    Embedding(max_features, 128),
    SpatialDropout1D(0.2),
    Bidirectional(LSTM(64, return_sequences=True)),
    LSTM(32),
    Dense(64, activation='relu'),
    keras.layers.Dropout(0.5),
    Dense(1, activation='sigmoid')
])

lstm_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train with early stopping
print("Training LSTM model...\n")
early_stop = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)
history = lstm_model.fit(
    X_train_pad, y_train_bin,
    validation_data=(X_dev_pad, y_dev_bin),
    epochs=15,
    batch_size=32,
    callbacks=[early_stop],
    verbose=0
)

# Evaluate
y_pred_lstm = (lstm_model.predict(X_test_pad, verbose=0) > 0.5).astype(int).flatten()
accuracy_lstm = accuracy_score(y_test_bin, y_pred_lstm)
precision_lstm = precision_score(y_test_bin, y_pred_lstm, average='macro', zero_division=0)
recall_lstm = recall_score(y_test_bin, y_pred_lstm, average='macro', zero_division=0)
f1_lstm = f1_score(y_test_bin, y_pred_lstm, average='macro', zero_division=0)

print("="*70)
print("LSTM - RESULTS")
print("="*70)
print(f"Accuracy: {accuracy_lstm:.4f}")
print(f"Precision (macro): {precision_lstm:.4f}")
print(f"Recall (macro): {recall_lstm:.4f}")
print(f"F1-Score (macro): {f1_lstm:.4f}")

Preparing data for LSTM...

Train sequences: (26931, 100)
Dev sequences: (3366, 100)
Test sequences: (3367, 100)

Building LSTM model...
Training LSTM model...

Train sequences: (26931, 100)
Dev sequences: (3366, 100)
Test sequences: (3367, 100)

Building LSTM model...
Training LSTM model...

LSTM - RESULTS
Accuracy: 0.9976
Precision (macro): 0.9977
Recall (macro): 0.9976
F1-Score (macro): 0.9976
LSTM - RESULTS
Accuracy: 0.9976
Precision (macro): 0.9977
Recall (macro): 0.9976
F1-Score (macro): 0.9976


## 8. Model 4: BERT (Transformer)

In [10]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TextClassificationPipeline
import torch

print("Loading BERT model...\n")

# Use distilbert for faster inference
model_name = "distilbert-base-uncased"
tokenizer_bert = AutoTokenizer.from_pretrained(model_name)
bert_model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

# Create pipeline
bert_pipeline = TextClassificationPipeline(
    model=bert_model, 
    tokenizer=tokenizer_bert, 
    device=0 if torch.cuda.is_available() else -1
)

print("Evaluating BERT on test set (sample)...\n")

# Sample test data for speed
sample_size = min(1000, len(X_test_text))
sample_indices = np.random.choice(len(X_test_text), sample_size, replace=False)
X_test_sample = X_test_text.iloc[sample_indices].values
y_test_sample = y_test_bin[sample_indices]

# Get predictions
bert_predictions = []
for i, text in enumerate(X_test_sample):
    if i % 250 == 0:
        print(f"  Processed {i}/{len(X_test_sample)}")
    try:
        result = bert_pipeline(text[:512], truncation=True)
        label = 1 if result[0]['label'] == 'LABEL_1' else 0
        bert_predictions.append(label)
    except:
        bert_predictions.append(0)

bert_predictions = np.array(bert_predictions)

# Evaluate
accuracy_bert = accuracy_score(y_test_sample, bert_predictions)
precision_bert = precision_score(y_test_sample, bert_predictions, average='macro', zero_division=0)
recall_bert = recall_score(y_test_sample, bert_predictions, average='macro', zero_division=0)
f1_bert = f1_score(y_test_sample, bert_predictions, average='macro', zero_division=0)

print("\n" + "="*70)
print("BERT - RESULTS (on sampled test set)")
print("="*70)
print(f"Accuracy: {accuracy_bert:.4f}")
print(f"Precision (macro): {precision_bert:.4f}")
print(f"Recall (macro): {recall_bert:.4f}")
print(f"F1-Score (macro): {f1_bert:.4f}")

Loading BERT model...



Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Device set to use cpu
Device set to use cpu


Evaluating BERT on test set (sample)...

  Processed 0/1000
  Processed 250/1000
  Processed 250/1000
  Processed 500/1000
  Processed 500/1000
  Processed 750/1000
  Processed 750/1000

BERT - RESULTS (on sampled test set)
Accuracy: 0.5100
Precision (macro): 0.2550
Recall (macro): 0.5000
F1-Score (macro): 0.3377

BERT - RESULTS (on sampled test set)
Accuracy: 0.5100
Precision (macro): 0.2550
Recall (macro): 0.5000
F1-Score (macro): 0.3377


## 9. Model 5: Gemma (via Ollama or HuggingFace)

In [None]:
import subprocess

print("Checking if Ollama is available...\n")

# Test if Ollama is running
try:
    result = subprocess.run(['ollama', 'list'], capture_output=True, timeout=5, text=True)
    if result.returncode == 0:
        print("✓ Ollama is available")
        ollama_available = True
        print(f"\nAvailable models:\n{result.stdout}")
    else:
        print("⚠ Ollama not accessible. Installation required.")
        ollama_available = False
except:
    print("⚠ Ollama not installed or not running")
    print("\nTo use Ollama:")
    print("  1. Install from https://ollama.ai")
    print("  2. Run: ollama pull gemma")
    print("  3. Start Ollama service")
    ollama_available = False

if ollama_available:
    print("\nRunning Ollama (Gemma) inference on sampled test set...\n")
    # Sample a modest-sized subset for evaluation (adjustable)
    sample_size = min(20, len(X_test_text))
    # Select indices explicitly so we can inspect labels and samples before running
    if sample_size < len(X_test_text):
        indices = np.random.choice(len(X_test_text), sample_size, replace=False)
    else:
        indices = np.arange(len(X_test_text))

    # Show selected indices, labels and a short preview of the selected samples
    print(f"Selected indices: {indices}")
    try:
        print(f"Selected labels (0=HAM,1=SPAM): {y_test_bin[indices]}")
    except Exception:
        # Fallback if y_test_bin is a pandas Series
        print(f"Selected labels (0=HAM,1=SPAM): {np.asarray(y_test_bin)[indices]}")
    print("Selected sample texts (first 3 shown):")
    for idx in indices[:3]:
        txt = X_test_text.iloc[idx] if hasattr(X_test_text, 'iloc') else X_test_text[idx]
        print(f" - idx {idx}: {str(txt)[:200].replace('\n', ' ')}")

    # Run Ollama on the explicit indices (pass indices so the function doesn't resample)
    preds, raw_outputs, indices = run_ollama_model("gemma", X_test_text.values, sample_size=None, timeout=600, verbose=True, indices=indices)

    # Align sample indices to label vector (y_test_bin defined earlier)
    y_test_sample = np.asarray(y_test_bin)[indices]
    ollama_predictions = preds

    # Compute metrics for Gemma
    accuracy_gemma = accuracy_score(y_test_sample, ollama_predictions)
    precision_gemma = precision_score(y_test_sample, ollama_predictions, average='macro', zero_division=0)
    recall_gemma = recall_score(y_test_sample, ollama_predictions, average='macro', zero_division=0)
    f1_gemma = f1_score(y_test_sample, ollama_predictions, average='macro', zero_division=0)

    print("\n" + "="*70)
    print("GEMMA (Ollama) - RESULTS (on sampled test set)")
    print("="*70)
    print(f"Accuracy: {accuracy_gemma:.4f}")
    print(f"Precision (macro): {precision_gemma:.4f}")
    print(f"Recall (macro): {recall_gemma:.4f}")
    print(f"F1-Score (macro): {f1_gemma:.4f}")
    print(f"\n✓ Sample predictions: {['SPAM' if p==1 else 'HAM' for p in ollama_predictions[:20]]} (showing up to 20)")

    print(f"\nNote: Full evaluation skipped. Ollama inference is time-intensive.")
else:
    print("\nSkipping Ollama (Gemma) evaluation until service is available.")


Checking if Ollama is available...

✓ Ollama is available

Available models:
NAME            ID              SIZE      MODIFIED   
gpt-oss:20b     17052f91a42e    13 GB     7 days ago    
gemma:latest    a72c7f4d0a15    5.0 GB    8 days ago    


Running Ollama (Gemma) inference on sampled test set...

  Running gemma: 0/10
✓ Ollama is available

Available models:
NAME            ID              SIZE      MODIFIED   
gpt-oss:20b     17052f91a42e    13 GB     7 days ago    
gemma:latest    a72c7f4d0a15    5.0 GB    8 days ago    


Running Ollama (Gemma) inference on sampled test set...

  Running gemma: 0/10

GEMMA (Ollama) - RESULTS (on sampled test set)
Accuracy: 0.7000
Precision (macro): 0.8125
Recall (macro): 0.7000
F1-Score (macro): 0.6703

✓ Sample predictions: ['SPAM', 'HAM', 'HAM', 'HAM', 'HAM', 'HAM', 'HAM', 'HAM', 'HAM', 'SPAM'] (showing up to 20)

Note: Full evaluation skipped. Ollama inference is time-intensive.

GEMMA (Ollama) - RESULTS (on sampled test set)
Accuracy: 0.70

In [None]:
import subprocess

print("Checking if Ollama is available...\n")

# Test if Ollama is running
try:
    result = subprocess.run(['ollama', 'list'], capture_output=True, timeout=5, text=True)
    if result.returncode == 0:
        print("✓ Ollama is available")
        ollama_available = True
        print(f"\nAvailable models:\n{result.stdout}")
    else:
        print("⚠ Ollama not accessible. Installation required.")
        ollama_available = False
except:
    print("⚠ Ollama not installed or not running")
    print("\nTo use Ollama:")
    print("  1. Install from https://ollama.ai")
    print("  2. Run: ollama pull gemma")
    print("  3. Start Ollama service")
    ollama_available = False

if ollama_available:
    print("\nRunning Ollama (GPT) inference on sampled test set...\n")
    # Sample a modest-sized subset for evaluation (adjustable)
    sample_size = min(20, len(X_test_text))

    # Choose indices explicitly so we can inspect the sample before running
    if sample_size < len(X_test_text):
        indices = np.random.choice(len(X_test_text), sample_size, replace=False)
    else:
        indices = np.arange(len(X_test_text))

    # Print selected label and the sample text before running the model
    print(f"Selected indices: {indices}")
    try:
        print(f"Selected labels (0=HAM,1=SPAM): {y_test_bin[indices]}")
    except Exception:
        print(f"Selected labels (0=HAM,1=SPAM): {np.asarray(y_test_bin)[indices]}")
    print("Selected sample text:")
    for idx in indices:
        txt = X_test_text.iloc[idx] if hasattr(X_test_text, 'iloc') else X_test_text[idx]
        print(f" - idx {idx}: {str(txt)[:1000].replace('\n',' ')}")

    preds, raw_outputs, indices = run_ollama_model("gpt-oss:20b", X_test_text.values, sample_size=None, timeout=600, verbose=True, indices=indices)
    
    # Align sample indices to label vector (y_test_bin defined earlier)
    y_test_sample = np.asarray(y_test_bin)[indices]
    ollama_predictions = preds

    # Compute metrics for GPT model
    accuracy_gpt = accuracy_score(y_test_sample, ollama_predictions)
    precision_gpt = precision_score(y_test_sample, ollama_predictions, average='macro', zero_division=0)
    recall_gpt = recall_score(y_test_sample, ollama_predictions, average='macro', zero_division=0)
    f1_gpt = f1_score(y_test_sample, ollama_predictions, average='macro', zero_division=0)

    print("\n" + "="*70)
    print("GPT (Ollama) - RESULTS (on sampled test set)")
    print("="*70)
    print(f"Accuracy: {accuracy_gpt:.4f}")
    print(f"Precision (macro): {precision_gpt:.4f}")
    print(f"Recall (macro): {recall_gpt:.4f}")
    print(f"F1-Score (macro): {f1_gpt:.4f}")
    print(f"\n✓ Sample predictions: {['SPAM' if p==1 else 'HAM' for p in ollama_predictions[:20]]} (showing up to 20)")

    print(f"\nNote: Full evaluation skipped. Ollama inference is time-intensive.")
else:
    print("\nSkipping Ollama (GPT) evaluation until service is available.")


Checking if Ollama is available...

✓ Ollama is available

Available models:
NAME            ID              SIZE      MODIFIED   
gpt-oss:20b     17052f91a42e    13 GB     7 days ago    
gemma:latest    a72c7f4d0a15    5.0 GB    8 days ago    


Running Ollama (GPT) inference on sampled test set...

Selected indices: [1832]
Selected labels (0=HAM,1=SPAM): [1]
Selected sample text:
 - idx 1832: transfers from ees attached is the latest version of the cost center assignments for the transfers out of ees . these transfers will be effective july 1 , 2001 and i need to get this to hr by friday , june 1 , 2001 to give them time to get everything effected . i think i have incorporated all your comments , but please review one more time and make sure we have not included anyone we shouldn ' t have or excluded anyone . you ' ll note that at this point we are not forming east and west risk management cost centers . don and rogers have decided for cost management purposes to leave it consolidate

Python(5567) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
Python(5568) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.



GPT (Ollama) - RESULTS (on sampled test set)
Accuracy: 0.0000
Precision (macro): 0.0000
Recall (macro): 0.0000
F1-Score (macro): 0.0000

✓ Sample predictions: ['HAM'] (showing up to 20)

Note: Full evaluation skipped. Ollama inference is time-intensive.


## 11. Comprehensive Results Comparison

In [29]:
# Compile all results
results_data = []

# Logistic Regression results
for feature_name, metrics in lr_results.items():
    results_data.append({
        'Model': 'Logistic Regression',
        'Feature': feature_name.upper(),
        'Accuracy': metrics['accuracy'],
        'Precision': metrics['precision'],
        'Recall': metrics['recall'],
        'F1-Score': metrics['f1']
    })

# Naive Bayes results
for feature_name, metrics in nb_results.items():
    results_data.append({
        'Model': 'Naive Bayes',
        'Feature': feature_name.upper(),
        'Accuracy': metrics['accuracy'],
        'Precision': metrics['precision'],
        'Recall': metrics['recall'],
        'F1-Score': metrics['f1']
    })

# LSTM results
results_data.append({
    'Model': 'LSTM',
    'Feature': 'Sequence Embedding',
    'Accuracy': accuracy_lstm,
    'Precision': precision_lstm,
    'Recall': recall_lstm,
    'F1-Score': f1_lstm
})

# BERT results
results_data.append({
    'Model': 'BERT',
    'Feature': 'Transformer (Sampled)',
    'Accuracy': accuracy_bert,
    'Precision': precision_bert,
    'Recall': recall_bert,
    'F1-Score': f1_bert
})

# Add Gemma (Ollama) if metrics were computed
if 'accuracy_gemma' in globals():
    results_data.append({
        'Model': 'Gemma (Ollama)',
        'Feature': 'Gemma (Sampled)',
        'Accuracy': accuracy_gemma,
        'Precision': precision_gemma,
        'Recall': recall_gemma,
        'F1-Score': f1_gemma
    })
else:
    print('Gemma metrics not available; skipping')

# Add GPT (Ollama) if metrics were computed
if 'accuracy_gpt' in globals():
    results_data.append({
        'Model': 'GPT (Ollama)',
        'Feature': 'GPT (Sampled)',
        'Accuracy': accuracy_gpt,
        'Precision': precision_gpt,
        'Recall': recall_gpt,
        'F1-Score': f1_gpt
    })
else:
    print('GPT metrics not available; skipping')

# Create results dataframe
results_df = pd.DataFrame(results_data)

print("="*100)
print("COMPREHENSIVE MODEL RESULTS COMPARISON")
print("="*100)
print(f"\n{results_df.to_string(index=False)}")

# Best models
print("\n" + "="*100)
print("TOP PERFORMING MODELS")
print("="*100)

if results_df.empty:
    print('\nNo results to summarize.')
else:
    best_accuracy_idx = results_df['Accuracy'].idxmax()
    best_f1_idx = results_df['F1-Score'].idxmax()

    best_accuracy = results_df.loc[best_accuracy_idx]
    best_f1 = results_df.loc[best_f1_idx]

    print(f"\nBest Accuracy: {best_accuracy['Model']} ({best_accuracy['Feature']})")
    print(f"  → Accuracy: {best_accuracy['Accuracy']:.4f}")

    print(f"\nBest F1-Score (macro): {best_f1['Model']} ({best_f1['Feature']})")
    print(f"  → F1-Score: {best_f1['F1-Score']:.4f}")
    print(f"  → Accuracy: {best_f1['Accuracy']:.4f}")

# Feature engineering comparison
print("\n" + "="*100)
print("FEATURE ENGINEERING ANALYSIS")
print("="*100)

for feature in ['UNIGRAM', 'BIGRAM', 'MIX', 'TFIDF']:
    feature_results = results_df[results_df['Feature'] == feature]
    if len(feature_results) > 0:
        avg_accuracy = feature_results['Accuracy'].mean()
        avg_f1 = feature_results['F1-Score'].mean()
        print(f"\n{feature}:")
        print(f"  Avg Accuracy: {avg_accuracy:.4f}")
        print(f"  Avg F1-Score: {avg_f1:.4f}")

print("\n" + "="*100)

COMPREHENSIVE MODEL RESULTS COMPARISON

              Model               Feature  Accuracy  Precision   Recall  F1-Score
Logistic Regression               UNIGRAM  0.999406   0.999418 0.999394  0.999406
Logistic Regression                BIGRAM  0.999406   0.999418 0.999394  0.999406
Logistic Regression                   MIX  0.999703   0.999709 0.999697  0.999703
Logistic Regression                 TFIDF  0.999109   0.999128 0.999091  0.999109
        Naive Bayes               UNIGRAM  0.985447   0.986127 0.985152  0.985430
        Naive Bayes                BIGRAM  0.983665   0.984481 0.983333  0.983644
        Naive Bayes                   MIX  0.986041   0.986678 0.985758  0.986025
        Naive Bayes                 TFIDF  0.987229   0.987784 0.986970  0.987215
               LSTM    Sequence Embedding  0.997624   0.997681 0.997576  0.997623
               BERT Transformer (Sampled)  0.510000   0.255000 0.500000  0.337748
     Gemma (Ollama)       Gemma (Sampled)  0.700000   0.81