# Comprehensive Model Evaluation

This notebook provides a detailed evaluation of our email assistant models including:
1. Intent Classification Metrics
2. Reply Generation Quality
3. System Performance Metrics
4. A/B Testing Results

### Imports and Data Load

In [1]:
import pandas as pd
import torch
import pickle
from sklearn.metrics import classification_report
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification
from transformers import T5Tokenizer, T5ForConditionalGeneration
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from rouge_score import rouge_scorer
import json
import tqdm

tqdm.tqdm.pandas()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

  from .autonotebook import tqdm as notebook_tqdm


### Load Test Data

In [2]:
# Load processed data with labels and targets
df = pd.read_csv("../data/processed/clean_emails.csv")
df = df[["clean_body", "label", "entities"]].dropna()

# Parse entities
def parse_entities(ent_str):
    try:
        return {k: v for k, v in json.loads(ent_str.replace("'", '"')).items() if v}
    except:
        return {}

df["parsed_entities"] = df["entities"].apply(parse_entities)

### Evaluate Intent Classifier

In [3]:
# Load model + tokenizer
intent_model_dir = "../models/intent_classifier"
intent_tokenizer = DistilBertTokenizerFast.from_pretrained(intent_model_dir)
intent_model = DistilBertForSequenceClassification.from_pretrained(intent_model_dir).to(device)

with open(f"{intent_model_dir}/label_encoder.pkl", "rb") as f:
    label_encoder = pickle.load(f)

def predict_intent(text):
    inputs = intent_tokenizer(text, truncation=True, padding=True, return_tensors="pt", max_length=512).to(device)
    with torch.no_grad():
        logits = intent_model(**inputs).logits
    pred = torch.argmax(logits, dim=1).item()
    return label_encoder.inverse_transform([pred])[0]

df["predicted_label"] = df["clean_body"].progress_apply(predict_intent)

# Evaluate
report = classification_report(df["label"], df["predicted_label"])
print("📊 Intent Classification Report:")
print(report)

100%|██████████| 425552/425552 [3:48:15<00:00, 31.07it/s]  
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


📊 Intent Classification Report:
                   precision    recall  f1-score   support

     Appreciation       0.95      0.97      0.96     60567
        Complaint       0.90      0.87      0.89     13994
     Data Request       0.93      0.97      0.95     57807
   Event Planning       0.00      0.00      0.00       904
         Farewell       0.00      0.00      0.00       884
          Finance       0.84      0.84      0.84     17805
  General Inquiry       0.74      0.93      0.82     18796
         Greeting       0.85      0.88      0.86     32509
      Job Inquiry       0.90      0.78      0.84     20744
            Legal       0.92      0.93      0.92     29772
  Meeting Request       0.94      0.93      0.93    162634
         Personal       0.00      0.00      0.00       663
   Project Update       0.97      0.08      0.16      1256
         Reminder       0.00      0.00      0.00       571
    Sales Inquiry       0.82      0.69      0.75      2372
Technical Support      

  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


### Evaluate Reply Generator

In [6]:
# Load T5 model
reply_model_dir = "../models/reply_generator"
reply_model = T5ForConditionalGeneration.from_pretrained(reply_model_dir).to(device)
reply_tokenizer = T5Tokenizer.from_pretrained(reply_model_dir)

# Prompt builder
def build_prompt(row):
    entities = row["parsed_entities"]
    recipient = entities.get("PERSON", ["Unknown"])[0]
    entities_str = " | ".join(f"{k}: {', '.join(v)}" for k, v in entities.items()) if entities else "None"
    return f"Intent: {row['label']} | RecipientName: {recipient} | Entities: {entities_str} | Email: {row['clean_body']}"

# Sample a smaller subset for quick evaluation
df_sample = df.sample(n=500, random_state=42).reset_index(drop=True)
df_sample["prompt"] = df_sample.apply(build_prompt, axis=1)
df_sample["target"] = df_sample["prompt"]  # we’ll generate reply from the prompt

# Generate replies
def generate_reply(prompt):
    inputs = reply_tokenizer(prompt, return_tensors="pt", truncation=True, padding=True, max_length=512).to(device)
    with torch.no_grad():
        outputs = reply_model.generate(**inputs, max_length=128)
    return reply_tokenizer.decode(outputs[0], skip_special_tokens=True)

df_sample["generated_reply"] = df_sample["prompt"].progress_apply(generate_reply)

100%|██████████| 500/500 [02:33<00:00,  3.25it/s]
100%|██████████| 500/500 [02:33<00:00,  3.25it/s]


### Compute BLEU & ROUGE

In [7]:
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from rouge_score import rouge_scorer

smoothie = SmoothingFunction().method4
bleu_scores, rouge_l_scores = [], []

scorer = rouge_scorer.RougeScorer(["rougeL"], use_stemmer=True)

for _, row in df_sample.iterrows():
    ref = row["target"]
    gen = row["generated_reply"]

    # BLEU
    bleu = sentence_bleu([ref.split()], gen.split(), smoothing_function=smoothie)
    bleu_scores.append(bleu)

    # ROUGE-L
    rouge = scorer.score(ref, gen)["rougeL"].fmeasure
    rouge_l_scores.append(rouge)

print(f"📘 Avg BLEU: {sum(bleu_scores)/len(bleu_scores):.4f}")
print(f"📕 Avg ROUGE-L: {sum(rouge_l_scores)/len(rouge_l_scores):.4f}")

📘 Avg BLEU: 0.0012
📕 Avg ROUGE-L: 0.0580


### Save Evaluation Results

In [9]:
df_sample[["clean_body", "label", "prompt", "generated_reply"]].to_csv("../data/processed/evaluation_output.csv", index=False)
print("✅ Evaluation results saved to ../data/processed/evaluation_output.csv")

✅ Evaluation results saved to ../data/processed/evaluation_output.csv
