In [1]:
# ============================== Setup & imports ==============================
import os
import json
import pickle
import re # For text cleaning
import time
from typing import List, Dict, Any, Tuple

import numpy as np
import pandas as pd

# NEW: The library for keyword scoring
from rank_bm25 import BM25Okapi

# Filter warnings for cleaner output
import warnings
warnings.filterwarnings("ignore")

# ============================== CONFIGURATION ==============================

# 1. Customer data input
# This now points to the file containing LISTS OF TOKENS, not vectors.
CUSTOMER_PKL = "transcript_data_bm25_ready.pkl"

# 2. Evaluation Data (Same as before)
EVAL_DIR = "Evaluation-data"
PROMPT_FILES = [
    "automotive_prompts.jsonl",
    "home_service_prompts.jsonl",
    "insurance_prompts.jsonl",
    "medical_equipment_prompts.jsonl",
]

# 3. Categories (Same mapping)
CATEGORIES = [
    "automotive - inbound call",
    "home service - inbound call",
    "insurance - outbound call",
    "medical equipment - outbound call",
]
INDEX_TO_CATEGORY = dict(enumerate(CATEGORIES))
CATEGORY_TO_INDEX = {c: i for i, c in INDEX_TO_CATEGORY.items()}

print("Configuration loaded. Ready to perform Keyword Search (BM25).")

Configuration loaded. Ready to perform Keyword Search (BM25).


In [2]:
# ============================== TEXT PROCESSING UTILS ==============================

def simple_tokenize(text: str) -> List[str]:
    """
    Splits text into words (tokens) for BM25.
    Must match the logic used to create the transcript pickle file!
    """
    if not isinstance(text, str):
        return []
    
    # 1. Lowercase
    text = text.lower()
    # 2. Remove punctuation (replace with space)
    text = re.sub(r'[^a-z0-9\s]', ' ', text)
    # 3. Split by whitespace and remove empty strings
    tokens = [t for t in text.split(" ") if t.strip()]
    return tokens

# Quick test to make sure it works
sample_text = "Hello! This is an AUTO-motive call..."
print(f"Original: {sample_text}")
print(f"Tokenized: {simple_tokenize(sample_text)}")

Original: Hello! This is an AUTO-motive call...
Tokenized: ['hello', 'this', 'is', 'an', 'auto', 'motive', 'call']


In [3]:
# ============================== LOAD DATA ==============================

# 1. Load Customer Transcripts
if not os.path.exists(CUSTOMER_PKL):
    raise FileNotFoundError(f"File '{CUSTOMER_PKL}' not found.")

with open(CUSTOMER_PKL, "rb") as f:
    customer_records: List[Dict[str, Any]] = pickle.load(f)

# Validation check
rec0 = customer_records[0]
assert "bm25_tokens" in rec0, "Data file is missing 'bm25_tokens'. Did you run the right preprocessing script?"
print(f"Loaded {len(customer_records)} customers.")

# 2. Load Prompts
def load_jsonl(path: str) -> List[Dict[str, Any]]:
    items = []
    with open(path, "r", encoding="utf-8") as f:
        for line in f:
            if line.strip(): items.append(json.loads(line))
    return items

all_prompts: List[Dict[str, Any]] = []
for fname in PROMPT_FILES:
    p = os.path.join(EVAL_DIR, fname)
    if os.path.exists(p):
        all_prompts.extend(load_jsonl(p))

print(f"Loaded {len(all_prompts)} prompts.")

# Prepare prompt lists
prompt_texts = [p["prompt"] for p in all_prompts]
prompt_ids   = [p["id"] for p in all_prompts]
prompt_cats  = [p["category"] for p in all_prompts]
prompt_diffs = [p["difficulty"] for p in all_prompts]

# PRE-TOKENIZE PROMPTS
# Since BM25 is fast, we can tokenize all prompts now to save time in the loop.
print("Tokenizing prompts...")
prompt_tokens_list = [simple_tokenize(t) for t in prompt_texts]
print("Done.")

Loaded 50 customers.
Loaded 80 prompts.
Tokenizing prompts...
Done.


In [4]:
# ============================== METRICS UTILS ==============================

def softmax(x: np.ndarray) -> np.ndarray:
    """Stable softmax to convert BM25 scores into probabilities."""
    x = np.asarray(x, dtype=np.float32)
    # subtracting max prevents overflow for large BM25 scores
    x = x - np.max(x) 
    e = np.exp(x)
    return e / (np.sum(e) + 1e-12)

def loss_metrics(scores: np.ndarray, correct_idx: int) -> dict:
    """
    Calculates metrics based on the 4 scores.
    """
    probs = softmax(scores)
    
    # 1. Cross Entropy (how confident were we in the right answer?)
    cross_entropy = -np.log(probs[correct_idx] + 1e-12)

    # 2. Margin (Difference between Correct Score and the Best Wrong Score)
    correct_score = float(scores[correct_idx])
    best_other_score = float(np.max(np.delete(scores, correct_idx)))
    signed_margin = correct_score - best_other_score 

    # 3. Rank
    order = np.argsort(-scores) # Sort descending
    rank_of_correct = int(np.where(order == correct_idx)[0][0]) + 1 

    return {
        "cross_entropy": float(cross_entropy),
        "signed_margin": float(signed_margin),
        "correct_score": correct_score,
        "best_other_score": best_other_score,
        "rank_of_correct": rank_of_correct,
    }

In [5]:
# ============================== EVALUATION LOOP ==============================

results = []

print("Starting evaluation...")

for c_idx, rec in enumerate(customer_records, start=1):
    
    # 1. Get the corpus for this specific customer
    # This is a list of 4 lists of tokens: [[tokens_doc1], [tokens_doc2], ...]
    corpus_tokens = rec["bm25_tokens"]
    
    # 2. Initialize BM25 for this customer
    # This acts as our "Search Engine" for this specific customer record
    bm25 = BM25Okapi(corpus_tokens)

    # 3. Loop through all prompts
    for p_idx, (pid, pcat, pdiff) in enumerate(zip(prompt_ids, prompt_cats, prompt_diffs)):
        
        # Get the pre-tokenized prompt
        q_tokens = prompt_tokens_list[p_idx]

        # 4. Get BM25 Scores
        # This returns a list of 4 float scores (one for each transcript)
        scores_list = bm25.get_scores(q_tokens)
        scores = np.array(scores_list, dtype=np.float32)

        # 5. Determine Winner
        top_idx = int(np.argmax(scores))

        # 6. Calculate Metrics (Same as before)
        correct_idx = CATEGORY_TO_INDEX[pcat]
        metrics = loss_metrics(scores, correct_idx)

        pred_category = INDEX_TO_CATEGORY[top_idx]
        is_correct = int(pred_category == pcat)

        row = {
            "customer_row": c_idx,
            "customer_id": rec.get("customer_id", None),
            "prompt_id": pid,
            "prompt_category": pcat,
            "prompt_difficulty": pdiff,
            "predicted_category": pred_category,
            "is_correct": is_correct,
            "top_index": top_idx,
            "top_score": float(scores[top_idx]),
            **metrics,
        }
        results.append(row)
    
    # Progress check
    if c_idx % 10 == 0:
        print(f"Evaluated {c_idx}/{len(customer_records)} customers...")

df = pd.DataFrame(results)
print(f"Total evaluations: {len(df)}")
display(df.head())

Starting evaluation...
Evaluated 10/50 customers...
Evaluated 20/50 customers...
Evaluated 30/50 customers...
Evaluated 40/50 customers...
Evaluated 50/50 customers...
Total evaluations: 4000


Unnamed: 0,customer_row,customer_id,prompt_id,prompt_category,prompt_difficulty,predicted_category,is_correct,top_index,top_score,cross_entropy,signed_margin,correct_score,best_other_score,rank_of_correct
0,1,0,auto-e-01,automotive - inbound call,easy,home service - inbound call,0,1,2.519128,2.366547,-1.864948,0.65418,2.519128,3
1,1,0,auto-e-02,automotive - inbound call,easy,medical equipment - outbound call,0,3,1.767087,2.07463,-1.200462,0.566625,1.767087,3
2,1,0,auto-e-03,automotive - inbound call,easy,medical equipment - outbound call,0,3,1.735542,2.006757,-1.096479,0.639063,1.735542,4
3,1,0,auto-e-04,automotive - inbound call,easy,home service - inbound call,0,1,1.451721,1.748309,-0.975393,0.476329,1.451721,4
4,1,0,auto-e-05,automotive - inbound call,easy,home service - inbound call,0,1,2.488464,2.304553,-1.67426,0.814204,2.488464,4


In [6]:
# ============================== METRICS & TABLES ==============================

if len(df) > 0:
    def pct(x: pd.Series) -> float:
        return 100.0 * x.mean()

    # Accuracy by category × difficulty
    pivot_cat_diff = (
        df
        .groupby(["prompt_category", "prompt_difficulty"])["is_correct"]
        .apply(pct)
        .rename("accuracy_pct")
        .reset_index()
        .sort_values(["prompt_category", "prompt_difficulty"])
    )

    overall_accuracy_pct = pct(df["is_correct"])

    print("=== Accuracy by Category × Difficulty (BM25) ===")
    display(pivot_cat_diff)

    print(f"\n=== Overall Accuracy ===\n{overall_accuracy_pct:.2f}%")

    # Loss Metrics
    # Note: 'deficit' here means difference in BM25 score points
    df["deficit_if_wrong"] = np.where(
        df["is_correct"] == 0,
        df["best_other_score"] - df["correct_score"],
        0.0
    )

    print("\n=== Loss Metrics ===")
    overall_stats = pd.DataFrame({
        "overall_accuracy_pct": [overall_accuracy_pct],
        "mean_cross_entropy":   [df["cross_entropy"].mean()],
        "mean_signed_margin":   [df["signed_margin"].mean()], 
        "mean_deficit_if_wrong":[df.loc[df["is_correct"]==0, "deficit_if_wrong"].mean()]
    })
    display(overall_stats)

=== Accuracy by Category × Difficulty (BM25) ===


Unnamed: 0,prompt_category,prompt_difficulty,accuracy_pct
0,automotive - inbound call,easy,44.8
1,automotive - inbound call,hard,35.0
2,home service - inbound call,easy,54.0
3,home service - inbound call,hard,41.2
4,insurance - outbound call,easy,41.4
5,insurance - outbound call,hard,28.6
6,medical equipment - outbound call,easy,36.4
7,medical equipment - outbound call,hard,26.0



=== Overall Accuracy ===
38.42%

=== Loss Metrics ===


Unnamed: 0,overall_accuracy_pct,mean_cross_entropy,mean_signed_margin,mean_deficit_if_wrong
0,38.425,1.422146,-0.309185,1.231413
