In [1]:
# ============================== Setup & imports ==============================
import os
import json
import pickle
import re
import time
from typing import List, Dict, Any, Tuple

import numpy as np
import pandas as pd
import torch

# 1. Vector Library
from sentence_transformers import SentenceTransformer

# 2. Keyword Library
from rank_bm25 import BM25Okapi

# Filter warnings
import warnings
warnings.filterwarnings("ignore")

print("Hybrid Libraries Loaded.")

  from .autonotebook import tqdm as notebook_tqdm


Hybrid Libraries Loaded.


In [27]:
# ============================== CONFIGURATION ==============================

# 1. Hybrid Input Data
CUSTOMER_PKL = "transcript_embeddings_hybrid.pkl"

# 2. Hybrid Balance (The most important setting!)
# 0.7 means: 70% importance to Vector (Meaning), 30% to Keywords.
# You can tweak this to see what works best for your data.
HYBRID_ALPHA = 0.96 

# 3. Model Name
LOCAL_MODEL_NAME = "Qwen/Qwen3-Embedding-0.6B"

# 4. Evaluation Data
EVAL_DIR = "Evaluation-data"
PROMPT_FILES = [
    "automotive_prompts.jsonl",
    "home_service_prompts.jsonl",
    "insurance_prompts.jsonl",
    "medical_equipment_prompts.jsonl",
]

CATEGORIES = [
    "automotive - inbound call",
    "home service - inbound call",
    "insurance - outbound call",
    "medical equipment - outbound call",
]
INDEX_TO_CATEGORY = dict(enumerate(CATEGORIES))
CATEGORY_TO_INDEX = {c: i for i, c in INDEX_TO_CATEGORY.items()}

print(f"Config set. Hybrid Alpha: {HYBRID_ALPHA}")

Config set. Hybrid Alpha: 0.96


In [3]:
# ============================== LOAD RESOURCES ==============================

# 1. Load Local Qwen Model
print("Loading Qwen3 model...")
model = SentenceTransformer(LOCAL_MODEL_NAME, trust_remote_code=True)

# 2. Load Data
if not os.path.exists(CUSTOMER_PKL):
    raise FileNotFoundError(f"File '{CUSTOMER_PKL}' not found.")

with open(CUSTOMER_PKL, "rb") as f:
    customer_records: List[Dict[str, Any]] = pickle.load(f)

# Validation
rec0 = customer_records[0]
assert "dense_embeddings" in rec0, "Missing vector embeddings"
assert "bm25_tokens" in rec0, "Missing BM25 tokens"

print(f"Loaded {len(customer_records)} hybrid records.")

Loading Qwen3 model...
Loaded 50 hybrid records.


In [4]:
# ============================== MATH & TEXT UTILS ==============================

def simple_tokenize(text: str) -> List[str]:
    """Must match the logic used in data generation."""
    if not isinstance(text, str): return []
    text = text.lower()
    text = re.sub(r'[^a-z0-9\s]', ' ', text)
    return [t for t in text.split(" ") if t.strip()]

def l2_normalize(x: np.ndarray) -> np.ndarray:
    """Standard vector normalization for Cosine Similarity."""
    x = np.asarray(x, dtype=np.float32)
    if x.ndim == 1:
        n = np.linalg.norm(x) + 1e-12
        return x / n
    n = np.linalg.norm(x, axis=1, keepdims=True) + 1e-12
    return x / n

def min_max_normalize(scores: np.ndarray) -> np.ndarray:
    """
    Scales an array of scores to range [0, 1].
    Used to make BM25 scores compatible with Cosine scores.
    """
    s = np.array(scores)
    min_val = np.min(s)
    max_val = np.max(s)
    
    # If all scores are the same (e.g. all 0), return zeros
    if max_val - min_val == 0:
        return np.zeros_like(s)
    
    return (s - min_val) / (max_val - min_val)

def softmax(x: np.ndarray) -> np.ndarray:
    """For metrics calculation."""
    x = np.asarray(x, dtype=np.float32)
    x = x - np.max(x) 
    e = np.exp(x)
    return e / (np.sum(e) + 1e-12)

def loss_metrics(scores: np.ndarray, correct_idx: int) -> dict:
    """Standard metrics."""
    probs = softmax(scores)
    cross_entropy = -np.log(probs[correct_idx] + 1e-12)
    correct_score = float(scores[correct_idx])
    best_other_score = float(np.max(np.delete(scores, correct_idx)))
    signed_margin = correct_score - best_other_score 
    order = np.argsort(-scores)
    rank_of_correct = int(np.where(order == correct_idx)[0][0]) + 1 
    return {
        "cross_entropy": cross_entropy,
        "signed_margin": signed_margin,
        "correct_score": correct_score,
        "best_other_score": best_other_score,
        "rank_of_correct": rank_of_correct,
    }

In [5]:
# ============================== PREPARE PROMPTS ==============================

# Load prompts
def load_jsonl(path):
    items = []
    with open(path, "r", encoding="utf-8") as f:
        for line in f:
            if line.strip(): items.append(json.loads(line))
    return items

all_prompts = []
for fname in PROMPT_FILES:
    p = os.path.join(EVAL_DIR, fname)
    if os.path.exists(p): all_prompts.extend(load_jsonl(p))

# 1. Generate Prompt Vectors (Dense)
print(f"Embedding {len(all_prompts)} prompts with Qwen...")
prompt_texts = [p["prompt"] for p in all_prompts]
raw_embeddings = model.encode(prompt_texts, convert_to_numpy=True, show_progress_bar=True)
prompt_vectors = l2_normalize(raw_embeddings)

# 2. Generate Prompt Tokens (Sparse)
print("Tokenizing prompts for BM25...")
prompt_tokens_list = [simple_tokenize(t) for t in prompt_texts]

# Metadata
prompt_ids   = [p["id"] for p in all_prompts]
prompt_cats  = [p["category"] for p in all_prompts]
prompt_diffs = [p["difficulty"] for p in all_prompts]

print("Prompts ready for Hybrid evaluation.")

Embedding 80 prompts with Qwen...


Batches: 100%|██████████| 3/3 [00:27<00:00,  9.31s/it]

Tokenizing prompts for BM25...
Prompts ready for Hybrid evaluation.





In [28]:
# ============================== HYBRID EVALUATION LOOP ==============================

results = []

print(f"Starting evaluation with Alpha={HYBRID_ALPHA}...")

for c_idx, rec in enumerate(customer_records, start=1):
    
    # --- A. Setup Customer Data ---
    # 1. Dense: Get 4 vectors
    doc_vecs = np.array(rec["dense_embeddings"], dtype=np.float32)
    doc_vecs = l2_normalize(doc_vecs) # Ensure normalized
    
    # 2. Sparse: Setup BM25 index for these 4 docs
    doc_tokens = rec["bm25_tokens"]
    bm25_engine = BM25Okapi(doc_tokens)

    # --- B. Evaluate all Prompts ---
    for p_idx in range(len(all_prompts)):
        
        # 1. Dense Scoring (Cosine)
        # Shape: (4,) array of scores between -1 and 1 (usually 0.4 to 0.9)
        q_vec = prompt_vectors[p_idx]
        dense_scores_raw = doc_vecs @ q_vec
        
        # 2. Sparse Scoring (BM25)
        # Shape: (4,) array of scores (e.g., [0.0, 12.5, 4.2, 0.0])
        q_tokens = prompt_tokens_list[p_idx]
        sparse_scores_raw = np.array(bm25_engine.get_scores(q_tokens))
        
        # 3. Normalization
        # We assume Dense is already roughly 0-1 (cosine). 
        # We MUST normalize Sparse to 0-1 to combine them fairly.
        sparse_scores_norm = min_max_normalize(sparse_scores_raw)
        
        # 4. Hybrid Fusion (Weighted Sum)
        final_scores = (HYBRID_ALPHA * dense_scores_raw) + \
                       ((1 - HYBRID_ALPHA) * sparse_scores_norm)
        
        # --- C. Metrics & Storage ---
        top_idx = int(np.argmax(final_scores))
        
        pcat = prompt_cats[p_idx]
        correct_idx = CATEGORY_TO_INDEX[pcat]
        
        metrics = loss_metrics(final_scores, correct_idx)
        pred_category = INDEX_TO_CATEGORY[top_idx]
        is_correct = int(pred_category == pcat)

        row = {
            "customer_row": c_idx,
            "prompt_id": prompt_ids[p_idx],
            "prompt_category": pcat,
            "prompt_difficulty": prompt_diffs[p_idx],
            "predicted_category": pred_category,
            "is_correct": is_correct,
            "top_score": float(final_scores[top_idx]),
            # Debug info: Let's see how the models voted!
            "raw_dense_winner": int(np.argmax(dense_scores_raw)),
            "raw_sparse_winner": int(np.argmax(sparse_scores_raw)),
            **metrics,
        }
        results.append(row)

    if c_idx % 10 == 0:
        print(f"Evaluated {c_idx}/{len(customer_records)} customers...")

df_hybrid = pd.DataFrame(results)
print(f"Total evaluations: {len(df_hybrid)}")

Starting evaluation with Alpha=0.96...
Evaluated 10/50 customers...
Evaluated 20/50 customers...
Evaluated 30/50 customers...
Evaluated 40/50 customers...
Evaluated 50/50 customers...
Total evaluations: 4000


In [29]:
# ==============================  METRICS & TABLES ==============================

# Use the dataframe created in the loop above
df = df_hybrid 

if len(df) > 0:
    def pct(x: pd.Series) -> float:
        return 100.0 * x.mean()

    # 1. Accuracy by Category × Difficulty
    # This shows you exactly which specific areas are failing (e.g., Hard Insurance calls)
    pivot_cat_diff = (
        df
        .groupby(["prompt_category", "prompt_difficulty"])["is_correct"]
        .apply(pct)
        .rename("accuracy_pct")
        .reset_index()
        .sort_values(["prompt_category", "prompt_difficulty"])
    )

    # 2. Accuracy by Category (Summary)
    pivot_cat = (
        df
        .groupby("prompt_category")["is_correct"]
        .apply(pct)
        .rename("accuracy_pct")
        .reset_index()
        .sort_values("prompt_category")
    )

    # 3. Accuracy by Difficulty (Summary)
    pivot_diff = (
        df
        .groupby("prompt_difficulty")["is_correct"]
        .apply(pct)
        .rename("accuracy_pct")
        .reset_index()
        .sort_values("prompt_difficulty")
    )

    # 4. Overall Accuracy
    overall_accuracy_pct = pct(df["is_correct"])

    print(f"=== Hybrid Accuracy (Alpha={HYBRID_ALPHA}) ===")
    print(f"Overall: {overall_accuracy_pct:.2f}%")

    print("\n=== Accuracy by Category × Difficulty ===")
    display(pivot_cat_diff)

    print("\n=== Accuracy by Category ===")
    display(pivot_cat)

    print("\n=== Accuracy by Difficulty ===")
    display(pivot_diff)

    # ============================== PART 8: LOSS & DIAGNOSTICS ==============================
    
    # Calculate 'deficit':

=== Hybrid Accuracy (Alpha=0.96) ===
Overall: 51.15%

=== Accuracy by Category × Difficulty ===


Unnamed: 0,prompt_category,prompt_difficulty,accuracy_pct
0,automotive - inbound call,easy,60.4
1,automotive - inbound call,hard,50.6
2,home service - inbound call,easy,69.0
3,home service - inbound call,hard,60.2
4,insurance - outbound call,easy,46.4
5,insurance - outbound call,hard,37.6
6,medical equipment - outbound call,easy,45.6
7,medical equipment - outbound call,hard,39.4



=== Accuracy by Category ===


Unnamed: 0,prompt_category,accuracy_pct
0,automotive - inbound call,55.5
1,home service - inbound call,64.6
2,insurance - outbound call,42.0
3,medical equipment - outbound call,42.5



=== Accuracy by Difficulty ===


Unnamed: 0,prompt_difficulty,accuracy_pct
0,easy,55.35
1,hard,46.95
