In [None]:
# Run with Kernel: local-eval

In [2]:
# ============================== Setup & imports ==============================
import os
import time
import json
import pickle
from typing import List, Dict, Any, Tuple

import numpy as np
import pandas as pd
import torch

# NEW: We use this library to load the local Qwen model
from sentence_transformers import SentenceTransformer

# Disable warnings for cleaner output in notebooks
import warnings
warnings.filterwarnings("ignore")

print("Imports complete. Ready to configure.")



  from .autonotebook import tqdm as notebook_tqdm


Imports complete. Ready to configure.


In [3]:
# ============================== CONFIGURATION ==============================

# 1. Customer data input
# This file must contain embeddings generated by the SAME Qwen model
CUSTOMER_PKL = "transcript_embeddings_localQwen.pkl"

# 2. Evaluation Data
# Folder with 4 JSONL prompt files (Same as before)
EVAL_DIR = "Evaluation-data"
PROMPT_FILES = [
    "automotive_prompts.jsonl",
    "home_service_prompts.jsonl",
    "insurance_prompts.jsonl",
    "medical_equipment_prompts.jsonl",
]

# 3. Categories (Fixed order mapping)
CATEGORIES = [
    "automotive - inbound call",
    "home service - inbound call",
    "insurance - outbound call",
    "medical equipment - outbound call",
]
INDEX_TO_CATEGORY = dict(enumerate(CATEGORIES))
CATEGORY_TO_INDEX = {c: i for i, c in INDEX_TO_CATEGORY.items()}

# 4. Model Name
# We define the local Hugging Face model path here
LOCAL_MODEL_NAME = "Qwen/Qwen3-Embedding-0.6B"

print("Configuration set. Target data file:", CUSTOMER_PKL)



Configuration set. Target data file: transcript_embeddings_localQwen.pkl


In [4]:
# ============================== LOAD LOCAL MODEL ==============================

def load_local_model(model_name: str):
    """
    Loads the Qwen model from Hugging Face using SentenceTransformers.
    trust_remote_code=True is required for Qwen architectures.
    """
    print(f"Loading local model: {model_name}...")
    print("If this is the first run, it may take a minute to download.")
    
    model = SentenceTransformer(
        model_name,
        trust_remote_code=True,
        # 'device' will automatically pick GPU (cuda) if available, else CPU
    )
    return model

# Initialize the model now
model = load_local_model(LOCAL_MODEL_NAME)
print("Model loaded successfully!")

Loading local model: Qwen/Qwen3-Embedding-0.6B...
If this is the first run, it may take a minute to download.
Model loaded successfully!


In [5]:
# ============================== LOAD CUSTOMER DATA ==============================

if not os.path.exists(CUSTOMER_PKL):
    raise FileNotFoundError(
        f"Could not find '{CUSTOMER_PKL}'. \n"
        "Please run your 'Script 2' first to generate the Qwen embeddings for the transcripts."
    )

with open(CUSTOMER_PKL, "rb") as f:
    customer_records: List[Dict[str, Any]] = pickle.load(f)

# Validation
assert isinstance(customer_records, list) and len(customer_records) > 0
print(f"Loaded {len(customer_records)} customers from '{CUSTOMER_PKL}'.")

# Quick sanity check on the first record
rec0 = customer_records[0]
assert "transcripts" in rec0 and "embeddings" in rec0
# Check dimension of the first embedding vector
dim = len(rec0["embeddings"][0])
print(f"Sample customer_id: {rec0.get('customer_id')}")
print(f"Embedding dimensions detected: {dim} (Qwen 0.6B is typically 1024)")


Loaded 50 customers from 'transcript_embeddings_localQwen.pkl'.
Sample customer_id: 0
Embedding dimensions detected: 1024 (Qwen 0.6B is typically 1024)


In [6]:
# ============================== VECTOR / COSINE UTILS ==============================
def l2_normalize(x: np.ndarray) -> np.ndarray:
    """
    Row-wise L2 normalize. Accepts shape (d,) or (n, d). Returns same shape.
    """
    x = np.asarray(x, dtype=np.float32)
    if x.ndim == 1:
        n = np.linalg.norm(x) + 1e-12
        return x / n
    n = np.linalg.norm(x, axis=1, keepdims=True) + 1e-12
    return x / n

def cosine_top_rank(query_vec: np.ndarray, doc_vecs: np.ndarray) -> Tuple[int, np.ndarray]:
    """
    Given a (d,) query and (4, d) docs, return:
      top_index (int), scores (np.ndarray shape (4,))
    Cosine reduces to dot product on normalized vectors.
    """
    q = l2_normalize(query_vec)
    D = l2_normalize(doc_vecs)
    scores = D @ q  # (4,)
    top_idx = int(np.argmax(scores))
    return top_idx, scores


In [7]:
# --- Loss calucation ---

def softmax(x: np.ndarray) -> np.ndarray:
    """
    Stable softmax over a length-4 score vector (cosine scores).
    """
    x = np.asarray(x, dtype=np.float32)
    x = x - np.max(x)           # numerical stability
    e = np.exp(x)
    return e / (np.sum(e) + 1e-12)

def loss_metrics(scores: np.ndarray, correct_idx: int) -> dict:
    """
    Compute 'how far off' and a standard loss from a 4-way score vector.
    Returns:
      - cross_entropy:  -log P(correct) with softmax(scores)
      - signed_margin:  correct_score - best_other_score  (positive=good, negative=wrong)
      - correct_score:  cosine score of the correct transcript
      - best_other_score: highest cosine among the 3 incorrect choices
      - rank_of_correct: 1..4 position of the correct transcript by score (1=best)
    """
    assert scores.shape == (4,), "Expected 4 scores."
    probs = softmax(scores)
    cross_entropy = -np.log(probs[correct_idx] + 1e-12)

    correct_score = float(scores[correct_idx])
    best_other_score = float(np.max(np.delete(scores, correct_idx)))
    signed_margin = correct_score - best_other_score  # <0 means the model chose another category

    order = np.argsort(-scores)
    rank_of_correct = int(np.where(order == correct_idx)[0][0]) + 1  # 1..4

    return {
        "cross_entropy": float(cross_entropy),
        "signed_margin": float(signed_margin),
        "correct_score": correct_score,
        "best_other_score": best_other_score,
        "rank_of_correct": rank_of_correct,
    }


In [8]:
# ============================== LOAD EVALUATION PROMPTS ==============================
# Each JSONL line structure:
# {"id": "...", "category": "...", "difficulty": "easy|hard", "prompt": "..."}

def load_jsonl(path: str) -> List[Dict[str, Any]]:
    items = []
    with open(path, "r", encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if not line:
                continue
            items.append(json.loads(line))
    return items

all_prompts: List[Dict[str, Any]] = []
for fname in PROMPT_FILES:
    full_path = os.path.join(EVAL_DIR, fname)
    items = load_jsonl(full_path)
    # Minimal validation
    for it in items:
        assert "id" in it and "category" in it and "difficulty" in it and "prompt" in it
        # Ensure categories in file are among our 4 categories
        assert it["category"] in CATEGORIES, f"Unknown category in {fname}: {it['category']}"
        assert it["difficulty"] in {"easy", "hard"}
    all_prompts.extend(items)

print(f"Loaded {len(all_prompts)} prompts from {EVAL_DIR}:")
pd.DataFrame(all_prompts)[["id", "category", "difficulty"]].head(8)


Loaded 80 prompts from Evaluation-data:


Unnamed: 0,id,category,difficulty
0,auto-e-01,automotive - inbound call,easy
1,auto-e-02,automotive - inbound call,easy
2,auto-e-03,automotive - inbound call,easy
3,auto-e-04,automotive - inbound call,easy
4,auto-e-05,automotive - inbound call,easy
5,auto-e-06,automotive - inbound call,easy
6,auto-e-07,automotive - inbound call,easy
7,auto-e-08,automotive - inbound call,easy


In [9]:
# ============================== EMBED ALL PROMPTS (LOCAL) ==============================

def embed_texts_local(model, texts: List[str]) -> np.ndarray:
    """
    Uses the local Qwen model to embed a list of strings.
    Returns a numpy array of shape (n_texts, embedding_dim).
    """
    print(f"Embedding {len(texts)} prompts locally...")
    
    # model.encode automatically handles batching and GPU acceleration
    # convert_to_numpy=True ensures we get an array back, not a Tensor
    embeddings = model.encode(texts, convert_to_numpy=True, show_progress_bar=True)
    
    # We normalize immediately so they are ready for Cosine Similarity
    return l2_normalize(embeddings)

# Prepare lists
prompt_texts = [p["prompt"] for p in all_prompts]
prompt_ids   = [p["id"] for p in all_prompts]
prompt_cats  = [p["category"] for p in all_prompts]
prompt_diffs = [p["difficulty"] for p in all_prompts]

# Run the embedding
# This replaces the slow loop with the Gemini API
if prompt_texts:
    prompt_embeddings = embed_texts_local(model, prompt_texts)
    print("Prompt embedding matrix shape:", prompt_embeddings.shape)
else:
    print("No prompts found! Please check your data folder.")
    prompt_embeddings = np.array([])


Embedding 80 prompts locally...


Batches: 100%|██████████| 3/3 [00:30<00:00, 10.20s/it]

Prompt embedding matrix shape: (80, 1024)





In [10]:
# ============================== EVALUATION LOOP ==============================
# For each (prompt, client):
# 1) take the pre-embedded prompt vector
# 2) take the client's 4 transcript embeddings
# 3) rank and pick top transcript index
# 4) map index -> predicted category
# 5) compare to prompt.category
# 6) store result

results = []

# Ensure we have data to run
if len(customer_records) > 0 and len(prompt_embeddings) > 0:
    
    # Loop over every customer
    for c_idx, rec in enumerate(customer_records, start=1):
        
        # 1. Get the 4 transcript vectors for this customer
        doc_vecs = np.array(rec["embeddings"], dtype=np.float32)

        # 2. Loop over every prompt
        for p_idx, (pid, pcat, pdiff) in enumerate(zip(prompt_ids, prompt_cats, prompt_diffs)):
            
            # Get the pre-calculated prompt vector
            q_vec = prompt_embeddings[p_idx] 

            # Compare: Find which transcript (0,1,2,3) matches the prompt best
            top_idx, scores = cosine_top_rank(q_vec, doc_vecs)

            # Metrics
            correct_idx = CATEGORY_TO_INDEX[pcat]
            metrics = loss_metrics(scores, correct_idx)

            pred_category = INDEX_TO_CATEGORY[top_idx]
            is_correct = int(pred_category == pcat)

            # Store Result
            row = {
                "customer_row": c_idx,
                "customer_id": rec.get("customer_id", None),
                "prompt_id": pid,
                "prompt_category": pcat,
                "prompt_difficulty": pdiff,
                "predicted_category": pred_category,
                "is_correct": is_correct,
                "top_index": top_idx,
                "top_score": float(scores[top_idx]),
                **metrics,
            }
            results.append(row)
            
        if c_idx % 10 == 0:
            print(f"Evaluated {c_idx}/{len(customer_records)} customers...")

    df = pd.DataFrame(results)
    print(f"Total evaluations: {len(df)}")
    display(df.head())

else:
    print("Skipping evaluation loop (missing data).")


Evaluated 10/50 customers...
Evaluated 20/50 customers...
Evaluated 30/50 customers...
Evaluated 40/50 customers...
Evaluated 50/50 customers...
Total evaluations: 4000


Unnamed: 0,customer_row,customer_id,prompt_id,prompt_category,prompt_difficulty,predicted_category,is_correct,top_index,top_score,cross_entropy,signed_margin,correct_score,best_other_score,rank_of_correct
0,1,0,auto-e-01,automotive - inbound call,easy,automotive - inbound call,1,0,0.554782,1.29965,0.103121,0.554782,0.451661,1
1,1,0,auto-e-02,automotive - inbound call,easy,automotive - inbound call,1,0,0.480021,1.330633,0.056779,0.480021,0.423242,1
2,1,0,auto-e-03,automotive - inbound call,easy,automotive - inbound call,1,0,0.524606,1.285894,0.120241,0.524606,0.404365,1
3,1,0,auto-e-04,automotive - inbound call,easy,automotive - inbound call,1,0,0.525146,1.307155,0.084811,0.525146,0.440336,1
4,1,0,auto-e-05,automotive - inbound call,easy,automotive - inbound call,1,0,0.522958,1.320275,0.05618,0.522958,0.466778,1


In [11]:
# ============================== METRICS & TABLES ==============================
def pct(x: pd.Series) -> float:
    return 100.0 * x.mean()

# Accuracy by category × difficulty
pivot_cat_diff = (
    df
    .groupby(["prompt_category", "prompt_difficulty"])["is_correct"]
    .apply(pct)
    .rename("accuracy_pct")
    .reset_index()
    .sort_values(["prompt_category", "prompt_difficulty"])
)

# Accuracy by category (all difficulties combined)
pivot_cat = (
    df
    .groupby("prompt_category")["is_correct"]
    .apply(pct)
    .rename("accuracy_pct")
    .reset_index()
    .sort_values("prompt_category")
)

# Accuracy by difficulty (across all categories)
pivot_diff = (
    df
    .groupby("prompt_difficulty")["is_correct"]
    .apply(pct)
    .rename("accuracy_pct")
    .reset_index()
    .sort_values("prompt_difficulty")
)

# Overall accuracy
overall_accuracy_pct = pct(df["is_correct"])

print("=== Accuracy by Category × Difficulty (Top-1, cosine) ===")
display(pivot_cat_diff)

print("\n=== Accuracy by Category (all difficulties) ===")
display(pivot_cat)

print("\n=== Accuracy by Difficulty (across all categories) ===")
display(pivot_diff)

print(f"\n=== Overall Accuracy ===\n{overall_accuracy_pct:.2f}%")


=== Accuracy by Category × Difficulty (Top-1, cosine) ===


Unnamed: 0,prompt_category,prompt_difficulty,accuracy_pct
0,automotive - inbound call,easy,58.4
1,automotive - inbound call,hard,50.4
2,home service - inbound call,easy,65.8
3,home service - inbound call,hard,55.6
4,insurance - outbound call,easy,44.0
5,insurance - outbound call,hard,35.0
6,medical equipment - outbound call,easy,39.6
7,medical equipment - outbound call,hard,37.6



=== Accuracy by Category (all difficulties) ===


Unnamed: 0,prompt_category,accuracy_pct
0,automotive - inbound call,54.4
1,home service - inbound call,60.7
2,insurance - outbound call,39.5
3,medical equipment - outbound call,38.6



=== Accuracy by Difficulty (across all categories) ===


Unnamed: 0,prompt_difficulty,accuracy_pct
0,easy,51.95
1,hard,44.65



=== Overall Accuracy ===
48.30%


In [12]:
# ============================== OPTIONAL: ERROR INSPECTION ==============================
# Quickly see where the model tends to confuse categories
errors = df[df["is_correct"] == 0].copy()
confusions = (
    errors
    .groupby(["prompt_category", "predicted_category"])
    .size()
    .rename("count")
    .reset_index()
    .sort_values("count", ascending=False)
)
display(confusions.head(12))

# Per-prompt accuracy (useful to refine the prompt set)
per_prompt = (
    df.groupby(["prompt_id", "prompt_category", "prompt_difficulty"])["is_correct"]
      .apply(pct).rename("accuracy_pct").reset_index().sort_values("accuracy_pct", ascending=True)
)
display(per_prompt.head(10))


Unnamed: 0,prompt_category,predicted_category,count
8,insurance - outbound call,medical equipment - outbound call,489
3,home service - inbound call,automotive - inbound call,259
0,automotive - inbound call,home service - inbound call,253
9,medical equipment - outbound call,automotive - inbound call,234
11,medical equipment - outbound call,insurance - outbound call,213
10,medical equipment - outbound call,home service - inbound call,167
2,automotive - inbound call,medical equipment - outbound call,131
5,home service - inbound call,medical equipment - outbound call,95
6,insurance - outbound call,automotive - inbound call,78
1,automotive - inbound call,insurance - outbound call,72


Unnamed: 0,prompt_id,prompt_category,prompt_difficulty,accuracy_pct
68,med-e-09,medical equipment - outbound call,easy,16.0
79,med-h-10,medical equipment - outbound call,hard,20.0
42,ins-e-03,insurance - outbound call,easy,20.0
14,auto-h-05,automotive - inbound call,hard,22.0
12,auto-h-03,automotive - inbound call,hard,22.0
59,ins-h-10,insurance - outbound call,hard,22.0
52,ins-h-03,insurance - outbound call,hard,24.0
56,ins-h-07,insurance - outbound call,hard,24.0
54,ins-h-05,insurance - outbound call,hard,24.0
77,med-h-08,medical equipment - outbound call,hard,24.0


In [13]:
# ============================== LOSS EVALUATION ==============================
# 'How far off' when wrong: deficit = (best_other_score - correct_score) for misclassifications
df["deficit_if_wrong"] = np.where(
    df["is_correct"] == 0,
    df["best_other_score"] - df["correct_score"],
    0.0
)

print("=== Overall loss metrics ===")
overall = pd.DataFrame({
    "overall_accuracy_pct": [100.0 * df["is_correct"].mean()],
    "mean_cross_entropy":   [df["cross_entropy"].mean()],
    "mean_signed_margin":   [df["signed_margin"].mean()],   # >0 is good
    "mean_rank_of_correct": [df["rank_of_correct"].mean()], # closer to 1 is better
    "mean_deficit_if_wrong":[df.loc[df["is_correct"]==0, "deficit_if_wrong"].mean()]
})
display(overall)

print("\n=== Loss by Category × Difficulty ===")
agg = {
    "is_correct": "mean",
    "cross_entropy": "mean",
    "signed_margin": "mean",
    "rank_of_correct": "mean",
    "deficit_if_wrong": "mean",
}
by_cat_diff = (
    df.groupby(["prompt_category","prompt_difficulty"])
      .agg(agg)
      .rename(columns={"is_correct":"accuracy_mean"})
      .reset_index()
)
by_cat_diff["accuracy_pct"] = 100.0 * by_cat_diff["accuracy_mean"]
display(by_cat_diff.drop(columns=["accuracy_mean"]).sort_values(["prompt_category","prompt_difficulty"]))

print("\n=== Wrong-only: How far off (deficit & rank) by Category ===")
wrong = df[df["is_correct"] == 0]
wrong_by_cat = (
    wrong.groupby("prompt_category")[["deficit_if_wrong","rank_of_correct"]]
         .mean()
         .reset_index()
         .sort_values("deficit_if_wrong", ascending=False)
)
display(wrong_by_cat)


=== Overall loss metrics ===


Unnamed: 0,overall_accuracy_pct,mean_cross_entropy,mean_signed_margin,mean_rank_of_correct,mean_deficit_if_wrong
0,48.3,1.34659,-0.001295,1.81325,0.055949



=== Loss by Category × Difficulty ===


Unnamed: 0,prompt_category,prompt_difficulty,cross_entropy,signed_margin,rank_of_correct,deficit_if_wrong,accuracy_pct
0,automotive - inbound call,easy,1.337299,0.016245,1.642,0.024037,58.4
1,automotive - inbound call,hard,1.34799,0.003499,1.842,0.027953,50.4
2,home service - inbound call,easy,1.32874,0.027883,1.598,0.024537,65.8
3,home service - inbound call,hard,1.343837,0.01335,1.696,0.021937,55.6
4,insurance - outbound call,easy,1.341044,-0.010552,1.818,0.029171,44.0
5,insurance - outbound call,hard,1.350469,-0.022384,1.89,0.034019,35.0
6,medical equipment - outbound call,easy,1.36113,-0.016982,1.972,0.032473,39.6
7,medical equipment - outbound call,hard,1.362207,-0.021415,2.048,0.037279,37.6



=== Wrong-only: How far off (deficit & rank) by Category ===


Unnamed: 0,prompt_category,deficit_if_wrong,rank_of_correct
1,home service - inbound call,0.059127,2.64631
0,automotive - inbound call,0.057007,2.627193
3,medical equipment - outbound call,0.056802,2.644951
2,insurance - outbound call,0.052223,2.41157
