In [None]:
# Run with Kernel: rag-eval

In [1]:
# ============================== Setup & imports ==============================

import os
import time
import json
import pickle
from typing import List, Dict, Any, Tuple

import numpy as np
import pandas as pd

# Google GenAI SDK (new)
from google import genai
from google.genai import types

In [2]:
# ============================== CONFIGURATION ==============================
# Customer data with precomputed 4×(3072-D) embeddings per client
CUSTOMER_PKL = "transcript_embeddings.pkl"

# Folder with 4 JSONL prompt files
EVAL_DIR = "Evaluation-data"
PROMPT_FILES = [
    "automotive_inbound.jsonl",
    "home_service_inbound.jsonl",
    "insurance_outbound.jsonl",
    "medical_equipment_outbound.jsonl",
]

# Fixed category order -> transcript index mapping (see assumption above)
CATEGORIES = [
    "automotive - inbound call",
    "home service - inbound call",
    "insurance - outbound call",
    "medical equipment - outbound call",
]
INDEX_TO_CATEGORY = dict(enumerate(CATEGORIES))
CATEGORY_TO_INDEX = {c: i for i, c in INDEX_TO_CATEGORY.items()}

# Models
EMBED_MODEL = "gemini-embedding-001"

# Rate limiting: sleep between *prompt* embedding calls
SLEEP_BETWEEN_EMBED_CALLS_SEC = 1  # adjust if needed


In [3]:
# === API key from key.env ===

from pathlib import Path

def read_key_from_env_file(filename: str = "key.env", var_name: str = "GEMINI_API_KEY") -> str:
    p = Path.cwd() / filename  # current working directory (your notebook folder)
    if not p.exists():
        raise FileNotFoundError(f"'{filename}' not found in {Path.cwd()}")
    for line in p.read_text(encoding="utf-8").splitlines():
        line = line.strip()
        if not line or line.startswith("#"):
            continue
        if "=" in line:
            k, v = line.split("=", 1)
            if k.strip() == var_name:
                return v.strip().strip('"').strip("'")
    raise RuntimeError(f"{var_name} not found in {filename}")

API_KEY = read_key_from_env_file()
client = genai.Client(api_key=API_KEY)


In [4]:
# ============================== LOAD CUSTOMER DATA ==============================
# Expected structure (list of 50):


with open(CUSTOMER_PKL, "rb") as f:
    customer_records: List[Dict[str, Any]] = pickle.load(f)

assert isinstance(customer_records, list) and len(customer_records) > 0
print(f"Loaded {len(customer_records)} customers from '{CUSTOMER_PKL}'.")

# Quick shape sanity check on the first record
rec0 = customer_records[0]
assert "transcripts" in rec0 and "embeddings" in rec0
assert len(rec0["transcripts"]) == 4 and len(rec0["embeddings"]) == 4
print("Sample customer_id:", rec0.get("customer_id"))


Loaded 50 customers from 'transcript_embeddings.pkl'.
Sample customer_id: 0


In [5]:
# ============================== VECTOR / COSINE UTILS ==============================
def l2_normalize(x: np.ndarray) -> np.ndarray:
    """
    Row-wise L2 normalize. Accepts shape (d,) or (n, d). Returns same shape.
    """
    x = np.asarray(x, dtype=np.float32)
    if x.ndim == 1:
        n = np.linalg.norm(x) + 1e-12
        return x / n
    n = np.linalg.norm(x, axis=1, keepdims=True) + 1e-12
    return x / n

def cosine_top_rank(query_vec: np.ndarray, doc_vecs: np.ndarray) -> Tuple[int, np.ndarray]:
    """
    Given a (d,) query and (4, d) docs, return:
      top_index (int), scores (np.ndarray shape (4,))
    Cosine reduces to dot product on normalized vectors.
    """
    q = l2_normalize(query_vec)
    D = l2_normalize(doc_vecs)
    scores = D @ q  # (4,)
    top_idx = int(np.argmax(scores))
    return top_idx, scores


In [6]:
# --- Loss calucation ---

def softmax(x: np.ndarray) -> np.ndarray:
    """
    Stable softmax over a length-4 score vector (cosine scores).
    """
    x = np.asarray(x, dtype=np.float32)
    x = x - np.max(x)           # numerical stability
    e = np.exp(x)
    return e / (np.sum(e) + 1e-12)

def loss_metrics(scores: np.ndarray, correct_idx: int) -> dict:
    """
    Compute 'how far off' and a standard loss from a 4-way score vector.
    Returns:
      - cross_entropy:  -log P(correct) with softmax(scores)
      - signed_margin:  correct_score - best_other_score  (positive=good, negative=wrong)
      - correct_score:  cosine score of the correct transcript
      - best_other_score: highest cosine among the 3 incorrect choices
      - rank_of_correct: 1..4 position of the correct transcript by score (1=best)
    """
    assert scores.shape == (4,), "Expected 4 scores."
    probs = softmax(scores)
    cross_entropy = -np.log(probs[correct_idx] + 1e-12)

    correct_score = float(scores[correct_idx])
    best_other_score = float(np.max(np.delete(scores, correct_idx)))
    signed_margin = correct_score - best_other_score  # <0 means the model chose another category

    order = np.argsort(-scores)
    rank_of_correct = int(np.where(order == correct_idx)[0][0]) + 1  # 1..4

    return {
        "cross_entropy": float(cross_entropy),
        "signed_margin": float(signed_margin),
        "correct_score": correct_score,
        "best_other_score": best_other_score,
        "rank_of_correct": rank_of_correct,
    }


In [7]:
# ============================== LOAD EVALUATION PROMPTS ==============================
# Each JSONL line structure:
# {"id": "...", "category": "...", "difficulty": "easy|hard", "prompt": "..."}

def load_jsonl(path: str) -> List[Dict[str, Any]]:
    items = []
    with open(path, "r", encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if not line:
                continue
            items.append(json.loads(line))
    return items

all_prompts: List[Dict[str, Any]] = []
for fname in PROMPT_FILES:
    full_path = os.path.join(EVAL_DIR, fname)
    items = load_jsonl(full_path)
    # Minimal validation
    for it in items:
        assert "id" in it and "category" in it and "difficulty" in it and "prompt" in it
        # Ensure categories in file are among our 4 categories
        assert it["category"] in CATEGORIES, f"Unknown category in {fname}: {it['category']}"
        assert it["difficulty"] in {"easy", "hard"}
    all_prompts.extend(items)

print(f"Loaded {len(all_prompts)} prompts from {EVAL_DIR}:")
pd.DataFrame(all_prompts)[["id", "category", "difficulty"]].head(8)


Loaded 80 prompts from Evaluation-data:


Unnamed: 0,id,category,difficulty
0,auto-e-01,automotive - inbound call,easy
1,auto-e-02,automotive - inbound call,easy
2,auto-e-03,automotive - inbound call,easy
3,auto-e-04,automotive - inbound call,easy
4,auto-e-05,automotive - inbound call,easy
5,auto-e-06,automotive - inbound call,easy
6,auto-e-07,automotive - inbound call,easy
7,auto-e-08,automotive - inbound call,easy


In [8]:
# ============================== EMBED ALL PROMPTS (ONCE) ==============================
# We embed each unique prompt string once, then reuse for all 50 customers.
# A short sleep between calls helps avoid rate limits.

def embed_texts(texts: List[str]) -> np.ndarray:
    """
    Returns an array of shape (n, 3072) of L2-normalized embeddings.
    """
    out = []
    for i, t in enumerate(texts, start=1):
        res = client.models.embed_content(
            model=EMBED_MODEL,
            contents=t,
            # For RAG queries, retrieval query often performs well; keep default simple.
            config=types.EmbedContentConfig(task_type="RETRIEVAL_QUERY")
        )
        [emb] = res.embeddings
        vec = np.array(emb.values, dtype=np.float32)
        out.append(vec)
        # Simple pacing to be kind to the API
        time.sleep(SLEEP_BETWEEN_EMBED_CALLS_SEC)
        if i % 10 == 0:
            print(f"Embedded {i}/{len(texts)} prompts...")
    return l2_normalize(np.vstack(out))

# Prepare input order (keep stable index)
prompt_texts = [p["prompt"] for p in all_prompts]
prompt_ids    = [p["id"] for p in all_prompts]
prompt_cats   = [p["category"] for p in all_prompts]
prompt_diffs  = [p["difficulty"] for p in all_prompts]

prompt_embeddings = embed_texts(prompt_texts)  # shape: (N_prompts, 3072)
print("Prompt embedding matrix shape:", prompt_embeddings.shape)


Embedded 10/80 prompts...
Embedded 20/80 prompts...
Embedded 30/80 prompts...
Embedded 40/80 prompts...
Embedded 50/80 prompts...
Embedded 60/80 prompts...
Embedded 70/80 prompts...
Embedded 80/80 prompts...
Prompt embedding matrix shape: (80, 3072)


In [9]:
# ============================== EVALUATION LOOP ==============================
# For each (prompt, client):
# 1) take the pre-embedded prompt vector
# 2) take the client's 4 transcript embeddings
# 3) rank and pick top transcript index
# 4) map index -> predicted category
# 5) compare to prompt.category
# 6) store result

results = []

for c_idx, rec in enumerate(customer_records, start=1):
    doc_vecs = np.array(rec["embeddings"], dtype=np.float32)  # (4, 3072)

    for p_idx, (pid, pcat, pdiff) in enumerate(zip(prompt_ids, prompt_cats, prompt_diffs)):
        q_vec = prompt_embeddings[p_idx]  # (3072,)

        top_idx, scores = cosine_top_rank(q_vec, doc_vecs)

        # compute loss metrics with the correct index
        correct_idx = CATEGORY_TO_INDEX[pcat]
        metrics = loss_metrics(scores, correct_idx)

        pred_category = INDEX_TO_CATEGORY[top_idx]
        is_correct = int(pred_category == pcat)

        row = {
            "customer_row": c_idx,
            "customer_id": rec.get("customer_id", None),
            "prompt_id": pid,
            "prompt_category": pcat,
            "prompt_difficulty": pdiff,
            "predicted_category": pred_category,
            "is_correct": is_correct,
            "top_index": top_idx,
            "top_score": float(scores[top_idx]),
            # attach the loss/offset info
            **metrics,
        }
        results.append(row)

df = pd.DataFrame(results)
print(f"Total evaluations: {len(df)}")
df.head(6)


Total evaluations: 4000


Unnamed: 0,customer_row,customer_id,prompt_id,prompt_category,prompt_difficulty,predicted_category,is_correct,top_index,top_score,cross_entropy,signed_margin,correct_score,best_other_score,rank_of_correct
0,1,0,auto-e-01,automotive - inbound call,easy,automotive - inbound call,1,0,0.695805,1.352318,0.035667,0.695805,0.660138,1
1,1,0,auto-e-02,automotive - inbound call,easy,automotive - inbound call,1,0,0.688124,1.355202,0.026814,0.688124,0.66131,1
2,1,0,auto-e-03,automotive - inbound call,easy,automotive - inbound call,1,0,0.72256,1.346935,0.050526,0.72256,0.672034,1
3,1,0,auto-e-04,automotive - inbound call,easy,automotive - inbound call,1,0,0.721942,1.353878,0.037044,0.721942,0.684897,1
4,1,0,auto-e-05,automotive - inbound call,easy,automotive - inbound call,1,0,0.705459,1.357323,0.025233,0.705459,0.680226,1
5,1,0,auto-e-06,automotive - inbound call,easy,automotive - inbound call,1,0,0.760471,1.339058,0.06051,0.760471,0.699962,1


In [10]:
# ============================== METRICS & TABLES ==============================
def pct(x: pd.Series) -> float:
    return 100.0 * x.mean()

# Accuracy by category × difficulty
pivot_cat_diff = (
    df
    .groupby(["prompt_category", "prompt_difficulty"])["is_correct"]
    .apply(pct)
    .rename("accuracy_pct")
    .reset_index()
    .sort_values(["prompt_category", "prompt_difficulty"])
)

# Accuracy by category (all difficulties combined)
pivot_cat = (
    df
    .groupby("prompt_category")["is_correct"]
    .apply(pct)
    .rename("accuracy_pct")
    .reset_index()
    .sort_values("prompt_category")
)

# Accuracy by difficulty (across all categories)
pivot_diff = (
    df
    .groupby("prompt_difficulty")["is_correct"]
    .apply(pct)
    .rename("accuracy_pct")
    .reset_index()
    .sort_values("prompt_difficulty")
)

# Overall accuracy
overall_accuracy_pct = pct(df["is_correct"])

print("=== Accuracy by Category × Difficulty (Top-1, cosine) ===")
display(pivot_cat_diff)

print("\n=== Accuracy by Category (all difficulties) ===")
display(pivot_cat)

print("\n=== Accuracy by Difficulty (across all categories) ===")
display(pivot_diff)

print(f"\n=== Overall Accuracy ===\n{overall_accuracy_pct:.2f}%")


=== Accuracy by Category × Difficulty (Top-1, cosine) ===


Unnamed: 0,prompt_category,prompt_difficulty,accuracy_pct
0,automotive - inbound call,easy,74.4
1,automotive - inbound call,hard,51.0
2,home service - inbound call,easy,52.4
3,home service - inbound call,hard,45.8
4,insurance - outbound call,easy,47.0
5,insurance - outbound call,hard,37.2
6,medical equipment - outbound call,easy,46.4
7,medical equipment - outbound call,hard,42.0



=== Accuracy by Category (all difficulties) ===


Unnamed: 0,prompt_category,accuracy_pct
0,automotive - inbound call,62.7
1,home service - inbound call,49.1
2,insurance - outbound call,42.1
3,medical equipment - outbound call,44.2



=== Accuracy by Difficulty (across all categories) ===


Unnamed: 0,prompt_difficulty,accuracy_pct
0,easy,55.05
1,hard,44.0



=== Overall Accuracy ===
49.53%


In [11]:
# ============================== OPTIONAL: ERROR INSPECTION ==============================
# Quickly see where the model tends to confuse categories
errors = df[df["is_correct"] == 0].copy()
confusions = (
    errors
    .groupby(["prompt_category", "predicted_category"])
    .size()
    .rename("count")
    .reset_index()
    .sort_values("count", ascending=False)
)
display(confusions.head(12))

# Per-prompt accuracy (useful to refine the prompt set)
per_prompt = (
    df.groupby(["prompt_id", "prompt_category", "prompt_difficulty"])["is_correct"]
      .apply(pct).rename("accuracy_pct").reset_index().sort_values("accuracy_pct", ascending=True)
)
display(per_prompt.head(10))


Unnamed: 0,prompt_category,predicted_category,count
8,insurance - outbound call,medical equipment - outbound call,419
11,medical equipment - outbound call,insurance - outbound call,383
3,home service - inbound call,automotive - inbound call,245
0,automotive - inbound call,home service - inbound call,191
4,home service - inbound call,insurance - outbound call,133
5,home service - inbound call,medical equipment - outbound call,131
1,automotive - inbound call,insurance - outbound call,103
6,insurance - outbound call,automotive - inbound call,100
9,medical equipment - outbound call,automotive - inbound call,91
10,medical equipment - outbound call,home service - inbound call,84


Unnamed: 0,prompt_id,prompt_category,prompt_difficulty,accuracy_pct
13,auto-h-14,automotive - inbound call,hard,8.0
23,home-e-04,home service - inbound call,easy,12.0
44,ins-e-05,insurance - outbound call,easy,12.0
58,ins-h-19,insurance - outbound call,hard,14.0
63,med-e-04,medical equipment - outbound call,easy,16.0
77,med-h-18,medical equipment - outbound call,hard,18.0
37,home-h-18,home service - inbound call,hard,20.0
54,ins-h-15,insurance - outbound call,hard,26.0
52,ins-h-13,insurance - outbound call,hard,26.0
59,ins-h-20,insurance - outbound call,hard,28.0


In [12]:
# ============================== LOSS EVALUATION ==============================
# 'How far off' when wrong: deficit = (best_other_score - correct_score) for misclassifications
df["deficit_if_wrong"] = np.where(
    df["is_correct"] == 0,
    df["best_other_score"] - df["correct_score"],
    0.0
)

print("=== Overall loss metrics ===")
overall = pd.DataFrame({
    "overall_accuracy_pct": [100.0 * df["is_correct"].mean()],
    "mean_cross_entropy":   [df["cross_entropy"].mean()],
    "mean_signed_margin":   [df["signed_margin"].mean()],   # >0 is good
    "mean_rank_of_correct": [df["rank_of_correct"].mean()], # closer to 1 is better
    "mean_deficit_if_wrong":[df.loc[df["is_correct"]==0, "deficit_if_wrong"].mean()]
})
display(overall)

print("\n=== Loss by Category × Difficulty ===")
agg = {
    "is_correct": "mean",
    "cross_entropy": "mean",
    "signed_margin": "mean",
    "rank_of_correct": "mean",
    "deficit_if_wrong": "mean",
}
by_cat_diff = (
    df.groupby(["prompt_category","prompt_difficulty"])
      .agg(agg)
      .rename(columns={"is_correct":"accuracy_mean"})
      .reset_index()
)
by_cat_diff["accuracy_pct"] = 100.0 * by_cat_diff["accuracy_mean"]
display(by_cat_diff.drop(columns=["accuracy_mean"]).sort_values(["prompt_category","prompt_difficulty"]))

print("\n=== Wrong-only: How far off (deficit & rank) by Category ===")
wrong = df[df["is_correct"] == 0]
wrong_by_cat = (
    wrong.groupby("prompt_category")[["deficit_if_wrong","rank_of_correct"]]
         .mean()
         .reset_index()
         .sort_values("deficit_if_wrong", ascending=False)
)
display(wrong_by_cat)


=== Overall loss metrics ===


Unnamed: 0,overall_accuracy_pct,mean_cross_entropy,mean_signed_margin,mean_rank_of_correct,mean_deficit_if_wrong
0,49.525,1.371161,0.001073,1.82425,0.018522



=== Loss by Category × Difficulty ===


Unnamed: 0,prompt_category,prompt_difficulty,cross_entropy,signed_margin,rank_of_correct,deficit_if_wrong,accuracy_pct
0,automotive - inbound call,easy,1.363983,0.017314,1.486,0.005321,74.4
1,automotive - inbound call,hard,1.374917,0.001843,1.944,0.010167,51.0
2,home service - inbound call,easy,1.375706,0.001189,1.934,0.011243,52.4
3,home service - inbound call,hard,1.377555,-0.001713,2.0,0.011269,45.8
4,insurance - outbound call,easy,1.361227,-0.000264,1.71,0.00864,47.0
5,insurance - outbound call,hard,1.373045,-0.004508,1.966,0.010386,37.2
6,medical equipment - outbound call,easy,1.370727,-0.002348,1.764,0.009226,46.4
7,medical equipment - outbound call,hard,1.372132,-0.002926,1.79,0.008539,42.0



=== Wrong-only: How far off (deficit & rank) by Category ===


Unnamed: 0,prompt_category,deficit_if_wrong,rank_of_correct
1,home service - inbound call,0.022115,2.899804
0,automotive - inbound call,0.02076,2.91689
2,insurance - outbound call,0.01643,2.447323
3,medical equipment - outbound call,0.015919,2.392473
