# Facet Interaction Mini-Benchmark

Plan and run a tiny, fast-to-run probe across four facets — reasoning, instruction-following, coding, and agentic tool-use. Evaluate a few LLMs on a handful of items per facet, then fit a 2D tensor-product spline to visualize how pairs of facet scores relate (as hills/troughs) in 3D. The notebook is self-contained and uses small prompts so it can run quickly on CPU.

What you get:
- Minimal benchmark slices for each facet (handful of curated prompts)
- Small suite of models (defaults to lightweight; toggle heavier ones)
- Automated scoring heuristics
- Spline fitting and 3D plotting for pairwise facet relationships
- Hooks to extend with your own prompts/models


In [None]:
# Lightweight dependency check (SciPy for tensor-product splines)
import importlib
import subprocess
import sys

if importlib.util.find_spec("scipy") is None:
    print("Installing scipy for spline fitting...")
    subprocess.check_call([sys.executable, "-m", "pip", "install", "-q", "scipy"])
else:
    print("SciPy already available.")

In [None]:
import json
import os
import re
import time
from dataclasses import dataclass
from pathlib import Path
from typing import Callable, Dict, List, Optional

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from matplotlib import cm
from openai import OpenAI

# Notebook-level settings
plt.rcParams["figure.figsize"] = (8, 6)
plt.rcParams["axes.grid"] = True

OPENROUTER_BASE = "https://openrouter.ai/api/v1"
OPENROUTER_KEY = os.environ.get("OPENROUTER_API_KEY")
if not OPENROUTER_KEY:
    print("WARNING: set OPENROUTER_API_KEY in your environment for live calls.")

client = OpenAI(base_url=OPENROUTER_BASE, api_key=OPENROUTER_KEY)

## Mini-benchmark slices
Each facet uses 3–4 small prompts to keep runtime low. Scoring is heuristic/regex-based to avoid heavy post-processing. Feel free to replace with your own tasks or plug in richer evals.


In [None]:
@dataclass
class Task:
    facet: str
    name: str
    prompt: str
    expected: Dict
    scorer: Callable[[str, Dict], float]


def numeric_in_text(text: str) -> List[int]:
    return [int(x) for x in re.findall(r"-?\d+", text)]


def score_reasoning(output: str, expected: Dict) -> float:
    nums = numeric_in_text(output)
    return 1.0 if expected.get("answer") in nums else 0.0


def score_instruction(output: str, expected: Dict) -> float:
    must_include = expected.get("must_include", [])
    for token in must_include:
        if token.lower() not in output.lower():
            return 0.0
    if expected.get("format") == "json":
        try:
            json.loads(output)
        except json.JSONDecodeError:
            return 0.0
    return 1.0


def score_coding(output: str, expected: Dict) -> float:
    patterns = expected.get("patterns", [])
    if patterns and not all(re.search(pat, output, flags=re.IGNORECASE) for pat in patterns):
        return 0.0
    return 1.0


def score_tool(output: str, expected: Dict) -> float:
    required = expected.get("required", [])
    for token in required:
        if token.lower() not in output.lower():
            return 0.0
    return 1.0


TASKS: List[Task] = [
    # Reasoning
    Task(
        facet="reasoning",
        name="math_apples",
        prompt="Sarah has 3 apples. She buys 4 more and eats 2. How many apples remain?",
        expected={"answer": 5},
        scorer=score_reasoning,
    ),
    Task(
        facet="reasoning",
        name="train_time",
        prompt="A train travels 60 miles in 1.5 hours. What is its average speed in mph?",
        expected={"answer": 40},
        scorer=score_reasoning,
    ),
    Task(
        facet="reasoning",
        name="ratio_mix",
        prompt="You mix 2 parts red paint with 3 parts blue. If you use 10 cups total, how many cups are blue?",
        expected={"answer": 6},
        scorer=score_reasoning,
    ),
    # Instruction following
    Task(
        facet="instruction",
        name="csv_format",
        prompt="Return the three words 'oak', 'pine', 'birch' as a single comma-separated line (no spaces).",
        expected={"must_include": ["oak,pine,birch"]},
        scorer=score_instruction,
    ),
    Task(
        facet="instruction",
        name="json_shape",
        prompt="Respond with JSON containing keys name and priority for the task 'backup data' with high priority.",
        expected={"must_include": ["backup", "high"], "format": "json"},
        scorer=score_instruction,
    ),
    Task(
        facet="instruction",
        name="style_rule",
        prompt="Answer 'ready' in ALL CAPS followed by an exclamation mark, nothing else.",
        expected={"must_include": ["READY!"]},
        scorer=score_instruction,
    ),
    # Coding (format/intent, not runtime exec for safety)
    Task(
        facet="coding",
        name="add_func",
        prompt="Write a Python function named add_numbers(a, b) that returns their sum.",
        expected={"patterns": [r"def\s+add_numbers", r"return\s+a\s*\+\s*b"]},
        scorer=score_coding,
    ),
    Task(
        facet="coding",
        name="list_comprehension",
        prompt="Give a one-line Python list comprehension that squares numbers 1 through 4.",
        expected={"patterns": [r"\[n\*\*2\s+for\s+n\s+in\s+range\(1,\s*5\)"]},
        scorer=score_coding,
    ),
    Task(
        facet="coding",
        name="doc_comment",
        prompt="Provide a short Python function with a docstring that reverses a string.",
        expected={"patterns": [r"def", r"docstring", r"[::-1]"]},
        scorer=score_coding,
    ),
    # Agentic tool-use (checks for structured tool calls)
    Task(
        facet="tool_use",
        name="search_tool",
        prompt="Use a SEARCH tool to look up 'weather in Paris' and return the action line only.",
        expected={"required": ["SEARCH", "Paris"]},
        scorer=score_tool,
    ),
    Task(
        facet="tool_use",
        name="calc_tool",
        prompt="Call a CALC tool to compute 12 * 7. Return only the tool call line.",
        expected={"required": ["CALC", "12", "7"]},
        scorer=score_tool,
    ),
    Task(
        facet="tool_use",
        name="retrieval_tool",
        prompt="Trigger a RETRIEVE tool for the topic 'quantum tunneling'. Output only the tool command.",
        expected={"required": ["RETRIEVE", "quantum"]},
        scorer=score_tool,
    ),
]

FACETS = ["reasoning", "instruction", "coding", "tool_use"]

## Models to probe (OpenRouter)
Toggle `enabled` to include/exclude. Defaults favor inexpensive/small OpenRouter models; swap in your choices. Set `OPENROUTER_API_KEY` before running.


In [None]:
MODEL_SPECS = [
    {
        "id": "mistralai/mistral-small-latest",  # adjust to your available OpenRouter models
        "alias": "mistral-small",
        "enabled": True,
    },
    {
        "id": "qwen/qwen-2.5-3b-instruct",  # swap to another inexpensive model if unavailable
        "alias": "qwen2.5-3b",
        "enabled": False,
    },
    {
        "id": "anthropic/claude-3.5-haiku",
        "alias": "claude-3.5-haiku",
        "enabled": False,
    },
]


def chat_with_model(model_id: str, prompt: str, max_tokens: int = 128, temperature: float = 0.1) -> str:
    """Single OpenRouter chat completion with light retry."""
    last_err: Optional[Exception] = None
    for attempt in range(3):
        try:
            resp = client.chat.completions.create(
                model=model_id,
                messages=[{"role": "user", "content": prompt}],
                max_tokens=max_tokens,
                temperature=temperature,
            )
            return resp.choices[0].message.content.strip()
        except Exception as exc:  # pragma: no cover - network path
            last_err = exc
            wait = 1.5 * (attempt + 1)
            print(f"Retry {attempt + 1}/3 after error: {exc} (waiting {wait:.1f}s)")
            time.sleep(wait)
    raise RuntimeError(f"Failed OpenRouter call after retries: {last_err}")

## Evaluation harness
Runs each model on the prompt set, captures generations, applies facet-specific scoring, and aggregates scores per facet and per model.


In [None]:
GENERATION_KW = {"max_tokens": 128, "temperature": 0.2}


def run_model_on_tasks(model_id: str, tasks: List[Task]) -> pd.DataFrame:
    rows = []
    for task in tasks:
        tail = chat_with_model(model_id, task.prompt, **GENERATION_KW)
        score = task.scorer(tail, task.expected)
        rows.append({"facet": task.facet, "task": task.name, "prompt": task.prompt, "response": tail, "score": score})
    return pd.DataFrame(rows)


def evaluate_models(model_specs: List[Dict]) -> pd.DataFrame:
    all_rows = []
    for spec in model_specs:
        if not spec.get("enabled", True):
            continue
        print(f"Running {spec['alias']} via OpenRouter ...")
        df = run_model_on_tasks(spec["id"], TASKS)
        df["model"] = spec["alias"]
        all_rows.append(df)
    return pd.concat(all_rows, ignore_index=True) if all_rows else pd.DataFrame()


RUN_EVAL = True  # set False to skip generation and load cached results
results_df = None

if RUN_EVAL:
    if not OPENROUTER_KEY:
        raise EnvironmentError("OPENROUTER_API_KEY not set; cannot run live evals.")
    start = time.time()
    results_df = evaluate_models(MODEL_SPECS)
    elapsed = time.time() - start
    print(f"Ran {len(results_df)} model-task pairs in {elapsed:.1f}s")
else:
    print("Generation skipped; populate results_df manually or load external evals.")

results_df.head() if results_df is not None else None

### Aggregate scores
Collapse to per-model/per-facet means plus a wide table for plotting. Optionally merge precomputed evals (LM Eval outputs or UKAI safety eval exports) before plotting.


In [None]:
# Optionally load external evals (LM Eval outputs, UKAI safety eval exports) into
# a DataFrame with columns: model, facet, score. Append them to results_df below.
EXTERNAL_EVAL_PATHS: List[Path] = []  # e.g., [Path("/path/to/ukai_safety_eval.csv")]

external_rows = []
for p in EXTERNAL_EVAL_PATHS:
    if not p.exists():
        print(f"External eval not found: {p}")
        continue
    ext_df = pd.read_csv(p)
    if not {"model", "facet", "score"}.issubset(ext_df.columns):
        raise ValueError(f"External eval file {p} missing required columns model/facet/score")
    external_rows.append(ext_df)

if external_rows:
    external_df = pd.concat(external_rows, ignore_index=True)
    if results_df is None:
        results_df = external_df
    else:
        results_df = pd.concat([results_df, external_df], ignore_index=True)

if results_df is None or results_df.empty:
    raise ValueError("No results to aggregate. Run cells above or load external evals.")

facet_means = results_df.groupby(["model", "facet"]).agg(score=("score", "mean")).reset_index()
wide = facet_means.pivot(index="model", columns="facet", values="score").reset_index().fillna(0.0)

print("Facet means:\n", facet_means)
print("\nWide table:\n", wide)

wide

## Tensor-product spline & 3D plots
Fits a smooth bivariate spline over two facet axes to predict a third metric (or any target column). Uses `SmoothBivariateSpline` when possible and falls back to radial basis interpolation if the sample is too small or degenerate. Red dots show the observed model points.


In [None]:
from scipy.interpolate import Rbf, SmoothBivariateSpline


def fit_surface(df: pd.DataFrame, x_col: str, y_col: str, z_col: str, grid_n: int = 40):
    x = df[x_col].to_numpy()
    y = df[y_col].to_numpy()
    z = df[z_col].to_numpy()

    # Build grid for plotting
    xg = np.linspace(x.min(), x.max(), grid_n)
    yg = np.linspace(y.min(), y.max(), grid_n)
    Xg, Yg = np.meshgrid(xg, yg)

    try:
        spline = SmoothBivariateSpline(x, y, z, kx=3, ky=3, s=0.1)
        Zg = spline(xg, yg)
        method = "SmoothBivariateSpline"
    except Exception as exc:  # fallback for tiny sample sizes
        rbf = Rbf(x, y, z, function="thin_plate")
        Zg = rbf(Xg, Yg)
        method = f"RBF fallback ({exc.__class__.__name__})"
    return Xg, Yg, Zg, method


def plot_surface(df: pd.DataFrame, x_col: str, y_col: str, z_col: str, title: str):
    Xg, Yg, Zg, method = fit_surface(df, x_col, y_col, z_col)
    fig = plt.figure(figsize=(8, 6))
    ax = fig.add_subplot(111, projection="3d")
    surf = ax.plot_surface(Xg, Yg, Zg, cmap=cm.viridis, alpha=0.8)
    ax.scatter(df[x_col], df[y_col], df[z_col], color="red", s=60, label="models")
    ax.set_xlabel(x_col)
    ax.set_ylabel(y_col)
    ax.set_zlabel(z_col)
    ax.set_title(f"{title}\n{method}")
    fig.colorbar(surf, shrink=0.6, aspect=12)
    ax.legend(loc="upper left")
    plt.show()


# Define which surfaces to visualize (x, y -> z)
SURFACES = [
    ("reasoning", "instruction", "coding"),
    ("reasoning", "coding", "tool_use"),
    ("instruction", "tool_use", "reasoning"),
]

for x_col, y_col, z_col in SURFACES:
    if not set([x_col, y_col, z_col]).issubset(set(wide.columns)):
        print(f"Missing columns for {x_col}/{y_col}/{z_col}; skip.")
        continue
    plot_surface(wide, x_col, y_col, z_col, title=f"{x_col} vs {y_col} -> {z_col}")

### Tips to extend
- Add prompts to `TASKS` or adjust scoring heuristics per facet.
- Toggle/expand `MODEL_SPECS` to probe more OpenRouter models; keep `max_tokens` small for cost/latency.
- Swap the `SURFACES` tuples to explore other relationships or use `wide.eval()` to craft composite targets.
- If you already have scores (LM Eval or UKAI safety exports), set `RUN_EVAL = False` and list them in `EXTERNAL_EVAL_PATHS` to avoid regenerating text.
