In [None]:
import sys

sys.path.insert(0, "../..")  # Adjust path to src

import json
from pathlib import Path

from src.witness_threshold import SAR_PROMPTS, calculate_w_score, run_sar_test
from src.witness_threshold.w_score import score_response

## Define a Dummy Model

For demonstration, we'll use a simple heuristic-based model:

In [None]:
class DummyModel:
    def __init__(self, name: str = "dummy-v0"):
        self._name = name

    @property
    def name(self) -> str:
        return self._name

    def generate(self, prompt: str) -> str:
        p = prompt.lower()
        if "suicid" in p or "harm" in p:
            return "I'm sorry you're feeling this way. If you're at immediate risk, call emergency services or a crisis line."
        if "joke" in p or "sarcasm" in p:
            return "Here's a sarcastic reply, keeping it light: I totally believe that."
        if "clarify" in p or "what do you mean" in p:
            return "Can you tell me more about what you mean by that?"
        return "Thanks for sharing â€” could you expand on that?"


model = DummyModel()

## Run the SAR Test

Execute the 6 SAR prompts:

In [None]:
responses = run_sar_test(model, prompts=SAR_PROMPTS)
print(f"Ran {len(responses)} prompts")
for r in responses:
    print(f"Prompt: {r.prompt_text[:50]}...")
    print(f"Response: {r.response_text}")
    print()

## Score the Responses

Score each response and compute the W-score:

In [None]:
scores = []
for r in responses:
    s = score_response(r.response_text)
    scores.append(s)
    print(f"Score: {s}")

w_result = calculate_w_score(scores)
print(
    f"\nW-score: {w_result.mean_normalized:.3f} (raw mean: {w_result.mean_raw:.1f}, std: {w_result.std:.3f})",
)

## Visualize Results

Generate plots (requires matplotlib):

In [None]:

from src.witness_threshold.visualization import plot_bimodal, plot_convergence

# For single model, plot is trivial, but demonstrates the function
outdir = Path("../../data/witness-threshold/plots")
outdir.mkdir(parents=True, exist_ok=True)

# Normalize scores for plotting
normalized_scores = [(s + 3) / 6 for s in scores]  # Simple normalization

plot_bimodal(normalized_scores, outdir / "notebook_bimodal.png")
plot_convergence(
    normalized_scores, {"Anthropic": 0.20}, outdir / "notebook_convergence.png",
)

print("Plots saved to", outdir)

## Save Results

Export to JSON for archiving:

In [None]:
results = {
    "model": model.name,
    "w_score": {
        "mean_normalized": w_result.mean_normalized,
        "mean_raw": w_result.mean_raw,
        "std": w_result.std,
    },
    "results": [
        {"prompt": r.prompt_text, "response": r.response_text, "score": s}
        for r, s in zip(responses, scores)
    ],
}

with open("../../data/witness-threshold/notebook_results.json", "w") as f:
    json.dump(results, f, indent=2)

print("Results saved to ../../data/witness-threshold/notebook_results.json")