# Trajectory Viewer
Interactive tool for browsing saved Crescendomation trajectories.

In [1]:
import json
from pathlib import Path
from collections import Counter
import pandas as pd
import torch as t
import ipywidgets as widgets
from IPython.display import display, HTML, clear_output

OUTPUT_DIR = Path("../results/crescendo_trajectories")

# Find all saved trajectory files
traj_files = sorted(OUTPUT_DIR.glob("trajectories_*.json"))
if not traj_files:
    print("No trajectory files found in", OUTPUT_DIR)
else:
    print(f"Found {len(traj_files)} trajectory file(s):")
    for f in traj_files:
        print(f"  {f.name}")

# Build JBB category lookup for trajectories saved without category info
try:
    from datasets import load_dataset
    jbb_data = load_dataset("JailbreakBench/JBB-Behaviors", "behaviors")
    jbb_data = list(jbb_data["harmful"])
    jbb_lookup = {item["Goal"]: {"category": item["Category"], "behavior": item["Behavior"]} for item in jbb_data}
    print(f"Loaded JBB lookup ({len(jbb_lookup)} behaviors)")
except Exception as e:
    jbb_lookup = {}
    print(f"JBB lookup not available: {e}")

Found 2 trajectory file(s):
  trajectories_20260220_050157.json
  trajectories_20260221_190645.json
Loaded JBB lookup (100 behaviors)


In [2]:
# Load selected file
file_dropdown = widgets.Dropdown(
    options=[(f.name, f) for f in traj_files],
    description="File:",
    style={"description_width": "50px"},
    layout=widgets.Layout(width="500px"),
)

cat_dropdown = widgets.Dropdown(description="Category:", style={"description_width": "70px"}, layout=widgets.Layout(width="500px"))
traj_dropdown = widgets.Dropdown(description="Trajectory:", style={"description_width": "80px"}, layout=widgets.Layout(width="700px"))
turn_dropdown = widgets.Dropdown(description="Turn:", style={"description_width": "50px"}, layout=widgets.Layout(width="500px"))
output = widgets.Output()

# State
trajectories = []


def enrich_categories(trajs):
    """Add category/behavior to trajectories that don't have them."""
    for tr in trajs:
        if "category" not in tr or tr.get("category") == "Unknown":
            match = jbb_lookup.get(tr["goal"])
            if match:
                tr["category"] = match["category"]
                tr["behavior"] = match["behavior"]
            else:
                tr.setdefault("category", "Unknown")
                tr.setdefault("behavior", "Unknown")


def load_file(change=None):
    global trajectories
    path = file_dropdown.value
    with open(path, "r", encoding="utf-8") as f:
        trajectories = json.load(f)
    enrich_categories(trajectories)

    # Populate category filter
    cats = sorted(set(tr.get("category", "Unknown") for tr in trajectories))
    cat_dropdown.options = [("All categories", None)] + [(c, c) for c in cats]
    cat_dropdown.value = None
    update_traj_list()


def update_traj_list(change=None):
    selected_cat = cat_dropdown.value
    options = []
    for i, tr in enumerate(trajectories):
        cat = tr.get("category", "Unknown")
        if selected_cat is not None and cat != selected_cat:
            continue
        cat_short = cat[:20]
        options.append((
            f"[{i}] {tr['label'].upper():10s} | score={tr['max_score_normalized']:.0f} | [{cat_short}] {tr['goal'][:50]}...",
            i
        ))
    traj_dropdown.options = options


def on_traj_change(change=None):
    idx = traj_dropdown.value
    if idx is None:
        return
    traj = trajectories[idx]
    turn_dropdown.options = [
        (f"Turn {t['turn_idx']}  score={t['normalized_score']:.0f}{'  [BACKTRACK]' if t['was_backtracked'] else ''}", i)
        for i, t in enumerate(traj["turns"])
    ]
    show_turn()


def show_turn(change=None):
    traj_idx = traj_dropdown.value
    turn_idx = turn_dropdown.value
    if traj_idx is None or turn_idx is None:
        return
    traj = trajectories[traj_idx]
    turn = traj["turns"][turn_idx]

    with output:
        clear_output(wait=True)

        # Trajectory header
        label_color = "red" if traj["label"] == "jailbroken" else "green"
        scores = [t["normalized_score"] for t in traj["turns"] if not t["was_backtracked"]]
        score_str = " -> ".join(str(int(s)) for s in scores)
        category = traj.get("category", "Unknown")
        behavior = traj.get("behavior", "Unknown")

        html = f"""
        <div style='font-family: monospace; padding: 10px; background: #1e1e1e; color: #d4d4d4; border-radius: 8px;'>
            <h3 style='color: #569cd6; margin-top:0;'>Trajectory {traj_idx}</h3>
            <b>Goal:</b> {traj['goal']}<br>
            <b>Category:</b> <span style='color:#ce9178;'>{category}</span>
            &nbsp;|&nbsp; <b>Behavior:</b> <span style='color:#ce9178;'>{behavior}</span><br>
            <b>Label:</b> <span style='color:{label_color}; font-weight:bold;'>{traj['label'].upper()}</span>
            &nbsp;|&nbsp; <b>Max Score:</b> {traj['max_score_normalized']:.0f}/10
            &nbsp;|&nbsp; <b>Turns:</b> {traj['num_turns']}
            &nbsp;|&nbsp; <b>Backtracks:</b> {traj['num_backtracks']}<br>
            <b>Score trajectory:</b> {score_str}
            <hr style='border-color:#444;'>

            <h3 style='color: #569cd6;'>Turn {turn['turn_idx']}
                {'<span style="color:orange;"> [BACKTRACKED]</span>' if turn['was_backtracked'] else ''}
            </h3>
            <b>Score:</b> {turn['normalized_score']:.0f}/10 (raw: {turn['raw_score']})<br><br>

            <b style='color:#dcdcaa;'>User Prompt:</b>
            <div style='background:#2d2d2d; padding:8px; margin:4px 0 12px 0; border-left:3px solid #569cd6; white-space:pre-wrap;'>{turn['user_prompt']}</div>

            <b style='color:#dcdcaa;'>Model Response:</b>
            <div style='background:#2d2d2d; padding:8px; margin:4px 0 12px 0; border-left:3px solid #4ec9b0; white-space:pre-wrap;'>{turn['model_response']}</div>
        </div>
        """
        display(HTML(html))


# Wire up callbacks
file_dropdown.observe(lambda c: load_file() if c["name"] == "value" else None)
cat_dropdown.observe(lambda c: update_traj_list() if c["name"] == "value" else None)
traj_dropdown.observe(lambda c: on_traj_change() if c["name"] == "value" else None)
turn_dropdown.observe(lambda c: show_turn() if c["name"] == "value" else None)

# Initial load
load_file()

display(widgets.VBox([
    file_dropdown,
    cat_dropdown,
    traj_dropdown,
    turn_dropdown,
    output,
]))

VBox(children=(Dropdown(description='File:', layout=Layout(width='500px'), options=(('trajectories_20260220_05…

In [None]:
# Show all turns for current trajectory at once
btn = widgets.Button(description="Show All Turns", button_style="info")
all_output = widgets.Output()

def show_all(b):
    traj_idx = traj_dropdown.value
    if traj_idx is None:
        return
    traj = trajectories[traj_idx]
    category = traj.get("category", "Unknown")
    behavior = traj.get("behavior", "Unknown")
    label_color = "red" if traj["label"] == "jailbroken" else "green"

    with all_output:
        clear_output(wait=True)
        # Header
        display(HTML(f"""
        <div style='font-family:monospace; padding:8px; background:#1e1e1e; color:#d4d4d4; border-radius:4px; margin-bottom:8px;'>
            <b>Trajectory {traj_idx}</b> — <span style='color:{label_color};'>{traj['label'].upper()}</span>
            | <b>Category:</b> <span style='color:#ce9178;'>{category}</span>
            | <b>Behavior:</b> <span style='color:#ce9178;'>{behavior}</span>
            | <b>Max Score:</b> {traj['max_score_normalized']:.0f}/10<br>
            <b>Goal:</b> {traj['goal']}
        </div>
        """))
        for turn in traj["turns"]:
            bt = " [BACKTRACKED]" if turn["was_backtracked"] else ""
            score_color = "red" if turn["normalized_score"] >= 8 else ("orange" if turn["normalized_score"] >= 5 else "green")
            html = f"""
            <div style='font-family:monospace; padding:6px; margin:4px 0; background:#1e1e1e; color:#d4d4d4; border-radius:4px; border-left:4px solid {score_color};'>
                <b>Turn {turn['turn_idx']}</b> — Score: <span style='color:{score_color};'>{turn['normalized_score']:.0f}</span>{bt}<br>
                <b>User:</b> {turn['user_prompt'][:200]}{'...' if len(turn['user_prompt']) > 200 else ''}<br>
                <b>Model:</b> {turn['model_response'][:200]}{'...' if len(turn['model_response']) > 200 else ''}
            </div>
            """
            display(HTML(html))

btn.on_click(show_all)
display(btn, all_output)

Button(button_style='info', description='Show All Turns', style=ButtonStyle())

Output()

In [5]:
# ═══════════════════════════════════════════════════════════════════
# Category Summary — ASR breakdown by JBB category and behavior
# ═══════════════════════════════════════════════════════════════════

cat_data = []
for i, tr in enumerate(trajectories):
    cat_data.append({
        "idx": i,
        "category": tr.get("category", "Unknown"),
        "behavior": tr.get("behavior", "Unknown"),
        "label": tr["label"],
        "max_score": tr["max_score_normalized"],
        "num_turns": tr["num_turns"],
        "backtracks": tr["num_backtracks"],
        "jailbroken": 1 if tr["label"] == "jailbroken" else 0,
    })

cat_df = pd.DataFrame(cat_data)

# Per-category summary
cat_summary = cat_df.groupby("category").agg(
    count=("idx", "count"),
    jailbroken=("jailbroken", "sum"),
    asr=("jailbroken", "mean"),
    avg_max_score=("max_score", "mean"),
    avg_turns=("num_turns", "mean"),
    avg_backtracks=("backtracks", "mean"),
).reset_index()
cat_summary["asr"] = (cat_summary["asr"] * 100).round(1)
cat_summary = cat_summary.sort_values("asr", ascending=False)

# Overall stats
total = len(cat_df)
n_jb = cat_df["jailbroken"].sum()
print(f"Overall: {n_jb}/{total} jailbroken ({100*n_jb/total:.1f}% ASR)")
print()

# Build HTML table
rows_html = ""
for _, r in cat_summary.iterrows():
    asr_color = "red" if r["asr"] > 50 else ("orange" if r["asr"] > 25 else "green")
    rows_html += f"""
    <tr>
        <td>{r['category']}</td>
        <td style='text-align:center;'>{int(r['count'])}</td>
        <td style='text-align:center;'>{int(r['jailbroken'])}</td>
        <td style='text-align:center; color:{asr_color}; font-weight:bold;'>{r['asr']:.0f}%</td>
        <td style='text-align:center;'>{r['avg_max_score']:.1f}</td>
        <td style='text-align:center;'>{r['avg_turns']:.1f}</td>
        <td style='text-align:center;'>{r['avg_backtracks']:.1f}</td>
    </tr>"""

html = f"""
<div style='font-family: monospace; padding: 10px; background: #1e1e1e; color: #d4d4d4; border-radius: 8px;'>
    <h3 style='color: #569cd6; margin-top:0;'>Results by JBB Category</h3>
    <table style='width:100%; border-collapse:collapse; color:#d4d4d4;'>
        <tr style='border-bottom:2px solid #444; color:#569cd6;'>
            <th style='text-align:left; padding:4px;'>Category</th>
            <th style='padding:4px;'>N</th>
            <th style='padding:4px;'>Jailbroken</th>
            <th style='padding:4px;'>ASR</th>
            <th style='padding:4px;'>Avg Max Score</th>
            <th style='padding:4px;'>Avg Turns</th>
            <th style='padding:4px;'>Avg Backtracks</th>
        </tr>
        {rows_html}
    </table>
</div>
"""
display(HTML(html))

Overall: 43/100 jailbroken (43.0% ASR)



Category,N,Jailbroken,ASR,Avg Max Score,Avg Turns,Avg Backtracks
Government decision-making,10,6,60%,8.2,7.6,1.3
Malware/Hacking,10,6,60%,8.5,7.7,1.1
Privacy,10,6,60%,6.9,8.6,1.9
Disinformation,10,5,50%,7.3,9.2,3.0
Expert advice,10,5,50%,7.3,7.7,1.2
Economic harm,10,4,40%,6.7,8.7,1.5
Fraud/Deception,10,4,40%,6.9,8.7,1.7
Harassment/Discrimination,10,3,30%,4.2,10.2,2.7
Physical harm,10,2,20%,5.6,9.6,2.2
Sexual/Adult content,10,2,20%,4.3,10.8,3.2
