In [None]:
# Core
import os, json, glob, hashlib, math, gc
from pathlib import Path
from typing import Any, Dict, List, Iterable, Tuple

# Data / math
import numpy as np
import pandas as pd

# Plotting
import matplotlib.pyplot as plt
import matplotlib as mpl

# ML
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report, roc_auc_score, confusion_matrix
from sklearn.utils import resample

# Stats
from scipy.stats import pearsonr, spearmanr

# HF
import torch
from transformers import AutoTokenizer, AutoModel  # base model to get hidden_states

# Style
mpl.rcParams.update({
    "figure.dpi": 120,
    "axes.spines.top": False, "axes.spines.right": False,
    "axes.grid": True, "grid.alpha": 0.22,
    "axes.titleweight": "bold", "axes.titlesize": 13,
    "axes.labelsize": 12, "legend.frameon": False, "font.size": 11,
})

RNG = np.random.RandomState(42)
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

HF_MODEL_ID = "Qwen/Qwen3-8B"   # use your exact HF id here

# === Embedding layer aggregation config ===
EMB_MODE = "mid_k"          # options: "last_k" | "mid_k" | "layer_ids"

LAST_K = 4                  # used if EMB_MODE == "last_k"
MID_K = 6                   # how many middle layers to average
MID_CENTER_FRAC = 0.50      # center of the window as a fraction in [0,1], 0.5 = exact middle

LAYER_IDS = [10,11,12]      # used if EMB_MODE == "layer_ids" (zero-based model layers)
                            # NOTE: hidden_states[0] is the embedding table output,
                            #       model layers are hidden_states[1..N], so layer i -> hs[i+1]

LAYER_AVG_LAST_K = 4             # average last-k hidden layers
MAX_TOK_LEN = 2048               # truncate if needed
BATCH_EMB = 4                    # small batch to avoid OOM with 14B
SAVE_DIR = "./artifacts_svm"     # where we’ll save embeddings & csv
os.makedirs(SAVE_DIR, exist_ok=True)


In [None]:
def _aggregate_token_layers(hs, emb_mode="mid_k",
                            last_k=4, mid_k=6, mid_center_frac=0.5, layer_ids=None):
    """
    hs: tuple of hidden_states from HF (len = n_layers + 1; hs[0] = embeddings)
    returns: token_emb [B,T,H], used layers (zero-based model layer indices)
    """
    n_layers = len(hs) - 1                 # exclude hs[0] (input embeddings)
    if n_layers <= 0:
        raise ValueError("No model layers found in hidden_states")

    if emb_mode == "last_k":
        k = max(1, min(last_k, n_layers))
        idx_model = list(range(n_layers - k, n_layers))             # 0-based in [0..n_layers-1]
    elif emb_mode == "mid_k":
        k = max(1, min(mid_k, n_layers))
        center = int(round(mid_center_frac * (n_layers - 1)))       # 0..n_layers-1
        start = max(0, center - k // 2)
        end   = min(n_layers, start + k)
        idx_model = list(range(start, end))
    elif emb_mode == "layer_ids":
        ids = layer_ids or []
        idx_model = [i for i in ids if 0 <= i < n_layers]
        if len(idx_model) == 0:
            raise ValueError("LAYER_IDS produced an empty selection")
    else:
        raise ValueError(f"Unknown emb_mode: {emb_mode}")

    # Map model-layer index -> hidden_states index (+1)
    tensors = [hs[i+1] for i in idx_model]                          # each [B,T,H]
    token_emb = torch.stack(tensors, 0).mean(0) if len(tensors) > 1 else tensors[0]
    return token_emb, idx_model


In [None]:
tokenizer = AutoTokenizer.from_pretrained(HF_MODEL_ID, use_fast=True)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

model = AutoModel.from_pretrained(
    HF_MODEL_ID,
    torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32,
    device_map="auto"
).eval()

device = next(model.parameters()).device
print("Loaded:", HF_MODEL_ID, "on", device)


In [None]:
def _expand_paths(paths_or_globs: List[str]) -> List[str]:
    out=[]
    for p in paths_or_globs:
        if any(ch in p for ch in "*?[]"):
            out.extend(glob.glob(p, recursive=True))
        else:
            out.append(p)
    return [p for p in out if Path(p).exists()]

def _iter_behavior_like(obj: Any):
    # matches your structures where behaviors / strategies exist
    if isinstance(obj, dict):
        if isinstance(obj.get("behaviors"), dict):
            for v in obj["behaviors"].values():
                if isinstance(v, dict):
                    yield v
        # fallthrough scan
        for v in obj.values():
            yield from _iter_behavior_like(v)
    elif isinstance(obj, list):
        for v in obj:
            yield from _iter_behavior_like(v)

def _rows_from_holder(holder: Dict[str, Any], behavior_number=None, behavior_text=None, source_path:str=""):
    set_number = holder.get("set_number")
    strategy_number = holder.get("strategy_number")
    conv = holder.get("conversation", []) or []
    for turn in conv:
        # many files have either "turn" starting at 1 or implicit order
        t_idx = int(turn.get("turn", 0))  # we’ll normalize to 0-based later
        eva = turn.get("evaluation") if isinstance(turn.get("evaluation"), dict) else {}
        score = turn.get("evaluation_score") or turn.get("eval_score") or eva.get("score")
        reason = turn.get("evaluation_reason") or eva.get("reason")
        if score is None:
            continue
        attacker = (turn.get("attacker") or "").strip()
        target   = (turn.get("target")   or "").strip()
        row = {
            "source_path": source_path,
            "behavior_number": behavior_number,
            "behavior_text": behavior_text,
            "set_number": set_number,
            "strategy_number": strategy_number,
            "turn_idx_raw": t_idx,                 # may be 1-based in file
            "score": int(score),
            "reason": reason,
            "attacker": attacker,
            "target": target,
            "context_text": f"Attacker: {attacker}\nTarget: {target}".strip(),
        }
        yield row

def load_scored_contexts(json_inputs: List[str]) -> pd.DataFrame:
    rows=[]
    for p in _expand_paths(json_inputs):
        with open(p, "r", encoding="utf-8") as f:
            data = json.load(f)
        behs = list(_iter_behavior_like(data)) or [data]
        for beh in behs:
            bnum = beh.get("behavior_number")
            btxt = (beh.get("behavior", {}) or {}).get("Behavior") if isinstance(beh.get("behavior"), dict) else None
            # either strategies list, or the behavior directly contains conversation nodes
            if isinstance(beh.get("strategies"), list) and beh["strategies"]:
                for strat in beh["strategies"]:
                    for r in _rows_from_holder(strat, bnum, btxt, p):
                        rows.append(r)
            else:
                # treat beh as “holder”
                for r in _rows_from_holder(beh, bnum, btxt, p):
                    rows.append(r)
    df = pd.DataFrame(rows)
    if df.empty:
        print("Loaded 0 rows.")
        return df

    # Group id = file+behavior+set+strategy to keep a single conversation thread
    def _gid(row):
        key = f"{row.get('source_path','')}|{row.get('behavior_number','')}|{row.get('set_number','')}|{row.get('strategy_number','')}"
        return hashlib.md5(key.encode()).hexdigest()
    df["group_id"] = df.apply(_gid, axis=1)

    # Normalize a consistent 0-based turn within each group in file order
    df = (df.sort_values(["group_id","turn_idx_raw"])
            .groupby("group_id", as_index=False)
            .apply(lambda g: g.assign(turn_local0=np.arange(len(g)),
                                      turn_local1=np.arange(1,len(g)+1)))
            .reset_index(drop=True))

    # final label buckets you use elsewhere
    df["label"] = np.select(
        [
            df["score"].isin([1,2]),
            df["score"].isin([4,5])
        ],
        [0, 1],  # 0=safe, 1=unsafe
        default=None
    )
    return df


In [None]:
# 1) Build messages list for a given group up to (and including) current attacker,
#    but EXCLUDING the assistant reply of that turn (so: “before it answers”).
def messages_for_turn(g: pd.DataFrame, idx_local0: int, system_text:str=None) -> List[Dict[str,str]]:
    msgs=[]
    if system_text:
        msgs.append({"role":"system","content":system_text})

    # all previous completed turns: add as user/assistant pairs
    # note: your data has attacker (user) and target (assistant).
    for i in range(idx_local0):
        a = (g.iloc[i]["attacker"] or "").strip()
        t = (g.iloc[i]["target"] or "").strip()
        if a:
            msgs.append({"role":"user","content":a})
        if t:
            msgs.append({"role":"assistant","content":t})

    # add current user prompt (attacker) only
    cur_a = (g.iloc[idx_local0]["attacker"] or "").strip()
    if cur_a:
        msgs.append({"role":"user","content":cur_a})

    return msgs

@torch.no_grad()
def embed_rendered_prompts(rendered_input_ids: torch.Tensor, attention_mask: torch.Tensor) -> np.ndarray:
    # Forward with hidden states
    out = model(input_ids=rendered_input_ids, attention_mask=attention_mask, output_hidden_states=True)
    hs = out.hidden_states  # tuple[layer] shape: (B, T, H)
    # # average last-k layers
    # if LAYER_AVG_LAST_K > 1:
    #     token_emb = torch.stack(hs[-LAYER_AVG_LAST_K:], 0).mean(0)  # (B,T,H)
    # else:
    #     token_emb = hs[-1]
    # # mean-pool over valid tokens
    # mask = attention_mask.unsqueeze(-1)  # (B,T,1)
    # sent = (token_emb * mask).sum(1) / mask.sum(1).clamp(min=1)
    # # DO NOT L2-normalize here since you asked for “without normalization” in some analyses.
   
    # NEW: select middle layers (or whatever EMB_MODE says)
    token_emb, used_layers = _aggregate_token_layers(
        hs,
        emb_mode=EMB_MODE,
        last_k=LAST_K,
        mid_k=MID_K,
        mid_center_frac=MID_CENTER_FRAC,
        layer_ids=LAYER_IDS,
    )
    # mean-pool over tokens with mask (no L2-normalization unless you want it)
    mask = attention_mask.unsqueeze(-1)
    sent = (token_emb * mask).sum(1) / mask.sum(1).clamp(min=1)
    emb = sent.float().cpu().numpy()
    
    return emb

def render_and_embed_messages_batch(batch_msgs: List[List[Dict[str,str]]]) -> np.ndarray:
    # use chat template exactly as runtime
    batch_inputs = tokenizer.apply_chat_template(
        batch_msgs,
        tokenize=True,
        add_generation_prompt=False,
        padding=True,
        truncation=True,
        max_length=MAX_TOK_LEN,
        return_tensors="pt"
    )
    input_ids = batch_inputs.to(device)
    attn = (input_ids != tokenizer.pad_token_id).long()
    return embed_rendered_prompts(input_ids, attn)


In [None]:
def compute_turn_context_embeddings(
    df: pd.DataFrame,
    system_text: str = None,
    allowed_scores: set = {1, 2,3, 4, 5},   # ⬅️ keep 3 in history, exclude from outputs
) -> pd.DataFrame:
    """
    For each turn in df, build the *runtime* pre-answer context:
      [system] + all (user,assistant) pairs up to t-1 + current user(t).
    Embed/return ONLY rows whose score ∈ allowed_scores.
    History can include any scores (including 3).
    """
    out_rows = []

    for gid, g in df.groupby("group_id", sort=False):
        g = g.sort_values("turn_local0").reset_index(drop=True)

        pending_msgs, pending_idxs = [], []
        for i in range(len(g)):
            msgs = messages_for_turn(g, i, system_text=system_text)

            # enqueue only if this turn’s score is allowed
            if (allowed_scores is None) or (g.at[i, "score"] in allowed_scores):
                pending_msgs.append(msgs)
                pending_idxs.append(i)

            # flush batch if full or at end
            if pending_msgs and (len(pending_msgs) == BATCH_EMB or i == len(g) - 1):
                vecs = render_and_embed_messages_batch(pending_msgs)  # (B,H)
                for j, vec in zip(pending_idxs, vecs):
                    row = g.iloc[j].to_dict()
                    row["emb"] = vec
                    out_rows.append(row)
                pending_msgs.clear()
                pending_idxs.clear()

    df_emb = pd.DataFrame(out_rows)
    if df_emb.empty:
        print("[warn] No rows embedded (did your data have only score==3?)")
        return df_emb

    E = np.stack(df_emb["emb"].to_numpy(), axis=0)
    np.save(Path(SAVE_DIR) / "turn_context_embeddings.npy", E)
    meta_cols = [c for c in df_emb.columns if c != "emb"]
    df_emb[meta_cols].to_parquet(Path(SAVE_DIR) / "turn_context_meta.parquet", index=False)
    print(f"Saved: {E.shape} → {Path(SAVE_DIR) / 'turn_context_embeddings.npy'}")
    print(f"Saved meta → {Path(SAVE_DIR) / 'turn_context_meta.parquet'}")
    return df_emb


# ---- Load your files and compute embeddings ----
JSON_INPUTS = [
    "/storage/users/visionintelligence/Nivya/x-teaming/attacks/2025-08-14_01-31-41_HINDI_50engStrategy/all_results.json",
    "/storage/users/visionintelligence/Nivya/x-teaming/attacks/2025-08-13_13-43-15_FRENCH_50engStrategy/all_results.json",
    "/storage/users/visionintelligence/Nivya/x-teaming/attacks/2025-08-15_02-58-29/all_results.json",
    "/storage/users/visionintelligence/Nivya/x-teaming/attacks/2025-08-22_10-54-11/all_results.json",
    "/storage/users/visionintelligence/Nivya/x-teaming/attacks/Hindi_attackThinkTrue_2025-08-27_06-46-57/all_results.json",
    "/storage/users/visionintelligence/Nivya/x-teaming/attacks/French_attackThinkTrue_2025-08-27_02-24-42/all_results.json"
    # e.g., your Hindi_50engStrategy files, etc.
    # "/another/path/all_results.json",
]
df_all = load_scored_contexts(JSON_INPUTS)
print("Loaded rows:", len(df_all))

# 1) Identify those rows (you already printed them)
suspect = df_all[(df_all["turn_local0"]==0) & (df_all["attacker"].fillna("").str.strip()=="")]
bad_gids = suspect["group_id"].unique().tolist()
print("Fixing groups:", bad_gids)

df_all = df_all[df_all["score"].isin([1, 2, 3, 4, 5])].copy()
# 2) Drop the offending first turn(s)
df_all = df_all[~((df_all["group_id"].isin(bad_gids)) & (df_all["turn_local0"]==0))].copy()

# 3) Recompute 0-based / 1-based local indices so trajectories start at 0 again
df_all = (df_all.sort_values(["group_id","turn_local0"])
                 .groupby("group_id", as_index=False)
                 .apply(lambda g: g.assign(
                     turn_local0=np.arange(len(g)),
                     turn_local1=np.arange(1, len(g)+1)
                 ))
                 .reset_index(drop=True))

print("After drop+reindex:", len(df_all))

df_emb = compute_turn_context_embeddings(df_all,allowed_scores={1,2,3,4,5}, system_text=None)  # add system text if you use one at runtime


In [None]:
# Keep scores 1,2,3,4,5 but use label: 0 for {1,2}, 1 for {4,5}; 3 is “neutral/other”, keep for analysis but not for SVM.
df_train = df_emb.copy()
# df_train["label"] = df_train["score"].isin([4,5]).astype(int)  # 1=unsafe(4/5), 0=safe(1/2); 3 kept for analysis


# Optional: drop score==3 from SVM training but keep them for correlation plots if you want
df_svm = df_train[df_train["score"].isin([1,2,4,5])].copy()

# Balance (downsample the majority) IF you want
def downsample_to_balance(df_lbl: pd.DataFrame) -> pd.DataFrame:
    g0 = df_lbl[df_lbl["label"]==0]
    g1 = df_lbl[df_lbl["label"]==1]
    if len(g0)==0 or len(g1)==0:
        return df_lbl
    n = min(len(g0), len(g1))
    g0s = resample(g0, replace=False, n_samples=n, random_state=42)
    g1s = resample(g1, replace=False, n_samples=n, random_state=42)
    return pd.concat([g0s,g1s], ignore_index=True)

USE_DOWNSAMPLE = True
df_svm_bal = downsample_to_balance(df_svm) if USE_DOWNSAMPLE else df_svm

# Build X, y
X = np.stack(df_svm_bal["emb"].to_numpy(), axis=0)  # no normalization, per your request
y = df_svm_bal["label"].astype(int).to_numpy()

print("SVM train shape:", X.shape, "positives:", (y==1).sum(), "negatives:", (y==0).sum())


In [None]:
svm = LinearSVC(C=1.0, class_weight="balanced", random_state=42)
svm.fit(X, y)

# Add margins back to the full turn table for plotting (use the model on all rows)
X_all = np.stack(df_train["emb"].to_numpy(), axis=0)
margins_all = svm.decision_function(X_all)  # signed distance (up to scale)
df_plot = df_train.copy()
df_plot["margin"] = margins_all

# Save everything for reuse
np.save(Path(SAVE_DIR)/"svm_w.npy", svm.coef_.astype(np.float32))
np.save(Path(SAVE_DIR)/"svm_b.npy", np.array([svm.intercept_[0]], dtype=np.float32))
df_plot.to_parquet(Path(SAVE_DIR)/"df_plot_with_margins.parquet", index=False)

print("Saved SVM & df_plot with margins into", SAVE_DIR)


In [None]:
# Metrics on the training set (since you asked “use all samples”)
pred_all = (margins_all > 0).astype(int)

# Only evaluate classification on 1/2 vs 4/5 rows
mask_svm_eval = df_plot["score"].isin([1,2,4,5])
print("Accuracy:", (pred_all[mask_svm_eval]==df_plot.loc[mask_svm_eval,"label"].to_numpy()).mean())
try:
    print("ROC-AUC:", roc_auc_score(df_plot.loc[mask_svm_eval,"label"].to_numpy(),
                                    df_plot.loc[mask_svm_eval,"margin"].to_numpy()))
except Exception as e:
    print("ROC-AUC error:", e)

print("\nClassification report:\n",
      classification_report(df_plot.loc[mask_svm_eval,"label"].to_numpy(),
                            pred_all[mask_svm_eval], digits=3))

# === Turn-level correlation: margin vs score (1..5) ===
pear = pearsonr(df_plot["margin"], df_plot["score"])
spear = spearmanr(df_plot["margin"], df_plot["score"])
print(f"\nPearson(margin, score): {pear.statistic:.3f} (p={pear.pvalue:.2g})")
print(f"Spearman(margin, score): {spear.correlation:.3f} (p={spear.pvalue:.2g})")

# (Optional) final-turn-only correlation
final_rows = (df_plot.sort_values(["group_id","turn_local0"])
                    .groupby("group_id", as_index=False).tail(1))
pear_f = pearsonr(final_rows["margin"], final_rows["score"])
spear_f = spearmanr(final_rows["margin"], final_rows["score"])
print(f"\n[Final turn] Pearson: {pear_f.statistic:.3f} (p={pear_f.pvalue:.2g})")
print(f"[Final turn] Spearman: {spear_f.correlation:.3f} (p={spear_f.pvalue:.2g})")


In [None]:
# ===== Cell I — Core visualizations from df_plot =====
TAU_EARLY = 0.20
TAU_BLOCK = 0.40

import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import (
    roc_curve, auc, precision_recall_curve, average_precision_score,
    precision_recall_fscore_support, confusion_matrix
)
from scipy.stats import pearsonr

assert {"group_id","turn_local0","score","label","margin"}.issubset(df_plot.columns), \
    "df_plot must have: group_id, turn_local0, score, label, margin"

# ---------------------------------------------
# 1) Histogram: margins by class (safe vs unsafe)
# ---------------------------------------------
mask_eval = df_plot["score"].isin([1,2,4,5])  # strict eval on 1/2 vs 4/5
safe   = df_plot.loc[(df_plot["label"]==0) & mask_eval, "margin"].to_numpy()
unsafe = df_plot.loc[(df_plot["label"]==1) & mask_eval, "margin"].to_numpy()

plt.figure(figsize=(6.6,4.2))
plt.hist(safe,   bins=40, alpha=0.65, label="safe (1/2)",   density=True)
plt.hist(unsafe, bins=40, alpha=0.65, label="unsafe (4/5)", density=True)
plt.axvline(0.0,        ls="--", lw=1, color="k", label="τ=0")
plt.axvline(TAU_EARLY,  ls="--", lw=1, label=f"τ_early={TAU_EARLY:.2f}")
plt.axvline(TAU_BLOCK,  ls="--", lw=1, label=f"τ_block={TAU_BLOCK:.2f}")
plt.title("Margin distribution (pre-answer, chat-templated)")
plt.xlabel("SVM margin (w·x + b)")
plt.ylabel("density")
plt.legend(); plt.tight_layout(); plt.show()

# ---------------------------------------------
# 2) ROC & PR curves
# ---------------------------------------------
y_true = df_plot.loc[mask_eval, "label"].to_numpy().astype(int)
scores = df_plot.loc[mask_eval, "margin"].to_numpy()

fpr, tpr, thr = roc_curve(y_true, scores)
roc_auc = auc(fpr, tpr)

prec, rec, thr_pr = precision_recall_curve(y_true, scores)
ap = average_precision_score(y_true, scores)

plt.figure(figsize=(6.2,4.2))
plt.plot(fpr, tpr, lw=2)
plt.title(f"ROC (AUC={roc_auc:.3f})")
plt.xlabel("False Positive Rate"); plt.ylabel("True Positive Rate")
plt.grid(alpha=.25); plt.tight_layout(); plt.show()

plt.figure(figsize=(6.2,4.2))
plt.plot(rec, prec, lw=2)
plt.title(f"Precision–Recall (AP={ap:.3f})")
plt.xlabel("Recall"); plt.ylabel("Precision")
plt.grid(alpha=.25); plt.tight_layout(); plt.show()

# ---------------------------------------------
# 3) Threshold sweep: Precision / Recall / F1 vs τ
# ---------------------------------------------
taus = np.unique(np.percentile(scores, np.linspace(1, 99, 60)))
P,R,F = [],[],[]
for t in taus:
    yhat = (scores >= t).astype(int)
    p_, r_, f_, _ = precision_recall_fscore_support(y_true, yhat, average="binary", zero_division=0)
    P.append(p_); R.append(r_); F.append(f_)

plt.figure(figsize=(7.6,4.6))
plt.plot(taus, P, label="Precision", lw=2)
plt.plot(taus, R, label="Recall",    lw=2)
plt.plot(taus, F, label="F1",        lw=2)
for v,lab in [(0.0,"τ=0"), (TAU_EARLY,f"τ_early={TAU_EARLY:.2f}"), (TAU_BLOCK,f"τ_block={TAU_BLOCK:.2f}")]:
    plt.axvline(v, ls="--", lw=1, label=lab)
plt.title("Precision / Recall / F1 vs τ")
plt.xlabel("τ (threshold on margin)"); plt.ylabel("score")
plt.ylim(0,1.02); plt.legend(); plt.grid(alpha=.25); plt.tight_layout(); plt.show()

# ---------------------------------------------
# 4) Confusion matrix helper and readouts at key τ's
# ---------------------------------------------
def cm_at_tau(scores, labels, tau):
    yhat = (scores >= tau).astype(int)
    cm = confusion_matrix(labels, yhat, labels=[0,1])  # [[TN,FP],[FN,TP]]
    (tn, fp), (fn, tp) = cm
    p, r, f, _ = precision_recall_fscore_support(labels, yhat, average="binary", zero_division=0)
    return {
        "tau": tau, "TN": int(tn), "FP": int(fp), "FN": int(fn), "TP": int(tp),
        "precision": float(p), "recall": float(r), "f1": float(f),
        "FPR": float(fp / (fp + tn + 1e-9)), "TPR": float(r),
        "accuracy": float((tn+tp) / (tn+fp+fn+tp+1e-9))
    }

for t in [0.0, TAU_EARLY, TAU_BLOCK]:
    print("—"*60)
    stats = cm_at_tau(scores, y_true, t)
    print(f"τ={stats['tau']:.3f}  "
          f"TN={stats['TN']} FP={stats['FP']}  FN={stats['FN']} TP={stats['TP']}  "
          f"Acc={stats['accuracy']:.3f}  P={stats['precision']:.3f}  R={stats['recall']:.3f}  F1={stats['f1']:.3f}  "
          f"FPR={stats['FPR']:.3f}")

# ---------------------------------------------
# 5) Pearson heatmaps (turn-level & final-turn)
# ---------------------------------------------
def corr_heatmap(df_num: pd.DataFrame, title: str):
    C = df_num.corr(method="pearson")
    fig, ax = plt.subplots(figsize=(4.8,4.2))
    im = ax.imshow(C.values, vmin=-1, vmax=1, cmap="coolwarm")
    ax.set_xticks(range(len(C.columns))); ax.set_xticklabels(C.columns, rotation=45, ha="right")
    ax.set_yticks(range(len(C.index)));   ax.set_yticklabels(C.index)
    for i in range(C.shape[0]):
        for j in range(C.shape[1]):
            ax.text(j, i, f"{C.values[i,j]:.2f}", ha="center", va="center")
    fig.colorbar(im, ax=ax, fraction=0.046, pad=0.04, label="r")
    ax.set_title(title, fontweight="bold"); fig.tight_layout(); plt.show()

# Turn-level
corr_heatmap(df_plot[["margin","score","label"]].astype(float), "Turn-level Pearson")

# Final-turn only (one row per conversation)
final_rows = (df_plot.sort_values(["group_id","turn_local0"])
                      .groupby("group_id", as_index=False).tail(1))
corr_heatmap(final_rows[["margin","score","label"]].astype(float), "Final-turn Pearson")

# Also print r and p for margin ↔ score
r_all, p_all = pearsonr(df_plot["margin"].astype(float), df_plot["score"].astype(float))
r_fin, p_fin = pearsonr(final_rows["margin"].astype(float), final_rows["score"].astype(float))
print(f"\nPearson(margin, score) — turn-level: r={r_all:.3f}, p={p_all:.3g}, n={len(df_plot)}")
print(f"Pearson(margin, score) — final-turn: r={r_fin:.3f}, p={p_fin:.3g}, n={len(final_rows)}")


In [None]:
TAU_EARLY = 0.20
TAU_BLOCK = 0.40

def plot_one_jb_conversation(df, gid, tau_early=0.20, tau_block=0.40, use_one_based=False, ylims=None):
    g = (df[df["group_id"]==gid]
         .sort_values("turn_local0")
         .reset_index(drop=True))
    if g.empty: return
    x = g["turn_local1"].to_numpy() if use_one_based else g["turn_local0"].to_numpy()
    y = g["margin"].to_numpy()
    s = g["score"].to_numpy()

    if ylims is None:
        pad = max(y.max()-y.min(), 0.05)
        ylo = y.min() - 0.10*pad; yhi = y.max() + 0.15*pad
    else:
        ylo, yhi = ylims

    fig, ax = plt.subplots(figsize=(10, 2.8))
    ax.axhspan(tau_early, yhi, color="#ff7f0e", alpha=0.10, label=f"unsafe ≥ {tau_early:.2f}")
    ax.axhspan(tau_block, yhi, color="#d62728", alpha=0.12, label=f"block ≥ {tau_block:.2f}")
    ax.plot(x, y, lw=2.2, marker="o", markersize=5)

    for xi, yi, si in zip(x, y, s):
        ax.annotate(f"{yi:+.2f} (s={si})", (xi, yi),
                    textcoords="offset points", xytext=(0, 8),
                    ha="center", va="bottom", fontsize=10)

    ax.axhline(0.0, ls="--", lw=1, color="k", alpha=0.6)
    ax.axhline(tau_early, ls="--", lw=1, alpha=0.6)
    ax.axhline(tau_block, ls="--", lw=1, alpha=0.6)
    ax.set_ylim(ylo, yhi)
    ax.set_xlim(x.min()-0.1, x.max()+0.1)
    ax.set_xticks(np.arange(x.min(), x.max()+1, 1))
    ax.set_xlabel("turn index" + (" (1-based)" if use_one_based else " (0-based)"))
    ax.set_ylabel("SVM margin (w·x + b)")
    ax.set_title(f"{gid[:12]}…   start {y[0]:+.2f} → end {y[-1]:+.2f}")
    fig.tight_layout(); plt.show()

# same global y-limits across all JB conversations
jb_gids = (df_plot[df_plot["score"]==5]["group_id"].unique().tolist())
if jb_gids:
    all_m = np.concatenate([df_plot.loc[df_plot["group_id"]==g, "margin"].to_numpy() for g in jb_gids])
    pad = max(all_m.max()-all_m.min(), 0.05)
    YLIMS = (all_m.min()-0.10*pad, all_m.max()+0.15*pad)
else:
    YLIMS = None

for gid in jb_gids:
    plot_one_jb_conversation(df_plot, gid, tau_early=TAU_EARLY, tau_block=TAU_BLOCK,
                             use_one_based=False, ylims=YLIMS)


In [None]:
import numpy as np
import pandas as pd

# Assumes df_plot has: group_id, turn_idx (or turn_local0), margin, score, label
turn_col = "turn_idx" if "turn_idx" in df_plot.columns else "turn_local0"

# Final turn per conversation
last = (df_plot.sort_values(["group_id", turn_col])
                .groupby("group_id", as_index=False).tail(1)
                .reset_index(drop=True))

# Focus on jailbreak conversations (final score = 5)
jb = last[last["score"] == 5].copy()
N = len(jb)

# Pick your thresholds
TAU_EARLY = 0.20
TAU_BLOCK = 0.40

# A1) Final unsafe but margin < 0 (on the "safe" side of the plane)
a1 = (jb["margin"] < 0).mean()

# A2) Final unsafe but margin < TAU_EARLY (below early-warning)
a2 = (jb["margin"] < TAU_EARLY).mean()

# A3) Final unsafe but margin < TAU_BLOCK (below block threshold)
a3 = (jb["margin"] < TAU_BLOCK).mean()

# A4) “Downward” trajectories: start > end by a meaningful delta (e.g., ≥ 0.10)
deltas = (
    df_plot.sort_values(["group_id", turn_col])
           .groupby("group_id")
           .agg(start=("margin","first"), end=("margin","last"))
           .reset_index()
)
deltas = deltas.merge(jb[["group_id"]], on="group_id", how="inner")
a4 = (deltas["start"] - deltas["end"] >= 0.10).mean()

print(f"Total jailbreak convos (final score=5): {N}")
print(f"A1  final margin < 0              : {a1:.2%}")
print(f"A2  final margin < τ_early {TAU_EARLY:.2f}: {a2:.2%}")
print(f"A3  final margin < τ_block {TAU_BLOCK:.2f}: {a3:.2%}")
print(f"A4  start→end decreased ≥ 0.10     : {a4:.2%}")


In [None]:
# ---- One true place to compute Pearson on exactly-defined subsets ----
import numpy as np, pandas as pd, matplotlib.pyplot as plt
from scipy.stats import pearsonr

assert {"margin","score","group_id","turn_local0"}.issubset(df_plot.columns)

def pearson_from_mask(mask, title):
    sub = df_plot.loc[mask, ["margin","score"]].astype(float).dropna()
    x, y = sub["margin"].to_numpy(), sub["score"].to_numpy()
    r, p = pearsonr(x, y)
    print(f"{title}: r={r:.3f} (p={p:.3g}, n={len(sub)})")
    # 2x2 heatmap
    C = np.array([[1.0, r],[r, 1.0]])
    fig, ax = plt.subplots(figsize=(4.6,4.0))
    im = ax.imshow(C, vmin=-1, vmax=1, cmap="coolwarm")
    ax.set_xticks([0,1]); ax.set_yticks([0,1])
    ax.set_xticklabels(["margin","score"], rotation=45, ha="right")
    ax.set_yticklabels(["margin","score"])
    for i in range(2):
        for j in range(2):
            ax.text(j, i, f"{C[i,j]:.2f}", ha="center", va="center")
    fig.colorbar(im, ax=ax, fraction=0.046, pad=0.04, label="r")
    ax.set_title(title, fontweight="bold"); fig.tight_layout(); plt.show()
    return r, p

# Masks (pick the one you want and use it everywhere)
mask_all      = np.isfinite(df_plot["margin"]) & np.isfinite(df_plot["score"])
mask_no3      = mask_all & df_plot["score"].isin([1,2,4,5])      # drop 3's
mask_final    = mask_no3 & (df_plot.sort_values(["group_id","turn_local0"])
                            .groupby("group_id").cumcount(ascending=False).eq(0))

_ = pearson_from_mask(mask_all,   "Turn-level Pearson (1/2/3/4/5, ALL turns)")
_ = pearson_from_mask(mask_no3,   "Turn-level Pearson (no 3’s, ALL turns)")
_ = pearson_from_mask(mask_final, "Final-turn Pearson (no 3’s, last turn per conv)")


In [None]:
# Flatten emb matrix to columns e0..eH-1 for quick inspection (warning: wide CSV)
E = np.stack(df_plot["emb"].to_numpy(), axis=0)
cols = [f"e{i}" for i in range(E.shape[1])]
df_save = pd.concat([df_plot.drop(columns=["emb"]).reset_index(drop=True),
                     pd.DataFrame(E, columns=cols)], axis=1)
out_csv = Path(SAVE_DIR)/"turn_context_embeddings_with_meta.csv"
df_save.to_csv(out_csv, index=False)
print("Wrote:", out_csv, "shape:", df_save.shape)
