# Evaluate on Harmbench/Jailbreakbench/StrongReject

In [1]:
from utils import *
from plot_utils import plot_line,plot_bar
from data_utils import *
from eval_refusal import *
from tqdm import tqdm
from collections import defaultdict,Counter
import os
import numpy as np
import pandas as pcd

seed = 42
torch.manual_seed(seed)
np.random.seed(seed)
torch.set_grad_enabled(False)

INFO 03-26 21:00:37 __init__.py:190] Automatically detected platform cuda.


<torch.autograd.grad_mode.set_grad_enabled at 0x155550cc3760>

In [2]:
from vllm import LLM, SamplingParams
def load_harmbench_classifier(torch_dtype=torch.bfloat16):
    classifier = LLM(model='cais/HarmBench-Llama-2-13b-cls', tensor_parallel_size=1,gpu_memory_utilization=0.5)
    classifier.llm_engine.tokenizer.truncation_side = "left"
    return classifier
hb_classifier = load_harmbench_classifier()




INFO 03-26 21:04:19 config.py:542] This model supports multiple tasks: {'generate', 'reward', 'score', 'classify', 'embed'}. Defaulting to 'generate'.
INFO 03-26 21:04:19 llm_engine.py:234] Initializing a V0 LLM engine (v0.7.2) with config: model='cais/HarmBench-Llama-2-13b-cls', speculative_config=None, tokenizer='cais/HarmBench-Llama-2-13b-cls', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=2048, download_dir=None, load_format=auto, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto,  device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='xgrammar'), observability_config=ObservabilityConfig(otlp_traces_endpoint=None, collect_model_forward_time=False, collect_model_execute_time=False), seed=0, served_model_name=cais/HarmBench-Llama-2-13b-cls, num

Loading safetensors checkpoint shards:   0% Completed | 0/6 [00:00<?, ?it/s]


INFO 03-26 21:04:35 model_runner.py:1115] Loading model weights took 24.2836 GB
INFO 03-26 21:04:36 worker.py:267] Memory profiling takes 1.05 seconds
INFO 03-26 21:04:36 worker.py:267] the current vLLM instance can use total_gpu_memory (79.33GiB) x gpu_memory_utilization (0.50) = 39.66GiB
INFO 03-26 21:04:36 worker.py:267] model weights take 24.28GiB; non_torch_memory takes 0.09GiB; PyTorch activation peak memory takes 0.32GiB; the rest of the memory reserved for KV Cache is 14.97GiB.
INFO 03-26 21:04:36 executor_base.py:110] # CUDA blocks: 1226, # CPU blocks: 327
INFO 03-26 21:04:36 executor_base.py:115] Maximum concurrency for 2048 tokens per request: 9.58x
INFO 03-26 21:04:39 model_runner.py:1434] Capturing cudagraphs for decoding. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI. If out-of-memory error occurs during cudagraph capture, consider decreasing `gpu_memory_uti

Capturing CUDA graph shapes: 100%|████████████████████████████████████████████████████████████████████| 35/35 [00:25<00:00,  1.36it/s]

INFO 03-26 21:05:05 model_runner.py:1562] Graph capturing finished in 26 secs, took 1.04 GiB





INFO 03-26 21:05:05 llm_engine.py:431] init engine (profile, create kv cache, warmup model) took 30.67 seconds


In [3]:
def harmbench_judge(prompts,responses,classifier):
    harmful_acc = []
    classifier_params = SamplingParams(temperature=0.0, max_tokens=1)
    cls_prompt =[LLAMA2_CLS_PROMPT.format(behavior = p,generation = r) for p,r in zip(prompts,responses)]
    outputs = classifier.generate(cls_prompt, classifier_params, use_tqdm=False)
    output_str = [o.outputs[0].text for o in outputs]
    harmful_acc.extend([int(x.lower().strip() == 'yes') for x in output_str])
    return np.mean(harmful_acc)

In [None]:
# load classifiers

device = 'cuda'
lg_model,lg_tokenizer = load_llamaguard_model(device)
lg_kwargs = {'llamaguard_model':lg_model,'llamaguard_tokenizer':lg_tokenizer}

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

# Load completions

In [56]:
m_name = 'gemma' # either gemma or llama
result_dir = 'results'
clamp_vals = [0,-2] # 0 or -2 (-5 will cause degeneration)
os.makedirs(result_dir,exist_ok=True)

for steer_type in ['base','vec']:
    perf_store = defaultdict(dict)
    perf_path = f"{result_dir}/{m_name}_{steer_type}.json"
    if os.path.exists(perf_path):
        continue
    for ds_name in tqdm(['jailbreakbench','harmbench_test','strongreject'],total = 3):
        
        completion_path =  f"completion/{ds_name}/{m_name}_{steer_type}.jsonl"
        with open(completion_path,'r') as f:
            completion_ds = [json.loads(x) for x in f]
        prompts = [x['instruction'] for x in completion_ds]
        completions = [x['completion'] for x in completion_ds]

        perf_store[ds_name]['string'] = np.mean([substring_matching_judge_fn(x) for x in completions])

        if 'harmbench' in ds_name:
            perf_store[ds_name]['harm'] = harmbench_judge(prompts,completions,hb_classifier)
        else:
            perf_store[ds_name]['safe'] = llamaguard_eval(prompts,completions,**lg_kwargs)
    with open(perf_path,'w') as f:
        json.dump(perf_store,f)

# For ablation across diff K
K_vals = [80,90,100,110,120,130,-1] if 'gemma' in m_name else [24,30,32,34,40, -1]
for clamp_val in clamp_vals:
    sae_perf = defaultdict(lambda: defaultdict(list))
    sae_suffix = '' if clamp_val == 0 else f'_clamp{clamp_val}'
    sae_path = f"{result_dir}/{m_name}_sae{sae_suffix}.json"
    if not os.path.exists(sae_path):
        for ds_name in tqdm(['jailbreakbench','harmbench_test','strongreject'],total = 3):
            for K_val in K_vals:
                with open(f'completion/{ds_name}/{m_name}_sae_{K_val}{sae_suffix}.jsonl','r') as f:
                    sae_completion = [json.loads(x) for x in f]
                prompts = [x['instruction'] for x in sae_completion]
                sae_resp = [x['completion'] for x in sae_completion]
                sae_perf[ds_name]['string'].append(np.mean([substring_matching_judge_fn(x) for x in sae_resp]))
                if 'harmbench' in ds_name:
                    sae_perf[ds_name]['harm'].append(harmbench_judge(prompts,sae_resp,hb_classifier))
                else:
                    sae_perf[ds_name]['safe'].append(llamaguard_eval(prompts,sae_resp,**lg_kwargs))
                torch.cuda.empty_cache()
            
        with open(sae_path,'w') as f:
            json.dump(sae_perf,f)


100%|██████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [06:30<00:00, 130.32s/it]
100%|██████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [07:34<00:00, 151.47s/it]


# Plot results

In [57]:
m_name = 'gemma' # either gemma or llama
clamp_vals = [0,-2]
all_sae_perf = []
for clamp_val in clamp_vals:
    sae_suffix = '' if clamp_val == 0 else f'_clamp{clamp_val}'
    sae_path = f"{result_dir}/{m_name}_sae{sae_suffix}.json"
    with open(sae_path,'r') as f:
        sae_perf = json.load(f)
    all_sae_perf.append(sae_perf)

with open(f"{result_dir}/{m_name}_base.json",'r') as f:
    base_perf = json.load(f)
with open(f"{result_dir}/{m_name}_vec.json",'r') as f:
    vec_perf = json.load(f)

all_sae_val = []

for ds in ['jailbreakbench','harmbench_test','strongreject']:
    single_scores = []
    single_keys = []
    for key in base_perf[ds].keys():
        if key == 'harm':
            single_scores.append(1-base_perf[ds][key]) # inverse to standardize all to lower is better
            single_keys.append(f'Base (Safe)')
            single_scores.append(1-vec_perf[ds][key])
            single_keys.append(f'Vec (Safe)')
        else:
            single_scores.append(base_perf[ds][key])
            single_keys.append(f'Base ({key.capitalize()})')
            single_scores.append(vec_perf[ds][key])
            single_keys.append(f'Vec ({key.capitalize()})')
    single_scores = np.array(single_scores) * 100

    sae_scores = []
    sae_keys = []
    for i,sae_perf in enumerate(all_sae_perf):
        for key in sae_perf[ds].keys():
            if key == 'harm':
                sae_scores.append(1-np.array(sae_perf[ds][key]))
                sae_keys.append(f'SAE (Safe, {clamp_vals[i]})')
            else:
                sae_scores.append(np.array(sae_perf[ds][key]))
                sae_keys.append(f'SAE ({key.capitalize()}, {clamp_vals[i]})')
    
    K_range = K_vals[:-1] + [650] if 'gemma' in m_name else K_vals[:-1] + [96]
    K_range = [str(x) for x in K_range]
    title_name = f'{ds} (SAE clamped at {clamp_val})' 
    plot_line(np.stack(sae_scores)*100,labels = sae_keys,xlabel = 'No. Feat',ylabel = 'Acc',title = title_name,x_tick = K_range,hlines = single_scores,hline_labels = single_keys)

    all_sae_val.append(np.stack(sae_scores).mean(0))

all_sae_val = np.stack(all_sae_val).mean(0)

for kk,v in zip(K_range,all_sae_val):
    print(f'{kk}: {v:.2f}')

    
    

80: 0.34
90: 0.31
100: 0.28
110: 0.27
120: 0.26
130: 0.25
650: 0.23
