In [3]:
from datetime import date

import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

from inference import unload_model, run_experiment_batch, safety_scores
from RefusalVectors import get_full_block_activations, load_prompts, get_refusal_vectors, save_candidate_vectors

## Load Model and data

In [6]:
# Hugging face login is needed to access AdvBench
from huggingface_hub import login
login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [4]:
model_name = "openai/gpt-oss-20b"

def load_model(model_name):
    device = "cuda"
    tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
    model = AutoModelForCausalLM.from_pretrained(
        model_name, 
        trust_remote_code=True,
        # device_map="auto",   # Hugging Face accelerates across multiple GPUs
        dtype="auto"   # or torch.float16 / bfloat16
    ).to(device)
    return model, tokenizer, device

model, tokenizer, device = load_model(model_name)

tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.json:   0%|          | 0.00/27.9M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/98.0 [00:00<?, ?B/s]

chat_template.jinja: 0.00B [00:00, ?B/s]

config.json: 0.00B [00:00, ?B/s]

MXFP4 quantization requires Triton and kernels installed: CUDA requires Triton >= 3.4.0, XPU requires Triton >= 3.5.0, we will default to dequantizing the model to bf16


model.safetensors.index.json: 0.00B [00:00, ?B/s]

model-00000-of-00002.safetensors:   0%|          | 0.00/4.79G [00:00<?, ?B/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.80G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/4.17G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/177 [00:00<?, ?B/s]

In [14]:
# ==== Global configuration ====
N_CLEAN_TEST = 50
N_HARM_TEST = 100
N_HARM_TRAIN = 150
N_CLEAN_TRAIN = 200
N_MALICIOUS_TRAIN = 50
TOTAL_MALICIOUS_ROWS = 100

# ==== Data loading ====
harm_train, clean_train = load_prompts(
    harm_range=range(N_HARM_TRAIN),
    alpaca_range=range(N_CLEAN_TRAIN),
    malicious_range=range(N_MALICIOUS_TRAIN)
)

harm_test, clean_test = load_prompts(
    harm_range=range(N_HARM_TRAIN + 1, N_HARM_TRAIN + 1 + N_HARM_TEST // 2),
    alpaca_range=range(N_CLEAN_TRAIN + 1, N_CLEAN_TRAIN + 1 + N_CLEAN_TEST),
    malicious_range=range(N_MALICIOUS_TRAIN + 1, min(N_MALICIOUS_TRAIN + 1 + N_HARM_TEST // 2, TOTAL_MALICIOUS_ROWS - 1))
)

## Set of $R_1$

In [16]:
harmful_vectors = get_full_block_activations(model, tokenizer, harm_train)
harmless_vectors = get_full_block_activations(model, tokenizer, clean_train)
refusal_vectors = get_refusal_vectors(harmful_vectors, harmless_vectors)

r1_save_filename = save_candidate_vectors(f"Data\{model_name.replace("/", "_")}_r1.pt", refusal_vectors)

  r1_save_filename = save_candidate_vectors(f"Data\{model_name.replace("/", "_")}_r1.pt", refusal_vectors)
100%|██████████| 200/200 [00:40<00:00,  4.99it/s]
100%|██████████| 200/200 [00:40<00:00,  4.98it/s]
100%|██████████| 24/24 [00:01<00:00, 13.76it/s]


## Run Experiment

In [17]:
r1_results = run_experiment_batch(
    harm_test,
    clean_test,
    refusal_vectors,
    model,
    tokenizer
)

Harmless (UM):   0%|          | 0/4 [00:00<?, ?it/s]

Layers:   0%|          | 0/15 [00:00<?, ?it/s]

Positions:   0%|          | 0/4 [00:00<?, ?it/s]

Positions:   0%|          | 0/4 [00:00<?, ?it/s]

Positions:   0%|          | 0/4 [00:00<?, ?it/s]

Positions:   0%|          | 0/4 [00:00<?, ?it/s]

Positions:   0%|          | 0/4 [00:00<?, ?it/s]

Positions:   0%|          | 0/4 [00:00<?, ?it/s]

Positions:   0%|          | 0/4 [00:00<?, ?it/s]

Positions:   0%|          | 0/4 [00:00<?, ?it/s]

Positions:   0%|          | 0/4 [00:00<?, ?it/s]

Positions:   0%|          | 0/4 [00:00<?, ?it/s]

Positions:   0%|          | 0/4 [00:00<?, ?it/s]

Positions:   0%|          | 0/4 [00:00<?, ?it/s]

Positions:   0%|          | 0/4 [00:00<?, ?it/s]

Positions:   0%|          | 0/4 [00:00<?, ?it/s]

Positions:   0%|          | 0/4 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [None]:
unload_model(model, tokenizer)

### Load Saftey model

In [None]:
saftey_model_name = "meta-llama/Meta-Llama-Guard-2-8B"

quant_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,  # stable low-precision compute
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4"
)

# Load tokenizer + model
guard_tokenizer = AutoTokenizer.from_pretrained(saftey_model_name , trust_remote_code=True)

guard_model = AutoModelForCausalLM.from_pretrained(
    saftey_model_name ,
    quantization_config=quant_config,
    device_map=device,
    trust_remote_code=True
)

if guard_tokenizer.pad_token is None:
    guard_tokenizer.add_special_tokens({'pad_token': '[PAD]'})
    guard_model.resize_token_embeddings(len(guard_tokenizer))

guard_tokenizer.padding_side = "left"

### Saftey Score

In [None]:
safety_scores = safety_score(guard_tokenizer, guard_model, r1_results["Prompt"], r1_results["Output"])

NameError: name 'moderate_batch' is not defined