In [None]:
from datetime import date

import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

from inference import unload_model, run_experiment_batch
from RefusalVectors import load_prompts, get_full_block_activations, get_refusal_vectors, save_candidate_vectors, safety_score

## Load Model and data

In [None]:
model_name = "openai/gpt-oss-20b"

def load_model(model_name):
    device = "cuda"
    tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
    model = AutoModelForCausalLM.from_pretrained(
        model_name, 
        trust_remote_code=True,
        # device_map="auto",   # Hugging Face accelerates across multiple GPUs
        dtype="auto"   # or torch.float16 / bfloat16
    ).to(device)
    return model, tokenizer, device

model, tokenizer, device = load_model(model_name)

In [None]:
# ==== Global configuration ====
N_CLEAN_TEST = 50
N_HARM_TEST = 100
N_HARM_TRAIN = 150
N_CLEAN_TRAIN = 200
N_MALICIOUS_TRAIN = 50

# ==== Data loading ====
harm_train, clean_train = load_prompts(
    harm_range=range(N_HARM_TRAIN),
    alpaca_range=range(N_CLEAN_TRAIN),
    malicious_range=range(N_MALICIOUS_TRAIN),
)

harm_test, clean_test = load_prompts(
    harm_range=range(N_HARM_TRAIN + 1, N_HARM_TRAIN + 1 + N_HARM_TEST // 2),
    alpaca_range=range(N_CLEAN_TRAIN + 1, N_CLEAN_TRAIN + 1 + N_CLEAN_TEST),
    malicious_ds=range(N_MALICIOUS_TRAIN + 1, N_MALICIOUS_TRAIN + 1 + N_HARM_TEST // 2),
)

## Set of $R_1$

In [None]:
harmful_vectors = get_full_block_activations(model, tokenizer, harm_train)
harmless_vectors = get_full_block_activations(model, tokenizer, clean_train)
refusal_vectors = get_refusal_vectors(harmful_vectors, harmless_vectors)

r1_save_filename = save_candidate_vectors(f"Data\{model_name.replace("/", "_")}_r1.pt", refusal_vectors)

NameError: name 'get_full_block_activations' is not defined

## Run Experiment

In [None]:
r1_results = run_experiment_batch(
    harm_test,
    clean_test,
    refusal_vectors,
    model,
    tokenizer
)

In [None]:
unload_model(model, tokenizer)

### Load Saftey model

In [None]:
saftey_model_name = "meta-llama/Meta-Llama-Guard-2-8B"

quant_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,  # stable low-precision compute
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4"
)

# Load tokenizer + model
guard_tokenizer = AutoTokenizer.from_pretrained(saftey_model_name , trust_remote_code=True)

guard_model = AutoModelForCausalLM.from_pretrained(
    saftey_model_name ,
    quantization_config=quant_config,
    device_map=device,
    trust_remote_code=True
)

if guard_tokenizer.pad_token is None:
    guard_tokenizer.add_special_tokens({'pad_token': '[PAD]'})
    guard_model.resize_token_embeddings(len(guard_tokenizer))

guard_tokenizer.padding_side = "left"

### Saftey Score

In [None]:
safety_scores = safety_score(guard_tokenizer, guard_model, r1_results["Prompt"], r1_results["Output"])

NameError: name 'moderate_batch' is not defined