In [1]:
%load_ext autoreload
%autoreload 2

# Preparation

## Define model

In [2]:
from nnsight import LanguageModel
import torch as t

t.cuda.empty_cache()

device = t.device('cuda:0')
dtype = t.float32

model = LanguageModel(
    'EleutherAI/pythia-70m-deduped',
    device_map=device,
    dispatch=True,
    torch_dtype=dtype,
)

  from .autonotebook import tqdm as notebook_tqdm


## Load data

In [3]:
import os
from data_loading_utils import load_examples_rct
from dictionary_loading_utils import load_saes_and_submodules

category = 'concise'
num_examples = 100
train_clean, train_patch = load_examples_rct(os.path.join('prompts', category), 50)
save_base = f'pythia-70m-deduped_prompt_dir_{category}_n{num_examples}'

submods, SAEs = load_saes_and_submodules(
    model,
    separate_by_type=False,
    include_embed=True,
    device=device,
    dtype=dtype,
)

# Experiment

## Clean run

In [4]:
from attribution import get_activations, create_batches
from metrics import eos_metric
from activation_utils import SparseAct

metric_fn = eos_metric
calc_metric = True
metric_kwargs = dict()

clean_acts, clean_metric = get_activations(
    model,
    train_clean,
    submods,
    SAEs,
    calc_metric=True,
    metric_fn = eos_metric
)

You're using a GPTNeoXTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


## Patched run (with explicit behaviour elicitation)

In [5]:
patch_acts, patch_metric = get_activations(
    model,
    train_patch,
    submods,
    SAEs,
    calc_metric=True,
    metric_fn = eos_metric
)

In [7]:
from attribution import pe_exact
effect = pe_exact(model, train_clean, clean_acts, patch_acts, submods, SAEs, metric_fn)

100%|██████████| 19/19 [09:19<00:00, 29.46s/it]
