In [1]:
try:
    import google.colab
    IN_COLAB = True
    from tqdm.notebook import tqdm, trange

    from google.colab import drive
    drive.mount("/content/gdrive", force_remount=True)
    %cd /content/gdrive/MyDrive/feature-circuits
    %pip install -r requirements.txt
    !git submodule update --init
except:
    IN_COLAB = False
    from tqdm import tqdm, trange

import os

import torch
from nnsight import LanguageModel

from circuit import get_circuit
from utils import save_circuit
from utils import plot_circuit
from dictionary_learning import AutoEncoder

DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print("DEVICE :", DEVICE)

print("IN_COLAB :", IN_COLAB)

DEVICE : cpu
IN_COLAB : False


In [2]:
pythia70m = LanguageModel("EleutherAI/pythia-70m-deduped", device_map=DEVICE, dispatch=True)

pythia70m_embed = pythia70m.gpt_neox.embed_in

pythia70m_resids= []
pythia70m_attns = []
pythia70m_mlps = []
for layer in range(len(pythia70m.gpt_neox.layers)):
    pythia70m_resids.append(pythia70m.gpt_neox.layers[layer])
    pythia70m_attns.append(pythia70m.gpt_neox.layers[layer].attention)
    pythia70m_mlps.append(pythia70m.gpt_neox.layers[layer].mlp)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [3]:
if IN_COLAB:
    base = "/content/gdrive/MyDrive/feature-circuits/"
else:
    base = "C:/Users/Grégoire/Documents/ENS/stages/AttentionGraph/Marks/feature-circuits/"
path = base + "dictionary_learning/dictionaires/pythia-70m-deduped/"

if not os.path.exists(path):
    if IN_COLAB:
        # go to base / dictionary_learning :
        %cd /content/gdrive/MyDrive/feature-circuits/dictionary_learning
        !apt-get update
        !apt-get install dos2unix
        !dos2unix pretrained_dictionary_downloader.sh
        !chmod +x pretrained_dictionary_downloader.sh
        !./pretrained_dictionary_downloader.sh
        %cd /content/gdrive/MyDrive/feature-circuits
    else:
        %cd C:/Users/Grégoire/Documents/ENS/stages/AttentionGraph/Marks/feature-circuits/dictionary_learning
        %run ./pretrained_dictionary_downloader.sh
        %cd C:/Users/Grégoire/Documents/ENS/stages/AttentionGraph/Marks/feature-circuits

dictionaries = {}

d_model = 512
dict_size = 32768

ae = AutoEncoder(d_model, dict_size).to(DEVICE)
ae.load_state_dict(torch.load(path + f"embed/ae.pt", map_location=DEVICE))
dictionaries[pythia70m_embed] = ae


for layer in range(len(pythia70m.gpt_neox.layers)):
    ae = AutoEncoder(d_model, dict_size).to(DEVICE)
    ae.load_state_dict(torch.load(path + f"resid_out_layer{layer}/ae.pt", map_location=DEVICE))
    dictionaries[pythia70m_resids[layer]] = ae

    # ae = AutoEncoder(d_model, dict_size).to(DEVICE)
    # ae.load_state_dict(torch.load(path + f"attn_out_layer{layer}/ae.pt", map_location=DEVICE))
    # dictionaries[pythia70m_attns[layer]] = ae

    # ae = AutoEncoder(d_model, dict_size).to(DEVICE)
    # ae.load_state_dict(torch.load(path + f"mlp_out_layer{layer}/ae.pt", map_location=DEVICE))
    # dictionaries[pythia70m_mlps[layer]] = ae

In [4]:
def metric_fn_v1(model, trg=None):
    """
    default : return the logit
    """
    if trg is None:
        raise ValueError("trg must be provided")
    logits = model.embed_out.output[:,-1,:]
    return logits[torch.arange(trg.numel()), trg]
    
def metric_fn_v2(model, trg=None):
    """
    default : return the logit
    """
    if trg is None:
        raise ValueError("trg must be provided")
    logits = model.embed_out.output[:,trg[0],:]
    return logits[0, 0, trg[1]]

def metric_fn_v3(model, trg=None):
    """
    Return -log probability for the expected target.

    trg : torch.Tensor, contains idxs of the target tokens (between 0 and d_vocab_out)

    /!\ here we assume that all last tokens are indeed in the last position (if padding, it must happen in front of the sequence, not after)
    """
    if trg is None:
        raise ValueError("trg must be provided")
    logits = model.embed_out.output[:,-1,:]
    return (
         -1 * torch.gather(
             torch.nn.functional.log_softmax(model.embed_out.output[:,-1,:], dim=-1),
             dim=-1, index=trg.view(-1, 1)
         ).squeeze(-1)
    )

In [5]:
batch_size = 1

clean = [
    "When Mary and John went to the store, John gave a drink to"
    for _ in range(batch_size)
]
patch = None

trg = " Mary"
trg_idx = torch.tensor([pythia70m.tokenizer.encode(trg)[0]] * batch_size, device=DEVICE)
print(trg_idx)

tensor([6393])


In [6]:
circuit = get_circuit(
    clean, patch,
    pythia70m,
    dictionaries,
    metric_fn_v1,
    pythia70m_embed, pythia70m_resids,
    metric_kwargs={"trg": trg_idx},
    original_marks=False,
    edge_threshold=1.5
)

You're using a GPTNeoXTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Computing effects for layer 5 with 1 features
Computing effects for layer 4 with 1 features
Computing effects for layer 3 with 1 features
Computing effects for layer 2 with 12 features
Computing effects for layer 1 with 6 features
Computing effects for layer 0 with 1 features


In [35]:
import evaluation
import importlib
importlib.reload(evaluation)

mask = evaluation.get_mask(circuit, 0.5)
pruned = evaluation.prune(mask)

Now looking at edges from upstream embed
arriving at downstream resid_0
Now looking at edges from upstream resid_0
arriving at downstream resid_1
Now looking at edges from upstream resid_1
arriving at downstream resid_2
Now looking at edges from upstream resid_2
arriving at downstream resid_3
Now looking at edges from upstream resid_3
arriving at downstream resid_4
Now looking at edges from upstream resid_4
arriving at downstream resid_5
Now looking at edges from upstream resid_5
arriving at downstream y


In [22]:
mask = torch.rand((512,)) > 0.5
idxs = torch.randint(0, 512, (2, 512))
new_idxs = idxs[:, mask]
print(new_idxs.size())

torch.Size([2, 267])


- cpu :
    - 1 : 2m47
    - 2 : /
    - 10: Stop at 68m+

- gpu :
    - 1 : 42s
    - 2 : 1m32

In [None]:
submod_1 = "resid_0"
submod_2 = "resid_1"

weights = circuit[1][submod_1][submod_2]
weights = weights.values()

from matplotlib import pyplot as plt

alive_downstream = circuit[1][submod_1][submod_2].indices()[0]
set_downstream = list(set([alive_downstream_.item() for alive_downstream_ in alive_downstream]))

ss = []
abss = []
nb_k = []

from tqdm import tqdm

for k in tqdm(set_downstream):
    weights = []
    for i, idx in enumerate(alive_downstream):
        if idx == k:
            weights.append(circuit[1][submod_1][submod_2].values()[i])
    weights = torch.stack(weights)

    perm = torch.argsort(weights.abs(), descending=True)
    weights = weights[perm]
    tot = sum(weights)
    s = 0
    for i in range(len(weights)):
        s += weights[i]
        if i < len(ss):
            ss[i] += (s / tot).item()
        else:
            ss.append((s / tot).item())
        if i < len(abss):
            abss[i] += weights[i].abs().item()
        else:
            abss.append(weights[i].abs().item())
        if i < len(nb_k):
            nb_k[i] += 1
        else:
            nb_k.append(1)
        # print("i :", i)
        # print("weight :", embed_weights[i].item())
        # print("% of total :", s.item() / tot.item() * 100)

ss = [ss[i] / nb_k[i] for i in range(len(ss))]
abss = [abss[i] / nb_k[i] for i in range(len(abss))]

"""
plot ss and abss on two different axis with the same x-axis on the same plot
"""
fig, ax1 = plt.subplots()

color = 'tab:red'
ax1.set_xlabel('weight index')
ax1.set_ylabel('cumulative % of total', color=color)
ax1.plot(ss, color=color)
ax1.tick_params(axis='y', labelcolor=color)

ax2 = ax1.twinx()
color = 'tab:blue'
ax2.set_ylabel('weight', color=color)
ax2.plot(abss, color=color)
ax2.tick_params(axis='y', labelcolor=color)

plt.show()

In [None]:
max_weights = 100

fig, ax1 = plt.subplots()

color = 'tab:red'
ax1.set_xlabel('weight index')
ax1.set_ylabel('cumulative % of total', color=color)
ax1.plot(ss[:max_weights], color=color)
ax1.tick_params(axis='y', labelcolor=color)

ax2 = ax1.twinx()
color = 'tab:blue'
ax2.set_ylabel('weight', color=color)
ax2.plot(abss[:max_weights], color=color)
ax2.tick_params(axis='y', labelcolor=color)

plt.show()

In [None]:
import importlib
import circuit_plotting
importlib.reload(circuit_plotting)
circuit_plotting.plot_circuit(circuit[0], circuit[1], save_dir='./circuit/cpu_2_')

In [None]:
all_weights = []
for key, value in circuit[1].items():
    for k, v in value.items():
        all_weights.append(v.values())
        
all_weights = torch.cat(all_weights, dim=0)
print(all_weights.shape)
print(all_weights.abs().mean())

plt.hist(all_weights[all_weights.abs() > 0.01].detach().cpu().numpy(), bins=100)
plt.show()

In [None]:
A = torch.randn(1, 10, 50)
B = torch.randn(1, 10, 50)

print((A * B).shape)
print(A @ B)

In [None]:
import torch

dummy_2d_sparse_idx = torch.tensor([[0, 99, 27], [1, 2, 199]])
dummy_2d_sparse_values = torch.randn(2, 3)

dummy_2d_sparse = torch.sparse_coo_tensor(
    dummy_2d_sparse_idx,
    dummy_2d_sparse_values,
    size=(100, 200)
)

print(dummy_2d_sparse.to_dense())

In [36]:
print("hehehe_hahaha")
print("hehehe_hahaha".split("_"))
print("hehehehahaha".split("_"))

hehehe_hahaha
['hehehe', 'hahaha']
['hehehehahaha']
