In [1]:
import argparse
import sys
import torch
from nnsight import LanguageModel
from pathlib import Path
from crosscoder.newcrosscoder import cc_config, Crosscoder_Model
import crosscoder.newcrosscoder as newcrosscoder
sys.modules['newcrosscoder'] = newcrosscoder
import numpy as np
import random
from circuitsvis.tokens import colored_tokens_multi

import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio
pio.renderers.default = "notebook_connected"



In [2]:
model = Crosscoder_Model(cc_config)
state_dict = torch.load("./checkpoints/crosscoder_step_40000.pt", map_location="cpu", weights_only=False)
model.load_state_dict(state_dict["model_state_dict"])
model.eval()

Crosscoder_Model()

In [3]:
print(model.W_dec.shape)

torch.Size([32768, 12, 768])


In [4]:
print(model.W_dec[10000][0].norm())

tensor(0.0775, grad_fn=<LinalgVectorNormBackward0>)


In [5]:
norms = []
target_feature = 11111
for j in range(12):
    inorm = torch.linalg.norm(model.W_dec[target_feature][j]).item()
    norms.append(inorm)


x_values = np.arange(12)

max_val = max(norms)
for i in range(len(norms)):
    temp = norms[i]
    norms[i] = temp / max_val

fig = go.Figure(data=[go.Scatter(x=x_values, y=norms, mode='lines')])
fig.update_layout(title=f"Per-Layer Decoder Norms for feature {i}",
                 xaxis_title='Layers', yaxis_title='Norms')

fig.show()

In [13]:
lm = LanguageModel('openai-community/gpt2', device_map='auto')
tokenizer = lm.tokenizer



prompt = """
Historically, the Indigenous peoples of the Americas have been usually recognized as constituting two broad cultural groupings, American Indians (a term now considered outdated) and Arctic peoples. American Indians are often further grouped by area of residence: Northern America (present-day United States and Canada), Middle America (present-day Mexico and Central America; sometimes called Mesoamerica), and South America. This article is a survey of the culture areas, prehistories, histories, and recent developments of the Indigenous peoples and cultures of the United States and Canada. Some of the terminology used in reference to Indigenous Americans is explained in Sidebar: Tribal Nomenclature: American Indian, Native American, and First Nation; Sidebar: The Difference Between a Tribe and a Band; and Sidebar: Native American Self-Names. An overview of all the Indigenous American peoples is presented in Indigenous peoples of the Americas; discussions of various aspects of Indigenous American cultures may also be found in the articles pre-Columbian civilizations; Middle American Indian; South American Indian; Arctic: The people; American Indian languages; Native American religions; and Native American arts.
"""




# == Step 1: Get residual stream activations from all layers ==
layer_acts_saved = []
with lm.trace(prompt):
    num_layers = 12
    for i in range(num_layers):
        activation_proxy = lm.transformer.h[i].output[0].save()
        layer_acts_saved.append(activation_proxy)

# --- THE FIX ---
# The error shows that layer_acts_saved already contains the tensors we need.
# So, we can use it directly instead of trying to access .value.
print("Applying fix: Using the returned list of tensors directly.")
layer_acts = layer_acts_saved
# ----------------

# == Step 2: Prepare the input for the crosscoder ==
stacked_acts = torch.stack(layer_acts, dim=2)
print(f"✅ Stacking successful. Shape: {stacked_acts.shape}")

if 'trainer' in locals() and trainer.buffer.layer_stds is not None:
    stacked_acts = stacked_acts / trainer.buffer.layer_stds.to(stacked_acts.device)
else:
    print("Warning: 'trainer' object not found or layer_stds is None. Skipping normalization.")

stacked_acts = stacked_acts.to(model.W_enc.device)
# == Step 3: Get feature activations from the crosscoder ==
with torch.no_grad():
    feature_activations = model.encode(stacked_acts)

# == Step 4 & 5: Find top features and visualize ==
top_k = 20
summed_activations = feature_activations.abs().sum(dim=1)
top_activations_indices = summed_activations.topk(top_k).indices[0]

compounded = []
for i in top_activations_indices:
    compounded.append(feature_activations[0, :, i.item()].cpu())

compounded_tensor = torch.stack(compounded, dim=0)

tokens = tokenizer.encode(prompt)
str_tokens = [tokenizer.decode(t) for t in tokens]
feature_labels = [f"Feature {i.item()}" for i in top_activations_indices]

print(f"✅ Visualization ready for top {top_k} features.")

# This should now display the visualization without error.
colored_tokens_multi(str_tokens,compounded_tensor.T,)

Applying fix: Using the returned list of tensors directly.
✅ Stacking successful. Shape: torch.Size([1, 222, 12, 768])
✅ Visualization ready for top 20 features.
