<a href="https://colab.research.google.com/github/Reusezer/Neuron-activations/blob/main/google_gemma_3_4b_it.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# 1-1. upgrade pip (optional)
!pip install -q --upgrade pip

# 1-2. core libs: HF Transformers, accelerate (for device mapping), pandas, datasets, torch
!pip install -q transformers accelerate pandas datasets torch


In [None]:
import os
import torch
import pandas as pd

from transformers import AutoTokenizer, AutoModelForCausalLM

# choose GPU if available
device = "cuda" if torch.cuda.is_available() else "cpu"
print("Using device:", device)


In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

MODEL_NAME = "google/gemma-3-4b-it"

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)

hf_model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    trust_remote_code=True,  # ✅ corrected
    device_map="auto"
)

hf_model.eval()




In [None]:
CSV_URL = "https://raw.githubusercontent.com/nyu-mll/crows-pairs/master/data/crows_pairs_anonymized.csv"
csv_path = "crows_pairs_anonymized.csv"

if not os.path.exists(csv_path):
    import urllib.request
    urllib.request.urlretrieve(CSV_URL, csv_path)

df = pd.read_csv(csv_path).dropna(subset=["sent_more","sent_less"])
print("Total pairs loaded:", len(df))
df.head()


In [None]:
row = df.sample(1).iloc[0]
sent_more, sent_less = row["sent_more"], row["sent_less"]

print("MORE (biased):", sent_more)
print("LESS (neutral):", sent_less)


In [None]:
# List all submodules whose name contains "mlp" and are Linear layers:
for name, module in hf_model.named_modules():
    if "mlp" in name.lower() and isinstance(module, torch.nn.Linear):
        print(f"{name:60s} — weight shape: {tuple(module.weight.shape)}")


In [None]:
# Cell A (generic for any model)
HOOKED_MODULES = [
    name for name, module in hf_model.named_modules()
    if name.endswith("mlp.down_proj")
]

print("✅ Hooking the following MLP output layers:")
for m in HOOKED_MODULES:
    print("  ", m)





In [None]:
from collections import defaultdict
import torch

def get_activations(prompt: str):
    activations = defaultdict(list)

    def make_hook(name):
        def hook(module, input, output):
            if isinstance(output, torch.Tensor):
                activations[name].append(output.detach().cpu())
        return hook

    hooks = []
    for name, module in hf_model.named_modules():
        if name in HOOKED_MODULES:
            hooks.append(module.register_forward_hook(make_hook(name)))

    tokens = tokenizer(prompt, return_tensors="pt").to(hf_model.device)
    with torch.no_grad():
        _ = hf_model(**tokens)

    for h in hooks:
        h.remove()

    return {k: torch.cat(v, dim=0) for k, v in activations.items()}




In [None]:
import torch
from tqdm import tqdm
from google.colab import files
import pandas as pd

#  Auto-detect all MLP down_proj layers in the loaded model
HOOKED_MODULES = [
    name for name, module in hf_model.named_modules()
    if name.endswith("mlp.down_proj")
]
print("✅ Hooking the following MLP output layers:")
for m in HOOKED_MODULES:
    print("  ", m)

records = []

#  Loop through every CrowS-Pairs row
for _, row in tqdm(df.iterrows(), total=len(df), desc="Comparing CrowS-Pairs"):
    try:
        sent_more = row["sent_more"]
        sent_less = row["sent_less"]

        act_more = get_activations(sent_more)
        act_less = get_activations(sent_less)

        for name in HOOKED_MODULES:
            if name in act_more and name in act_less:
                min_len = min(act_more[name].shape[1], act_less[name].shape[1])
                delta = (act_more[name][:, :min_len, :] - act_less[name][:, :min_len, :]).abs().mean(dim=(0, 1))
                topk = torch.topk(delta, 5)

                # ✅ Robust layer index extraction
                try:
                  parts = name.split(".")
                  layer_index = int(parts[parts.index("layers") + 1])
                except:
                  layer_index = -1


                for idx, val in zip(topk.indices.tolist(), topk.values.tolist()):
                    records.append({
                        "module": name,
                        "layer": layer_index,
                        "neuron_index": idx,
                        "delta_value": float(val),
                        "prompt_more": sent_more,
                        "prompt_less": sent_less
                    })
    except Exception as e:
        print(f"❗ Error on row: {e}")

#  Save to CSV
if records:
    df_out = pd.DataFrame(records)
    output_path = "crows_bias_top_neurons.csv"
    df_out.to_csv(output_path, index=False)
    print(f"✅ Saved {len(df_out)} rows to {output_path}")
    files.download(output_path)
else:
    print("⚠️ No neuron activations recorded.")






In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Load your updated CSV
df = pd.read_csv("/content/crows_bias_top_neurons.csv")

# Aggregate: mean delta per neuron per layer
df["layer"] = df["module"].str.extract(r"layers\.(\d+)")
df["layer"] = df["layer"].astype(int)

pivot = df.pivot_table(
    index="layer",
    columns="neuron_index",
    values="delta_value",
    aggfunc="mean"
).fillna(0)

# Plot heatmap
plt.figure(figsize=(14, 6))
sns.heatmap(pivot, cmap="viridis", xticklabels=False)
plt.title("Bias-Activated Neurons by Layer (mean ∆ activation)")
plt.xlabel("Neuron Index")
plt.ylabel("Layer")
plt.tight_layout()
plt.show()

