In [1]:
# %% Install dependencies
!pip install requests pandas neuronpedia


Collecting neuronpedia
  Downloading neuronpedia-1.2.0-py3-none-any.whl.metadata (3.7 kB)
Downloading neuronpedia-1.2.0-py3-none-any.whl (61 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.5/61.5 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: neuronpedia
Successfully installed neuronpedia-1.2.0


In [3]:
import requests
import json
import pandas as pd
from typing import Optional, Dict, List, Any, Tuple
from dataclasses import dataclass

In [6]:
@dataclass
class FeatureConfig:
    """Configuration for a Neuronpedia SAE feature."""
    model_id: str      # e.g., "gemma-2-9b-it"
    sae_id: str        # e.g., "20-gemmascope-res-16k" (layer-saename-width)
    feature_index: int # e.g., 12799

    @property
    def feature_url(self) -> str:
        """URL to view this feature on Neuronpedia."""
        return f"https://neuronpedia.org/{self.model_id}/{self.sae_id}/{self.feature_index}"

    @property
    def api_url(self) -> str:
        """API URL to get feature data."""
        return f"https://www.neuronpedia.org/api/feature/{self.model_id}/{self.sae_id}/{self.feature_index}"


In [7]:
def get_feature_info(config: FeatureConfig) -> Dict[str, Any]:
    """
    Fetch feature information from Neuronpedia API.

    Returns dict with explanations, top activations, logits, etc.
    """
    response = requests.get(config.api_url)

    if response.status_code != 200:
        raise Exception(f"Failed to fetch feature: {response.status_code} - {response.text}")

    return response.json()

def get_feature_explanation(feature_data: Dict[str, Any]) -> str:
    """Extract the best explanation from feature data."""
    explanations = feature_data.get("explanations", [])
    if explanations:
        # Get the first/best explanation
        return explanations[0].get("description", "No explanation available")
    return "No explanation available"

In [24]:
def test_feature_activation(
    config: FeatureConfig,
    text: str,
) -> Dict[str, Any]:
    """
    Test a feature's activation on custom text using Neuronpedia's API.

    Args:
        config: Feature configuration
        text: Input text to test

    Returns:
        Dict containing tokens and their activation values
    """
    # Primary endpoint for activation testing
    url = "https://www.neuronpedia.org/api/activation/new"

    payload = {
        "feature": {
            "modelId": config.model_id,
            "source": config.sae_id,
            "index": str(config.feature_index)
        },
        "customText": text
    }

    headers = {"Accept": "*/*", "Content-Type": "application/json"}

    response = requests.post(url, json=payload, headers=headers)

    if response.status_code != 200:
        print(f"API Response: {response.text[:500]}")
        raise Exception(
            f"Activation test failed: {response.status_code}\n"
            f"Try checking the API at https://neuronpedia.org/api-doc\n"
            f"Or inspect network requests on {config.feature_url} when testing activation"
        )

    return response.json()


In [25]:
def parse_activation_results(
    activation_data: Dict[str, Any]
) -> pd.DataFrame:
    """
    Parse activation results into a DataFrame with tokens and values.

    Returns DataFrame with columns: position, token, activation, is_active
    """
    tokens = activation_data.get("tokens", [])
    values = activation_data.get("values", [])

    if not tokens or not values:
        # Try alternative response format
        if "result" in activation_data:
            result = activation_data["result"]
            tokens = result.get("tokens", [])
            values = result.get("values", [])

    df = pd.DataFrame({
        "position": range(len(tokens)),
        "token": tokens,
        "activation": values
    })

    df["is_active"] = df["activation"] > 0

    return df

In [26]:
def get_ranked_activations(df: pd.DataFrame, top_k: int = None) -> pd.DataFrame:
    """
    Get activations sorted by activation value (highest first).

    Args:
        df: DataFrame from parse_activation_results
        top_k: If provided, return only top K activations

    Returns:
        Sorted DataFrame
    """
    sorted_df = df.sort_values("activation", ascending=False)

    if top_k:
        sorted_df = sorted_df.head(top_k)

    return sorted_df.reset_index(drop=True)

In [27]:
def analyze_feature_activation(
    config: FeatureConfig,
    text: str,
    show_all: bool = False,
    top_k: int = 10
) -> Tuple[str, pd.DataFrame, pd.DataFrame]:
    """
    Complete analysis of a feature's activation on custom text.

    Args:
        config: Feature configuration
        text: Input text to analyze
        show_all: If True, return all tokens. If False, only active ones.
        top_k: Number of top activations to highlight

    Returns:
        Tuple of (explanation, full_df, ranked_df)
    """
    print(f"Fetching feature info for {config.feature_url}...")
    feature_data = get_feature_info(config)
    explanation = get_feature_explanation(feature_data)

    print(f"Testing activation on input text...")
    activation_data = test_feature_activation(config, text)

    full_df = parse_activation_results(activation_data)

    # Filter to active only if requested
    if not show_all:
        active_df = full_df[full_df["is_active"]]
    else:
        active_df = full_df

    ranked_df = get_ranked_activations(full_df, top_k)

    return explanation, active_df, ranked_df


# %%
def display_results(
    explanation: str,
    active_df: pd.DataFrame,
    ranked_df: pd.DataFrame,
    text: str
):
    """Pretty print the analysis results."""
    print("=" * 60)
    print("FEATURE EXPLANATION")
    print("=" * 60)
    print(explanation)
    print()

    print("=" * 60)
    print("INPUT TEXT")
    print("=" * 60)
    print(text)
    print()

    print("=" * 60)
    print(f"TOP ACTIVATING TOKENS (ranked by activation value)")
    print("=" * 60)
    if len(ranked_df) > 0:
        print(ranked_df.to_string(index=False))
    else:
        print("No activations found for this feature on the input text.")
    print()

    print("=" * 60)
    print("ALL ACTIVE TOKENS")
    print("=" * 60)
    if len(active_df) > 0:
        print(active_df.to_string(index=False))
    else:
        print("No activations found.")

In [28]:
def highlight_text_by_activation(
    text: str,
    df: pd.DataFrame,
    threshold: float = 0
) -> str:
    """
    Create a simple text representation with activation markers.
    Tokens with activation > threshold are marked with [brackets] and value.
    """
    highlighted = []
    for _, row in df.iterrows():
        token = row["token"]
        activation = row["activation"]
        if activation > threshold:
            highlighted.append(f"[{token}]({activation:.2f})")
        else:
            highlighted.append(token)

    return "".join(highlighted)

In [39]:
config = FeatureConfig(
    model_id="gemma-2-9b-it",           # Model ID
    sae_id="31-gemmascope-res-16k",     # SAE ID: layer-saename-width
    feature_index=7811                  # Feature index
)

In [44]:
input_text = """
As we move ahead into the December quarter, I'd like to review our outlook, which includes the types of forward-looking information Suhasini referred to. Importantly, the color we're providing assumes that the global tariff rates, policies, and application remain in effect as of this call, and the global macroeconomic outlook does not worsen from today. We expect our December quarter total company revenue to grow by 10% to 12% year over year, which would be our best quarter ever. We expect iPhone revenue to grow double digits year over year, which would be our best iPhone quarter ever. On Mac, keep in mind we expect to face a very difficult compare against the M4 MacBook Pro, Mac Mini, and iMac launches in the year-ago quarter. We expect services revenue to grow at a year-over-year rate similar to what we reported in the fiscal year 2025"""


In [45]:
explanation, active_df, ranked_df = analyze_feature_activation(
    config=config,
    text=input_text,
    show_all=False,  # Set to True to see all tokens
    top_k=15         # Number of top activations to show
)


Fetching feature info for https://neuronpedia.org/gemma-2-9b-it/31-gemmascope-res-16k/7811...
Testing activation on input text...


In [46]:
display_results(explanation, active_df, ranked_df, input_text)


FEATURE EXPLANATION
 terms related to economic conditions and financial implications

INPUT TEXT

As we move ahead into the December quarter, I'd like to review our outlook, which includes the types of forward-looking information Suhasini referred to. Importantly, the color we're providing assumes that the global tariff rates, policies, and application remain in effect as of this call, and the global macroeconomic outlook does not worsen from today. We expect our December quarter total company revenue to grow by 10% to 12% year over year, which would be our best quarter ever. We expect iPhone revenue to grow double digits year over year, which would be our best iPhone quarter ever. On Mac, keep in mind we expect to face a very difficult compare against the M4 MacBook Pro, Mac Mini, and iMac launches in the year-ago quarter. We expect services revenue to grow at a year-over-year rate similar to what we reported in the fiscal year 2025

TOP ACTIVATING TOKENS (ranked by activation value)


In [47]:
# Show highlighted text with activation values
print("=" * 60)
print("HIGHLIGHTED TEXT (tokens with activations > 0)")
print("=" * 60)
full_df = parse_activation_results(test_feature_activation(config, input_text))
highlighted = highlight_text_by_activation(input_text, full_df, threshold=0)
print(highlighted)

HIGHLIGHTED TEXT (tokens with activations > 0)
<bos>
As we move ahead[ into](9.86) the December quarter[,](9.78) I'd like to review our outlook, which[ includes](21.28)[ the](32.81) types of forward-looking information Suhasini referred to. Importantly,[ the](20.59) color we're providing[ assumes](38.53)[ that](54.41)[ the](52.88)[ global](41.25)[ tariff](28.09)[ rates](9.20)[,](51.94)[ policies](13.74)[,](39.72)[ and](51.31)[ application](16.78) remain in effect as of this call,[ and](22.78)[ the](53.50)[ global](53.97)[ macroeconomic](19.47) outlook does not worsen from today. We[ expect](12.55) our December quarter total company revenue to grow by 10% to 12% year over year, which would be our best quarter ever. We expect iPhone revenue to grow double digits year over year, which would be our best iPhone quarter ever. On Mac, keep in[ mind](23.78)[ we](27.31)[ expect](19.84)[ to](11.65)[ face](36.38)[ a](35.00)[ very](25.88)[ difficult](17.62)[ compare](13.75)[ against](31.00)[ the](

In [49]:
def get_context_around_token(
    df: pd.DataFrame,
    position: int,
    context_window: int = 10
) -> Dict[str, Any]:
    """
    Get the context around a specific token position.

    Args:
        df: DataFrame with columns 'position', 'token', 'activation'
        position: The token position to get context for
        context_window: Number of tokens before/after to include

    Returns:
        Dict with before_context, target_token, after_context, activation, and full_context
    """
    tokens = df["token"].tolist()
    activations = df["activation"].tolist()

    start = max(0, position - context_window)
    end = min(len(tokens), position + context_window + 1)

    before = "".join(tokens[start:position])
    target = tokens[position]
    after = "".join(tokens[position + 1:end])
    activation = activations[position]

    # Full context with target highlighted
    full_context = f"{before}>>>{target}<<<{after}"

    return {
        "position": position,
        "before": before,
        "target": target,
        "after": after,
        "activation": activation,
        "full_context": full_context
    }


def display_top_activations_with_context(
    df: pd.DataFrame,
    top_k: int = 10,
    context_window: int = 10,
    min_activation: float = 0.0
):
    """
    Display top activating tokens with their surrounding context.

    Args:
        df: DataFrame with columns 'position', 'token', 'activation'
        top_k: Number of top activations to show
        context_window: Number of tokens before/after to include
        min_activation: Minimum activation threshold
    """
    # Filter and sort
    filtered_df = df[df["activation"] > min_activation].copy()
    top_df = filtered_df.nlargest(top_k, "activation")

    print("=" * 80)
    print(f"TOP {len(top_df)} ACTIVATING TOKENS WITH CONTEXT (±{context_window} tokens)")
    print("=" * 80)
    print()

    for rank, (_, row) in enumerate(top_df.iterrows(), 1):
        pos = int(row["position"])
        ctx = get_context_around_token(df, pos, context_window)

        print(f"#{rank} | Position: {pos} | Activation: {ctx['activation']:.4f}")
        print("-" * 80)
        # Print with the target token highlighted
        print(f"...{ctx['before']}【{ctx['target']}】{ctx['after']}...")
        print()


def get_top_activations_as_dataframe(
    df: pd.DataFrame,
    top_k: int = 10,
    context_window: int = 10,
    min_activation: float = 0.0
) -> pd.DataFrame:
    """
    Get top activating tokens with context as a DataFrame.

    Returns:
        DataFrame with columns: rank, position, token, activation, context
    """
    filtered_df = df[df["activation"] > min_activation].copy()
    top_df = filtered_df.nlargest(top_k, "activation")

    results = []
    for rank, (_, row) in enumerate(top_df.iterrows(), 1):
        pos = int(row["position"])
        ctx = get_context_around_token(df, pos, context_window)

        results.append({
            "rank": rank,
            "position": pos,
            "token": ctx["target"],
            "activation": ctx["activation"],
            "context": f"...{ctx['before']}【{ctx['target']}】{ctx['after']}..."
        })

    return pd.DataFrame(results)

In [50]:
# Option 1: Print nicely formatted output
display_top_activations_with_context(full_df, top_k=10, context_window=10)

TOP 10 ACTIVATING TOKENS WITH CONTEXT (±10 tokens)

#1 | Position: 44 | Activation: 54.4062
--------------------------------------------------------------------------------
.... Importantly, the color we're providing assumes【 that】 the global tariff rates, policies, and application remain...

#2 | Position: 64 | Activation: 53.9688
--------------------------------------------------------------------------------
... remain in effect as of this call, and the【 global】 macroeconomic outlook does not worsen from today. We expect...

#3 | Position: 63 | Activation: 53.5000
--------------------------------------------------------------------------------
... application remain in effect as of this call, and【 the】 global macroeconomic outlook does not worsen from today. We...

#4 | Position: 45 | Activation: 52.8750
--------------------------------------------------------------------------------
... Importantly, the color we're providing assumes that【 the】 global tariff rates, policies, and app

In [51]:
# Option 2: Get as DataFrame for further analysis
context_df = get_top_activations_as_dataframe(full_df, top_k=10, context_window=10)
print(context_df.to_string(index=False))

 rank  position    token  activation                                                                                                                    context
    1        44     that    54.40625    .... Importantly, the color we're providing assumes【 that】 the global tariff rates, policies, and application remain...
    2        64   global    53.96875      ... remain in effect as of this call, and the【 global】 macroeconomic outlook does not worsen from today. We expect...
    3        63      the    53.50000 ... application remain in effect as of this call, and【 the】 global macroeconomic outlook does not worsen from today. We...
    4        45      the    52.87500  ... Importantly, the color we're providing assumes that【 the】 global tariff rates, policies, and application remain in...
    5        49        ,    51.93750       ... we're providing assumes that the global tariff rates【,】 policies, and application remain in effect as of this...
    6        52      and    51.31250   .