In [1]:
import torch
from transformer_lens import (
    HookedTransformer,
)
import pandas as pd

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
all_neuron_metrics = pd.read_csv('./all_neuron_metrics.csv')

##### Filter top toxic value vectors with initial positive activations

In [10]:
# Step 1: Filter the DataFrame to include only rows with positive gpt2_activation and being toxic
positive_activations = all_neuron_metrics[all_neuron_metrics['gpt2_activation'] > 0]
positive_activations = positive_activations[positive_activations['cosine_similarity'] > 0]

# Step 2: Rank these rows by cosine similarity in descending order
positive_activations['cossim_rank'] = positive_activations['cosine_similarity'].rank(ascending=False, method='min').astype(int)

# Step 3: Sort the DataFrame by the ranks
positive_activations_sorted = positive_activations.sort_values('cossim_rank')

# Step 4: Select only the layer_idx and neuron_idx columns
layer_neuron_ids = positive_activations_sorted[['layer_idx', 'neuron_idx']]

# Step 5: Save the result to a CSV file
output_file = 'toxic_positive_acts_idxs.csv'
layer_neuron_ids.to_csv(output_file, index=False)

print(f"Ranked layer_idx and neuron_idx saved to {output_file}")

Ranked layer_idx and neuron_idx saved to toxic_positive_acts_idxs.csv


In [8]:
print(len(layer_neuron_ids))

5770


##### Filter top toxic value vectors with their initial key vector and bias term

In [5]:
# Load the CSV file
all_neuron_metrics = pd.read_csv('./all_neuron_metrics.csv')

# Step 1: Rank all rows based on cosine similarity (descending order)
all_neuron_metrics['cossim_rank'] = all_neuron_metrics['cosine_similarity'].rank(ascending=False, method='min').astype(int)

# Step 2: Sort the DataFrame by the cossim_rank
ranked_df = all_neuron_metrics.sort_values('cossim_rank')[['layer_idx', 'neuron_idx']]

# Initialize the model
torch.set_grad_enabled(False)
gpt2 = HookedTransformer.from_pretrained("gpt2-medium")
gpt2.tokenizer.padding_side = "left"
gpt2.tokenizer.pad_token_id = gpt2.tokenizer.eos_token_id

# Step 3: Add columns for key vectors and bias terms
key_vectors = []
bias_terms = []

for _, row in ranked_df.iterrows():
    layer_idx = int(row['layer_idx'])  # Convert layer_idx to an integer
    neuron_idx = int(row['neuron_idx'])  # Convert neuron_idx to an integer
    
    # Extract the key vector and bias term
    key_vector = gpt2.blocks[layer_idx].mlp.W_in[:, neuron_idx].cpu().numpy().tolist()  # Convert to list
    bias = gpt2.blocks[layer_idx].mlp.b_in[neuron_idx].cpu().item()  # Get the bias term as a scalar
    
    # Store them in the lists
    key_vectors.append(key_vector)
    bias_terms.append(bias)

# Add the lists as new columns in the DataFrame
ranked_df['key_vector'] = key_vectors
ranked_df['bias_term'] = bias_terms

# Display the updated DataFrame
print(ranked_df.head())

# Optionally, save the DataFrame to a CSV file
ranked_df.to_csv('toxic_neurons_with_key_vectors_and_bias.csv', index=False)

Loaded pretrained model gpt2-medium into HookedTransformer
       layer_idx  neuron_idx  \
78594         19         770   
49923         12         771   
76397         18        2669   
53916         13         668   
65791         16         255   

                                              key_vector  bias_term  
78594  [-0.024163583293557167, -0.017355741932988167,...  -0.226177  
49923  [-0.010582792572677135, 0.0035662075970321894,...  -0.694075  
76397  [-0.03616376593708992, 0.03283524513244629, -0...  -0.291046  
53916  [-0.006538663990795612, 0.019593868404626846, ...  -0.465313  
65791  [-0.030100855976343155, 0.055411819368600845, ...  -0.162906  
