In [1]:
import os
import json
import torch
import torch.nn.functional as F
from transformer_lens import (
    HookedTransformer,
)

import numpy as np
import pandas as pd
from collections import defaultdict
from tqdm import tqdm

import matplotlib.pyplot as plt
from fig_utils import load_hooked

In [2]:
print(f"Available GPUs: {torch.cuda.device_count()}")
for i in range(torch.cuda.device_count()):
    gpu_properties = torch.cuda.get_device_properties(i)
    total_memory = gpu_properties.total_memory / 1024**3  # Convert bytes to GB
    allocated_memory = torch.cuda.memory_allocated(i) / 1024**3
    reserved_memory = torch.cuda.memory_reserved(i) / 1024**3
    free_memory = total_memory - allocated_memory - reserved_memory

    print(f"GPU {i}: {torch.cuda.get_device_name(i)}")
    print(f"  - Total Memory: {total_memory:.2f} GB")
    print(f"  - Allocated Memory: {allocated_memory:.2f} GB")
    print(f"  - Reserved Memory: {reserved_memory:.2f} GB")
    print(f"  - Free Memory: {free_memory:.2f} GB")

Available GPUs: 4
GPU 0: NVIDIA L40S
  - Total Memory: 44.32 GB
  - Allocated Memory: 0.00 GB
  - Reserved Memory: 0.00 GB
  - Free Memory: 44.32 GB
GPU 1: NVIDIA L40S
  - Total Memory: 44.32 GB
  - Allocated Memory: 0.00 GB
  - Reserved Memory: 0.00 GB
  - Free Memory: 44.32 GB
GPU 2: NVIDIA A100 80GB PCIe
  - Total Memory: 79.15 GB
  - Allocated Memory: 0.00 GB
  - Reserved Memory: 0.00 GB
  - Free Memory: 79.15 GB
GPU 3: NVIDIA A100 80GB PCIe
  - Total Memory: 79.15 GB
  - Allocated Memory: 0.00 GB
  - Reserved Memory: 0.00 GB
  - Free Memory: 79.15 GB


In [2]:
torch.cuda.set_device(0) # Set all operations to run on cuda:1
device = torch.device("cuda:0") 

In [3]:
# torch.cuda.empty_cache()  # Clears unused cached memory

In [3]:
ROOT_DIR = '/data/kebl6672/dpo-toxic-neuron/checkpoints'
# Load model and move it to the correct device
dpo_model = load_hooked("gpt2-medium", os.path.join(ROOT_DIR, "dpo.pt"))

# Move the model explicitly to the correct device
dpo_model.to(device)
# # Force loading the weights to the correct device
# dpo_model.load_and_process_state_dict(
#     torch.load(os.path.join(ROOT_DIR, "dpo.pt"), map_location="cuda:2")
# )

  # Move DPO model to device
# print(f"Model is now on: {next(dpo_model.parameters()).device}")

# torch.load(os.path.join(ROOT_DIR, "dpo.pt"), map_location=device)

Loaded pretrained model gpt2-medium into HookedTransformer


  _weights = torch.load(weights_path, map_location=torch.device("cuda"))[


Moving model to device:  cuda


HookedTransformer(
  (embed): Embed()
  (hook_embed): HookPoint()
  (pos_embed): PosEmbed()
  (hook_pos_embed): HookPoint()
  (blocks): ModuleList(
    (0-23): 24 x TransformerBlock(
      (ln1): LayerNormPre(
        (hook_scale): HookPoint()
        (hook_normalized): HookPoint()
      )
      (ln2): LayerNormPre(
        (hook_scale): HookPoint()
        (hook_normalized): HookPoint()
      )
      (attn): Attention(
        (hook_k): HookPoint()
        (hook_q): HookPoint()
        (hook_v): HookPoint()
        (hook_z): HookPoint()
        (hook_attn_scores): HookPoint()
        (hook_pattern): HookPoint()
        (hook_result): HookPoint()
      )
      (mlp): MLP(
        (hook_pre): HookPoint()
        (hook_post): HookPoint()
      )
      (hook_attn_in): HookPoint()
      (hook_q_input): HookPoint()
      (hook_k_input): HookPoint()
      (hook_v_input): HookPoint()
      (hook_mlp_in): HookPoint()
      (hook_attn_out): HookPoint()
      (hook_mlp_out): HookPoint()
      (h

In [5]:
gpt2 = HookedTransformer.from_pretrained("gpt2-medium")
gpt2.tokenizer.padding_side = "left"
gpt2.tokenizer.pad_token_id = gpt2.tokenizer.eos_token_id
gpt2.to(device)  # Move GPT-2 to device

Loaded pretrained model gpt2-medium into HookedTransformer
Moving model to device:  cuda


HookedTransformer(
  (embed): Embed()
  (hook_embed): HookPoint()
  (pos_embed): PosEmbed()
  (hook_pos_embed): HookPoint()
  (blocks): ModuleList(
    (0-23): 24 x TransformerBlock(
      (ln1): LayerNormPre(
        (hook_scale): HookPoint()
        (hook_normalized): HookPoint()
      )
      (ln2): LayerNormPre(
        (hook_scale): HookPoint()
        (hook_normalized): HookPoint()
      )
      (attn): Attention(
        (hook_k): HookPoint()
        (hook_q): HookPoint()
        (hook_v): HookPoint()
        (hook_z): HookPoint()
        (hook_attn_scores): HookPoint()
        (hook_pattern): HookPoint()
        (hook_result): HookPoint()
      )
      (mlp): MLP(
        (hook_pre): HookPoint()
        (hook_post): HookPoint()
      )
      (hook_attn_in): HookPoint()
      (hook_q_input): HookPoint()
      (hook_k_input): HookPoint()
      (hook_v_input): HookPoint()
      (hook_mlp_in): HookPoint()
      (hook_attn_out): HookPoint()
      (hook_mlp_out): HookPoint()
      (h

In [4]:
toxic_vector = torch.load(os.path.join(ROOT_DIR, "probe.pt")).to(device)  # Move toxic probe vector to device

  toxic_vector = torch.load(os.path.join(ROOT_DIR, "probe.pt")).to(device)  # Move toxic probe vector to device


In [5]:
# load data
DATA_DIR = '/data/kebl6672/dpo-toxic-neuron/data/intervene_data'

with open(
    os.path.join(DATA_DIR, "challenge_prompts_prefix_mmm.jsonl"), "r"
    # os.path.join(ROOT_DIR, "challenge_prompts.jsonl"), "r"
) as file_p:
    data = file_p.readlines()

prompts = [json.loads(x.strip())["prompt"] for x in data]

tokenized_prompts = dpo_model.to_tokens(prompts, prepend_bos=True).cuda()     

##### Input and output projection function

In [None]:
def compute_layer_toxic_projection_reduction(model, tokenized_prompts, toxic_vector, batch_size=64):

    # Dictionary to store hook data
    mlp_inputs = {}

    # Function to create a hook function for a specific layer
    def create_store_mlp_input_hook(layer_idx):
        def store_mlp_input_hook(module, inputs, outputs):
            mlp_inputs[layer_idx] = inputs[0].detach()
        return store_mlp_input_hook

    # Add the hooks to each MLP layer in the model
    for layer_idx, layer in enumerate(model.blocks):
        layer.mlp.register_forward_hook(create_store_mlp_input_hook(layer_idx))

    # Initialize dictionaries to store projections for all layers
    gpt2_input_projections = defaultdict(list)
    gpt2_output_projections = defaultdict(list)
    gpt2_projection_reductions = defaultdict(list)

    sample_size = tokenized_prompts.size(0)

    print("Computing MLP inputs and output projections...")
    for idx in tqdm(range(0, sample_size, batch_size)):
        batch = tokenized_prompts[idx : idx + batch_size, :]

        # Clear the mlp_inputs dictionary before running the batch
        mlp_inputs.clear()

        for timestep in range(20): # generate 20 tokens
            with torch.inference_mode():
                _, cache = model.run_with_cache(batch)

            sampled = model.unembed(cache["ln_final.hook_normalized"]).argmax(-1)[:, -1]

            for layer_idx in range(len(model.blocks)):
                # Retrieve the input to the MLP stored by the hook
                x_token = mlp_inputs[layer_idx][:, -1, :] # at the last token position

                # Get the output after the MLP has been applied (before adding back to the residual stream)
                y_token = cache[f"blocks.{layer_idx}.hook_mlp_out"][:, -1, :] # [batch_size, d_model]

                # print(torch.norm(x_token), torch.norm(y_token))
                
                # Compute the projection onto the toxic vector for the MLP input and output
                input_projection = torch.matmul(x_token, toxic_vector) / torch.norm(toxic_vector) # [batch_size]
                output_projection = torch.matmul(y_token, toxic_vector) / torch.norm(toxic_vector) # [batch_size]

                # Store the mean projection values for this batch and layer
                gpt2_input_projections[layer_idx].extend(input_projection.tolist())
                gpt2_output_projections[layer_idx].extend(output_projection.tolist())

                # Compute the projection reduction (how much the MLP changes the toxic direction)
                projection_reduction = input_projection - output_projection
                gpt2_projection_reductions[layer_idx].extend(projection_reduction.tolist())

            batch = torch.concat([batch, sampled.unsqueeze(-1)], dim=-1)

    # Compute final average projections across all batches and tokens for each layer
    avg_input_projections = {layer_idx: np.mean(projections) for layer_idx, projections in gpt2_input_projections.items()}
    avg_output_projections = {layer_idx: np.mean(projections) for layer_idx, projections in gpt2_output_projections.items()}
    avg_projection_reductions = {layer_idx: np.mean(projections) for layer_idx, projections in gpt2_projection_reductions.items()}

    return avg_input_projections, avg_output_projections, avg_projection_reductions


In [None]:
avg_input_projections, avg_output_projections, avg_projection_reductions = compute_layer_toxic_projection_reduction(gpt2, tokenized_prompts, toxic_vector)
avg_input_projections_dpo, avg_output_projections_dpo, avg_projection_reductions_dpo = compute_layer_toxic_projection_reduction(dpo_model, tokenized_prompts, toxic_vector)

##### Ablate top toxic value vectors and plot output projections

In [None]:
def scale_top_value_vectors(model, toxic_vector, topk_sorted_score, scale_factor=0):
    """
    Scales the value vectors of the MLPs based on the cosine similarity of the value vectors to a given toxic vector.
    
    Args:
        model (torch.nn.Module): The model being modified.
        toxic_vector (torch.Tensor): The vector to compare against for scaling.
        topk_sorted_score (int): The number of top vectors to scale.
        scale_factor (float): The factor by which to scale the selected vectors.
    
    Returns:
        torch.nn.Module: The modified model with scaled value vectors.
    """
    scores = []
    
    # Iterate through each layer of the model
    for layer_idx in range(len(model.blocks)):
        # Target the value vectors in the MLP
        value_vectors = model.blocks[layer_idx].mlp.W_out
        
        # Compute cosine similarities between the value vectors and the toxic vector
        cos_sims = F.cosine_similarity(value_vectors, toxic_vector.unsqueeze(0), dim=1)
        
        # Get the top k most similar value vectors
        _topk = cos_sims.topk(k=100)
        _values = [x.item() for x in _topk.values]
        _idxs = [x.item() for x in _topk.indices]
        topk = list(zip(_values, _idxs, [layer_idx] * _topk.indices.shape[0]))
        scores.extend(topk)

    # Sort the scores in descending order based on cosine similarity
    sorted_scores = sorted(scores, key=lambda x: x[0], reverse=True)
    
    # Select the top `topk_sorted_score` value vectors and scale them
    with torch.no_grad():
        for score in sorted_scores[:topk_sorted_score]:
            layer_idx, neuron_idx = score[2], score[1]
            print(f"Scaling vector at layer {layer_idx}, index {neuron_idx} by {scale_factor}")
            model.blocks[layer_idx].mlp.W_out[neuron_idx, :] *= scale_factor

    # Return the modified model
    return model

In [None]:
def scale_top_value_vectors_with_positive_activations(model, toxic_positive_acts_index_csv_path, topk_sorted_score, scale_factor=0):
    """
    Scales the value vectors with positive activations before DPO based on the ranks of their cosine similarity with the toxic probe.
    """

    # Load the sorted scores from the CSV
    sorted_scores_df = pd.read_csv(toxic_positive_acts_index_csv_path)
    
    # Select the top `topk_sorted_score` layer and neuron indices from the CSV
    top_layer_neuron_indices = sorted_scores_df.head(topk_sorted_score)
    
    # Scale the selected value vectors
    with torch.no_grad():
        for _, row in top_layer_neuron_indices.iterrows():
            layer_idx = row['layer_idx']
            neuron_idx = row['neuron_idx']
            print(f"Scaling vector at layer {layer_idx}, index {neuron_idx} by {scale_factor}")
            model.blocks[layer_idx].mlp.W_out[neuron_idx, :] *= scale_factor

    return model 


In [None]:
# Disable top 128 toxic value vectors
new_gpt2 = scale_top_value_vectors(gpt2, toxic_vector, 128, 0)

In [None]:
# Disable top 128 toxic value vectors with positive activations
# new_gpt2 = scale_top_value_vectors_with_positive_activations(gpt2, './toxic_positive_acts_idxs.csv', 128, 0)

In [None]:
print(new_gpt2.blocks[19].mlp.W_out[770, :])

In [None]:
new_avg_input_projections, new_avg_output_projections, new_avg_projection_reductions = compute_layer_toxic_projection_reduction(new_gpt2, tokenized_prompts, toxic_vector)

##### Plotting functions

In [None]:
def plot_two_layer_projections(layer_input_projections1, layer_output_projections1,
                           layer_input_projections2, layer_output_projections2):
    # Ensure all projection lists are sorted by layer index
    layers1 = sorted(layer_input_projections1.keys())
    layers2 = sorted(layer_input_projections2.keys())
    
    # Convert dictionaries to lists based on the sorted layers
    layer_input_projections1 = [layer_input_projections1[layer] for layer in layers1]
    layer_output_projections1 = [layer_output_projections1[layer] for layer in layers1]
    layer_input_projections2 = [layer_input_projections2[layer] for layer in layers2]
    layer_output_projections2 = [layer_output_projections2[layer] for layer in layers2]

    plt.figure(figsize=(10, 6))
    plt.plot(layers1, layer_input_projections1, label='Pre-DPO Input Projection', marker='o', linestyle='-', color='blue')
    plt.plot(layers1, layer_output_projections1, label='Pre-DPO Output Projection', marker='x', linestyle='--', color='orange')
    
    plt.plot(layers2, layer_input_projections2, label='DPO Input Projection', marker='o', linestyle='-', color='green')
    plt.plot(layers2, layer_output_projections2, label='DPO Output Projection', marker='x', linestyle='--', color='red')

    plt.title('Input and Output Projections Across Layers')
    plt.xlabel('Layer Index')
    plt.ylabel('Projection')
    plt.legend()
    plt.grid(True)
    plt.show()

In [None]:
def plot_three_layer_projections(layer_input_projections1, layer_output_projections1,
                           layer_input_projections2, layer_output_projections2,
                           layer_input_projections3, layer_output_projections3):
    # Ensure all projection lists are sorted by layer index
    layers1 = sorted(layer_input_projections1.keys())
    layers2 = sorted(layer_input_projections2.keys())
    layers3 = sorted(layer_input_projections3.keys())
    
    # Convert dictionaries to lists based on the sorted layers
    layer_input_projections1 = [layer_input_projections1[layer] for layer in layers1]
    layer_output_projections1 = [layer_output_projections1[layer] for layer in layers1]
    layer_input_projections2 = [layer_input_projections2[layer] for layer in layers2]
    layer_output_projections2 = [layer_output_projections2[layer] for layer in layers2]
    layer_input_projections3 = [layer_input_projections3[layer] for layer in layers3]
    layer_output_projections3 = [layer_output_projections3[layer] for layer in layers3]

    plt.figure(figsize=(12, 8))
    
    # Plot first set of input and output projections
    plt.plot(layers1, layer_input_projections1, label='Pre-DPO Input Projection', marker='o', linestyle='-', color='blue')
    plt.plot(layers1, layer_output_projections1, label='Pre-DPO Output Projection', marker='x', linestyle='--', color='orange')
    
    # Plot second set of input and output projections
    plt.plot(layers2, layer_input_projections2, label='Disable Value Vector Input Projection', marker='o', linestyle='-', color='green')
    plt.plot(layers2, layer_output_projections2, label='Disable Value Vector Output Projection', marker='x', linestyle='--', color='red')
    
    # Plot third set of input and output projections
    plt.plot(layers3, layer_input_projections3, label='DPO Input Projection', marker='o', linestyle='-', color='purple')
    plt.plot(layers3, layer_output_projections3, label='DPO Output Projection', marker='x', linestyle='--', color='brown')

    plt.title('Input and Output Projections Across Layers')
    plt.xlabel('Layer Index')
    plt.ylabel('Projection')
    plt.legend()
    plt.grid(True)
    plt.show()

In [None]:
def plot_three_layer_projections_output(layer_output_projections1,
                                        layer_output_projections2,
                                        layer_output_projections3):
    # Ensure all projection lists are sorted by layer index
    layers1 = sorted(layer_output_projections1.keys())
    layers2 = sorted(layer_output_projections2.keys())
    layers3 = sorted(layer_output_projections3.keys())
    
    # Convert dictionaries to lists based on the sorted layers
    layer_output_projections1 = [layer_output_projections1[layer] for layer in layers1]
    layer_output_projections2 = [layer_output_projections2[layer] for layer in layers2]
    layer_output_projections3 = [layer_output_projections3[layer] for layer in layers3]

    plt.figure(figsize=(12, 8))
    
    # Plot first set of output projections with increased line width
    plt.plot(layers1, layer_output_projections1, label='Before DPO', marker='x', linestyle='--', color='red', linewidth=2.5)
    
    # Plot second set of output projections with increased line width
    plt.plot(layers2, layer_output_projections2, label='Ablate 128 toxic neurons', marker='x', linestyle='--', color='orange', linewidth=2.5)
    
    # Plot third set of output projections with increased line width
    plt.plot(layers3, layer_output_projections3, label='After DPO', marker='x', linestyle='--', color='green', linewidth=2.5)

    # plt.title('Output projection per layer')
    plt.xlabel('MLP layer index', fontsize=20)
    plt.ylabel('Output projection per layer', fontsize=20)
    plt.legend(fontsize=18)
    plt.xticks(fontsize=20)
    plt.yticks(fontsize=20)
    # plt.grid(True)
    plt.show()

In [None]:
def plot_two_layer_projections_output(layer_output_projections1,
                                 layer_output_projections2):
    # Ensure all projection lists are sorted by layer index
    layers1 = sorted(layer_output_projections1.keys())
    layers2 = sorted(layer_output_projections2.keys())
    
    # Convert dictionaries to lists based on the sorted layers
    layer_output_projections1 = [layer_output_projections1[layer] for layer in layers1]
    layer_output_projections2 = [layer_output_projections2[layer] for layer in layers2]

    plt.figure(figsize=(12, 8))
    
    # Plot first set of output projections
    plt.plot(layers1, layer_output_projections1, label='Pre-DPO Output Projection', marker='x', linestyle='--', color='red')
    
    # Plot second set of output projections
    plt.plot(layers2, layer_output_projections2, label='DPO Output Projection', marker='x', linestyle='--', color='green')

    plt.title('Output Projections Across Layers')
    plt.xlabel('Layer Index')
    plt.ylabel('Projection')
    plt.legend()
    plt.grid(True)
    plt.show()

##### Projection after disabling toxic value vectors

In [None]:
# Averaged over 20 tokens
plot_two_layer_projections_output(avg_output_projections, 
                      avg_output_projections_dpo)

In [None]:
# Averaged over 20 tokens
# Disable top 128 value vectors
plot_three_layer_projections_output(avg_output_projections, 
                      new_avg_output_projections,
                      avg_output_projections_dpo)

In [None]:
# Averaged over 20 tokens
# Disable top 200 value vectors with positive activations
plot_three_layer_projections_output(avg_output_projections, 
                      new_avg_output_projections,
                      avg_output_projections_dpo)

In [None]:
# Averaged over 20 tokens
# Disable top 1000 value vectors with positive activations (legend is wrong)
plot_three_layer_projections_output(avg_output_projections, 
                      new_avg_output_projections,
                      avg_output_projections_dpo)

In [None]:
# Averaged over 20 tokens
# Disable top 2000 value vectors with positive activations (legend is wrong)
plot_three_layer_projections_output(avg_output_projections, 
                      new_avg_output_projections,
                      avg_output_projections_dpo)

In [None]:
# Averaged over 20 tokens
# Disable top 200 value vectors
plot_three_layer_projections(avg_input_projections, avg_output_projections, 
                       new_avg_input_projections, new_avg_output_projections,
                       avg_input_projections_dpo, avg_output_projections_dpo)

In [None]:
# At the last token position only (for generating the next token)
# Disable top 200 value vectors
plot_layer_projections(avg_input_projections, avg_output_projections, 
                       avg_input_projections_dpo, avg_output_projections_dpo)

##### Neuron toxicity projection function

In [11]:
def compute_neuron_toxic_projection(model, tokenized_prompts, toxic_vector, batch_size=64):
    # Initialize dictionaries to store projections and activations for all layers
    gpt2_neuron_projections = defaultdict(list)

    sample_size = tokenized_prompts.size(0)

    print("Computing MLP neuron projections...")
    
    device = next(model.parameters()).device  # Get the model's device

    for idx in tqdm(range(0, sample_size, batch_size)):
        batch = tokenized_prompts[idx : idx + batch_size, :].to(device)

        for timestep in range(20):  # generate 20 tokens
            with torch.inference_mode():
                _, cache = model.run_with_cache(batch)

            # Ensure cache tensors are moved to the correct device
            cache = {k: v.to(device).detach().clone() for k, v in cache.items()}

            sampled = model.unembed(cache["ln_final.hook_normalized"]).argmax(-1).detach().to(device)[:, -1]

            for layer_idx in range(len(model.blocks)):
                neuron_acts = cache[f"blocks.{layer_idx}.mlp.hook_post"][:, -1, :].to(device)
                value_vectors = model.blocks[layer_idx].mlp.W_out.to(device)

                neuron_outputs = neuron_acts.unsqueeze(-1) * value_vectors
                neuron_projections = torch.matmul(neuron_outputs, toxic_vector.to(device)) / torch.norm(toxic_vector.to(device))

                for neuron_idx in range(neuron_projections.size(1)):
                    gpt2_neuron_projections[(layer_idx, neuron_idx)].extend(neuron_projections[:, neuron_idx].tolist())

            batch = torch.concat([batch, sampled.unsqueeze(-1)], dim=-1)

    # Compute final average neuron projections and average activations across all batches and tokens 
    avg_neuron_projections = {
        (layer_idx, neuron_idx): np.mean(projections)
        for (layer_idx, neuron_idx), projections in gpt2_neuron_projections.items()
    }

    return avg_neuron_projections

##### Check neuron projection calculation correctness

In [None]:
def compute_layer_projection_sums(model, tokenized_prompts, toxic_vector, batch_size=64):
    # Initialize dictionaries to store the sum and count of projections for all layers
    gpt2_neuron_projections_sum = defaultdict(float)
    gpt2_neuron_projections_count = defaultdict(int)

    sample_size = tokenized_prompts.size(0)

    print("Computing MLP neuron projections and averaging them for each layer...")
    for idx in tqdm(range(0, sample_size, batch_size)):
        batch = tokenized_prompts[idx : idx + batch_size, :]

        for timestep in range(20):  # generate 20 tokens
            with torch.inference_mode():
                _, cache = model.run_with_cache(batch)

            sampled = model.unembed(cache["ln_final.hook_normalized"]).argmax(-1)[:, -1]

            for layer_idx in range(len(model.blocks)):
                # Extract neuron activations before applying the output weights
                neuron_acts = cache[f"blocks.{layer_idx}.mlp.hook_post"][:, -1, :]  # [batch_size, d_mlp]
                value_vectors = model.blocks[layer_idx].mlp.W_out  # [d_mlp, d_model]

                # Compute the neuron outputs by multiplying neuron_acts with the corresponding value vector
                neuron_outputs = neuron_acts.unsqueeze(-1) * value_vectors  # [batch_size, d_mlp, d_model]

                # Compute the projection onto the toxic vector for each neuron output
                neuron_projections = torch.matmul(neuron_outputs, toxic_vector) / torch.norm(toxic_vector)  # [batch_size, d_mlp]
                
                # Sum the projections across all neurons in the layer for this batch
                layer_projection_sum = torch.sum(neuron_projections, dim=1)  # [batch_size]

                # Accumulate the sum of projections for the layer and keep track of the count
                gpt2_neuron_projections_sum[layer_idx] += torch.sum(layer_projection_sum).item()
                gpt2_neuron_projections_count[layer_idx] += batch_size

            # Update the batch with the newly sampled tokens
            batch = torch.concat([batch, sampled.unsqueeze(-1)], dim=-1)

    # Calculate the final averaged projection for each layer and add the bias projection
    final_layer_projections = {}
    for layer_idx in range(len(model.blocks)):
        # Average the accumulated projections
        avg_projection = gpt2_neuron_projections_sum[layer_idx] / gpt2_neuron_projections_count[layer_idx]
        
        # Add the bias projection
        bias = model.blocks[layer_idx].mlp.b_out  # [d_model]
        bias_projection = torch.dot(bias, toxic_vector) / torch.norm(toxic_vector)  # scalar

        # Final projection for this layer
        final_layer_projections[layer_idx] = avg_projection + bias_projection.item()

    return final_layer_projections

In [None]:
def plot_layer_projections(layer_projections):
    layers = list(layer_projections.keys())
    projections = list(layer_projections.values())

    plt.figure(figsize=(10, 6))
    plt.plot(layers, projections, marker='o', linestyle='-', color='b')
    plt.xlabel('Layer Index')
    plt.ylabel('Final Projection Sum')
    plt.title('Final Layer Projections for Each Layer')
    plt.grid(True)
    plt.show()

In [None]:
final_layer_projections = compute_layer_projection_sums(gpt2, tokenized_prompts, toxic_vector)
plot_layer_projections(final_layer_projections)

##### Compute neuron toxicity projections

In [12]:
# Save results to csv file
def save_neuron_projections_to_csv(avg_neuron_projections, filename):
    # Convert the dictionary to a list of tuples (layer_idx, neuron_idx, projection_value)
    data = [
        {"layer_idx": layer_idx, "neuron_idx": neuron_idx, "projection_value": projection}
        for (layer_idx, neuron_idx), projection in avg_neuron_projections.items()
    ]

    # Create a pandas DataFrame
    df = pd.DataFrame(data)

    # Save the DataFrame to a CSV file
    df.to_csv(filename, index=False)
    print(f"Neuron projections saved to {filename}")

In [18]:
# Compute the neuron data and average projections
avg_neuron_projections = compute_neuron_toxic_projection(gpt2, tokenized_prompts, toxic_vector)

Computing MLP neuron projections...


  0%|          | 0/38 [00:00<?, ?it/s]


OutOfMemoryError: CUDA out of memory. Tried to allocate 14.00 MiB. GPU 0 has a total capacity of 44.32 GiB of which 8.50 MiB is free. Process 3078353 has 6.58 GiB memory in use. Including non-PyTorch memory, this process has 37.72 GiB memory in use. Of the allocated memory 35.06 GiB is allocated by PyTorch, and 2.17 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [None]:
# Save to csv file
save_neuron_projections_to_csv(avg_neuron_projections, filename="gpt2_neuron_projections.csv")

In [13]:
device = torch.device("cuda:0")  # "cuda:1" 
dpo_model.to(device)
tokenized_prompts = tokenized_prompts.to(device)
toxic_vector = toxic_vector.to(device)

Moving model to device:  cuda


In [14]:
import torch
import gc

torch.cuda.empty_cache()  # Free up cached memory
gc.collect()  # Force garbage collection

0

In [15]:
# For the DPO-ed model
avg_neuron_projections_dpo = compute_neuron_toxic_projection(dpo_model, tokenized_prompts, toxic_vector)

Computing MLP neuron projections...


100%|██████████| 19/19 [27:02<00:00, 85.42s/it]


In [16]:
# Save to csv file
save_neuron_projections_to_csv(avg_neuron_projections_dpo, filename="dpo_neuron_projections.csv")

Neuron projections saved to dpo_neuron_projections.csv


In [None]:
# Top neuron contributors for each layer

# Step 1: Compute the difference in projections between the original model and DPO model
projection_diffs = {}

for (layer_idx, neuron_idx), proj in avg_neuron_projections.items():
    if (layer_idx, neuron_idx) in avg_neuron_projections_dpo:
        diff = proj - avg_neuron_projections_dpo[(layer_idx, neuron_idx)]
        if layer_idx not in projection_diffs:
            projection_diffs[layer_idx] = []
        projection_diffs[layer_idx].append((neuron_idx, diff))

# Step 2: Sort the differences within each layer to find the top neurons with the largest decrease
top_neurons_per_layer = {}

for layer_idx, neuron_diffs in projection_diffs.items():
    # Sort neurons by projection difference in descending order
    sorted_neurons = sorted(neuron_diffs, key=lambda x: x[1], reverse=True)
    # Select top neurons for this layer (you can choose the top N neurons per layer)
    top_neurons_per_layer[layer_idx] = sorted_neurons[:20]  # Adjust the number as needed

# Step 3: Retrieve the neuron activations before and after DPO for the top neurons in each layer
top_neuron_acts_per_layer = {}

for layer_idx, top_neurons in top_neurons_per_layer.items():
    top_neuron_acts_per_layer[layer_idx] = []
    for neuron_idx, diff in top_neurons:
        top_neuron_acts_per_layer[layer_idx].append({
            'layer_idx': layer_idx,
            'neuron_idx': neuron_idx,
            'projection_diff': diff,
        })

# Step 4: Print or return the top neuron information per layer
for layer_idx, neuron_acts in top_neuron_acts_per_layer.items():
    print(f"Layer {layer_idx}:")
    for neuron in neuron_acts:
        print(f"  Neuron {neuron['neuron_idx']}:")
        print(f"    Projection Decrease: {neuron['projection_diff']}")

In [None]:
# Top neuron contributors across all layers

# Step 1: Compute the difference in projections between the original model and DPO model
projection_diffs = {}

for (layer_idx, neuron_idx), proj in avg_neuron_projections.items():
    if (layer_idx, neuron_idx) in avg_neuron_projections_dpo:
        diff = proj - avg_neuron_projections_dpo[(layer_idx, neuron_idx)]
        projection_diffs[(layer_idx, neuron_idx)] = diff

# Step 2: Sort the differences to find the top neurons with the largest decrease
all_neurons_sorted = sorted(projection_diffs.items(), key=lambda x: x[1], reverse=True)

# Step 3: Retrieve the neuron activations before and after DPO for all neurons
all_neurons_projs = []

for (layer_idx, neuron_idx), diff in all_neurons_sorted:
    all_neurons_projs.append({
        'layer_idx': layer_idx,
        'neuron_idx': neuron_idx,
        'projection_diff': diff,
    })

# Step 4: Print or return the top 100 neuron information
for neuron in all_neurons_projs[:100]:
    print(f"Layer {neuron['layer_idx']}, Neuron {neuron['neuron_idx']}:")
    print(f"  Projection Decrease: {neuron['projection_diff']}")
    

##### Compute (value_vector * toxic direction)

In [None]:
# Compute the dot product of value vectors with the normalized toxic vector
def compute_all_value_vector_projection(model, toxic_vector, model_name="model"):
    value_vector_projections = []

    # Normalize the toxic vector
    normalized_toxic_vector = toxic_vector / toxic_vector.norm()

    # Iterate over all layers and all neurons in each layer
    for layer_idx in range(len(model.blocks)):
        # Get the weight matrix W_out for the current layer's MLP
        W_out = model.blocks[layer_idx].mlp.W_out  # [d_mlp, d_model]

        for neuron_idx in range(W_out.shape[0]):
            # Get the value vector for the specified neuron
            value_vector = W_out[neuron_idx]  # [d_model]

            # Compute the dot product between the value vector and the normalized toxic vector
            value_vector_projection = torch.dot(value_vector, normalized_toxic_vector).item()

            # Store the layer index, neuron index, and computed projection
            value_vector_projections.append({
                "layer_idx": layer_idx,
                "neuron_idx": neuron_idx,
                "value_vector_projection": value_vector_projection
            })
    
    # Convert the list of dictionaries to a pandas DataFrame
    df = pd.DataFrame(value_vector_projections)

    # Generate the CSV filename using the model name
    csv_filename = f"{model_name}_value_vector_projections.csv"

    # Save the DataFrame to a CSV file with the generated filename
    df.to_csv(csv_filename, index=False)
    print(f"Projections saved to {csv_filename}")

    return df

In [None]:
df_gpt2 = compute_all_value_vector_projection(gpt2, toxic_vector, model_name="gpt2")

In [None]:
df_dpo = compute_all_value_vector_projection(dpo_model, toxic_vector, model_name="dpo")

##### Extract all cossims of neurons

In [None]:
# Compute cossims of all neurons - very similar before and after DPO
def compute_all_neuron_cossims(model, toxic_vector, model_name="model"):
    gpt2_neuron_cossims = []

    # Iterate over all layers and all neurons in each layer
    for layer_idx in range(len(model.blocks)):
        # Get the weight matrix W_out for the current layer's MLP
        W_out = model.blocks[layer_idx].mlp.W_out  # [d_mlp, d_model]

        for neuron_idx in range(W_out.shape[0]):
            # Get the value vector for the specified neuron
            value_vector = W_out[neuron_idx]  # [d_model]

            # Compute the cosine similarity between the value vector and the toxic vector
            cossim = F.cosine_similarity(value_vector.unsqueeze(0), toxic_vector.unsqueeze(0), dim=1).item()

            # Store the layer index, neuron index, and computed cosine similarity
            gpt2_neuron_cossims.append({
                "layer_idx": layer_idx,
                "neuron_idx": neuron_idx,
                "cosine_similarity": cossim
            })
    
    # Convert the list of dictionaries to a pandas DataFrame
    df = pd.DataFrame(gpt2_neuron_cossims)

    # Generate the CSV filename using the model name
    csv_filename = f"{model_name}_neuron_cossims.csv"

    # Save the DataFrame to a CSV file with the generated filename
    df.to_csv(csv_filename, index=False)
    print(f"Cosine similarities saved to {csv_filename}")

    return df

In [None]:
df_gpt2 = compute_all_neuron_cossims(gpt2, toxic_vector, model_name = 'gpt2')

In [None]:
df_dpo = compute_all_neuron_cossims(dpo_model, toxic_vector, model_name = 'dpo')

In [None]:
top_100_neurons_indexes = [(19, 770), (19, 1438), (12, 882), (18, 2669), (16, 603), (21, 2876), (13, 668), (20, 2953), (21, 1404), (20, 2820), (12, 771), (15, 4051), (20, 3210), (21, 3336), (23, 4039), (22, 1406), (18, 2757), (18, 2062), (11, 175), (16, 3941), (23, 816), (18, 919), (3, 3680), (6, 3972), (23, 1295), (21, 2568), (21, 3929), (17, 368), (17, 3922), (13, 33), (19, 3341), (19, 2312), (18, 430), (21, 566), (13, 2258), (3, 3742), (8, 2854), (21, 387), (16, 255), (18, 2795), (20, 539), (0, 2352), (14, 1958), (19, 2191), (17, 3704), (20, 3384), (20, 474), (11, 1550), (20, 1748), (0, 3393), (18, 3606), (21, 1889), (17, 2875), (23, 3759), (20, 3773), (20, 2780), (23, 505), (14, 883), (17, 359), (22, 4077), (13, 3243), (16, 1291), (10, 3184), (22, 782), (18, 2982), (21, 3088), (19, 505), (17, 3336), (23, 2031), (23, 1054), (22, 1075), (21, 2318), (19, 1402), (15, 3116), (16, 2492), (17, 3162), (19, 955), (23, 4069), (19, 3244), (22, 3559), (23, 1029), (23, 1874), (23, 2954), (16, 1800), (10, 3477), (19, 2006), (22, 3980), (20, 2946), (23, 2220), (21, 3774), (23, 1268), (22, 2308), (22, 1418), (23, 1274), (17, 346), (15, 1517), (22, 268), (18, 1971), (11, 4021), (20, 1483)]

In [None]:
def compute_top_neuron_cossims(model, toxic_vector, top_neurons):
    gpt2_neuron_cossims = []

    for layer_idx, neuron_idx in top_neurons:
        # Get the value vector for the specified layer and neuron
        value_vector = model.blocks[layer_idx].mlp.W_out[neuron_idx] # [d_model]
        
        # Compute the cosine similarity between the value vector and the toxic vector
        cossim = F.cosine_similarity(value_vector.unsqueeze(0), toxic_vector.unsqueeze(0), dim=1).item()
        
        # Append the result to the list
        gpt2_neuron_cossims.append(cossim)
    
    return gpt2_neuron_cossims

In [None]:
top_neuron_cossims = compute_top_neuron_cossims(gpt2, toxic_vector, top_100_neurons_indexes)

In [None]:
# Prepare labels for the x-axis
labels = [f"Layer {layer}, Neuron {neuron}" for layer, neuron in top_100_neurons_indexes]

# Plotting the cossims
plt.figure(figsize=(14, 7))
plt.bar(labels, top_neuron_cossims)

# Adding labels and title
plt.xlabel("Layer and Neuron Index")
plt.ylabel("Cosine Similarity")
plt.title("Cosine Similarities in Order of Specified Top Neurons")

# Rotate x-axis labels for better readability
plt.xticks(rotation=90)

# Show plot
plt.tight_layout()
plt.show()

##### Compute (after-GELU) activations pre-DPO and post-DPO for all neurons

In [6]:
def compute_top_neuron_acts(model, tokenized_prompts, batch_size=32):
    # Initialize dictionaries to store activations for all neurons
    gpt2_neuron_acts = defaultdict(list)

    sample_size = tokenized_prompts.size(0)

    print("Computing MLP neuron activations...")
    for idx in tqdm(range(0, sample_size, batch_size)):
        batch = tokenized_prompts[idx : idx + batch_size, :]

        for timestep in range(20):  # generate 20 tokens
            with torch.inference_mode():
                _, cache = model.run_with_cache(batch)
            
            cache = {k: v.detach().clone() for k, v in cache.items()}

            sampled = model.unembed(cache["ln_final.hook_normalized"].detach()).argmax(-1).detach().to(device)[:, -1]


            for layer_idx in range(len(model.blocks)):
                # Extract (after Gelu) neuron activations before applying the output weights
                neuron_acts = cache[f"blocks.{layer_idx}.mlp.hook_post"][:, -1, :]  # [batch_size, d_mlp]

                # Store the neuron activations for this batch, layer, and neuron
                for neuron_idx in range(neuron_acts.size(1)):
                    gpt2_neuron_acts[(layer_idx, neuron_idx)].extend(neuron_acts[:, neuron_idx].tolist())

            batch = torch.concat([batch, sampled.unsqueeze(-1)], dim=-1)

    # Compute final average neuron activations across all batches and tokens
    avg_neuron_acts = {
        (layer_idx, neuron_idx): np.mean(acts)
        for (layer_idx, neuron_idx), acts in gpt2_neuron_acts.items()
    }

    return avg_neuron_acts

In [7]:
# Save results to csv file
def save_neuron_acts_to_csv(avg_neuron_acts, filename):
    # Convert the dictionary to a list of tuples (layer_idx, neuron_idx, acts)
    data = [
        {"layer_idx": layer_idx, "neuron_idx": neuron_idx, "activation": acts}
        for (layer_idx, neuron_idx), acts in avg_neuron_acts.items()
    ]

    # Create a pandas DataFrame
    df = pd.DataFrame(data)

    # Save the DataFrame to a CSV file
    df.to_csv(filename, index=False)
    print(f"Neuron activations saved to {filename}")

In [8]:
import torch
import gc

torch.cuda.empty_cache()  # Free up cached memory
gc.collect()  # Force garbage collection

0

In [None]:
# Compute the neuron acts before DPO
avg_neuron_acts = compute_top_neuron_acts(gpt2, tokenized_prompts)

In [None]:
# Save to csv file
save_neuron_acts_to_csv(avg_neuron_acts, filename="gpt2_neuron_activations.csv")

In [9]:
# Compute the neuron acts after DPO
avg_neuron_acts_dpo = compute_top_neuron_acts(dpo_model, tokenized_prompts)

Computing MLP neuron activations...


100%|██████████| 38/38 [42:21<00:00, 66.88s/it]


In [10]:
# Save to csv file
save_neuron_acts_to_csv(avg_neuron_acts_dpo, filename="dpo_neuron_activations.csv")

Neuron activations saved to dpo_neuron_activations.csv


##### Compute (pre-GELU) activations pre-DPO and post-DPO for all neurons

In [None]:
import math

def gelu(x):
    """Compute the GELU function."""
    return 0.5 * x * (1 + math.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * (x ** 3))))

def inverse_gelu(output, epsilon=1e-5, max_iter=100):
    """Approximate the inverse of the GELU function."""
    # Start with an initial guess for the input
    x = output
    for _ in range(max_iter):
        # Compute the output of GELU for the current guess
        gelu_output = gelu(x)
        
        # Calculate the derivative of GELU at current guess
        derivative = 0.5 * (1 + math.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * (x ** 3)))) \
                   + 0.5 * x * (1 / math.cosh(math.sqrt(2 / math.pi) * (x + 0.044715 * (x ** 3)))) ** 2 \
                   * (math.sqrt(2 / math.pi) * (1 + 0.134145 * (x ** 2)))
        
        # Update the guess using Newton's method
        x -= (gelu_output - output) / derivative
        
        # Stop if the guess is close enough to the actual input
        if abs(gelu_output - output) < epsilon:
            break
    return x

# Example usage
output_value = gelu(-0.2769)
input_value = inverse_gelu(output_value)
print(f"GELU Output: {output_value}")
print(f"Inverse GELU Input: {input_value}")


In [None]:
df = pd.read_csv('/code/eraser_neurons/all_neuron_metrics.csv')

# Apply the inverse GELU function to the 'dpo_activation' column
df['dpo_pregelu_activation'] = df['dpo_activation'].apply(inverse_gelu)

# Display the updated DataFrame
print(df.head)

df.to_csv('/code/eraser_neurons/all_neuron_metrics.csv', index=False)

In [None]:
import torch

# Define the values for which we want to compute the GeLU
values = torch.tensor([1.7812614, 3.557839])

# Compute GeLU using PyTorch's built-in function
gelu_values = torch.nn.functional.gelu(values)

gelu_values


##### Compute post-Gelu activation at only the last time step (for activation patching)

In [None]:
def compute_top_neuron_acts(model, tokenized_prompts, batch_size=64):
    # Initialize dictionaries to store activations for all neurons
    gpt2_neuron_acts = defaultdict(list)

    sample_size = tokenized_prompts.size(0)

    print("Computing MLP neuron activations...")
    for idx in tqdm(range(0, sample_size, batch_size)):
        batch = tokenized_prompts[idx : idx + batch_size, :]
        
        with torch.inference_mode():
            _, cache = model.run_with_cache(batch)

        sampled = model.unembed(cache["ln_final.hook_normalized"]).argmax(-1)[:, -1] # generate the next token only

        for layer_idx in range(len(model.blocks)):
            # Extract (after Gelu) neuron activations before applying the output weights
            neuron_acts = cache[f"blocks.{layer_idx}.mlp.hook_post"][:, -1, :]  # [batch_size, d_mlp]

            # Store the neuron activations for this batch, layer, and neuron
            for neuron_idx in range(neuron_acts.size(1)):
                gpt2_neuron_acts[(layer_idx, neuron_idx)].extend(neuron_acts[:, neuron_idx].tolist())

        batch = torch.concat([batch, sampled.unsqueeze(-1)], dim=-1)

    # Compute final average neuron activations across all batches and tokens
    avg_neuron_acts = {
        (layer_idx, neuron_idx): np.mean(acts)
        for (layer_idx, neuron_idx), acts in gpt2_neuron_acts.items()
    }

    return avg_neuron_acts

In [None]:
# Save results to csv file
def save_neuron_acts_to_csv(avg_neuron_acts, filename):
    # Convert the dictionary to a list of tuples (layer_idx, neuron_idx, acts)
    data = [
        {"layer_idx": layer_idx, "neuron_idx": neuron_idx, "activation": acts}
        for (layer_idx, neuron_idx), acts in avg_neuron_acts.items()
    ]

    # Create a pandas DataFrame
    df = pd.DataFrame(data)

    # Save the DataFrame to a CSV file
    df.to_csv(filename, index=False)
    print(f"Neuron activations saved to {filename}")

In [None]:
# Compute the neuron acts before DPO
avg_neuron_acts = compute_top_neuron_acts(gpt2, tokenized_prompts)

In [None]:
# Save to csv file
save_neuron_acts_to_csv(avg_neuron_acts, filename="gpt2_acts_last_token.csv")

In [None]:
# Compute the neuron acts after DPO
avg_neuron_acts_dpo = compute_top_neuron_acts(dpo_model, tokenized_prompts)

In [None]:
# Save to csv file
save_neuron_acts_to_csv(avg_neuron_acts_dpo, filename="dpo_acts_last_token.csv")