In [None]:
import torch
import numpy
import pandas as pd
import os
import random
import transformer_lens.utils as utils
from transformer_lens import ActivationCache, HookedTransformer
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import torch
import torch.nn.functional as F


In [None]:
model = HookedTransformer.from_pretrained(
    "gpt2-XL",
    center_unembed=True,
    center_writing_weights=True,
    fold_ln=True,
    refactor_factored_attn_matrices=True
)


In [None]:
model

In [None]:
data_path = '../dataset_csvs_singular_plural/es_plurals.csv'

In [None]:
data = pd.read_csv(data_path)
data.shape, data.columns

In [None]:
average_attention_weights = np.load('../nouns_average_weights_sing_to_plu.npy')

In [None]:
average_attention_weights.shape

In [None]:
layers = 48
heads = 25
dim = 64

In [None]:
# ... existing imports and model loading ...

def replace_hook_z(z, hook, start_layer):
    layer_num = int(hook.name.split('.')[1])
    relative_layer = layer_num - start_layer
    # Convert numpy array to torch tensor and add batch dimension
    average_weights = torch.from_numpy(average_attention_weights[relative_layer]).unsqueeze(0).to(z.device)
    z[:, 4, :, :] = average_weights
    return z

for start_layer in range(1, 47, 4):  # 45 is the last starting layer for a group of 4
    print(start_layer)
    if start_layer == 46:
        end_layer = layers
    else:
        end_layer = start_layer + 4

    
    
    print(f"Processing layers {start_layer} to {end_layer - 1}")
    results = []
    
    for idx, row in data.iterrows():
        singular = row['sentence']
        plural = row['answer']
        
        # Create hooks for the current group of 4 layers
        hooks = [(f'blocks.{layer}.attn.hook_z', lambda z, hook, sl=start_layer: replace_hook_z(z, hook, sl)) 
                 for layer in range(start_layer, end_layer)]
        
        # Run the forward pass with the hooks
        with model.hooks(fwd_hooks=hooks):
            logits = model(singular, prepend_bos=True, return_type="logits")
        
        # Get the prediction and probability
        probs = F.softmax(logits[0, -1], dim=-1)
        pred_id = probs.argmax().item()
        prediction = model.to_string(pred_id)
        probability = probs[pred_id].item()
        
        # Store the results
        results.append({
            'singular': singular,
            'plural': plural,
            'prediction': prediction,
            'probability': probability
        })

    # Convert results to a DataFrame
    results_df = pd.DataFrame(results)
    #results_df.to_csv(f'../mean_ablated_predictions_multiple_layers/predictions_s_plural_XL_layers_{start_layer}_to_{end_layer-1}.csv')

    # Count correct predictions
    count = sum(row['plural'] == row['prediction'][1:] for _, row in results_df.iterrows())
    print(f"Correct predictions: {count}/{len(results_df)} ({count/len(results_df)*100:.2f}%)")

# ... rest of your code ...

In [None]:
# ... existing imports and model loading ...

def replace_hook_z(z, hook, start_layer):
    layer_num = int(hook.name.split('.')[1])
    relative_layer = layer_num - start_layer
    # Convert numpy array to torch tensor and add batch dimension
    average_weights = torch.from_numpy(average_attention_weights[relative_layer]).unsqueeze(0).to(z.device)
    z[:, 4, :, :] = average_weights
    return z

gap = 6
#or start_layer in range(0, layers, gap):  # 45 is the last starting layer for a group of 4
for start_layer in range(6, 12, gap):  # 45 is the last starting layer for a group of 4
    print(start_layer)
    
    end_layer = start_layer + gap
    
    print(f"Processing layers {start_layer} to {end_layer-1}")
    results = []
    
    for idx, row in data.iterrows():
        singular = row['sentence']
        plural = row['answer']
        
        # Create hooks for the current group of 4 layers
        hooks = [(f'blocks.{layer}.attn.hook_z', lambda z, hook, sl=start_layer: replace_hook_z(z, hook, sl)) 
                 for layer in range(start_layer, end_layer)]
        
        # Run the forward pass with the hooks
        with model.hooks(fwd_hooks=hooks):
            logits = model(singular, prepend_bos=True, return_type="logits")
        
        # Get the prediction and probability
        probs = F.softmax(logits[0, -1], dim=-1)
        pred_id = probs.argmax().item()
        prediction = model.to_string(pred_id)
        probability = probs[pred_id].item()
        
        # Store the results
        results.append({
            'singular': singular,
            'plural': plural,
            'prediction': prediction,
            'probability': probability,
            'gap': gap
        })

    # Convert results to a DataFrame
    results_df = pd.DataFrame(results)
    #results_df.to_csv(f'../mean_ablation_predictions_different_gaps/predictions_s_plural_XL_layers_{start_layer}_to_{end_layer-1}_{gap}.csv')

    # Count correct predictions
    count = sum(row['plural'] == row['prediction'][1:] for _, row in results_df.iterrows())
    print(f"Correct predictions: {count}/{len(results_df)} ({count/len(results_df)*100:.2f}%)")

# ... rest of your code ...

In [None]:
for layer in range(6, 12):  # Iterate through layers 6 to 11
    model.blocks[layer].ln1 = torch.nn.Identity()  # Replace layer normalization with identity
    model.blocks[layer].ln2 = torch.nn.Identity()  # Replace layer normalization with identity

results = []
for idx, row in data.iterrows():
    singular = row['sentence']
    plural = row['answer']
    
    # Run the forward pass without hooks
    logits = model(singular, prepend_bos=True, return_type="logits")
    
    # Get the prediction and probability
    probs = F.softmax(logits[0, -1], dim=-1)
    pred_id = probs.argmax().item()
    prediction = model.to_string(pred_id)
    probability = probs[pred_id].item()
    
    # Store the results
    results.append({
        'singular': singular,
        'plural': plural,
        'prediction': prediction,
        'probability': probability
    })

# Convert results to a DataFrame
results_df = pd.DataFrame(results)
#results_df.to_csv(f'../mean_ablated_predictions_multiple_layers/predictions_s_plural_XL_layers_{start_layer}_to_{end_layer-1}.csv')

# Count correct predictions
count = sum(row['plural'] == row['prediction'][1:] for _, row in results_df.iterrows())
print(f"Correct predictions: {count}/{len(results_df)} ({count/len(results_df)*100:.2f}%)")

# ... rest of your code ...