In [1]:
import torch
import numpy
import pandas as pd
import os
import random
import transformer_lens.utils as utils
from transformer_lens import ActivationCache, HookedTransformer
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt


In [2]:
model = HookedTransformer.from_pretrained(
    "gpt2-XL",
    center_unembed=True,
    center_writing_weights=True,
    fold_ln=True,
    refactor_factored_attn_matrices=True
)


Loaded pretrained model gpt2-XL into HookedTransformer


In [3]:
model

HookedTransformer(
  (embed): Embed()
  (hook_embed): HookPoint()
  (pos_embed): PosEmbed()
  (hook_pos_embed): HookPoint()
  (blocks): ModuleList(
    (0-47): 48 x TransformerBlock(
      (ln1): LayerNormPre(
        (hook_scale): HookPoint()
        (hook_normalized): HookPoint()
      )
      (ln2): LayerNormPre(
        (hook_scale): HookPoint()
        (hook_normalized): HookPoint()
      )
      (attn): Attention(
        (hook_k): HookPoint()
        (hook_q): HookPoint()
        (hook_v): HookPoint()
        (hook_z): HookPoint()
        (hook_attn_scores): HookPoint()
        (hook_pattern): HookPoint()
        (hook_result): HookPoint()
      )
      (mlp): MLP(
        (hook_pre): HookPoint()
        (hook_post): HookPoint()
      )
      (hook_attn_in): HookPoint()
      (hook_q_input): HookPoint()
      (hook_k_input): HookPoint()
      (hook_v_input): HookPoint()
      (hook_mlp_in): HookPoint()
      (hook_attn_out): HookPoint()
      (hook_mlp_out): HookPoint()
      (h

In [4]:
blocks = model.blocks
blocks[0]

TransformerBlock(
  (ln1): LayerNormPre(
    (hook_scale): HookPoint()
    (hook_normalized): HookPoint()
  )
  (ln2): LayerNormPre(
    (hook_scale): HookPoint()
    (hook_normalized): HookPoint()
  )
  (attn): Attention(
    (hook_k): HookPoint()
    (hook_q): HookPoint()
    (hook_v): HookPoint()
    (hook_z): HookPoint()
    (hook_attn_scores): HookPoint()
    (hook_pattern): HookPoint()
    (hook_result): HookPoint()
  )
  (mlp): MLP(
    (hook_pre): HookPoint()
    (hook_post): HookPoint()
  )
  (hook_attn_in): HookPoint()
  (hook_q_input): HookPoint()
  (hook_k_input): HookPoint()
  (hook_v_input): HookPoint()
  (hook_mlp_in): HookPoint()
  (hook_attn_out): HookPoint()
  (hook_mlp_out): HookPoint()
  (hook_resid_pre): HookPoint()
  (hook_resid_mid): HookPoint()
  (hook_resid_post): HookPoint()
)

In [5]:
blocks[0].attn.hook_k

HookPoint()

In [6]:
data_path = '../dataset_csvs_singular_plural/predictions/correct_preds_xl_s_plural.csv'

In [7]:
data = pd.read_csv(data_path)
data.shape

(80, 4)

In [8]:
data.head(3)

Unnamed: 0,sentence,snswer,predictions,probabilities
0,The plural of cat is,cats,cats,0.526731
1,The plural of dog is,dogs,dogs,0.599398
2,The plural of book is,books,books,0.422725


In [9]:
# Initialize a list to store hook_attn_scores values
# for all datapoints , for all layers
hook_attn_scores = []
predictions = []
probabilities = []
k = 1

# Define a function to store hook_attn_scores values
def store_hook_attn_scores(module, input, output):
    hook_attn_scores.append(output)

# Register hooks for layers 6 to 11
for layer in range(0, 26):
    #blocks[layer].hook_attn_out.register_forward_hook(store_hook_attn_scores)
    blocks[layer].attn.hook_pattern.register_forward_hook(store_hook_attn_scores)

for data_i, row in data.iterrows():
    print(data_i)
    example_prompt = row['sentence']
    example_answer = row['snswer']
    tokens = model.to_tokens(example_prompt, prepend_bos=True)
    logits, cache = model.run_with_cache(tokens) 
    year_probs = torch.softmax(logits[:, -1, :], dim=-1)
    topk = torch.topk(year_probs, k=k)
    topk_tokens = [[model.tokenizer.decode(top) for top in ex] for ex in topk.indices]
    probs_array = topk.values.cpu().detach().numpy()[0].tolist()
    predictions.extend(topk_tokens[0])
    probabilities.extend(probs_array)
    words = ['bos']
    w = example_prompt.split(' ')
    words.extend(w)
    for layer_i, layer in enumerate(hook_attn_scores):
        for head_i, head in enumerate(layer[0]):
            np.save('../attention_arrays/s_plural_datapoint_' + str(data_i) + '_' + str(layer_i) + '_' + str(head_i) + '.npy', head.detach().cpu().numpy())
            
    
    hook_attn_scores.clear()


0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79


In [9]:
# Initialize a list to store hook_attn_scores values
hook_attn_scores = []
predictions = []
probabilities = []
k = 1

# Define a function to store hook_attn_scores values
def store_hook_attn_scores(module, input, output):
    hook_attn_scores.append(output)

# Register hooks for layers 6 to 11
for layer in range(6, 12):
    #blocks[layer].hook_attn_out.register_forward_hook(store_hook_attn_scores)
    blocks[layer].attn.hook_pattern.register_forward_hook(store_hook_attn_scores)

for data_i, row in data.iterrows():
    print(data_i)
    example_prompt = row['sentence']
    example_answer = row['snswer']
    tokens = model.to_tokens(example_prompt, prepend_bos=True)
    logits, cache = model.run_with_cache(tokens) 
    year_probs = torch.softmax(logits[:, -1, :], dim=-1)
    topk = torch.topk(year_probs, k=k)
    topk_tokens = [[model.tokenizer.decode(top) for top in ex] for ex in topk.indices]
    probs_array = topk.values.cpu().detach().numpy()[0].tolist()
    predictions.extend(topk_tokens[0])
    probabilities.extend(probs_array)
    print(probabilities)
    words = ['bos']
    w = example_prompt.split(' ')
    words.extend(w)
    for layer_i, layer in enumerate(hook_attn_scores):
        for head_i, head in enumerate(layer[0]):
            plt.figure(figsize=(10, 5))
            plt.title(f'Attention Map for Layer {layer_i} head {head_i}')

            # Create the heatmap
            heatmap = plt.imshow(head.detach().cpu().numpy(), aspect='auto', cmap='viridis')
            plt.colorbar()

            # Add text annotations for the attention values
            for (i, j), val in np.ndenumerate(head.detach().cpu().numpy()):
                plt.text(j, i, f'{val:.2f}', ha='center', va='center', color='white')

            plt.xlabel('Tokens')
            plt.xticks(ticks=range(len(words)), 
                    labels=words)
            plt.ylabel('Tokens')
            plt.yticks(ticks=range(len(words)), 
                    labels=words)

            #plt.savefig('../feature_maps_analysis_plots/s_plural_gap_6_datapoint_' + str(data_i) + '_' + str(layer_i) + '_' + str(head_i) + '.png')
            #np.save('../feature_maps_analysis_arrays/s_plural_gap_6_datapoint_' + str(data_i) + '_' + str(layer_i) + '_' + str(head_i) + '.npy', head.detach().cpu().numpy())
            
            plt.close()
    break
    hook_attn_scores.clear()


0
[0.5267309546470642]


In [14]:
import numpy as np

# Example attention values for each token (from a specific head and layer)
attention_values = np.array([0.11, 0.15, 0.12, 0.1, 0.15, 0.1, 0, 0, 0])


# Example output confidence values (logit or probability for the correct plural form)
output_value = 0.8
output_values = np.full(attention_values.shape, output_value)

# Calculate the correlation coefficient
correlation_coefficient = np.corrcoef(attention_values, output_values)[0, 1]

print(f"Correlation coefficient: {correlation_coefficient}")


Correlation coefficient: nan


In [16]:
import numpy as np

# Example attention values for each token (from a specific head and layer)
attention_values = np.array([0.11, 0.15, 0.12, 0.1, 0.15, 0.1, 0, 0, 0])

# Remove all 0s from attention_values
attention_values = attention_values[attention_values != 0]

# Example output confidence values (logit or probability for the correct plural form)
output_value = 0.8
output_values = np.full(attention_values.shape, output_value)

# Calculate the correlation coefficient
correlation_coefficient = np.corrcoef(attention_values, output_values)[0, 1]

print(f"Correlation coefficient: {correlation_coefficient}")

Correlation coefficient: 3.2811686960750074e-16


In [None]:
len(hook_attn_scores)


In [None]:
hook_attn_scores[0].shape

In [None]:
a = hook_attn_scores[6][0][3]
plt.figure(figsize=(10, 5))
plt.title(f'Attention Map for Layer 1 head 1')

# Create the heatmap
heatmap = plt.imshow(a.detach().cpu().numpy(), aspect='auto', cmap='viridis')
plt.colorbar()

# Add text annotations for the attention values
for (i, j), val in np.ndenumerate(a.detach().cpu().numpy()):
    plt.text(j, i, f'{val:.2f}', ha='center', va='center', color='white')

plt.xlabel('Tokens')
plt.xticks(ticks=range(len(['bos', 'the', 'plural', 'of', 'house', 'is'])), 
           labels=['bos', 'the', 'plural', 'of', 'house', 'is'])
plt.ylabel('Tokens')
plt.yticks(ticks=range(len(['bos', 'the', 'plural', 'of', 'house', 'is'])), 
           labels=['bos', 'the', 'plural', 'of', 'house', 'is'])

plt.show()

In [1]:
from transformers import GPT2Tokenizer, GPT2Model
import torch

# Load tokenizer and model
tokenizer = GPT2Tokenizer.from_pretrained('gpt2-xl')
model = GPT2Model.from_pretrained('gpt2-xl')

# Tokenize words and ensure they are in the format expected by the model
tokens_house = tokenizer('mouse', return_tensors='pt')
tokens_houses = tokenizer('mice', return_tensors='pt')

# Extract embeddings
with torch.no_grad():
    outputs_house = model(tokens_house['input_ids'])
    outputs_houses = model(tokens_houses['input_ids'])

    # Assuming using only the last hidden state for embedding
    emb_house = outputs_house.last_hidden_state.squeeze(0)  # Remove batch dimension
    emb_houses = outputs_houses.last_hidden_state.squeeze(0)  # Remove batch dimension

    # Mean over all token positions to get a single vector per input
    emb_house_mean = torch.mean(emb_house, dim=0)
    emb_houses_mean = torch.mean(emb_houses, dim=0)

# Calculate Euclidean distance
euclidean_distance = torch.norm(emb_house_mean - emb_houses_mean, p=2).item()

# Calculate cosine similarity
cosine_similarity = torch.nn.functional.cosine_similarity(emb_house_mean.unsqueeze(0), emb_houses_mean.unsqueeze(0), dim=1).item()

print(f"Euclidean Distance: {euclidean_distance}")
print(f"Cosine Similarity: {cosine_similarity}")


Euclidean Distance: 21.64276123046875
Cosine Similarity: 0.5845746994018555


In [2]:
0.21 + 0.45 + 0.03 + 0.04 + 0.02

0.7500000000000001

In [8]:
a = torch.tensor([0.52673])
b = torch.tensor([0.75])
euclidean_distance = torch.norm(b - a, p=2).item()
cosine_similarity = torch.nn.functional.cosine_similarity(b, a, dim=0).item()

print(f"Euclidean Distance: {euclidean_distance}")
print(f"Cosine Similarity: {cosine_similarity}")


Euclidean Distance: 0.2232699990272522
Cosine Similarity: 1.0
