|<h2>Book:</h2>|<h1><a href="https://open.substack.com/pub/mikexcohen/p/llm-breakdown-16-tokenization-words" target="_blank">50 ML projects to understand LLMs</a></h1>|
|-|:-:|
|<h2>Project:</h2>|<h1><b>[41] Patching heads in IOI</b></h1>|
|<h2>Author:<h2>|<h1>Mike X Cohen, <a href="https://sincxpress.com" target="_blank">sincxpress.com</a></h1>|

<br>

<i>Using the code without reading the book may lead to confusion or errors.</i>

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.gridspec import GridSpec

from tqdm import tqdm

from transformers import GPT2LMHeadModel, GPT2Tokenizer
import torch

In [None]:
### Run this cell only if you're using "dark mode"

# svg plots (higher-res)
import matplotlib_inline.backend_inline
matplotlib_inline.backend_inline.set_matplotlib_formats('svg')

plt.rcParams.update({
    # 'figure.facecolor': '#282a2c',
    # 'figure.edgecolor': '#282a2c',
    # 'axes.facecolor':   '#282a2c',
    # 'axes.edgecolor':   '#DDE2F4',
    # 'axes.labelcolor':  '#DDE2F4',
    # 'xtick.color':      '#DDE2F4',
    # 'ytick.color':      '#DDE2F4',
    # 'text.color':       '#DDE2F4',
    'axes.spines.right': False,
    'axes.spines.top':   False,
    'axes.titleweight': 'bold',
    'axes.labelweight': 'bold',
    'savefig.dpi':300
})

# **Part 1: The IOI task** (from Project 32)

In [None]:
# Note: I didn't remove any code for Part 1. Just run the cells and enjoy :)

In [None]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

tokenizer = GPT2Tokenizer.from_pretrained('gpt2-large')
model = GPT2LMHeadModel.from_pretrained('gpt2-large').to(device)
model.eval()

In [None]:
text_ME = 'When Mike and Emma went to the cafe, Mike gave a coffee to'
text_EM = 'When Mike and Emma went to the cafe, Emma gave a coffee to'

target_M = tokenizer.encode(' Mike')[0]
target_E = tokenizer.encode(' Emma')[0]

tokensME = tokenizer.encode(text_ME,return_tensors='pt').to(device)
tokensEM = tokenizer.encode(text_EM,return_tensors='pt').to(device)

nbatches,ntokens = tokensME.shape

In [None]:
with torch.no_grad():
  outME = model(tokensME)
  outEM = model(tokensEM)

In [None]:
# predicted next words
nextword_ME = torch.argmax(outME.logits[0,-1,:])
nextword_EM = torch.argmax(outEM.logits[0,-1,:])

print(f'{text_ME} "{tokenizer.decode(nextword_ME)}"')
print(f'{text_EM} "{tokenizer.decode(nextword_EM)}"')

In [None]:
logsm_ME = torch.log_softmax(outME.logits[0,-1,:].detach().cpu(),dim=0)
logsm_EM = torch.log_softmax(outEM.logits[0,-1,:].detach().cpu(),dim=0)


# setup the figure
fig = plt.figure(figsize=(12,3))
gs = GridSpec(1,5,figure=fig)
ax1 = fig.add_subplot(gs[:2])
ax2 = fig.add_subplot(gs[2:4])
ax3 = fig.add_subplot(gs[-1])

# plot log-sm from "EM" sentence
ax1.plot(target_M,logsm_EM[target_M],'go',label='"Mike"')
ax1.plot(target_E,logsm_EM[target_E],'rs',label='"Emma"')
ax1.plot(logsm_EM,'k.',alpha=.2)
ax1.legend(fontsize=8)
ax1.set(xlabel='Vocab index',ylabel='Log softmax',
           title=text_EM[-21:]+'...',xlim=[-100,tokenizer.vocab_size+100])

# plot log-sm from "ME" sentence
ax2.plot(target_M,logsm_ME[target_M],'go',label='"Mike"')
ax2.plot(target_E,logsm_ME[target_E],'rs',label='"Emma"')
ax2.plot(logsm_ME,'k.',alpha=.2)
ax2.legend(fontsize=8)
ax2.set(xlabel='Vocab index',ylabel='Log softmax',
           xlim=[-100,tokenizer.vocab_size+100],title=text_ME[-21:]+'...')

# how they relate to each other
ax3.plot(logsm_ME,logsm_EM,'k.',alpha=.3)
ax3.set(xlabel='ME log-sm logits',ylabel='EM log-sm logits',title='ME vs. EM')

# this figure is a replication from project 32 and is not shown in the book
plt.tight_layout()
plt.show()

In [None]:
IOI_score_ME = outME.logits[0,-1,target_M] - outME.logits[0,-1,target_E]
IOI_score_EM = outEM.logits[0,-1,target_M] - outEM.logits[0,-1,target_E]

print(f'IOI score for text "ME": {IOI_score_ME:6.3f}')
print(f'IOI score for text "EM": {IOI_score_EM:6.3f}')

In [None]:
# some useful variables
n_heads = model.config.n_head
n_layers = model.config.n_layer
n_embd = model.config.n_embd
head_dim = n_embd // n_heads

# **Part 2: Impact of single-head permutation on IOI**

In [None]:
# 1) create a matrix
Ho = torch.arange(12).reshape(3,4)

# 2) vectorize
H =

# 3) random permute
H = H[

# 4) reshape back
H = H.view

# print
print('Original:\n',Ho)
print('\nRandomized:\n',H)

In [None]:
# can also do it in one line
H = Ho.reshape()[].reshape()
H

In [None]:
# initializations
IOI_scores = np.zeros((,))
headnorms = np.zeros((,))

# loop over layers
for layeri in tqdm(range
  for headi in range


    ### --- silence this head --- ###
    def hook(module,input):
      head_tensor = input[0].view(nbatches,ntokens,n_heads,head_dim) # reshape to index one head
      H = head_tensor[,,,].flatten()                         # isolate and vectorize one head
      H = H[]                                  # randomly permute the elements
      head_tensor[:,:,headi,:] = H.view(,,)   # reshape and replace
      head_tensor = head_tensor.view(,,)        # reshape back to tensor
      input = (head_tensor,*input[1:])                               # repackage the tuple
      headnorms[headi,layeri] = np.linalg.norm()      # norm of head in numpy
      return input

    # implant the hook
    handle = model.transformer.h[layeri].attn.c_proj.register_forward_pre_hook(hook)


    ### --- forward pass --- ###
    with torch.no_grad():
      outEM = model(

    # remove the hook



    ### --- calculate IOI score --- ###
    IOI_scores[headi,layeri] =  -

In [None]:
fig,axs = plt.subplots(1,3,figsize=(12,3))

# image data to show
I = IOI_scores -

# find color limits based on L1-mean
clim =

# create the image
h = axs[0].pcolor(range(),range(),I,vmin=-clim,vmax=clim,cmap='RdBu_r')
axs[0].set(xlabel='Transformer layer',ylabel='Attention head index',yticks=range(0,n_heads,2),
           title='A) $\\Delta$ IOI (silenced - clean)')
c = fig.colorbar(h,ax=axs[0],pad=.02)
c.ax.tick_params(labelsize=8)
axs[0].spines.top.set_visible(True)
axs[0].spines.right.set_visible(True)

# create the error bar plot
Imean =
Istd =
axs[1].errorbar()
axs[1].plot()
axs[1].axhline()
axs[1].set(xlabel='Transformer layer',ylabel='Average $\\Delta$ IOI',title='B) Head-average results')

axs[2].scatter(,,30,c=,edgecolor='k',
               linewidth=.4,cmap='RdBu_r',vmin=-clim,vmax=clim,alpha=.7)
axs[2].set(xlabel='Head norm',ylabel='$\\Delta$IOI',title='C) Head norm vs. IOI impact')

plt.tight_layout()
plt.savefig('ch6_proj41_part2.png')
plt.show()

# **Part 3: Get all ME-related attention activations**

In [None]:
# Define a hook function to store QVK vectors
MEattn = {}

def implant_hook(layer_number):
  def hook(module,input):
    MEattn[f'L{layer_number}'] = input[0].view(,,,).detach()
  return hook


# implant the hooks
handles = []
for i in range(n_layers):
  h = model.transformer.h[i].attn.c_proj.register_forward_pre_hook(implant_hook(i))
  handles.append(h)

# run the clean model to get all the activations
with torch.no_grad():
  outME = model(tokensME)

# remove the hooks to avoid risk of overwriting in the experiment


In [None]:
for v,p in MEattn.items():
  print()

In [None]:
# some visualizations
_,axs = plt.subplots(1,ntokens,figsize=(14,3))
for i in range(ntokens):
  I = MEattn['L4'][
  cmin,cmax = np.percentile
  axs[i].imshow(,origin='lower',aspect='auto',cmap='plasma',vmin=cmin,vmax=cmax)
  axs[i].set(title=f'"{tokenizer.decode(tokensME[0,i])}"',xticklabels=[],yticklabels=[])

axs[0].set(xlabel='Head',ylabel='Dimension')
plt.suptitle('Head activation heatmaps from layer 4',fontsize=16,fontweight='bold')

plt.tight_layout()
plt.savefig('ch6_proj41_part3a.png')
plt.show()

In [None]:
headnorms = torch.zeros((n_layers,ntokens))

# average of attention head vector norms per token and layer
for layeri in range(n_layers):
  headnorms[layeri,:] = MEattn[][,,,].norm(dim=).mean(dim=).cpu()

# visualize
plt.figure(figsize=(11,4))
cmin,cmax = np.percentile(
plt.imshow(headnorms)
plt.colorbar(pad=.01)
plt.gca().set(xticks=range(ntokens),xticklabels=
              ylabel='Transformer layer',title='Average attention head norms')

plt.tight_layout()
plt.savefig('ch6_proj41_part3b.png')
plt.show()

# **Part 4: An interesting interlude**

In [None]:
# positions of the Mikes
mike_idx = torch.where(

layerskip = 3
_,axs = plt.subplots(1,n_layers//layerskip,figsize=(12,3))

for layeri in range(

  # extract the Mikes
  Mike1 = MEattn[f'L{layeri}'][,,,].cpu()
  Mike2 =
  MikeDiff =

  # visualize their difference
  axs[layeri//layerskip].imshow()
  axs[layeri//layerskip].set(title=f'L{layeri}',xticks=[],yticks=[])
  axs[layeri//layerskip].spines.top.set_visible(True)
  axs[layeri//layerskip].spines.right.set_visible(True)

axs[0].set(xlabel='Head',ylabel='Dimension')
plt.suptitle('The difference of the Mikes',fontweight='bold',fontsize=14)

plt.tight_layout()
plt.savefig('ch6_proj41_part4a.png')
plt.show()

In [None]:
# initialize
meanratio = np.zeros(n_layers)

for layeri in range(n_layers):

  # extract the Mikes
  Mike1 =
  Mike2 =
  MikeDiff =

  # L1 mean
  means =

  # ratio of max2 to min2 (averaging 2 to boost SNR)
  meanratio[layeri] =  /

# and plot
plt.figure(figsize=(10,3))
plt.plot(meanratio,'kh',markerfacecolor=[.7,.9,.9],markersize=12)
plt.gca().set(xlabel='Layer',ylabel='Head selectivity ratio')

plt.tight_layout()
plt.savefig('ch6_proj41_part4b.png')
plt.show()

# **Part 5: Head-specific IOI patching**

In [None]:
# initializations
IOI_scores = np.zeros((n_heads,n_layers))

# loop over layers
for layeri in tqdm(range(n_layers)):
  for headi in range(n_heads):


    ### --- patch this head --- ###
    def hook(module,input):
      # reshape, silence one head, reshape back to tensor
      head_tensor = input[0].view(,,,) # reshape to index one head
      head_tensor[:,:,headi,:] = MEattn   # patch this head from ME
      head_tensor = head_tensor.view(,,)        # reshape back to tensor
      input = (head_tensor,*input[1:])                               # repackage the tuple
      return input

    # implant the hook
    handle = model.transformer.h[layeri].attn.c_proj.register_forward_pre_hook(hook)


    ### --- forward pass --- ###
    with torch.no_grad():
      outEM = model(tokensEM)

    # remove the hook
    handle.remove()


    ### --- calculate IOI score --- ###
    IOI_scores[headi,layeri] =  -

In [None]:
fig,axs = plt.subplots(1,3,figsize=(12,3))

# image data to show
I =

# find color limits based on L1-mean
clim =

# create the image
h = axs[0].pcolor(,cmap='RdBu_r')
axs[0].set(xlabel='Transformer layer',ylabel='Attention head index',yticks=range(0,n_heads,2),
           title='A) $\\Delta$ IOI (patched - clean)')
c = fig.colorbar(h,ax=axs[0],pad=.02)
c.ax.tick_params(labelsize=8)
axs[0].spines.top.set_visible(True)
axs[0].spines.right.set_visible(True)

Imean =
Istd =
axs[1].errorbar()
axs[1].plot()
axs[1].axhline()
axs[1].set(xlabel='Transformer layer',ylabel='Average $\\Delta$ IOI',title='B) Head-average results')

axs[2].plot()
axs[2].set(xlabel='Transformer layer',ylabel='Stdev across $\\Delta$ IOI',title='C) Head-variability results')

plt.tight_layout()
plt.savefig('ch6_proj41_part5.png')
plt.show()