|<h2>Book:</h2>|<h1><a href="https://open.substack.com/pub/mikexcohen/p/llm-breakdown-16-tokenization-words" target="_blank">50 ML projects to understand LLMs</a></h1>|
|-|:-:|
|<h2>Project:</h2>|<h1><b>[36] Characteristics of attention adjustment magnitudes</b></h1>|
|<h2>Author:<h2>|<h1>Mike X Cohen, <a href="https://sincxpress.com" target="_blank">sincxpress.com</a></h1>|

<br>

<i>Using the code without reading the book may lead to confusion or errors.</i>

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.gridspec import GridSpec

from datasets import load_dataset

import torch
from transformers import AutoModelForCausalLM, GPT2Tokenizer
import torch.nn.functional as F

In [None]:
### matplotlib adjustments (commented lines are for dark mode)

# svg plots (higher-res)
import matplotlib_inline.backend_inline
matplotlib_inline.backend_inline.set_matplotlib_formats('svg')

plt.rcParams.update({
    # 'figure.facecolor': '#282a2c',
    # 'figure.edgecolor': '#282a2c',
    # 'axes.facecolor':   '#282a2c',
    # 'axes.edgecolor':   '#DDE2F4',
    # 'axes.labelcolor':  '#DDE2F4',
    # 'xtick.color':      '#DDE2F4',
    # 'ytick.color':      '#DDE2F4',
    # 'text.color':       '#DDE2F4',
    'axes.spines.right': False,
    'axes.spines.top':   False,
    'axes.titleweight': 'bold',
    'axes.labelweight': 'bold',
    'savefig.dpi':300
})

# **Part 1: Model, tokens, projection vectors**

In [None]:
# load GPT2 model and tokenizer
model = AutoModelForCausalLM.from_pretrained('gpt2-large')
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

model.eval()

In [None]:
n_layers = model.config.n_layer

In [None]:
# hook the attention vectors
activations = {}

def implant_hook(layer_number):
  def hook(module,input,output):

  return hook

# implant the hooks
handles = []
for i in range(n_layers):
  h = model.transformer.h[i].
  handles.append(h)

In [None]:
# load a dataset
dataset = load_dataset('cornell-movie-review-data/rotten_tomatoes', split='test')
dataset

In [None]:
dataset[30]

In [None]:
batchsize = 64

txt = []

tokenizer.pad_token =
tokens = tokenizer(txt,)
n_tokens =  # will be helpful later

# create the boolean attention mask
token_mask =

# visualize
fig = plt.figure(figsize=(12,3.5))
gs = GridSpec(1,3,figure=fig)
ax0 = fig.add_subplot(gs[:-1])
ax1 = fig.add_subplot(gs[-1])

ax0.pcolor
ax0.set(xlabel='Token position',ylabel='Sequence in batch',title='A) Heatmap of valid token positions')
ax0.spines.top.set_visible(True) # switched off by default, but I want the top spine here

ax1.hist(,bins='fd',color='gray',edgecolor='k')
ax1.set(xlabel='Sequence length',ylabel='Token count',title='B) Distribution of sequence lengths')

plt.tight_layout()
plt.savefig('ch6_proj36_part1.png')
plt.show()

In [None]:
with torch.no_grad():
  outputs =

for k,v in activations.items():
  print(f'{k:>8} has shape')

# **Part 2: Laminar profile of attention adjustment magnitudes**

In [None]:
# on broadcast-masking
print(activations['attn_L4'][tokens['attention_mask']].shape)
print(activations['attn_L4'][token_mask].shape)

In [None]:
activations['attn_L4'][tokens['attention_mask']][:,1,1,:]

In [None]:
_,axs = plt.subplots(1,2,figsize=(12,3.5))

for layeri in range(n_layers):

  # get the adjustments (ignoring the first token)
  att_projs = activations[f'attn_L{}'][][]
  att_projs =  # using numpy here

  # histogram of log-norms
  norms = np.( np.(att_projs,axis=) )
  y,x = np.histogram(norms,bins=,density=)

  # and visualize
  axs[0].plot(,color=plt.cm.plasma(layeri/n_layers),linewidth=2,label=f'L{layeri}')
  axs[1].errorbar(,color=plt.cm.plasma(layeri/n_layers))
  axs[1].plot(,'kh',markeredgewidth=.5,markersize=12,markerfacecolor=plt.cm.plasma(layeri/n_layers))

# plot adjustments
axs[0].set(xlabel='Log vector norms',ylabel='Density',title='A) Distribution of attention adjustment magnitudes')
axs[1].set(xlabel='Transformer layer',ylabel='Log vector norm (ave)',title='B) Average adjustment magnitude per layer')

plt.tight_layout()
plt.savefig('ch6_proj36_part2.png')
plt.show()

# **Part 3: Adjustment norm by token position**

In [None]:
# reminder for reference
activations['attn_L4'].shape

In [None]:
# example of averaging only the valid tokens
all_norms = torch.norm()
ave_norms = np.zeros()

# solved with a for-loop (list comprehension solution in the next code block)
for t in range():
  this_pos_mask =
  valid_norms =
  ave_norms[t] =

all_norms.shape, ave_norms.shape

In [None]:
plt.figure(figsize=(10,4))

X = np.zeros((n_layers,n_tokens))

# loop over layers
for layeri in range():

  # get the norms
  norms =

  # average all the norms for each position, only for rows with mask=True
  norm_by_pos =
  X[layeri,:] = norm_by_pos

# percentile-based color values
cmin,cmax = np.percentile(,[,])

# and visualize
plt.imshow(X,origin='lower',vmin=cmin,vmax=cmax,aspect='auto',cmap='magma')


plt.tight_layout()
plt.savefig('ch6_proj36_part3a.png')
plt.show()

In [None]:
plt.figure(figsize=(10,3))
plt.plot(,'kh',markersize=12,markerfacecolor=[.9,.7,.7,.7])


plt.tight_layout()
plt.savefig('ch6_proj36_part3b.png')
plt.show()

# **Part 4: Predicting adjustment norm from token position**

In [None]:
tokens2use = 30

# the design matrix with two IVs
designMatrix = np.vstack((
    np.ones(),  # intercept
    np.arange()  # token position


_,axs = plt.subplots(1,3,figsize=(12,3))

for layeri in range(n_layers):

  # same code as in previous part
  norms =
  norm_by_pos =
  norm_by_pos = np.log(norm_by_pos)

  # fit a least-squares model
  betas = np.linalg.lstsq(,norm_by_pos[])
  axs[0].plot(range(1,tokens2use+1),,'.-',markersize=9,color=plt.cm.plasma(layeri/n_layers))
  axs[1].plot(,'ks',markerfacecolor=plt.cm.plasma(layeri/n_layers),markersize=10)
  axs[2].plot(,'ko',markerfacecolor=plt.cm.plasma(layeri/n_layers),markersize=10)


axs[0].set(xlabel='Token index',ylabel='Log vector norm',title='A) Attention projection norm')
axs[1].set(xlabel='Transformer layer',ylabel='Intercept ($\\beta_0$)',title='B) Regression intercept')

plt.tight_layout()
plt.savefig('ch6_proj36_part4.png')
plt.show()

# **Part 5: Adjustment norm by previous norms**

In [None]:
# initialize matrix of beta values
betas = np.zeros((n_layers,3))

# loop over layers
for layeri in range(n_layers):

  # initialize an empty list to hold the data
  data =

  # and get all the norms from this layer
  norms = torch.norm(activations[f'attn_L{layeri}'],dim=-1)

  # loop over all sequences in the batch
  for seqi in range(batchsize):

    # just the norms for valid tokens (excluding the first)
    validtokens =
    normseq = norms[seqi,validtokens]

    # create each row in the dataset
    for i in range(2,len(normseq)):
      data.append

  # and stack them into an array
  data = np.log(

  designMat = np.hstack((
  y = data[:,-1]

  betas[layeri,:] = np.linalg.lstsq(,)[0]

In [None]:
data.shape

In [None]:
plt.figure(figsize=(10,4))

plt.axhline(0,linestyle='--',color='k',linewidth=.5)
plt.plot(,'gs-',linewidth=.5,markerfacecolor=[.7,.9,.7,.7],markersize=12,label='t-2')
plt.plot(,'bo-',linewidth=.5,markerfacecolor=[.7,.7,.9,.7],markersize=12,label='t-1')

plt.gca().set(xlabel='Transformer layer',ylabel='$\\beta$ coefficient',title='Impact of previous attention adjustment')
plt.legend()

plt.tight_layout()
plt.savefig('ch6_proj36_part5.png')
plt.show()