|<h2>Book:</h2>|<h1><a href="https://open.substack.com/pub/mikexcohen/p/llm-breakdown-16-tokenization-words" target="_blank">50 ML projects to understand LLMs</a></h1>|
|-|:-:|
|<h2>Project:</h2>|<h1><b>[35] Raw and softmax attention scores</b></h1>|
|<h2>Author:<h2>|<h1>Mike X Cohen, <a href="https://sincxpress.com" target="_blank">sincxpress.com</a></h1>|

<br>

<i>Using the code without reading the book may lead to confusion or errors.</i>

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl

import torch
from transformers import AutoModelForCausalLM, GPT2Tokenizer
import torch.nn.functional as F

In [None]:
### matplotlib adjustments (commented lines are for dark mode)

# svg plots (higher-res)
import matplotlib_inline.backend_inline
matplotlib_inline.backend_inline.set_matplotlib_formats('svg')

plt.rcParams.update({
    # 'figure.facecolor': '#282a2c',
    # 'figure.edgecolor': '#282a2c',
    # 'axes.facecolor':   '#282a2c',
    # 'axes.edgecolor':   '#DDE2F4',
    # 'axes.labelcolor':  '#DDE2F4',
    # 'xtick.color':      '#DDE2F4',
    # 'ytick.color':      '#DDE2F4',
    # 'text.color':       '#DDE2F4',
    'axes.spines.right': False,
    'axes.spines.top':   False,
    'axes.titleweight': 'bold',
    'axes.labelweight': 'bold',
    'savefig.dpi':300
})

# **Part 1: The reason to scale by sqrt(d_h)**

In [None]:
matsizes = np.arange(5,101)

plt.figure(figsize=(10,4))

for n in matsizes:

  # create the matrices
  M1 =  @
  M2 = M1 /

  # plot
  plt.plot(n,,'kh',markerfacecolor=[.9,.7,.7,.5],markersize=10)
  plt.plot(n,,'ks',markerfacecolor=[.7,.9,.7,.5],markersize=10)


plt.plot(matsizes,(),'r')
plt.gca().set(xlabel='Matrix size',ylabel='Matrix standard deviation')
plt.legend(['No scaling','$\\sqrt{N}$ scaling','Theory'])

plt.tight_layout()
plt.savefig('ch6_proj35_part1.png')
plt.show()

# **Part 2: Model, tokens, QVK activations**

In [None]:
# load GPT2 model and tokenizer
model = AutoModelForCausalLM.from_pretrained('gpt2-medium')
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

model.eval()

In [None]:
# hook the attention vectors
activations = {}

def hook(module,input,output):
  activations[keyName] = output.detach()

# implant the hooks
whichlayer = 6
keyName = f'attn_{whichlayer}'
hookhandle = model.transformer....register_forward_hook(hook)

In [None]:
# https://en.wikipedia.org/wiki/Purple
txt = "Purple is a color similar in appearance to violet light. In the RYB color model historically used in the arts, purple is a secondary color created by combining red and blue pigments. In the CMYK color model used in modern printing, purple is made by combining magenta pigment with either cyan pigment, black pigment, or both. In the RGB color model used in computer and television screens, purple is created by mixing red and blue light in order to create colors that appear similar to violet light. According to color theory, purple is considered a cool color."

# tokenize
tokens = tokenizer.encode(txt,return_tensors='pt')
ntokens =

# run through the model
with torch.no_grad():
  model

In [None]:
# checking sizes
print(activations.keys())
print(activations[keyName].shape)

# **Part 3: Split into heads**

In [None]:
# some helpful variables
n_layers = model.config.
n_emb = model.config.
n_heads = model.config.
head_dim = model.config. //
sqrtD =

print(f'There are {n_heads} heads, each with {head_dim} dimensions.')

In [None]:
# first, separate the Q,K,V matrices
Q,K,V = torch.split(
Q.shape

In [None]:
# now split into heads
Q_h = torch.split(Q,,dim=)
K_h = torch.split(K,,dim=)

print(f'There are {len(Q_h)} heads')
print(f'Each head has size {Q_h[2].shape}')

In [None]:
# visualize
_,axs = plt.subplots(4,4,figsize=(12,6))

for i,ax in enumerate(axs.flatten()):
  ax.pcolor(,cmap='plasma',vmin=-2,vmax=2)
  ax.text(2,head_dim-1,f'Qh{i}',fontsize=12,fontweight='bold',color='k',ha='left',va='top')
  ax.text(1,head_dim-2,f'Qh{i}',fontsize=12,fontweight='bold',color='w',ha='left',va='top')
  ax.set(xticks=[],yticks=[])

# finalize
axs[3,0].set(ylabel='Head dim',xlabel='Token position')

plt.tight_layout()
plt.savefig('ch6_proj35_part3.png')
plt.show()

# **Part 4: Raw attention scores in one layer**

In [None]:
# initializations
withinhead_dp = np.array([])
acrosshead_dp = np.array([])

# loop over pairs of heads
for qi in range(n_heads):
  for ki in range(n_heads):

    # dot product for last token in Q with all previous tokens in K (excluding first token)
    dp = Q_h[][] @ K_h[][].t() /
    dp =  # numpy will convert from pytorch, but this is cleaner

    # store in the appropriate matrix
    if qi==ki:
      withinhead_dp = np.concatenate((,))
    else:
      acrosshead_dp = np.concatenate((,))

print(f'There are {len(acrosshead_dp)} values in "across head"')
print(f'      and {len(withinhead_dp):5} values in "within head".')

In [None]:
## visualizations
_,axs = plt.subplots(1,2,figsize=(10,4))

# plot the raw data
axs[0].plot(,,'ko',linewidth=.1,markerfacecolor=[.7,.9,.7,.3],markersize=3)
axs[0].plot(,,'ks',linewidth=.1,markerfacecolor=[.9,.7,.7,.3],markersize=3)

# and the violin plot
v = axs[0].violinplot()

# change the colors
v['bodies'][0].set_facecolor([.7,.9,.7])
v['bodies'][1].set_facecolor([.9,.7,.7])
v['bodies'][0].set_alpha([.9])
v['bodies'][1].set_alpha([.9])
v['cbars'].set_edgecolor('k')
v['cmins'].set_edgecolor('k')
v['cmaxes'].set_edgecolor('k')

axs[0].axhline(0,linestyle='--',color=[.7,.7,.7],zorder=-3)
axs[0].set(xticks=[1,2],xticklabels=['Same head','Diff heads'],
              ylabel='QK$^\\top$ dot products',title='A) Raw attention scores',xlim=[.5,2.5])


# distributions
y,x = np.histogram()
axs[1].plot(x[:-1],y,'g',linewidth=2,label='Same head')

y,x = np.histogram()
axs[1].plot(x[:-1],y,'r',linewidth=2,label='Diff heads')

axs[1].legend()
axs[1].set(xlabel='Dot product value',ylabel='Density',title='B) Distributions')
axs[1].axvline(0,linestyle='--',color=[.7,.7,.7])

plt.tight_layout()
plt.savefig('ch6_proj35_part4.png')
plt.show()

# **Part 5: Attention score distributions over layers**

In [None]:
hookhandle.remove()

In [None]:
# hook the attention vectors
activations = {}

def implant_hook(layer_number):
  def hook(module, input, output):
    activations[f'attn_L{layer_number}'] =
  return hook

# implant the hooks
handles = []
for i in range(n_layers):
  h = model.transformer.h[i].attn.c_attn.register_forward_hook(implant_hook(i))
  handles.append(h)

In [None]:
# run through the model
with torch.no_grad():
  model(tokens)

In [None]:
activations.keys()

In [None]:
histedges = np.linspace()

layerHists = np.zeros((,,))


# loop over layers
for layeri in range(n_layers):

  # get the activations
  Q,K,V = torch.split(
  Q_h = torch.split(Q,)
  K_h = torch.split(K,)


  # re-initialize
  withinhead_dp = np.array([])
  acrosshead_dp = np.array([])

  # loop over pairs of heads
  for qi in range(n_heads):
    for ki in range(n_heads):

      # dot product for last token in Q with all previous tokens in K (excluding first token)
      dp =
      dp =  # pytorch -> numpy

      # store in the appropriate matrix
      if qi==ki:
        withinhead_dp =
      else:
        acrosshead_dp =


  # distributions
  y,_ = np.histogram(withinhead_dp,bins=,density=)
  layerHists[layeri,0,:] = y

  y,_ = np.histogram()
  layerHists[layeri,1,:] = y

In [None]:
_,axs = plt.subplots(1,2,figsize=(10,4))
axs[0].imshow(,origin='lower',aspect='auto',cmap='magma',
              extent=[histedges[0],histedges[-1],0,n_layers],vmin=0,vmax=.15)
axs[0].axvline(0,linestyle='--',color='w',linewidth=.4)

axs[1].imshow(,origin='lower',aspect='auto',cmap='magma',
              extent=[histedges[0],histedges[-1],0,n_layers],vmin=0,vmax=.15)

axs[0].set(xlabel='$\\mathbf{QK^\\top}$ activation value',ylabel='Transformer layer',title='A) Within heads')
axs[1].set(xlabel='$\\mathbf{QK^\\top}$ activation value',ylabel='Transformer layer',title='B) Across heads')

plt.tight_layout()
plt.savefig('ch6_proj35_part5a.png')
plt.show()

In [None]:
# get the means and standard deviations
y0  = layerHists[:,0,:].mean(axis=
y0e = layerHists[:,0,:].std
y1  =
y1e =

plt.figure(figsize=(10,4))

plt.fill_between(histedges[:-1],y0-y0e,y0+y0e,color=[.7,.7,.9,.7],label='Within heads')
plt.plot(histedges[:-1],y0,'b')
plt.fill_between(histedges[:-1],y1-y1e,y1+y1e,color=[.9,.7,.7,.7],label='Across head')
plt.plot(histedges[:-1],y1,'r')

plt.legend()
plt.gca().set(xlabel='$\\mathbf{QK^\\top}$ activation value',ylabel='Density',xlim=histedges[[0,-1]])

plt.tight_layout()
plt.savefig('ch6_proj35_part5b.png')
plt.show()

# **Part 6: Laminar distributions of raw and softmax scores**

In [None]:
_,axs = plt.subplots(1,3,figsize=(12,3))


# normalization for mapping line colors to colorbar
cmap = plt.cm.plasma
norm = mpl.colors.Normalize(vmin=0,vmax=n_layers)


# loop over all the layers
for layeri in range(n_layers):

  # split the matrices
  Q,K,V = torch.split(activations[f'attn_L{layeri}'],n_emb,dim=-1)

  # calculate the attention activations
  Qh = Q.view().permute(0,2,1,3)
  Kh = K.view(
  qkt = (Qh @ Kh.

  # plot the average QK^T scores
  axs[0].errorbar(layeri,,,color=plt.cm.plasma(layeri/n_layers))
  axs[0].plot(layeri,,'kh',markersize=10,
              markerfacecolor=plt.cm.plasma(layeri/n_layers))

  # distribution of "raw" values
  y,x = torch.histogram(,torch.linspace(-15,15,201),density=True)
  axs[1].plot(x[:-1],y,color=cmap(norm(layeri)),label=f'Layer {layeri}')

  # distribution of softmax-prob values
  y,x = torch.histogram(,201,density=True)
  axs[2].plot(x[:-1],y,color=plt.cm.plasma(layeri/n_layers),label=f'Layer {layeri}')


# plot adjustments
axs[0].set(xlabel='Transformer layer',ylabel='Activation mean',title='A) Means of $\\mathbf{QK^\\top}$')
axs[1].set(xlabel='Activation value',ylabel='Density',title='B) Distributions of $\\mathbf{QK^\\top}$')
axs[2].set(xlabel='Softmax probability',ylabel='Density (log scale)',yscale='log',xlim=[0,1],
           title='C) Distributions of $\\sigma(\\mathbf{QK^\\top})$')

# create a colorbar
sm = mpl.cm.ScalarMappable(cmap=cmap,norm=norm)
cbar = plt.colorbar(sm,ax=axs[-1],pad=.02)
cbar.set_label('Transformer layer')

plt.tight_layout()
plt.savefig('ch6_proj35_part6.png')
plt.show()

# **Part 7: Softmax probabilities for self- vs. cross-attention**

In [None]:
# create a mask with 0/1
N = 5
M =

# open a figure
fig,axs = plt.subplots(1,3,figsize=(10,3))

# show the mask
axs[0].imshow(M,vmin=0,vmax=1)
axs[0].set(title='A) Binary mask (0 or 1)')

# replace ones with -inf
M[M==1] =
axs[1].imshow(M,vmin=0,vmax=1)
axs[1].set(title='B) Time-causal mask (M)')

# impact of softmax on mask values
h = axs[2].imshow(,vmin=0,vmax=1)
axs[2].set(title='C) Impact of softmax')
fig.colorbar(h,ax=axs[2],pad=.01)

# adjustments for all axes
for a in axs:
  a.set(xlabel='Token position',ylabel='Token position')
  a.set_xticks(np.arange(.5,N,1),minor=True)
  a.set_yticks(np.arange(.5,N,1),minor=True)
  a.grid(which='minor')
  a.spines['top'].set_visible(True) # switched off by default, but helpful here
  a.spines['right'].set_visible(True)


plt.tight_layout()
plt.savefig('ch6_proj35_part7a.png')
plt.show()

In [None]:
# get the activations
layeri = 6
Q,K,V = torch.split(,n_emb,dim=1)
Q_h =
K_h =

# empty initializations
final2prev = np.array([])
selfAttend = np.array([])
first2self = np.array([])


# loop over heads
for qi in range(n_heads):

  # raw attention scores with mask
  attn_scores =  @ .t()) /
  pastmask = torch.triu(torch.ones(,),1)
  pastmask[pastmask==1] =
  attn_scores +=


  # softmax
  attn_sm = F.softmax( attn_scores ,

  # the final token with all previous tokens (including the first but excluding self-attn)
  final_with_prev =

  # matching tokens are self-attention
  matching_toks =  # exclude the first token in the sequence
  first_selfTok =   # isolate the first token

  # add to dataset
  final2prev = np.concatenate((final2prev,final_with_prev.numpy()))
  selfAttend = np.concatenate(())
  first2self = np.concatenate(())


## visualize
plt.figure(figsize=(8,4))

plt.plot(np.random.normal(1,.04,len(final2prev)),,'ko',linewidth=.1,markerfacecolor=[.7,.9,.7,.2],markersize=3)
plt.plot(np.random.normal(2,.04,len(selfAttend)),,'ks',linewidth=.1,markerfacecolor=[.9,.7,.7,.2],markersize=3)
plt.plot(np.random.normal(3,.04,len(first2self)),,'ks',linewidth=.1,markerfacecolor=[.7,.7,.9,.2],markersize=3)

v = plt.violinplot([])

# change the colors
v['bodies'][0].set_facecolor([.7,.9,.7])
v['bodies'][1].set_facecolor([.9,.7,.7])
v['bodies'][0].set_alpha([.9])
v['bodies'][1].set_alpha([.9])
v['cmins'].set(linewidth=.5,edgecolor='k')
v['cbars'].set(linewidth=.5,edgecolor='k')
v['cmaxes'].set(linewidth=.5,edgecolor='k')

plt.gca().set(xticks=[1,2,3],ylabel='Softmax attention weight',xlim=[.5,3.5],
              xticklabels=['Final to\nprev','Self-attention\nother tokens','Self-attention\nfirst token'])

plt.tight_layout()
plt.savefig('ch6_proj35_part7b.png')
plt.show()

# **Part 8: Laminar softmax, self- vs. cross-attention**

In [None]:
binedges = np.linspace(0,1,91)

attn_hists = np.zeros((n_layers,3,len(binedges)-1))


for layeri in range(n_layers):


  # empty initializations
  final2prev = np.array([])
  selfAttend = np.array([])
  first2self = np.array([])


  # get the activations
  Q,K,V = torch.split(activations[f'attn_L{layeri}'][0,:,:],n_emb,dim=1)
  Q_h = torch.split(Q,head_dim,dim=1)
  K_h = torch.split(K,head_dim,dim=1)


  # loop over heads
  for qi in range(n_heads):

    # raw attention scores with mask
    attn_scores =
    pastmask =
    attn_scores[pastmask==0] =

    # softmax
    attn_sm =

    # the final token with all previous tokens (including the first but excluding self-attn)
    final_with_prev =

    # matching tokens are self-attention
    matching_toks =  # exclude the first token in the sequence
    first_selfTok =   # isolate the first token

    # add to dataset
    final2prev =
    selfAttend =
    first2self =


  # get histograms
  attn_hists[layeri,0,:],_ = np.histogram(final2prev,bins=binedges,density=True)
  attn_hists[layeri,1,:],_ = np.histogram(selfAttend,bins=binedges,density=True)
  attn_hists[layeri,2,:],_ = np.histogram(first2self,bins=binedges,density=True)


In [None]:
_,axs = plt.subplots(1,3,figsize=(12,3.5))

titels = [ 'A) Final to previous','B) Self-attention','C) First to self' ]

for i in range(3):
  axs[i].imshow(attn_hists[:,i,:],aspect='auto',origin='lower',
                extent=[binedges[0],binedges[-1],0,n_layers],vmin=0,vmax=.5,cmap='magma')
  axs[i].set(xlabel='Softmax attention prob',ylabel='Transformer layer',title=titels[i])


plt.tight_layout()
plt.savefig('ch6_proj35_part8.png')
plt.show()

# **Part 9: Averaging within vs. across heads**

In [None]:
# get the activations
Q,K,V = torch.split(activations['attn_L6'][0,:,:],n_emb,dim=1)
Q_h = torch.split(Q,head_dim,dim=1)
K_h = torch.split(K,head_dim,dim=1)


# ignoring heads
qkt_allheads =

# separate per head
qkt_eachhead = torch.zeros((n_heads,ntokens,ntokens))
for headi in range(n_heads):
  qkt_eachhead[headi,:,:] =

print(f'Size of QKt all heads: {list(qkt_allheads.shape)}')
print(f'Size of QKt per head: {list(qkt_eachhead.shape)}')

# plot their relationship
_,axs = plt.subplots(1,2,figsize=(10,4))
axs[0].plot(.flatten(),.flatten(),
         'ko',markerfacecolor='y',alpha=.2)

axs[0].set(xlabel='$QK^\\top$ ignoring heads',ylabel='Average of per-head $QK^\\top$',
           title='A) Scatter plot of the two calculations')
axs[0].grid(linestyle='--',color='k',linewidth=.4)

# plot the differences
axs[1].plot(-,
         'ko',markerfacecolor='m',alpha=.2)
axs[1].set(xlabel='$QK^\\top$ matrix index',ylabel='Difference',
           title='B) Differences between calculations')

plt.tight_layout()
plt.savefig('ch6_proj35_part9.png')
plt.show()

In [None]:
# a little arithmetic demo (comment one of the lines)
np.mean([1,2,3,4]), (np.mean([1,2])+np.mean([3,4]))/2 # equivalent b/c balanced sample size
np.mean([1,2,3,4,5]), (np.mean([1,2])+np.mean([3,4,5]))/2 # unbalanced sample size breaks equivalence