|<h2>Book:</h2>|<h1><a href="https://open.substack.com/pub/mikexcohen/p/llm-breakdown-16-tokenization-words" target="_blank">50 ML projects to understand LLMs</a></h1>|
|-|:-:|
|<h2>Project:</h2>|<h1><b>[33] QKV weights characteristics</b></h1>|
|<h2>Author:<h2>|<h1>Mike X Cohen, <a href="https://sincxpress.com" target="_blank">sincxpress.com</a></h1>|

<br>

<i>Using the code without reading the book may lead to confusion or errors.</i>

In [None]:
import numpy as np
import matplotlib.pyplot as plt

import torch
from transformers import AutoModelForCausalLM

In [None]:
### matplotlib adjustments (commented lines are for dark mode)

# svg plots (higher-res)
import matplotlib_inline.backend_inline
matplotlib_inline.backend_inline.set_matplotlib_formats('svg')

plt.rcParams.update({
    # 'figure.facecolor': '#282a2c',
    # 'figure.edgecolor': '#282a2c',
    # 'axes.facecolor':   '#282a2c',
    # 'axes.edgecolor':   '#DDE2F4',
    # 'axes.labelcolor':  '#DDE2F4',
    # 'xtick.color':      '#DDE2F4',
    # 'ytick.color':      '#DDE2F4',
    # 'text.color':       '#DDE2F4',
    'axes.spines.right': False,
    'axes.spines.top':   False,
    'axes.titleweight': 'bold',
    'axes.labelweight': 'bold',
    'savefig.dpi':300
})

# **Part 1: A dictionary of GPT2 models**

In [None]:
# a list of lists of model infos
model_ids = [
    #  name    label
    ['small' ,       'gpt2'],
    ['medium','gpt2-medium'],
    ['large' , 'gpt2-large'],
    ['xl'    ,    'gpt2-xl']
]


# load all models into a dictionary
models = {}
params = {}

for modinfo in model_ids:

  # load the model
  models[modinfo[0]] = AutoModelForCausalLM.from_pretrained(modinfo[1])

  # count key parameters
  params[modinfo[0]] = {}
  params[modinfo[0]]['n_layers'] =
  params[modinfo[0]]['n_emb']    =
  params[modinfo[0]]['n_heads']  =
  params[modinfo[0]]['head_dim'] =

In [None]:
print('"models" keys:\n',models.keys())
print('\n"params[''small'']" keys:\n',params['small'].keys())

In [None]:
print('  Model  | Embed.dim | Layers | n heads | head.dim')
print('---------+-----------+--------+---------+---------')
for name in models.keys():
  print(f" {name:>6}  |    {}   |   {}   |    {}   |   {}")

In [None]:
# fyi, accessing a weights matrix
models['small'].transformer.h[5].attn.c_attn.weight

In [None]:
# count total number of parameters in attention subblock

for modlist in model_ids:

  name = mod

  # isolate one layer
  block = models[name].transformer.h[5].attn

  # count the parameters in this layer
  params_per_layer = (
    block.c_attn.weight.numel() +
    block. +
    block. +
    block.
  )

  # total params is weights times layers
  totparams = params_per_layer *

  # and print the info
  print(f'{}M attention weights in GPT2-{name}')

# **Part 2: Distributions of attention weights in GPT2-small**

In [None]:
# in one layer for one model
whichmod = 'small'
layeri = 6

# extract the wide weights matrix for this layer
wide_weights = models[whichmod]...

plt.figure(figsize=(10,3))
plt.imshow()
plt.axvline(,linestyle='--',color='w')
plt.axvline(,linestyle='--',color='w')
plt.colorbar(pad=.01)

plt.gca().set(xticks=[],ylabel='Embeddings dimensions',
              xlabel='Queries dimensions         |           Keys dimensions             |           Values dimensions')


plt.tight_layout()
plt.savefig('ch6_proj33_part2a.png')
plt.show()

In [None]:
# split the Q, K, and V matrices
q,k,v = torch.split(,params[whichmod][],dim=)

# histograms of the three weights values
plt.figure(figsize=(8,3))
y,x = np.histogram(,bins='fd')
plt.plot(x[:-1],y,label='$\\mathbf{W_Q}$')

y,x = np.histogram(
plt.plot(x[:-1],y,label='$\\mathbf{W_K}$')

y,x = np.histogram(
plt.plot(x[:-1],y,label='$\\mathbf{W_V}$')

plt.gca().set(xlabel='Weight value',ylabel='Count',
              title=f'Distribution of QKV weights in layer {layeri} in GPT2-{whichmod}')
plt.legend()

plt.tight_layout()
plt.savefig('ch6_proj33_part2b.png')
plt.show()

In [None]:
# common histogram boundaries
histedges = np.linspace(-.8,.8,81)

# initializations
distributions = np.zeros()
distchars = np.zeros(())

# loop over layers
for layeri in range(params[whichmod]['n_layers']):

  # split into matrices
  wideW = models[whichmod].transformer.h[layeri].attn.c_attn.weight.detach()
  q,k,v = torch.split

  # histograms
  distributions[layeri,:,0] = np.histogram(
  distributions[layeri,:,1] = np.histogram(
  distributions[layeri,:,2] = np.histogram(

  # mean and std
  distchars[layeri,:,0] =
  distchars[layeri,:,1] =
  distchars[layeri,:,2] =

# show the heatmaps
_,axs = plt.subplots(1,3,figsize=(10,3))
for i in range(3):
  axs[i].imshow(distributions[:,:,i],origin='lower',extent=[histedges[0],histedges[-1],0,params[whichmod]['n_layers']],
                aspect='auto',cmap=plt.cm.plasma,vmin=0,vmax=3.5)
  axs[i].set(xlabel='Weight value',ylabel='Layer',title=f"$\\mathbf{{W}}_{'QKV'[i]}$")

plt.suptitle(f'Laminar distributions of attention weights in GPT2-{whichmod}',fontweight='bold')


plt.tight_layout()
plt.savefig('ch6_proj33_part2c.png')
plt.show()

In [None]:
_,axs = plt.subplots(1,2,figsize=(10,3))

for i in [0,1]:
  axs[i].plot(,'rs-',markerfacecolor=[.9,.7,.7],markersize=10)
  axs[i].plot(,'go-',markerfacecolor=[.7,.9,.7],markersize=10)
  axs[i].plot(,'b^-',markerfacecolor=[.7,.7,.9],markersize=10)
  axs[i].legend(['$\\mathbf{W_Q}$','$\\mathbf{W_K}$','$\\mathbf{W_V}$'])
  axs[i].set(xlabel='Transformer layer',ylabel=['Mean','Stdev'][i])

plt.suptitle(f'Means and standard deviations of attention weights in GPT2-{whichmod}',fontweight='bold')


plt.tight_layout()
plt.savefig('ch6_proj33_part2d.png')
plt.show()

# **Part 3: Comparing distributions across models**

In [None]:
_,axs = plt.subplots(len(models),2,figsize=(10,8))


# loop over models
for modeli,whichmod in enumerate(models.keys()):

  # initialize matrix of statistics for this layer
  attn_stats = np.zeros((params[],,))

  # loop over layers
  for layeri in range(params[whichmod]['n_layers']):

    # split into Q, K, V
    Q,K,V = torch.split()

    ### Q
    attn_stats[layeri,0,0] =
    attn_stats[layeri,0,1] =

    ### K
    attn_stats[
    attn_stats[

    ### V
    attn_stats[
    attn_stats[

  # end of layer loop

  for i in [0,1]:
    axs[modeli,i].plot(label='$\\mathbf{W_Q}$')
    axs[modeli,i].plot(label='$\\mathbf{W_K}$')
    axs[modeli,i].plot(label='$\\mathbf{W_V}$')
    axs[modeli,i].legend(fontsize=8)
    axs[modeli,i].set(xlabel='Transformer layer',ylabel=['Mean','Stdev'][i],
                      title=f'GPT2-{whichmod}')


plt.tight_layout()
plt.savefig('ch6_proj33_part3.png')
plt.show()

# **Part 4: Cosine similarity within heads (one model)**

In [None]:
whichmod = 'small'
layeri = 6

# to split into heads, first split into QKV
Q,K,V = torch.split

# now split each W matrix into heads
WQ_h = torch.split(Q,head_dim,dim=)
WK_h = torch.split(

len(WQ_h), WQ_h[3].shape

In [None]:
fig,axs = plt.subplots(3,4,figsize=(10,5))

axs = axs.flatten()
for i in range(len(WQ_h)):
  axs[i].imshow(,aspect='auto',vmin=-.1,vmax=.1,cmap='plasma')
  axs[i].set(xticks=[],yticks=[])
  axs[i].text(18,11,f'H{i}',fontweight='bold',color='k',fontsize=16)
  axs[i].text(10,10,f'H{i}',fontweight='bold',color='w',fontsize=16)

axs[8].set(xlabel='Embeddings dimensions',ylabel='Head dimension')


plt.tight_layout()
plt.savefig('ch6_proj33_part4a.png')
plt.show()

In [None]:
# cosine similarity in one attention head
tmp =  / torch.norm(,dim=,keepdim=)
cs_Q =  @

fig,axs = plt.subplots(1,2,figsize=(10,3))
# show the similarity matrix
h = axs[0].imshow()
axs[0].set(xlabel='Head dimension',ylabel='Head dimension',title='A) Cosine similarity matrix')
fig.colorbar(h,ax=axs[0],pad=.01)

# and the distribution
axs[1].hist(,40,color='gray',edgecolor='k')
axs[1].set(xlabel='Cosine similarity',ylabel='Count',title='B) Distribution of similarities')

plt.tight_layout()
plt.savefig('ch6_proj33_part4b.png')
plt.show()

# **Part 5: Cosine similarity within- vs. across-heads (one model)**

In [None]:
# initialize as empty arrays
withinhead_csQ = np.array([])
acrosshead_csQ = np.array([])
withinhead_csK = np.array([])
acrosshead_csK = np.array([])

# loop over pairs of heads
for i in range(params[whichmod]['n_heads']):
  for j in range(

    ### Q
    tmpi = WQ_h[i] / torch.norm(,dim=0,keepdim=True)
    tmpj =  /
    cs = (.T @ ).numpy() # convert to numpy here...

    # store in the appropriate matrix
    if i==j: # within-head -> symmetric matrix -> keep nonredundant values
      cs =
      withinhead_csQ = np.concatenate((,cs))
    else: # across-head -> nonsymmetric matrix -> keep all values
      acrosshead_csQ = np.concatenate((,cs.flatten()))


    ### K
    tmpi =
    tmpj =
    cs =  # leave as pytorch here...

    # store in the appropriate matrix
    if i==j: # within-head -> symmetric matrix -> keep nonredundant values
      cs =
      withinhead_csK =
    else: # across-head -> nonsymmetric matrix -> keep all values
      acrosshead_csK =


_,axs = plt.subplots(1,2,figsize=(12,3))
y,x = np.histogram(withinhead_csQ,bins='fd',density=True)
axs[0].plot(,linewidth=2,label='Within heads')
y,x = np.histogram(acrosshead_csQ,bins='fd',density=True)
axs[0].plot(,linewidth=2,label='Across heads')
axs[0].set(xlabel='Cosine similarity',ylabel='Density',title='A) Similarities in $\\mathbf{W_Q}$')
axs[0].legend()

y,x = np.histogram(withinhead_csK,bins='fd',density=True)
axs[1].plot(,linewidth=2,label='Within heads')
y,x = np.histogram(acrosshead_csK,bins='fd',density=True)
axs[1].plot(,linewidth=2,label='Across heads')
axs[1].set(xlabel='Cosine similarity',ylabel='Density',title='B) Similarities in $\\mathbf{W_K}$')
axs[1].legend()

plt.tight_layout()
plt.savefig('ch6_proj33_part5a.png')
plt.show()

In [None]:
meenz = np.zeros((params[whichmod]['n_layers'],2,2))
stdez = np.zeros((params[whichmod]['n_layers'],2,2))

histbins = np.linspace(-.4,.4,101)
hists = np.zeros((params[whichmod]['n_layers'],len(histbins)-1,2,2))


# loop over all layers
for layeri in range(params[whichmod]['n_layers']):

  # split into heads
  Q,K,V = torch.split(models[whichmod].transformer.h[layeri]...,
                      params[whichmod]['n_emb'],dim=1)
  WQ_h = torch.split(
  WK_h = torch.split(

  # re-initialize
  withinhead_csQ = np.array([])
  acrosshead_csQ = np.array([])
  withinhead_csK = np.array([])
  acrosshead_csK = np.array([])

  # loop over pairs of heads (copy from proj35)
  for i in range(params[whichmod]['n_heads']):
    for j in range(i,params[whichmod]['n_heads']):

      ### Q
      tmpi = WQ_h[i] / torch.norm(WQ_h[i],dim=0,keepdim=True)
      tmpj = WQ_h[j] / torch.norm(WQ_h[j],dim=0,keepdim=True)
      cs =

      # store in the appropriate matrix
      if i==j: # within-head -> symmetric matrix -> keep nonredundant values
        cs =
        withinhead_csQ =
      else: # across-head -> nonsymmetric matrix -> keep all values
        acrosshead_csQ =


      ### repeat for K
      tmpi = WK_h[i] /
      tmpj =
      cs = tmpi.T @ tmpj

      # store in the appropriate matrix
      if i==j: # within-head -> symmetric matrix -> keep nonredundant values
        cs = cs[np.triu_indices(params[whichmod]['head_dim'],k=1)]
        withinhead_csK =
      else: # across-head -> nonsymmetric matrix -> keep all values
        acrosshead_csK = np.concatenate(())
  # end of layer-loop

  # get the histograms
  hists[layeri,:,0,0] = np.histogram(withinhead_csQ,bins=histbins,density=True)[0]
  hists[layeri,:,1,0] = np.histogram(acrosshead_csQ,bins=histbins,density=True)[0]
  hists[layeri,:,0,1] = np.histogram(withinhead_csK,bins=histbins,density=True)[0]
  hists[layeri,:,1,1] = np.histogram(acrosshead_csK,bins=histbins,density=True)[0]

  # the means
  meenz[layeri,0,0] =
  meenz[layeri,0,1] =
  meenz[layeri,1,0] =
  meenz[layeri,1,1] =

  # and the standard deviations
  stdez[layeri,0,0] =
  stdez[layeri,0,1] =
  stdez[layeri,1,0] =
  stdez[layeri,1,1] =

In [None]:
fig,axs = plt.subplots(2,2,figsize=(10,5))

# Q within-head
axs[0,0].imshow(,aspect='auto',vmin=0,vmax=7,cmap='magma',
              origin='lower',extent=[histbins[0],histbins[-1],0,params[whichmod]['n_layers']])
axs[0,0].set(xlabel='Cosine similarity',ylabel='Layer',title='A) $\\mathbf{W_Q}$: Distributions within head')


# Q across-head
axs[0,1].imshow()
axs[0,1].set(xlabel='Cosine similarity',ylabel='Layer',title='B) $\\mathbf{W_Q}$: Distributions across head')


# K within-head
axs[1,0].imshow()
axs[1,0].set(xlabel='Cosine similarity',ylabel='Layer',title='C) $\\mathbf{W_K}$: Distributions within head')


# K across-head
axs[1,1].imshow)
axs[1,1].set(xlabel='Cosine similarity',ylabel='Layer',title='D) $\\mathbf{W_K}$: Distributions across head')


plt.tight_layout()
plt.savefig('ch6_proj33_part5b.png')
plt.show()

In [None]:
_,axs = plt.subplots(1,2,figsize=(12,3))

axs[0].plot(,label='$\\mathbf{W_Q}$ within head')
axs[0].plot(,label='$\\mathbf{W_Q}$ across head')
axs[0].plot(,label='$\\mathbf{W_K}$ within head')
axs[0].plot(,label='$\\mathbf{W_K}$ across head')
axs[0].legend(handlelength=4)
axs[0].set(xlabel='Transformer layer',ylabel='Mean',title='A) Average of cosine similarity dist.')

axs[1].plot(,label='$\\mathbf{W_Q}$ within head')
axs[1].plot(,label='$\\mathbf{W_Q}$ across head')
axs[1].plot(,label='$\\mathbf{W_K}$ within head')
axs[1].plot(,label='$\\mathbf{W_K}$ across head')
axs[1].legend(handlelength=4) # make the legend lines longer to show dashed vs solid
axs[1].set(xlabel='Transformer layer',ylabel='Standard deviation',title='B) Stdev of cosine similarity dist.')

plt.tight_layout()
plt.savefig('ch6_proj33_part5c.png')
plt.show()

# **Part 6: WQ, WK, WV similarities across models**

In [None]:
_,axs = plt.subplots(len(models),2,figsize=(10,8))


# loop over models
for modeli,whichmod in enumerate(models.keys()):

  # initialize matrix to store the metaparameters
  cossim_stats = np.zeros((params[whichmod]['n_layers'],3,2))

  # start of the progress report
  print(f"\n\nWorking on ...

  # mask for non-redundant and non-trivial indices
  # it is model-specific but not layer-specific
  N = params[whichmod]['n_emb']
  mask =


  # loop over layers
  for layeri in range(params[whichmod]['n_layers']):

    # update the layer number in the progress report
    print(f'{

    # split into Q, K, V
    Q,K,V = torch.split()

    ### Q
    Q = # normalize
    cs = # full similarity matrix
    cs = # extract mask elements
    cossim_stats[layeri,0,0] = # the mean
    cossim_stats[layeri,0,1] = # stdev


    ### repeat for K
    cossim_stats[layeri,1,0] =
    cossim_stats[layeri,1,1] =


    ### and for V


  # end of layer loop

  for i in [0,1]:
    axs[modeli,i].plot(cossim_stats[:,0,i],linewidth=2,label='$\\mathbf{W_Q}$')
    axs[modeli,i].plot(cossim_stats[:,1,i],linewidth=2,label='$\\mathbf{W_K}$')
    axs[modeli,i].plot(cossim_stats[:,2,i],linewidth=2,label='$\\mathbf{W_V}$')
    axs[modeli,i].legend(fontsize=8)
    axs[modeli,i].set(xlabel='Transformer layer',ylabel=['Mean','Stdev'][i],
                      title=f'GPT2-{whichmod}')


plt.tight_layout()
plt.savefig('ch6_proj33_part6.png')
plt.show()