|<h2>Book:</h2>|<h1><a href="https://open.substack.com/pub/mikexcohen/p/llm-breakdown-16-tokenization-words" target="_blank">50 ML projects to understand LLMs</a></h1>|
|-|:-:|
|<h2>Project:</h2>|<h1><b>[12] RSA to compare GPT2 vs. BERT embeddings</b></h1>|
|<h2>Author:<h2>|<h1>Mike X Cohen, <a href="https://sincxpress.com" target="_blank">sincxpress.com</a></h1>|

<br>

<i>Using the code without reading the book may lead to confusion or errors.</i>

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
import torch

In [None]:
### matplotlib adjustments (commented lines are for dark mode)

# svg plots (higher-res)
import matplotlib_inline.backend_inline
matplotlib_inline.backend_inline.set_matplotlib_formats('svg')

plt.rcParams.update({
    # 'figure.facecolor': '#282a2c',
    # 'figure.edgecolor': '#282a2c',
    # 'axes.facecolor':   '#282a2c',
    # 'axes.edgecolor':   '#DDE2F4',
    # 'axes.labelcolor':  '#DDE2F4',
    # 'xtick.color':      '#DDE2F4',
    # 'ytick.color':      '#DDE2F4',
    # 'text.color':       '#DDE2F4',
    'axes.spines.right': False,
    'axes.spines.top':   False,
    'axes.titleweight': 'bold',
    'axes.labelweight': 'bold',
    'savefig.dpi':300,
})

# **Part 1: Embeddings from two LLMs**

In [None]:
# BERT tokenizer and model
from transformers import BertTokenizer, BertModel

# load BERT tokenizer and model
tokenizerB = BertTokenizer.from_pretrained('bert-large-uncased')
modelB = BertModel.from_pretrained('bert-large-uncased')

# the embeddings matrix
embeddingsB = modelB.embeddings.word_embeddings.weight.detach()

In [None]:
# GPT2 tokenizer and model
from transformers import AutoTokenizer,GPT2Model
tokenizerG = AutoTokenizer.from_pretrained('gpt2-medium')
modelG = GPT2Model.from_pretrained('gpt2-medium')

# the embeddings matrix
embeddingsG = modelG.wte.weight.detach()

In [None]:
print(f'BERT vocabulary size: {tokenizerB.vocab_size}')
print(f'GPT2 vocabulary size: {tokenizerG.vocab_size}')
print('')

print(f'BERT embeddings shape: {list(embeddingsB.shape)}')
print(f'GPT2 embeddings shape: {list(embeddingsG.shape)}')

# **Part 2: Comparing model embeddings**

In [None]:
# list of words for RSA
words = [ 'space','spaceship','planet','moon'  ,'star' ,'galaxy',
          'chair','table'    ,'couch' ,'stool' ,'floor','desk',
          'apple','banana'   ,'pear'  ,'orange','peach','grape'
        ]

# will be convenient later
num_words = len(words)

# initialize lists of tokens
tokensB = []
tokensG = []

# table header
print('     Word  | B-n | G-n | B-s | G-s')
print('-----------+-----+-----+-----+-----')

for w in words:

  # tokenize without preceding spaces
  tb_nospace = tokenizerB.encode(w,add_special_tokens=False)
  tg_nospace = tokenizerG.encode(w)

  # tokenize with preceding spaces
  tb_withspace = tokenizerB.encode(f' {w}',add_special_tokens=False)
  tg_withspace = tokenizerG.encode(f' {w}')

  # print a row of the table
  print(f'{w:>10} |  {len(tb_nospace)}  |  {len(tg_nospace)}  |  {len(tb_withspace)}  |  {len(tg_withspace)}')

  # add to list
  tokensB.append(tb_nospace[0])
  tokensG.append(tg_withspace[0])

In [None]:
tokensB,tokensG

In [None]:
# embeddings matrix for these words
sub_embedB = embeddingsB[tokensB,:]
sub_embedG = embeddingsG[tokensG,:]

print(f'BERT embeddings shape: {list(sub_embedB.shape)}')
print(f'GPT2 embeddings shape: {list(sub_embedG.shape)}')

In [None]:
_,axs = plt.subplots(1,2,figsize=(10,4))

# scatter plot of an example word
axs[0].plot(sub_embedB[0,:],sub_embedG[0,:],'kh',
            markerfacecolor=[.7,.7,.9,.4])
axs[0].set(xlabel='BERT embeddings',ylabel='GPT-2 embeddings',title=f'A) Embeddings for "{words[0]}"')

# correlations for all words
for i in range(num_words):
  r = np.corrcoef(sub_embedB[i,:],sub_embedG[i,:])[0,1]
  axs[1].plot(i,r,'kh',markerfacecolor=[.9,.7,.7],markersize=10)

axs[1].axhline(0,linestyle='--',color='k',linewidth=.5,zorder=-1)
axs[1].set(xticks=range(num_words),xticklabels=words,ylim=[-.5,.5],
           title='B) Correlation for each word',ylabel='Correlation coefficient')
axs[1].tick_params(axis='x',labelrotation=90)

plt.tight_layout()
plt.savefig('ch3_proj12_part2.png')
plt.show()

# **Part 3: Representational similarity analysis**

In [None]:
# cosine similarities
csG = torch.zeros((num_words,num_words))
csB = torch.zeros((num_words,num_words))

for i in range(num_words):
  csB[i,:] = torch.cosine_similarity(sub_embedB[i],sub_embedB)
  csG[i,:] = torch.cosine_similarity(sub_embedG[i],sub_embedG)


# extract the upper-triangular elements
unique_B = csB[np.triu_indices_from(csB, k=1)]
unique_G = csG[np.triu_indices_from(csG, k=1)]

# Pearson correlation
r = np.corrcoef(unique_B,unique_G)[0,1]

print(f'Size of similarity matrices: {csB.shape}')
print(f'Number of non-redundant elements: {len(unique_B)}')

In [None]:
# and visualize
fig,axs = plt.subplots(1,3,figsize=(13,4))

# color limits for images
vminmax = [.1,.5]

# BERT: cosine similarity matrix
h = axs[0].imshow(csB,vmin=vminmax[0],vmax=vminmax[1],cmap='plasma')
axs[0].set(xticks=range(0,len(words),2),xticklabels=words[::2],
           yticks=range(1,len(words),2),yticklabels=words[1::2],
           title='A) Cossim matrix for BERT')
axs[0].tick_params(axis='x',labelrotation=90)
fig.colorbar(h,ax=axs[0],fraction=.046,pad=.02)


# GPT2: cosine similarity matrix
h = axs[1].imshow(csG,vmin=vminmax[0],vmax=vminmax[1],cmap='plasma')
axs[1].set(xticks=range(0,len(words),2),xticklabels=words[::2],
           yticks=range(1,len(words),2),yticklabels=words[1::2],
           title='B) Cossim matrix for GPT-2')
axs[1].tick_params(axis='x',labelrotation=90)
fig.colorbar(h,ax=axs[1],fraction=.046,pad=.02)


# scatter plot
axs[2].plot(unique_B,unique_G,'kh',markersize=8,markerfacecolor=[.7,.9,.7,.5])
axs[2].set(xlabel='BERT cosine similarities',ylabel='GPT-2 cosine similarities',
              title=f'C) Correlation (RSA score): r = {r:.2f}')
axs[2].grid(linestyle='--',color=[.9,.9,.9])


plt.tight_layout()
plt.savefig('ch3_proj12_part3.png')
plt.show()

# **Part 4: Different embeddings sizes**