|<h2>Book:</h2>|<h1><a href="https://open.substack.com/pub/mikexcohen/p/llm-breakdown-16-tokenization-words" target="_blank">50 ML projects to understand LLMs</a></h1>|
|-|:-:|
|<h2>Project:</h2>|<h1><b>[15] Analogy vectors</b></h1>|
|<h2>Author:<h2>|<h1>Mike X Cohen, <a href="https://sincxpress.com" target="_blank">sincxpress.com</a></h1>|

<br>

<i>Using the code without reading the book may lead to confusion or errors.</i>

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec

import pandas as pd
import seaborn as sns
from sklearn.metrics.pairwise import cosine_similarity

from transformers import RobertaTokenizer, RobertaForMaskedLM

In [None]:
### matplotlib adjustments (commented lines are for dark mode)

# svg plots (higher-res)
import matplotlib_inline.backend_inline
matplotlib_inline.backend_inline.set_matplotlib_formats('svg')

plt.rcParams.update({
    # 'figure.facecolor': '#282a2c',
    # 'figure.edgecolor': '#282a2c',
    # 'axes.facecolor':   '#282a2c',
    # 'axes.edgecolor':   '#DDE2F4',
    # 'axes.labelcolor':  '#DDE2F4',
    # 'xtick.color':      '#DDE2F4',
    # 'ytick.color':      '#DDE2F4',
    # 'text.color':       '#DDE2F4',
    'axes.spines.right': False,
    'axes.spines.top':   False,
    'axes.titleweight': 'bold',
    'axes.labelweight': 'bold',
    'savefig.dpi':300,
})

# **Part 1: Explore the RoBERTa model**

In [None]:
tokenizer = RobertaTokenizer.from_pretrained('roberta-large')
model = RobertaForMaskedLM.from_pretrained('roberta-large')
model.eval()

In [None]:
# extract the embeddings matrix
embeddings = model.roberta.embeddings.word_embeddings.weight.detach().numpy()
embeddings.shape

In [None]:
words = [ 'list', 'computer', 'apple', 'spaceship' ]

for w in words:
  print(f'"{w}" is indices {tokenizer.encode(w)}')
  print(f'" {w}" is indices {tokenizer.encode(" "+w)}\n')

In [None]:
for i in range(20):
  print(f'Index {i:2} is "{tokenizer.decode(i)}"')

In [None]:
print(tokenizer.encode([' king'],add_special_tokens=False))
print(tokenizer.encode(' king',add_special_tokens=False))

# **Part 2: Extract four embeddings and make a dataframe**

In [None]:
# tokenize
words = [ ' king',' man',' woman',' queen' ]
tokens = [tokenizer.encode(w,add_special_tokens=False) for w in words]

# print the token indices and corresponding tokens (words)
for w,tok in zip(words,tokens):
  print(f'"{w}" is encoded using token indices {tok}')

In [None]:
# although we actually need a list of ints, not a list of lists of ints
tokens = [t[0] for t in tokens]
tokens

In [None]:
# submatrix with embeddings
E = embeddings[tokens]
df = pd.DataFrame(E.T,columns=words)

# summary of dataframe
df.describe()

# **Part 3: Visualize using pairplots**

In [None]:
# visualize
sns.pairplot(df,kind='reg',
             plot_kws={'line_kws':{'color':'r'},
                       'scatter_kws':{'color':[.7,.7,.9],'s':10,'alpha':.5}},
             diag_kws={'color':[.9,.7,.7]}
            )

plt.tight_layout()
plt.savefig('ch3_proj15_part3.png')
plt.show()

# **Part 4: Visualize cosine similarities**

In [None]:
# cosine similarities
csMat = cosine_similarity(E)

# show the matrix
plt.imshow(csMat,vmin=csMat.min(),vmax=1,cmap='Reds')
plt.gca().set(xticks=range(4),yticks=range(4),
              xticklabels=words,yticklabels=words,
              title='All pairwise cosine similarities')

# add text labels
for i in range(4):
  for j in range(4):
    plt.text(j,i,f'{csMat[i,j]:.2f}',
             ha='center',va='center',fontsize=18)

plt.colorbar(pad=.02)

plt.tight_layout()
plt.savefig('ch3_proj15_part4.png')
plt.show()

# **Part 5: Arithmetic with embeddings vectors**

In [None]:
# king - man + woman
analogyVector = df[' king'] - df[' man'] + df[' woman']
sim2all = cosine_similarity(analogyVector.values.reshape(1,-1),embeddings)
sim2all = np.squeeze(sim2all)

fig = plt.figure(figsize=(12,3.5))
gs = gridspec.GridSpec(1,3,figure=fig)
ax1 = fig.add_subplot(gs[:-1])
ax2 = fig.add_subplot(gs[-1])

ax1.scatter(range(len(sim2all)),sim2all,
            c=np.sqrt(abs(sim2all)),alpha=.7,s=(sim2all**2)*60,cmap='cool')
ax1.set(xlabel='Token index',ylabel='Cosine similarity',
        title='A) Cosine similarity with analogy vector')

ax2.hist(sim2all,bins='fd',color=[.7,.7,.9],edgecolor='gray')
ax2.set(xlabel='Cosine similarity',ylabel='Count',
        xlim=[sim2all.min(),sim2all.max()],yscale='log',
        title='B) Distribution of similarities')

plt.tight_layout()
plt.savefig('ch3_proj15_part5.png')
plt.show()

In [None]:
# print out the top 10 highest scores
top10 = sim2all.argsort()[-10:][::-1]

print(' CosSim  |   R^2   |    word')
print('---------+---------+-------------')
for widx in top10:
  # correlation (square it to get shared variance)
  r = np.corrcoef(analogyVector,embeddings[widx])[0,1]
  print(f'  {sim2all[widx]:.3f}  |  {100*r**2:4.1f}%  |  "{tokenizer.decode(widx)}"')

# **Part 6: An analogy-completing function**

In [None]:
def analogyCalculator(word2start,word2subtract,word2add):

  # 1) print the analogy
  print(f'"{word2start}" is to "{word2subtract}" as "_____" is to "{word2add}"\n')

  # 2) tokenize the words
  tokens = [tokenizer.encode(w,add_special_tokens=False) for w in [word2start,word2subtract,word2add]]

  # 3) check that each word is one token
  if sum([len(l) for l in tokens]) != 3:
    raise ValueError("Warning: too many tokens.")

  # transform into single list
  tokens = [t[0] for t in tokens]

  if '<unk>' in tokenizer.decode(tokens):
    raise ValueError("Unknown token: ",tokenizer.decode(tokens))

  # 4) get the vectors
  v1 = embeddings[tokens[0]] # base word
  v2 = embeddings[tokens[1]] # to subtract
  v3 = embeddings[tokens[2]] # to add

  # 5) analogy vector
  analogyVector = v1 - v2 + v3

  # 6) cossim with all
  cossim2all = cosine_similarity(analogyVector.reshape(1,-1),embeddings)[0]

  # 7) print out the top 10 highest scores
  top10 = cossim2all.argsort()[-10:][::-1]
  print('  CosSim  |   R^2   |    word')
  print('----------+---------+-------------')
  for widx in top10:
    # correlation (square it to get shared variance)
    r = np.corrcoef(analogyVector,embeddings[widx])[0,1]
    print(f'  {cossim2all[widx]:6.3f}  |  {100*r**2:4.1f}%  |  "{tokenizer.decode(widx)}"')


In [None]:
# try it
analogyCalculator(' king',' man',' woman')

In [None]:
# analogyCalculator(' tree',' leaf',' petals')
# analogyCalculator('tree','leaf','petals')
# analogyCalculator('leaf','tree','flower') # turn it around for better results?
# analogyCalculator(' husky',' dog',' bird')
# analogyCalculator('finger','hand','foot')
# analogyCalculator(' shoe',' foot',' hand')
analogyCalculator(' hand',' glove',' shoe')
# analogyCalculator('tomorrow','future','past')
# analogyCalculator('pants','legs','arms')