|<h2>Book:</h2>|<h1><a href="https://open.substack.com/pub/mikexcohen/p/llm-breakdown-16-tokenization-words" target="_blank">50 ML projects to understand LLMs</a></h1>|
|-|:-:|
|<h2>Project:</h2>|<h1><b>[15] Analogy vectors</b></h1>|
|<h2>Author:<h2>|<h1>Mike X Cohen, <a href="https://sincxpress.com" target="_blank">sincxpress.com</a></h1>|

<br>

<i>Using the code without reading the book may lead to confusion or errors.</i>

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec

import pandas as pd
import seaborn as sns
from sklearn.metrics.pairwise import cosine_similarity

from transformers import RobertaTokenizer, RobertaForMaskedLM

In [None]:
### matplotlib adjustments (commented lines are for dark mode)

# svg plots (higher-res)
import matplotlib_inline.backend_inline
matplotlib_inline.backend_inline.set_matplotlib_formats('svg')

plt.rcParams.update({
    # 'figure.facecolor': '#282a2c',
    # 'figure.edgecolor': '#282a2c',
    # 'axes.facecolor':   '#282a2c',
    # 'axes.edgecolor':   '#DDE2F4',
    # 'axes.labelcolor':  '#DDE2F4',
    # 'xtick.color':      '#DDE2F4',
    # 'ytick.color':      '#DDE2F4',
    # 'text.color':       '#DDE2F4',
    'axes.spines.right': False,
    'axes.spines.top':   False,
    'axes.titleweight': 'bold',
    'axes.labelweight': 'bold',
    'savefig.dpi':300,
})

# **Part 1: Explore the RoBERTa model**

In [None]:
tokenizer = RobertaTokenizer.from_pretrained('roberta-large')
model = RobertaForMaskedLM.from_pretrained('roberta-large')
model.eval()

In [None]:
# extract the embeddings matrix
embeddings = model.rob
embeddings.shape

In [None]:
words = [ 'list', 'computer', 'apple', 'spaceship' ]

for w in words:
  print(f'"{w}" is indices {}')
  print(f'" {w}" is indices {}\n')

In [None]:
for i in range(20):
  print

In [None]:
print(tokenizer.encode
print(tokenizer.encode

# **Part 2: Extract four embeddings and make a dataframe**

In [None]:
# tokenize
words = [ ' king',' man',' woman',' queen' ]
tokens =

# print the token indices and corresponding tokens (words)
for w,tok in zip(words,tokens):
  print(f' is encoded using token indices ')

In [None]:
# although we actually need a list of ints, not a list of lists of ints
tokens =
tokens

In [None]:
# submatrix with embeddings
E =
df = pd.DataFrame(

# summary of dataframe
df.describe()

# **Part 3: Visualize using pairplots**

In [None]:
# visualize
sns.pairplot(df)

plt.tight_layout()
plt.savefig('ch3_proj15_part3.png')
plt.show()

# **Part 4: Visualize cosine similarities**

In [None]:
# cosine similarities
csMat =

# show the matrix
plt.imshow(csMat)
plt.gca().set(xticks=range(4),yticks=range(4),
              xticklabels=words,yticklabels=words,
              title='All pairwise cosine similarities')

# add text labels

plt.colorbar(pad=.02)

plt.tight_layout()
plt.savefig('ch3_proj15_part4.png')
plt.show()

# **Part 5: Arithmetic with embeddings vectors**

In [None]:
# king - man + woman
analogyVector = df[' king'] - df[' man'] + df[' woman']
sim2all = cosine_similarity()

fig = plt.figure(figsize=(12,3.5))
gs = gridspec.GridSpec(1,3,figure=fig)
ax1 = fig.add_subplot(gs[:-1])
ax2 = fig.add_subplot(gs[-1])

ax1.scatter(sim2all)
ax1.set(xlabel='Token index',ylabel='Cosine similarity',
        title='A) Cosine similarity with analogy vector')

ax2.hist()
ax2.set(xlabel='Cosine similarity',ylabel='Count',
        title='B) Distribution of similarities')

plt.tight_layout()
plt.savefig('ch3_proj15_part5.png')
plt.show()

In [None]:
# print out the top 10 highest scores
top10 = sim2all.argsort()[-10:][::-1]

print(' CosSim  |   R^2   |    word')
print('---------+---------+-------------')
for widx in top10:
  # correlation (square it to get shared variance)
  r = np.corrcoef(analogyVector,embeddings[widx])[0,1]
  print(f'')

# **Part 6: An analogy-completing function**

In [None]:
def analogyCalculator(word2start,word2subtract,word2add):

  # 1) print the analogy
  print(f'"{}" is to "{}" as "_____" is to "{}"\n')

  # 2) tokenize the words
  tokens =

  # 3) check that each word is one token
  if :
    raise ValueError("Warning: too many tokens.")

  # transform into single list
  tokens =

  # check for unknown tokens (<unk>)
  if :
    raise ValueError("Unknown token: ",tokenizer.decode(tokens))

  # 4) get the vectors
  v1 = embeddings # base word
  v2 = embeddings # to subtract
  v3 = embeddings # to add

  # 5) analogy vector
  analogyVector =

  # 6) cossim with all
  cossim2all =

  # 7) print out the top 10 highest scores
  top10 = cossim2all
  print('  CosSim  |   R^2   |    word')
  print('----------+---------+-------------')
  for widx in top10:
    # correlation (square it to get shared variance)
    r = np.corrcoef(
    print(f'  {}  |  {}%  |  "{}"')


In [None]:
# try it
analogyCalculator(' king',' man',' woman')

In [None]:
# analogyCalculator(' tree',' leaf',' petals')
# analogyCalculator('tree','leaf','petals')
# analogyCalculator('leaf','tree','flower') # turn it around for better results?
# analogyCalculator(' husky',' dog',' bird')
# analogyCalculator('finger','hand','foot')
# analogyCalculator(' shoe',' foot',' hand')
# analogyCalculator(' hand',' glove',' shoe')
# analogyCalculator('tomorrow','future','past')
# analogyCalculator('pants','legs','arms')