|<h2>Book:</h2>|<h1><a href="https://open.substack.com/pub/mikexcohen/p/llm-breakdown-16-tokenization-words" target="_blank">50 ML projects to understand LLMs</a></h1>|
|-|:-:|
|<h2>Project:</h2>|<h1><b>[9] Sequential word cosine similarity</b></h1>|
|<h2>Author:<h2>|<h1>Mike X Cohen, <a href="https://sincxpress.com" target="_blank">sincxpress.com</a></h1>|

<br>

<i>Using the code without reading the book may lead to confusion or errors.</i>

In [None]:
import numpy as np
import matplotlib.pyplot as plt

from sklearn.metrics.pairwise import cosine_similarity

In [None]:
### matplotlib adjustments (commented lines are for dark mode)

# svg plots (higher-res)
import matplotlib_inline.backend_inline
matplotlib_inline.backend_inline.set_matplotlib_formats('svg')

plt.rcParams.update({
    # 'figure.facecolor': '#282a2c',
    # 'figure.edgecolor': '#282a2c',
    # 'axes.facecolor':   '#282a2c',
    # 'axes.edgecolor':   '#DDE2F4',
    # 'axes.labelcolor':  '#DDE2F4',
    # 'xtick.color':      '#DDE2F4',
    # 'ytick.color':      '#DDE2F4',
    # 'text.color':       '#DDE2F4',
    'axes.spines.right': False,
    'axes.spines.top':   False,
    'axes.titleweight': 'bold',
    'axes.labelweight': 'bold',
    'savefig.dpi':300,
})

# **Part 1: Cosine similarity of sequential tokens**

In [None]:
# load BERT tokenizer and model
from transformers import BertTokenizer, BertModel

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

# the embeddings matrix
embeddings = model.embeddings.word_embeddings.weight.detach()

In [None]:
sentence = "my phone is in the kitchen near the cold ice cream"

# tokenize the sentence
tokens = tokenizer.encode(sentence)[1:-1] # ignore the cls/sep tokens

# initialize cosine similarity
cossim = np.full(len(tokens),np.nan)

# calculate cosine similarity for successive word pairs
for ti in range(1,len(tokens)):
  v1 = embeddings[tokens[ti],:]
  v2 = embeddings[tokens[ti-1],:]
  c = cosine_similarity(v1.reshape(1,-1),v2.reshape(1,-1))
  cossim[ti] = c.item()


# plot!
plt.figure(figsize=(12,4))
plt.bar(np.arange(len(cossim)),cossim,facecolor=[.7,.7,.9],edgecolor='k')
plt.gca().set(xticks=range(len(tokens)),xticklabels=[tokenizer.decode(t) for t in tokens],
              xlim=[-.5,len(tokens)-.5],ylabel='Cosine similarity')
plt.gca().tick_params(axis='x',rotation=-45)

plt.title('Cosine similarities of sequential token embeddings',fontweight='bold')

plt.tight_layout()
plt.savefig('ch3_proj9_part1.png')
plt.show()

# **Part 2: Are embeddings vectors context-dependent?**

In [None]:
sentences = [
    'The conductor waved his hands as the train departed and people sat down',
    'The conductor waved his hands as the orchestra began and people sat down'
]


plt.figure(figsize=(12,4))

for i,sent in enumerate(sentences):

  # tokenize the sentence
  tokens = tokenizer.encode(sent)[1:-1]

  # initialize cosine similarity
  cossim = np.full(len(tokens),np.nan)

  # cosine similarity for successive word pairs
  for ti in range(1,len(tokens)):
    v1 = embeddings[tokens[ti],:]
    v2 = embeddings[tokens[ti-1],:]
    c = cosine_similarity(v1.reshape(1,-1),v2.reshape(1,-1))
    cossim[ti] = c.item()


  # plot!
  plt.bar(np.arange(0,len(cossim))+i/4-.1,cossim,width=.5,edgecolor='k',
          label=tokenizer.decode(tokens[7:9]))



# finish the plot
plt.axhline(0,linestyle='--',color='k',linewidth=.5,zorder=-3)
xticklabs = [tokenizer.decode(t) for t in tokens]
xticklabs[7] = f'train/\norchestra'
xticklabs[8] = 'departed/\nbegan'
plt.gca().set(xticks=range(len(tokens)),xticklabels=xticklabs,
              xlim=[-.5,len(tokens)-.5],ylabel='Cosine similarity')
plt.tick_params(axis='x',rotation=-45)
plt.legend()
plt.title('Cosine similarities of sequential token embeddings',fontweight='bold')

plt.tight_layout()
plt.savefig('ch3_proj9_part2.png')
plt.show()