|<h2>Book:</h2>|<h1><a href="https://open.substack.com/pub/mikexcohen/p/llm-breakdown-16-tokenization-words" target="_blank">50 ML projects to understand LLMs</a></h1>|
|-|:-:|
|<h2>Project:</h2>|<h1><b>[12] Word similarity via distance and cosine</b></h1>|
|<h2>Author:<h2>|<h1>Mike X Cohen, <a href="https://sincxpress.com" target="_blank">sincxpress.com</a></h1>|

<br>

<i>Using the code without reading the book may lead to confusion or errors.</i>

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import torch

In [None]:
### matplotlib adjustments (commented lines are for dark mode)

# svg plots (higher-res)
import matplotlib_inline.backend_inline
matplotlib_inline.backend_inline.set_matplotlib_formats('svg')

plt.rcParams.update({
    # 'figure.facecolor': '#282a2c',
    # 'figure.edgecolor': '#282a2c',
    # 'axes.facecolor':   '#282a2c',
    # 'axes.edgecolor':   '#DDE2F4',
    # 'axes.labelcolor':  '#DDE2F4',
    # 'xtick.color':      '#DDE2F4',
    # 'ytick.color':      '#DDE2F4',
    # 'text.color':       '#DDE2F4',
    'axes.spines.right': False,
    'axes.spines.top':   False,
    'axes.titleweight': 'bold',
    'axes.labelweight': 'bold',
    'savefig.dpi':300,
})

# **Part 1: Cosine similarity vs. Euclidean distance**

In [None]:
# simulation parameters
M = 768
k = 1000

# initializations
cs = np.zeros((k,3))
dist = np.zeros((k,3))

# loop over simulation iterations
for i in range(k):

  # create the data
  x = np.random.normal(0,1,M)
  y = np.random.normal(0,1,M)

  # case 1: normal random
  dist[i,0] = np.sqrt( np.sum( (x-y)**2 ) )
  cs[i,0] = np.dot(x,y) / (np.linalg.norm(x)*np.linalg.norm(y))

  # case 2: different variances
  x /= 10
  dist[i,1] = np.sqrt( np.sum( (x-y)**2 ) )
  cs[i,1] = np.dot(x,y) / (np.linalg.norm(x)*np.linalg.norm(y))

  # case 3: normalized
  x = x/np.linalg.norm(x)
  y = y/np.linalg.norm(y)
  dist[i,2] = np.sqrt( np.sum( (x-y)**2 ) )
  cs[i,2] = np.dot(x,y) # need the norm-scaling?


# the first plot
plt.plot(dist[:,0],cs[:,0],'rh',markerfacecolor=[.9,.7,.7,.5],label='Case 1: randn')
plt.plot(dist[:,1],cs[:,1],'go',markerfacecolor=[.7,.9,.7,.5],label='Case 2: Unequal var.')
plt.plot(dist[:,2],cs[:,2],'bs',markerfacecolor=[.7,.7,.9,.5],label='Case 3: Normed')

plt.gca().set(xlabel='Euclidean distance',ylabel='Cosine similarity')
plt.legend()

plt.tight_layout()
plt.savefig('ch3_proj13_part1a.png')
plt.show()

In [None]:
# the second plot
_,axs = plt.subplots(1,3,figsize=(10,3))

axs[0].plot(dist[:,0],cs[:,0],'rh',markerfacecolor=[.9,.7,.7,.7])
axs[0].set(xlabel='Euclidean distance',ylabel='Cosine similarity',title='A) Case 1: randn')

axs[1].plot(dist[:,1],cs[:,1],'go',markerfacecolor=[.7,.9,.7,.7])
axs[1].set(xlabel='Euclidean distance',ylabel='Cosine similarity',title='B) Case 2: Unequal var.')

axs[2].plot(dist[:,2],cs[:,2],'bs',markerfacecolor=[.7,.7,.9,.7])
axs[2].set(xlabel='Euclidean distance',ylabel='Cosine similarity',title='C) Case 3: Normed')

plt.tight_layout()
plt.savefig('ch3_proj13_part1b.png')
plt.show()

# **Part 2: Relationship between similarity and distance**

In [None]:
# apply equation 1.13 to Case 3 (both vectors normed)
eq13 = cs[:,2] - (1-dist[:,2]**2/2)

plt.figure(figsize=(8,4))
plt.plot(eq13,'k.')
plt.gca().set(xlabel='Simulation number',ylabel='Error')

# note the y-axis, then use the same y-axis as in Part 1
# plt.ylim([-.1,.1])

plt.tight_layout()
plt.savefig('ch3_proj13_part2a.png')
plt.show()

In [None]:
# errors (difference between RHS and LHS)
e = np.zeros(k)

# loop over simulation iterations
for i in range(k):

  # create the data
  x = np.random.normal(0,1,M)
  y = np.random.normal(0,1,M)

  # case 1: normal random
  d = np.sqrt( np.sum( (x-y)**2 ) )
  s = np.dot(x,y)/(np.linalg.norm(x)*np.linalg.norm(y))

  xtx = x.T@x
  yty = y.T@y

  # Equation 1.14
  den = np.sqrt(xtx) * np.sqrt(yty)
  rhs = xtx/den + yty/den - d**2/den
  e[i] = 2*s - rhs

  # Equation 1.15 (comment out to run 1.14)
  e[i] = 2*x.T@y - (xtx + yty - d**2)


# and plot
plt.figure(figsize=(8,4))
plt.plot(e,'k.')
plt.gca().set(xlabel='Simulation number',ylabel='Error')

plt.tight_layout()
plt.savefig('ch3_proj13_part2b.png')
plt.show()

# **Part 3: Word synonyms via distance and similarity**

In [None]:
from transformers import BertTokenizer, BertModel

# load BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-large-uncased')
model = BertModel.from_pretrained('bert-large-uncased')

# extract embeddings
embeddings = model.embeddings.word_embeddings.weight.detach()

# vocab size
n_vocab = embeddings.shape[0]

In [None]:
# pick a "seed" vector
seedword = 'beauty'
seedtoken = tokenizer.encode(seedword,add_special_tokens=False)

print(f'The token "{seedword}" comprises these token indices: {seedtoken}')

In [None]:
seedvect = embeddings[seedtoken,:]

# Euclidean distance to all other vectors
eucDist = torch.sqrt( torch.sum( (embeddings-seedvect)**2 ,axis=1) )

# cosine similarity to all other vectors
cossim = torch.cosine_similarity(seedvect,embeddings)

# remove trivial values
eucDist[torch.argmin(eucDist)] = torch.nan
cossim[torch.argmax(cossim)] = torch.nan

# min-max scaling for coloring the scatter plot
eucDist_minmax = (eucDist-np.nanmin(eucDist)) / (np.nanmax(eucDist)-np.nanmin(eucDist))
cossim_minmax = (cossim-np.nanmin(cossim)) / (np.nanmax(cossim)-np.nanmin(cossim))

In [None]:
# visualizations
_,axs = plt.subplots(1,3,figsize=(15,4))

# plot the Euclidean distances
axs[0].scatter(range(n_vocab),eucDist,s=20,c=cossim_minmax,cmap=plt.cm.plasma,alpha=.4)
axs[0].set(xlim=[-20,n_vocab+20],xlabel='Token index',ylabel='Euclidean distance',
           title=f'Distance to "{seedword}",\ncolored by cosine similarity')

# plot the cosine similarities
axs[1].scatter(range(n_vocab),cossim,s=20,c=eucDist_minmax,cmap=plt.cm.magma,alpha=.4)
axs[1].set(xlim=[-20,n_vocab+20],xlabel='Token index',ylabel='Cosine similarity',
           title=f'Cosine similarity with "{seedword}",\ncolored by distance')

# and their relationship
axs[2].plot(eucDist,cossim,'ko',markerfacecolor=[.7,.7,.9,.2])
axs[2].set(xlabel='Euclidean distance',ylabel='Cosine similarity',
           title='Relation between\nS$_C$ and Euclidean distance')

plt.tight_layout()
plt.savefig('ch3_proj13_part3.png')
plt.show()

# **Part 4: Top-k closest and most similar**

In [None]:
# now for the top-k closest tokens
k = 10
topKidx = torch.argsort(eucDist)[:k]

print(f'Minimum distance {k} words to "{seedword}":')
for i in topKidx:
  print(f'  Distance of {eucDist[i]:.3f} to "{tokenizer.decode(i)}"')

In [None]:
# now for the top-k most similar tokens
topKidx = torch.argsort(cossim,descending=True)[:k]

print(f'Most similar {k} words to "{seedword}":')
for i in topKidx:
  print(f'  Similarity of {cossim[i]:.3f} to "{tokenizer.decode(i)}"')

In [None]:
# FYI
cossim.sort()

# **Part 5: Normalized distance**

In [None]:
### run this cell then repeat Parts 3 and 4

# normalize the embeddings matrix
E_norm = torch.nn.functional.normalize(embeddings,p=2,dim=1)

# Euclidean distance to all other vectors
eucDist = torch.sqrt( torch.sum( (E_norm-E_norm[seedtoken,:])**2 ,axis=1) )
eucDist[eucDist==0] = torch.nan

# min-max scaling for coloring the scatter plot
eucDist_minmax = (eucDist-np.nanmin(eucDist)) / (np.nanmax(eucDist)-np.nanmin(eucDist))