|<h2>Book:</h2>|<h1><a href="https://open.substack.com/pub/mikexcohen/p/llm-breakdown-16-tokenization-words" target="_blank">50 ML projects to understand LLMs</a></h1>|
|-|:-:|
|<h2>Project:</h2>|<h1><b>[11] Graphs of cosine similarity</b></h1>|
|<h2>Author:<h2>|<h1>Mike X Cohen, <a href="https://sincxpress.com" target="_blank">sincxpress.com</a></h1>|

<br>

<i>Using the code without reading the book may lead to confusion or errors.</i>

In [None]:
import numpy as np
import matplotlib.pyplot as plt

In [None]:
### matplotlib adjustments (commented lines are for dark mode)

# svg plots (higher-res)
import matplotlib_inline.backend_inline
matplotlib_inline.backend_inline.set_matplotlib_formats('svg')

plt.rcParams.update({
    # 'figure.facecolor': '#282a2c',
    # 'figure.edgecolor': '#282a2c',
    # 'axes.facecolor':   '#282a2c',
    # 'axes.edgecolor':   '#DDE2F4',
    # 'axes.labelcolor':  '#DDE2F4',
    # 'xtick.color':      '#DDE2F4',
    # 'ytick.color':      '#DDE2F4',
    # 'text.color':       '#DDE2F4',
    'axes.spines.right': False,
    'axes.spines.top':   False,
    'axes.titleweight': 'bold',
    'axes.labelweight': 'bold',
    'savefig.dpi':300,
})

# **Part 1: Tokens, embeddings, and cosine similarity**

In [None]:
from transformers import BertTokenizer, BertModel

# load BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-large-uncased')
model = BertModel.from_pretrained('bert-large-uncased')

embeddings = model.embeddings.word_embeddings.weight.detach().numpy()

In [None]:
text = 'As Gregor Samsa awoke one morning from uneasy dreams he found himself transformed in his bed into a gigantic bug'

tokens = tokenizer.encode(text,add_special_tokens=False)

print(f'There are {len(tokens)} tokens in the text, {len(set(tokens))} of which are unique.\n')

for t in tokens:
  print(f'Token index {t:5} is "{tokenizer.decode(t)}"')

In [None]:
E = embeddings[tokens,:]

# normalize each vector to its norm (unit length)
E_norm = E / np.linalg.norm(E,axis=1,keepdims=True)

# cosine similarity matrix
csM = E_norm @ E_norm.T

# get a vector of all unique matrix elements
uniqueCS = csM[np.nonzero(np.triu(csM,1))]

# here's the threshold!
thresh = np.median(uniqueCS) + np.std(uniqueCS)

In [None]:
# visualize
fig,axs = plt.subplots(1,2,figsize=(12,4))

h = axs[0].imshow(csM,vmin=-.7,vmax=.7,cmap='plasma')
axs[0].set(xlabel='Text token position',ylabel='Text token position',
           xticks=range(0,len(tokens),2),yticks=range(1,len(tokens),2),
           title='A) Cosine similarity matrix')
fig.colorbar(h,ax=axs[0],fraction=.046,pad=.02)

axs[1].hist(uniqueCS,bins=40,color=[.7,.7,.7],edgecolor='k')
axs[1].axvline(thresh,linestyle='--',color='m',label='Threshold (median + std)')

axs[1].legend()
axs[1].set(xlabel='Cosine similarity value',ylabel='Count',title='B) Distribution and threshold')

plt.tight_layout()
plt.savefig('ch3_proj11_part1.png')
plt.show()

# **Part 2: The network similarity graph**

In [None]:
# values for theta
N = len(tokens)
th = np.linspace(0,2*np.pi-2*np.pi/N,N)

maxcs = np.sort(csM[csM<.99])[-1]

# create a figure
plt.figure(figsize=(8,8))

thHighres = np.linspace(0,2*np.pi,100)
plt.plot(np.cos(thHighres),np.sin(thHighres),color=[.3,.3,.3])

# loop over tokens
for i in range(N):

  # determine dot (marker) size
  dotsize = 5 * np.sqrt((csM[i]>thresh).sum())

  # plot the dot
  plt.plot(np.cos(th[i]),np.sin(th[i]),'ko',markerfacecolor=[.7,.7,.7],markersize=dotsize)

  # draw text at the tokens
  plt.text(np.cos(th[i]),np.sin(th[i]),tokenizer.decode([tokens[i]]),
           ha=['right','left'][int(np.cos(th[i])>0)], # pick a side based on cosine sign
           va=['top','bottom'][int(np.sin(th[i])>0)], # pick a vertical orientation from sine
           fontweight='bold',fontsize=16)

  # loop over all the other tokens
  for j in range(i+1,N):

    # only draw a line if similarity exceeds the threshold
    if csM[i,j]>thresh:

      # color corresponding to cossim strength
      cidx = (csM[i,j]-thresh) / (maxcs-thresh)
      color = plt.cm.plasma(cidx)

      # draw it!
      x_vals = [ np.cos(th[i]),np.cos(th[j]) ]
      y_vals = [ np.sin(th[i]),np.sin(th[j]) ]
      plt.plot(x_vals,y_vals,zorder=-10,color=color,linewidth=3*csM[i,j])

plt.axis('off')

plt.savefig('ch3_proj11_part2.png')
plt.show()

# **Part 3: Graphing number networks**

In [None]:
# tokenize and confirm single-token encoding
text = [str(i) for i in range(10)] + [str(i*10) for i in range(1,11)]
tokens = tokenizer.encode(text,add_special_tokens=False)

for t in tokens:
  print(f'{t} is "{tokenizer.decode(t)}"')

In [None]:
# cosine similarity matrix
E = embeddings[tokens,:]
E_norm = E / np.linalg.norm(E,axis=1,keepdims=True)
csM = E_norm @ E_norm.T

In [None]:
# values for theta
N = len(tokens)
th = np.linspace(0,2*np.pi-2*np.pi*(1/N),N)

maxcs = np.sort(csM[csM<.99])[-1]

# create a figure
plt.figure(figsize=(8,8))

thHighres = np.linspace(0,2*np.pi,100)
plt.plot(np.cos(thHighres),np.sin(thHighres),color=[.3,.3,.3])

# loop over tokens
for i in range(N):

  # plot the dot
  plt.plot(np.cos(th[i]),np.sin(th[i]),'ko',markerfacecolor=[.7,.7,.7],markersize=24)

  # draw text at the tokens
  plt.text(np.cos(th[i]),np.sin(th[i]),tokenizer.decode([tokens[i]]),
           ha='center',va='center',fontweight='bold',fontsize=12,color='k')

  # loop over all the other tokens
  for j in range(i+1,N):

    # color corresponding to cossim strength
    cidx = (csM[i,j]-csM.min()) / (maxcs-csM.min())
    color = plt.cm.plasma(cidx)

    # draw it!
    x_vals = [ np.cos(th[i]),np.cos(th[j]) ]
    y_vals = [ np.sin(th[i]),np.sin(th[j]) ]
    plt.plot(x_vals,y_vals,zorder=-10,color=color)

plt.axis('off')

plt.tight_layout()
plt.savefig('ch3_proj11_part3.png')
plt.show()