|<h2>Book:</h2>|<h1><a href="https://open.substack.com/pub/mikexcohen/p/llm-breakdown-16-tokenization-words" target="_blank">50 ML projects to understand LLMs</a></h1>|
|-|:-:|
|<h2>Project:</h2>|<h1><b>[11] Graphs of cosine similarity</b></h1>|
|<h2>Author:<h2>|<h1>Mike X Cohen, <a href="https://sincxpress.com" target="_blank">sincxpress.com</a></h1>|

<br>

<i>Using the code without reading the book may lead to confusion or errors.</i>

In [None]:
import numpy as np
import matplotlib.pyplot as plt

In [None]:
### matplotlib adjustments (commented lines are for dark mode)

# svg plots (higher-res)
import matplotlib_inline.backend_inline
matplotlib_inline.backend_inline.set_matplotlib_formats('svg')

plt.rcParams.update({
    # 'figure.facecolor': '#282a2c',
    # 'figure.edgecolor': '#282a2c',
    # 'axes.facecolor':   '#282a2c',
    # 'axes.edgecolor':   '#DDE2F4',
    # 'axes.labelcolor':  '#DDE2F4',
    # 'xtick.color':      '#DDE2F4',
    # 'ytick.color':      '#DDE2F4',
    # 'text.color':       '#DDE2F4',
    'axes.spines.right': False,
    'axes.spines.top':   False,
    'axes.titleweight': 'bold',
    'axes.labelweight': 'bold',
    'savefig.dpi':300,
})

# **Part 1: Tokens, embeddings, and cosine similarity**

In [None]:
from transformers import BertTokenizer, BertModel

# load BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-large-uncased')
model = BertModel.from_pretrained('bert-large-uncased')

embeddings = model.embeddings.word_embeddings.weight.detach().numpy()

In [None]:
text = 'As Gregor Samsa awoke one morning from uneasy dreams he found himself transformed in his bed into a gigantic bug'

tokens = tokenizer.encode(text

print(f'There are {} tokens in the text, {} of which are unique.\n')

for t in tokens:
  print(f'Token index {t:5} is "{tokenizer.decod}"')

In [None]:
E = embeddings[tokens,:]

# normalize each vector to its norm (unit length)
E_norm =

# cosine similarity matrix
csM =

# get a vector of all unique matrix elements
uniqueCS =

# here's the threshold!
thresh =

In [None]:
# visualize
fig,axs = plt.subplots(1,2,figsize=(12,4))

h = axs[0].imshow(csM,vmin=-.7,vmax=.7,cmap='plasma')
axs[0].set(xlabel='Text token position',ylabel='Text token position',
           xticks=range(0,len(tokens),2),yticks=range(1,len(tokens),2),
           title='A) Cosine similarity matrix')
fig.colorbar(h,ax=axs[0],fraction=.046,pad=.02)



axs[1].legend()
axs[1].set(xlabel='Cosine similarity value',ylabel='Count',title='B) Distribution and threshold')

plt.tight_layout()
plt.savefig('ch3_proj11_part1.png')
plt.show()

# **Part 2: The network similarity graph**

In [None]:
# values for theta
N = len(tokens)
th = np.linspace

maxcs = # largest observed cossim value

# create a figure
plt.figure(figsize=(8,8))

thHighres =
plt.plot(np.cos(thHighres),np.sin())

# loop over tokens
for i in range(N):

  # determine dot (marker) size
  dotsize =

  # plot the dot
  plt.plot(,,'ko',markerfacecolor=[.7,.7,.7],markersize=dotsize)

  # draw text at the tokens
  plt.text,,tokenizer.decode([tokens[i]]),
           ha=, # pick a side based on cosine sign
           va=, # pick a vertical orientation from sine
           fontweight='bold',fontsize=16)

  # loop over all the other tokens
  for j in range(i+1,N):

    # only draw a line if similarity exceeds the threshold
    if csM[i,j]>thresh:

      # color corresponding to cossim strength
      cidx =
      color = plt.cm.plasma(cidx)

      # draw it!
      x_vals = [ np.cos(),np.cos() ]
      y_vals = [ np.sin(
      plt.plot(x_vals,y_vals,zorder=-10,color=color,linewidth=3*csM[i,j])

plt.axis('off')

plt.savefig('ch3_proj11_part2.png')
plt.show()

# **Part 3: Graphing number networks**

In [None]:
# tokenize and confirm single-token encoding
tokens = tokenizer.

for t in tokens:
  print(f'{t} is "{tokenizer.decode(t)}"')

In [None]:
# cosine similarity matrix
E =
E_norm =
csM =

In [None]:
# values for theta
N = len)
th =

maxcs =

# create a figure
plt.figure(figsize=(8,8))

thHighres = np.linspace(0,2*np.pi,100)
plt.plot

# loop over tokens
for i in range(N):

  # plot the dot
  plt.plot(

  # draw text at the tokens
  plt.text()

  # loop over all the other tokens
  for j in range(i+1,N):

    # color corresponding to cossim strength
    cidx =
    color =

    # draw it!
    plt.plot(x_vals,y_vals,zorder=-10,color=color)

plt.axis('off')

plt.tight_layout()
plt.savefig('ch3_proj11_part3.png')
plt.show()