|<h2>Book:</h2>|<h1><a href="https://open.substack.com/pub/mikexcohen/p/llm-breakdown-16-tokenization-words" target="_blank">50 ML projects to understand LLMs</a></h1>|
|-|:-:|
|<h2>Project:</h2>|<h1><b>[10] Sequential number cosine similarity</b></h1>|
|<h2>Author:<h2>|<h1>Mike X Cohen, <a href="https://sincxpress.com" target="_blank">sincxpress.com</a></h1>|

<br>

<i>Using the code without reading the book may lead to confusion or errors.</i>

In [None]:
import numpy as np
import matplotlib.pyplot as plt

import torch
import torch.nn.functional as F

In [None]:
### matplotlib adjustments (commented lines are for dark mode)

# svg plots (higher-res)
import matplotlib_inline.backend_inline
matplotlib_inline.backend_inline.set_matplotlib_formats('svg')

plt.rcParams.update({
    # 'figure.facecolor': '#282a2c',
    # 'figure.edgecolor': '#282a2c',
    # 'axes.facecolor':   '#282a2c',
    # 'axes.edgecolor':   '#DDE2F4',
    # 'axes.labelcolor':  '#DDE2F4',
    # 'xtick.color':      '#DDE2F4',
    # 'ytick.color':      '#DDE2F4',
    # 'text.color':       '#DDE2F4',
    'axes.spines.right': False,
    'axes.spines.top':   False,
    'axes.titleweight': 'bold',
    'axes.labelweight': 'bold',
    'savefig.dpi':300,
})

# **Part 1: Token counts of numbers**

In [None]:
# GPT2 tokenizer and model
from transformers import AutoTokenizer,AutoModel
model = AutoModel.from_pretrained('gpt2')
tokenizer = AutoTokenizer.from_pretrained('gpt2')

# the embeddings matrix
embeddings = model.wte.weight.detach()

In [None]:
# the numbers to tokenize
numbers = np.arange(-1000,1001)

# initializations
num_tokens = np.zeros(len(numbers),dtype=int)
token_idx = []

# loop over tokens
for i in range(len(numbers)):

  # tokenize
  t = tokenizer.encode(str(numbers[i]))

  # store indices and length
  token_idx.append(t)
  num_tokens[i] = len(t)


plt.figure(figsize=(12,3))
plt.plot(numbers,num_tokens+np.random.normal(0,.03,len(numbers)),'ko',markerfacecolor='w',markersize=4,alpha=.5)
plt.gca().set(xlabel='Number',ylabel='Token count',yticks=np.unique(num_tokens),
              xlim=[numbers[0]-30,numbers[-1]+30])

plt.tight_layout()
plt.savefig('ch3_proj10_part1a.png')
plt.show()

In [None]:
plt.figure(figsize=(12,3))
colord = 'bgr'
shape = 'os^'

# loop over tokens
for i in range(len(numbers)):

  # extract the token indices for this number
  t = token_idx[i]

  # plot them
  for ti in range(len(t)):
    plt.plot(numbers[i],t[ti],'k',marker=shape[ti],markerfacecolor=colord[ti],
             markersize=6,alpha=.4,linewidth=.4)


# hacky legend solution (plot with alpha=0)
for i in range(3):
  plt.plot(0,0,'w',marker=shape[i],markerfacecolor=colord[i],
             markersize=10,alpha=0,linewidth=.5,label=['First','Second','Third'][i])
h = plt.legend()
for e in h.legend_handles: e.set_alpha(1)

plt.gca().set(xlabel='Number',ylabel='Token index',ylim=[-1000,51000],xlim=[numbers[0]-30,numbers[-1]+30])

plt.tight_layout()
plt.savefig('ch3_proj10_part1b.png')
plt.show()

# **Part 2: Opposite sign, equal token indices**

In [None]:
# initialize
matches_count = np.zeros(1000)
matches_vals = np.zeros(1000)

for i in range(1000):

  # confirm the digit pairing and token count
  n = f'({numbers[i]:>5},{numbers[-i-1]:>4})'
  t = f'({num_tokens[i]-1},{num_tokens[-i-1]})'

  # see if they match
  matches_count[i] = (num_tokens[i]-1) == num_tokens[-i-1]
  matches_vals[i] = token_idx[i][1:] == token_idx[-i-1]

  # for confirmation during code development
  # print(n,t)

matches_count.sum(), matches_vals.sum()

# **Part 3: Successive digit cosine similarity**

In [None]:
# confirm that digits are single-token
for i in range(11):
  print(f'The token(s) for "{i}" are: {tokenizer.encode(str(i))}')

In [None]:
# tokenize the digits
tokens = [tokenizer.encode(str(i))[0] for i in range(11)]

# initialize cosine similarity
cossim = np.full(len(tokens),np.nan)

# calculate cosine similarity for successive digit pairs
for ti in range(1,len(tokens)):
  v1 = embeddings[tokens[ti],:]
  v2 = embeddings[tokens[ti-1],:]
  cossim[ti] = torch.sum(v1*v2) / torch.sqrt( torch.sum(v1**2)*torch.sum(v2**2) )


# plot!
plt.figure(figsize=(12,4))
plt.bar(np.arange(len(cossim)),cossim,facecolor=[.7,.7,.9],edgecolor='k')
plt.gca().set(xticks=range(len(tokens)),xticklabels=[tokenizer.decode(t) for t in tokens],
              xlabel='Digit',ylabel='Cosine similarity',
              xlim=[-.5,len(tokens)-.5],ylim=[np.nanmin(cossim)-.05,np.nanmax(cossim)+.05])

plt.title('Cosine similarities of sequential token embeddings',fontweight='bold')

plt.tight_layout()
plt.savefig('ch3_proj10_part3.png')
plt.show()

# **Part 4: All to all number similarities**

In [None]:
# tokenize the numbers
nums = [str(i) for i in range(10)] + [str(i*10) for i in range(1,11)]
tokens = tokenizer.encode(nums)

# confirm single tokens
for t in tokens:
  print(f'{t:5} is "{tokenizer.decode(t)}"')

In [None]:
# cosine similarity matrix
E = embeddings[tokens,:]
E_norm = F.normalize(E,p=2,dim=1)
csM = E_norm @ E_norm.T

# visualize
plt.imshow(csM,vmin=.2,vmax=.9,cmap='magma')
plt.gca().set(xticks=range(0,len(nums),2),xticklabels=nums[::2],
              yticks=range(1,len(nums),2),yticklabels=nums[1::2],
              title='Cosine similarity in number pairs')
plt.colorbar(pad=.02)

plt.tight_layout()
plt.savefig('ch3_proj10_part4.png')
plt.show()