|<h2>Book:</h2>|<h1><a href="https://open.substack.com/pub/mikexcohen/p/llm-breakdown-16-tokenization-words" target="_blank">50 ML projects to understand LLMs</a></h1>|
|-|:-:|
|<h2>Project:</h2>|<h1><b>[4] Token lengths in characters and bytes</b></h1>|
|<h2>Author:<h2>|<h1>Mike X Cohen, <a href="https://sincxpress.com" target="_blank">sincxpress.com</a></h1>|

<br>

<i>Using the code without reading the book may lead to confusion or errors.</i>

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl

In [None]:
### matplotlib adjustments (commented lines are for dark mode)

# svg plots (higher-res)
import matplotlib_inline.backend_inline
matplotlib_inline.backend_inline.set_matplotlib_formats('svg')

plt.rcParams.update({
    # 'figure.facecolor': '#282a2c',
    # 'figure.edgecolor': '#282a2c',
    # 'axes.facecolor':   '#282a2c',
    # 'axes.edgecolor':   '#DDE2F4',
    # 'axes.labelcolor':  '#DDE2F4',
    # 'xtick.color':      '#DDE2F4',
    # 'ytick.color':      '#DDE2F4',
    # 'text.color':       '#DDE2F4',
    'axes.spines.right': False,
    'axes.spines.top':   False,
    'axes.titleweight': 'bold',
    'axes.labelweight': 'bold',
    'savefig.dpi':300,
})

# **Demo of counting bytes**

In [None]:
things = [ 'x','!','Ã¼','ðŸ¥°','æ´»' ]

for t in things:
  print(f"{len(t.encode('utf-8'))} bytes in {t}")

# **Part 1: Character vs. byte lengths in all tokens**

In [None]:
# GPT2 tokenizer
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained('gpt2')

In [None]:
# initialize count-by-count matrix
token_lengths = np.zeros((200,200),dtype=int)
token_lens_chars = np.zeros(tokenizer.vocab_size,dtype=int)
token_lens_bytes = np.zeros(tokenizer.vocab_size,dtype=int)


for i in range(tokenizer.vocab_size):

  # token for this index
  thistoken = tokenizer.decode([i])

  # lengths in vectors
  token_lens_chars[i] = len(thistoken)
  token_lens_bytes[i] = len(thistoken.encode('utf-8'))

  # increment the matrix counter
  token_lengths[token_lens_chars[i],token_lens_bytes[i]] += 1



### draw a figure
fig = plt.figure(figsize=(8,5))

# normalization function for mapping frequency onto color
norm = mpl.colors.Normalize(vmin=0,vmax=np.log(token_lengths.max()))

# draw the individual points
x,y = np.nonzero(token_lengths)
for xi,yi in zip(x,y):
  plt.plot(xi,yi,'kh',alpha=.7,markersize=9,
           markerfacecolor=plt.cm.magma(norm(np.log(token_lengths[xi,yi]))))

sm = mpl.cm.ScalarMappable(cmap=mpl.cm.magma,norm=norm)
cbar = plt.colorbar(sm,ax=fig.gca(),pad=.01)
cbar.set_label('Log frequency')


# draw the line of equality
plt.plot([0,xi],[0,xi],'--',color=[.8,.8,.8],zorder=-100)

# prettify the plot
plt.gca().set(xlabel='Token length in characters',
              ylabel='Token length in bytes')

plt.tight_layout()
plt.savefig('ch2_proj4_part1.png')
plt.show()

In [None]:
noteq = np.where(token_lens_chars != token_lens_bytes)[0]
print(f'There are {len(noteq)} tokens with unequal lengths.\nHere are 30:\n')

for idx in noteq[np.random.randint(0,len(noteq),30)]:
  print(f'{tokenizer.decode([idx])} | ',end='')

# **Part 2: Visualizing token lengths**

In [None]:
plt.figure(figsize=(12,4))

# use for plotting only the first N tokens
max2plot = tokenizer.vocab_size #600

# note: I added a small y-axis offset to help visualize discrepancies
plt.plot(token_lens_chars[:max2plot]+.1,'rs',label='Characters',markerfacecolor=[.9,.7,.7,.7],markersize=5)
plt.plot(token_lens_bytes[:max2plot],'bo',label='Bytes',markerfacecolor=[.7,.7,.9,.7],markersize=5)

plt.legend()
plt.gca().set(xlabel='Token index',ylabel='Token length',xlim=[-10,max2plot+10])

plt.tight_layout()
plt.savefig('ch2_proj4_part2.png')
plt.show()

# **Part 3: Finding the "mismatch-length" tokens**

In [None]:
plt.figure(figsize=(12,4))

# y-values are 0 or 1, depending on match
yVals = (token_lens_chars!=token_lens_bytes).astype(float)
yVals += np.random.normal(0,.02,len(yVals))

# Not in the instructions, but also interesting to plot the discrepancy counts
#yVals = token_lens_bytes - token_lens_chars

# plot them
plt.plot(yVals,'kh',markerfacecolor=[.9,.7,.7,.3],markersize=5)

# and make the axis look nicer
plt.gca().set(xlabel='Token index',ylim=[-.5,1.5],xlim=[-30,tokenizer.vocab_size+30],
              yticks=[0,1],yticklabels=['Match','Mismatch'])

plt.tight_layout()
plt.savefig('ch2_proj4_part3.png')
plt.show()

# **Part 4: Length distributions**

In [None]:
# convert to percent of total
xB,yB = np.unique(token_lens_bytes,return_counts=True)
yB = 100*yB / yB.sum()

xC,yC = np.unique(token_lens_chars,return_counts=True)
yC = 100*yC / yC.sum()

# draw
plt.figure(figsize=(10,4))
plt.scatter(xB,yB,100+yB*10,c=xB,alpha=.7,cmap='rainbow',edgecolor='k',marker='s',label='Bytes')
plt.scatter(xC,yC,100+yC*10,c=xC,alpha=.7,cmap='rainbow',edgecolor='k',marker='o',label='Characters')

plt.gca().set(xlabel='Token length',xticks=xB[::2],ylabel='Frequency (percent)')
plt.grid(color=[.4,.4,.4],linestyle='--')
plt.gca().set_axisbelow(True)
plt.legend()

plt.tight_layout()
plt.savefig('ch2_proj4_part4.png')
plt.show()

In [None]:
# the longest token :P
tokenizer.decode(np.argmax(token_lens_bytes))