|<h2>Book:</h2>|<h1><a href="https://open.substack.com/pub/mikexcohen/p/llm-breakdown-16-tokenization-words" target="_blank">50 ML projects to understand LLMs</a></h1>|
|-|:-:|
|<h2>Project:</h2>|<h1><b>[16] Softmax probability distributions</b></h1>|
|<h2>Author:<h2>|<h1>Mike X Cohen, <a href="https://sincxpress.com" target="_blank">sincxpress.com</a></h1>|

<br>

<i>Using the code without reading the book may lead to confusion or errors.</i>

In [None]:
import numpy as np
import matplotlib.pyplot as plt

import torch
import torch.nn.functional as F

from transformers import AutoModelForCausalLM, GPT2Tokenizer

In [None]:
### matplotlib adjustments (commented lines are for dark mode)

# svg plots (higher-res)
import matplotlib_inline.backend_inline
matplotlib_inline.backend_inline.set_matplotlib_formats('svg')

plt.rcParams.update({
    # 'figure.facecolor': '#282a2c',
    # 'figure.edgecolor': '#282a2c',
    # 'axes.facecolor':   '#282a2c',
    # 'axes.edgecolor':   '#DDE2F4',
    # 'axes.labelcolor':  '#DDE2F4',
    # 'xtick.color':      '#DDE2F4',
    # 'ytick.color':      '#DDE2F4',
    # 'text.color':       '#DDE2F4',
    'axes.spines.right': False,
    'axes.spines.top':   False,
    'axes.titleweight': 'bold',
    'axes.labelweight': 'bold',
    'savefig.dpi':300,
})

# **Part 1: Softmax in numpy and PyTorch**

In [None]:
# the list of numbers
z = [1,1.1,2,3,5,6,6.1]

# compute the softmax result
num = np.exp(z)
den = np.sum( np.exp(z) )
sm = num / den

print(sm)
print(np.sum(sm))

In [None]:
zTorch = torch.tensor(z,dtype=torch.float32)

# using a function
zTorch_sm = F.softmax(zTorch,dim=-1)
zTorch_sm

In [None]:
# compare
plt.figure(figsize=(10,5))

plt.plot(z,sm,'ks-',markerfacecolor=[.9,.7,.7],markersize=10,label='Manual')
plt.plot(z,zTorch_sm,'bx:',markersize=8,label='PyTorch')
plt.legend()

plt.gca().set(xlabel='Original number (z)',ylabel='Softmax probability $\\sigma (z)$',
              title='$\\sum\\sigma (z)$ = %g' %np.sum(sm))

plt.tight_layout()
plt.savefig('ch4_proj16_part1.png')
plt.show()

# **Part 2: Temperature**

In [None]:
x = torch.linspace(-5,5,55)

shapes = 'soh^'

plt.figure(figsize=(10,5))
for i,temp in enumerate([.3,.6,1,1.4]):
  sm = F.softmax(x/temp,dim=-1)
  plt.plot(x,sm,shapes[i]+'-',linewidth=2,label='T = %g' %temp)

plt.legend()
plt.gca().set(xlabel='Original number',ylabel='Softmax probability')
# plt.yscale('log') # FYI

plt.tight_layout()
plt.savefig('ch4_proj16_part2.png')
plt.show()

# **Part 3: Softmax of LLM output logits**

In [None]:
# load pretrained GPT-2 model and tokenizer
gpt2_small = AutoModelForCausalLM.from_pretrained('gpt2')
gpt2_large = AutoModelForCausalLM.from_pretrained('gpt2-large')

# set to eval mode
gpt2_small.eval()
gpt2_large.eval()

# and the tokenizer
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

In [None]:
print(f'GPT-2-small has {gpt2_small.num_parameters():,} parameters.')
print(f'GPT-2-large has {gpt2_large.num_parameters():,} parameters.')

In [None]:
txt = 'It was a dark and stormy'
tokens = tokenizer.encode(txt,return_tensors='pt') # pt = PyTorch
tokens

In [None]:
print(f'The text comprises {tokens.shape[1]} tokens.\n')

for t in tokens[0]:
  print(f'{t:5} is "{tokenizer.decode(t)}"')

In [None]:
# forward pass through the model
outputs = gpt2_small(tokens)
outputs

In [None]:
outputs.logits.shape

In [None]:
logits = outputs.logits[0,-1,:].detach()
logits_sm = F.softmax(logits,dim=-1)
logits.shape

In [None]:
print(f'The sum of the raw logits is {logits.sum():.3f}')
print(f'The sum of the softmax logits is {logits_sm.sum():.3f}')

In [None]:
# plot the raw and softmax logits
_,axs = plt.subplots(1,3,figsize=(12,3))

axs[0].plot(logits,'ks',markerfacecolor=[.9,.7,.7,.3])
axs[0].set(xlim=[-10,tokenizer.vocab_size+9],xlabel='Token index',
           ylabel='Output logits',title='A) All final token logits')

axs[1].plot(logits_sm,'o',markerfacecolor=[.7,.9,.7,.3])
axs[1].set(xlim=[-10,tokenizer.vocab_size+9],xlabel='Token index',
           ylabel='Probabilities',title='B) Softmax probabilities')

axs[2].plot(logits,logits_sm,'^',markerfacecolor=[.7,.7,.9,.7])
axs[2].set(xlabel='"Raw" logits',ylabel='Softmax logits',
           title='C) Logits by probabilities')

plt.tight_layout()
plt.savefig('ch4_proj16_part3.png')
plt.show()

In [None]:
# find the maximum
max_logit = logits_sm.argmax()
print(f'The maximum softmax logit is #{max_logit} with a value of {logits_sm[max_logit]:.3f}')
print(f'The max word is "{tokenizer.decode(max_logit)}"')

# **Part 4: Top 10 probabilities and temperature**

In [None]:
k = 10
top_k = torch.topk(logits_sm,k)

print(txt,'___\n')

for i in range(k):
  val = top_k.values[i]
  tok = top_k.indices[i]
  print(f'{tok:5} ({100*val:4.1f}%) is "{tokenizer.decode(tok)}"')

In [None]:
temps = [ .5,1,1.5 ]

plt.figure(figsize=(10,5))

shapes = 'so^'

for i,T in enumerate(temps):

  # calculate softmax and find the top 10
  sm = F.softmax(logits/T,dim=-1)
  top_k = torch.topk(sm,k)

  # plot
  color = [.7,.7,.7]
  color[i] = .9
  plt.plot(top_k.values,f'{shapes[i]}-',markerfacecolor=color,
           color=color,markeredgecolor='k',markersize=10,label=f'T = {T}')


plt.legend()
plt.gca().set(xlabel=f'Top-{k} indices',ylabel='Softmax probabilities (log)',yscale='log')

plt.tight_layout()
plt.savefig('ch4_proj16_part4.png')
plt.show()

# **Part 5: Numerical instabilities and normalization**

In [None]:
# get the outputs of the models
tokens = tokenizer.encode('A plethora of platypuses.',return_tensors='pt')
outputs_small = gpt2_small(tokens)
outputs_large = gpt2_large(tokens)

In [None]:
# grab the final token logit outputs
logits_small = outputs_small.logits[0,-1,:].detach()
logits_large = outputs_large.logits[0,-1,:].detach()

_,axs = plt.subplots(1,3,figsize=(12,3.5))

# gpt2 small
axs[0].plot(logits_small,'k.',alpha=.2)
axs[0].set(xlim=[-10,tokenizer.vocab_size+9],xlabel='Token index',ylabel='Output logits',title='A) GPT-2 SMALL')

# gpt2 large
axs[1].plot(logits_large,'k.',alpha=.2)
axs[1].set(xlim=[-10,tokenizer.vocab_size+9],xlabel='Token index',ylabel='Output logits',title='B) GPT-2 LARGE')

# against each other
axs[2].plot(logits_small,logits_large,'m.',alpha=.2)
axs[2].set(xlabel='GPT-2 SMALL',ylabel='GPT-2 LARGE',title='C) Comparison of both models')

plt.tight_layout()
plt.savefig('ch4_proj16_part5a.png')
plt.show()

In [None]:
# manual softmax
sm_manual_small = torch.exp(logits_small) / torch.sum(torch.exp(logits_small))
sm_manual_large = torch.exp(logits_large) / torch.sum(torch.exp(logits_large))

_,axs = plt.subplots(1,3,figsize=(12,3.5))

# gpt2 small
axs[0].plot(sm_manual_small,'k.',alpha=.2)
axs[0].set(xlim=[-10,tokenizer.vocab_size+9],xlabel='Token index',ylabel='Softmax probabilities',title='A) GPT-2 SMALL')

# gpt2 large
axs[1].plot(sm_manual_large,'k.',alpha=.2)
axs[1].set(xlim=[-10,tokenizer.vocab_size+9],xlabel='Token index',ylabel='Softmax probabilities',title='B) GPT-2 LARGE')

# against each other
axs[2].plot(sm_manual_small,sm_manual_large,'m.',alpha=.2)
axs[2].set(xlabel='GPT-2 SMALL',ylabel='GPT-2 LARGE',title='C) Comparison of both models')

plt.tight_layout()
plt.savefig('ch4_proj16_part5b.png')
plt.show()

In [None]:
logits_small[3000],sm_manual_small[1000]

In [None]:
# simple normalization (subtract max value)
logits_small_norm = logits_small - logits_small.max()
logits_large_norm = logits_large - logits_large.max()

# visualize
_,axs = plt.subplots(1,3,figsize=(12,3.5))

# gpt2 small
axs[0].plot(logits_small_norm,'k.',alpha=.2)
axs[0].set(xlim=[-10,tokenizer.vocab_size+9],xlabel='Token index',ylabel='Raw logits (max-norm)',title='A) GPT-2 SMALL')

# gpt2 large
axs[1].plot(logits_large_norm,'k.',alpha=.2)
axs[1].set(xlim=[-10,tokenizer.vocab_size+9],xlabel='Token index',ylabel='Raw logits (max-norm)',title='B) GPT-2 LARGE')

# against each other
axs[2].plot(logits_small_norm,logits_large_norm,'m.',alpha=.2)
axs[2].set(xlabel='GPT-2 SMALL',ylabel='GPT-2 LARGE',title='C) Comparison of both models')

plt.tight_layout()
plt.savefig('ch4_proj16_part5c.png')
plt.show()

In [None]:
# now repeat the manual softmax
sm_manual_smallN = torch.exp(logits_small_norm) / torch.sum(torch.exp(logits_small_norm))
sm_manual_largeN = torch.exp(logits_large_norm) / torch.sum(torch.exp(logits_large_norm))

_,axs = plt.subplots(1,3,figsize=(12,3.5))

# gpt2 small
axs[0].plot(sm_manual_smallN,'k.',alpha=.2)
axs[0].set(xlim=[-10,tokenizer.vocab_size+9],xlabel='Token index',ylabel='Softmax probabilities',title='A) GPT-2 SMALL')

# gpt2 large
axs[1].plot(sm_manual_largeN,'k.',alpha=.2)
axs[1].set(xlim=[-10,tokenizer.vocab_size+9],xlabel='Token index',ylabel='Softmax probabilities',title='B) GPT-2 LARGE')

# against each other
axs[2].plot(sm_manual_smallN,sm_manual_largeN,'m.',alpha=.2)
axs[2].set(xlabel='GPT-2 SMALL',ylabel='GPT-2 LARGE',title='C) Comparison of both models')

plt.tight_layout()
plt.savefig('ch4_proj16_part5d.png')
plt.show()

In [None]:
# pytorch softmax
sm_torch_small = F.softmax(logits_small,dim=-1)
sm_torch_large = F.softmax(logits_large,dim=-1)

_,axs = plt.subplots(1,3,figsize=(12,3.5))

# gpt2 small
axs[0].plot(sm_torch_small,'k.',alpha=.2)
axs[0].set(xlim=[-10,tokenizer.vocab_size+9],xlabel='Token index',ylabel='Softmax probabilities',title='GPT-2 SMALL')

# gpt2 large
axs[1].plot(sm_torch_large,'k.',alpha=.2)
axs[1].set(xlim=[-10,tokenizer.vocab_size+9],xlabel='Token index',ylabel='Softmax probabilities',title='GPT-2 LARGE')

# against each other
axs[2].plot(sm_torch_small,sm_torch_large,'m.',alpha=.2)
axs[2].set(xlabel='GPT-2 SMALL',ylabel='GPT-2 LARGE',title='Comparison of both models')

plt.tight_layout()
plt.savefig('ch4_proj16_part5e.png')
plt.show()