|<h2>Book:</h2>|<h1><a href="https://open.substack.com/pub/mikexcohen/p/llm-breakdown-16-tokenization-words" target="_blank">50 ML projects to understand LLMs</a></h1>|
|-|:-:|
|<h2>Project:</h2>|<h1><b>[22] Evaluating models with HellaSwag</b></h1>|
|<h2>Author:<h2>|<h1>Mike X Cohen, <a href="https://sincxpress.com" target="_blank">sincxpress.com</a></h1>|

<br>

<i>Using the code without reading the book may lead to confusion or errors.</i>

In [None]:
import numpy as np
import matplotlib.pyplot as plt

from tqdm import tqdm
from datasets import load_dataset

from statsmodels.stats.contingency_tables import mcnemar

# pytorch-related libraries
import torch
import torch.nn as nn
import torch.nn.functional as F

# libraries to import models
from transformers import AutoModelForCausalLM, AutoTokenizer
from transformers import RobertaTokenizer, RobertaForMaskedLM

In [None]:
### matplotlib adjustments (commented lines are for dark mode)

# svg plots (higher-res)
import matplotlib_inline.backend_inline
matplotlib_inline.backend_inline.set_matplotlib_formats('svg')

plt.rcParams.update({
    # 'figure.facecolor': '#282a2c',
    # 'figure.edgecolor': '#282a2c',
    # 'axes.facecolor':   '#282a2c',
    # 'axes.edgecolor':   '#DDE2F4',
    # 'axes.labelcolor':  '#DDE2F4',
    # 'xtick.color':      '#DDE2F4',
    # 'ytick.color':      '#DDE2F4',
    # 'text.color':       '#DDE2F4',
    'axes.spines.right': False,
    'axes.spines.top':   False,
    'axes.titleweight': 'bold',
    'axes.labelweight': 'bold',
    'savefig.dpi':300,
})

# **Part 1: Import and compare GPT2 and RoBERTa**

In [None]:
# use the GPU if available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# import the tokenizer and models
tokenizerG = AutoTokenizer.from_pretrained('gpt2')
gpt2_s = AutoModelForCausalLM.from_pretrained('gpt2').to(device)
gpt2_l = AutoModelForCausalLM.from_pretrained('gpt2-large').to(device)

# switch to evaluation (inference) mode


In [None]:
# import roberta model and tokenizer
tokenizerR = RobertaTokenizer.from_pretrained('roberta-base')
roberta_s = RobertaForMaskedLM.from_pretrained('roberta-base').to(device)
roberta_l = RobertaForMaskedLM.from_pretrained('roberta-large').to(device)

# set to eval mode

In [None]:
# FYI, useful model information
roberta_s.config

In [None]:
# table of model sizes

print(' Model | Tokens | Emb.dim | Xfmr | Parameters')
print('-------+--------+---------+------+-------------')

# GPT2-small
print(f'GPT2-S | {:,} |   {:4}  |  {}  | {:,}')

# GPT2-large
print(f'GPT2-L | {:,} |   {:4}  |  {}  | {:,}')

# roberta-small
print(f'RoBA-S | {:,} |   {:4}  |  {}  | {:,}')

# roberta-large
print(f'RoBA-L | {:,} |   {:4}  |  {}  | {:,}')


# **Part 2: Import and explore HellaSwag**

In [None]:
# import the HellaSwag validation set
dataset = load_dataset('hellaswag',split='validation')
dataset

In [None]:
# an example
dataset[1]

# **Part 3: Visualize the evaluation process**

In [None]:
# pick a random example
exampleNum = 224
target =  # the target answer

# context tokens and length
context = dataset[exampleNum]['ctx']
context_len =

# prompts and their lengths
promptC = f""
promptC_tox =
promptC_len =

promptI = f""
promptI_tox =
promptI_len =

# show the prompts
print(f'Context:\n   "{context}"\n')
print(f'Correct ending:\n   "{promptC}"\n')
print(f'Incorrect ending:\n   "{promptI}"')

In [None]:
# forward pass through the model
with torch.no_grad():
  logitsC = gpt2_s
  logitsI =

# log softmax (more numerically stable than prob values for later calculations)
log_sm_C =
log_sm_I =


# get the sequence of sm logits for the correct prompt
lsmSeqC = np.zeros(promptC_len-1)
for i in range(0,promptC_len-1):
  lsmSeqC[i] = log_sm_C[]

# repeat for the incorrect prompt
lsmSeqI = np.zeros(promptI_len-1)
for i in range(0,promptI_len-1):
  lsmSeqI[i] = log_sm_I[]

# probabilities of prompts (sum of logs equals product of probabilities)
probC = lsmSeqC
probI = .mean()

print(f'    Target ending log-prob: {probC:.3f}')
print(f'Non-target ending log-prob: {probI:.3f}')

In [None]:
# a demo of a log property
p,q = .01,.003
np.log(p*q), np.log(p)+np.log(q)

In [None]:
# visualize the logits
plt.figure(figsize=(12,4))
plt.plot(lsmSeqI,)
plt.plot(lsmSeqC,)
plt.axvline(,linestyle='--',color='gray')

plt.gca().set(xlabel='Token position (index)',ylabel='Log-softmax probs',
              title='Token log-probabilities in HellaSwag evaluation')
plt.legend()

plt.tight_layout()
plt.savefig('ch4_proj22_part3.png')
plt.show()

# **Part 4: A function to test one HellaSwag sample**

In [None]:
# define a pad token for GPT2 (RoBERTa already has a pad token)
tokenizerG.pad_token_id = tokenizerG.
tokenizerG.pad_token_id

In [None]:
# demo of padding during tokenization
texts = [ 'hello',
          'my name is',
          'Mike and I like purple.'
]

t = tokenizerG(texts,padding=True,return_tensors='pt')
for k,v in t.items():
  print(f'"{k}":\n{v}\n')

In [None]:
# a function to calculate accuracy on one sample
def oneHellaSample(sample,model,tokenizer):

  # 1) find context length and target
  context = sample['ctx']
  context_len =
  target =

  # 2) loop over candidate endings and create prompts
  allprompts = []
  for opti in range(len(sample['endings'])):
    prompt = f""
    allprompts.append(

  # 3) batch tokenize with padding token
  prompt_tox = tokenizer(

  # 4) run all prompts in one batch and bring the logits back to the CPU
  output = model
  logits = .cpu()

  # 5) convert to log probabilities
  log_probs =

  # 6) initialize and populate the log-likelihood vector
  loglikelihoods = np.zeros(len(sample['endings']))

  # log-probs for each ending
  for opti in range(len(sample['endings'])):

    # get the token for this ending
    token_seq = prompt_tox

    # find the valid (non-pad) tokens
    valid_tox = np.where

    # extract a log-softmax sequence for the valid tokens
    lsmSeq =

    # average the valid ending log-probs
    loglikelihoods[opti] = np.mean(lsmSeq[

  # 7) function outputs
  return loglikelihoods,target

In [None]:
# test it with one sample
loglikelihoods,target = oneHellaSample(dataset[42],gpt2_s,tokenizerG)

print(f'Log-likelihoods: {loglikelihoods}')
print(f'Target ending: {target}')

if np.argmax(loglikelihoods)==target:
  print('Model was correct!')
else:
  print('Model needs more training ;)')

# **Part 5: Test all four models**

In [None]:
# number of data samples to test (set low on CPU!)
num_samples = 1000

# initialize accuracy matrix
accuracies = np.zeros((4,num_samples))


# loop over data samples with progress bar
for datai in tqdm(range(num_samples),desc='Evaluating on HellaSwag'):

  # extract one sample from the data
  example = dataset[datai]

  # test GPT2 small
  loglikelihoods,target = oneHellaSample(example,gpt2_s,tokenizerG)
  if np.argmax(loglikelihoods)==target: accuracies[0,datai] = 1

  # repeat for GPT2-large


  # repeat for roberta-small


  # repeat for roberta-large


In [None]:
plt.figure(figsize=(8,3))

# draw each bar at a time (for color and text labels)
for i in range(4):

  # accuracy for this model
  acy =

  # the bar
  plt.bar(,,color=plt.cm.plasma(2*acy/100))

  # write the actual accuracy value
  plt.text(,,f'{acy:.1f}%',
           fontweight='bold',ha='center',va='bottom')

# adjust the axies
plt.axhline(,linestyle='--',color='k',linewidth=.5,zorder=-10)
plt.gca().set(xticks=range(4),xticklabels=['GPT2 small','GPT2 large','RoBERTa small','RoBERTa large'],
              ylabel='Accuracy (%)')
plt.title('HellaSwag accuracy',y=1.1)

plt.tight_layout()
plt.savefig('ch4_proj22_part5a.png')
plt.show()

In [None]:
plt.figure(figsize=(12,4))
plt.plot()
plt.gca().set(xlabel='Data sample',ylabel='Average accuracy',yticks=np.linspace(0,1,5),
              yticklabels=['All wrong','1 correct','2 correct','3 correct','all correct'])

plt.tight_layout()
plt.savefig('ch4_proj22_part5b.png')
plt.show()

# **Part 6: Statistical comparisons**

In [None]:
# initialize the results matrix
statsMat = np.zeros((4,4,2))

# loop over all pairs
for i in range(4):
  for j in range(i+1,4):

    # extract this pair of accuracies
    a = accuracies[i]
    b = accuracies[j]

    # initialize and populate the McNemar table (diagonals are ignored in the test)
    table = np.zeros((2,2))
    table[0,1] = np.sum( & )
    table[1,0] = np.sum( & )

    # run the McNemar test
    res = mcnemar( ,exact=False,correction=True)

    # store the mean differences (% accuracy) and p-value
    statsMat[i,j,0] =
    statsMat[i,j,1] =

statsMat[statsMat==0] = np.nan

In [None]:
# mean differences
statsMat[:,:,0]

In [None]:
# show the matrix
plt.imshow()
plt.gca().set(xticks=range(4),yticks=range(4),
              xticklabels=['GPT-s','GPT-l','RoB-s','RoB-l'],yticklabels=['GPT-s','GPT-l','RoB-s','RoB-l'])

# and add the labels
for i in range(4):
  for j in range(i+1,4):

    # create the base label
    label = f'{statsMat[i,j,0]

    # add * if its significant
    if statsMat[i,j,1]<.05/6:


    # draw the text
    plt.text(j,i,label,fontsize=16,fontweight='bold',color='k',ha='center',va='center')

plt.tight_layout()
plt.savefig('ch4_proj22_part6.png')
plt.show()

In [None]:
# btw, there are some linguistic imperfections in the HellaSwag dataset, e.g.,
dataset[2]