|<h2>Book:</h2>|<h1><a href="https://open.substack.com/pub/mikexcohen/p/llm-breakdown-16-tokenization-words" target="_blank">50 ML projects to understand LLMs</a></h1>|
|-|:-:|
|<h2>Project:</h2>|<h1><b>[31] Logit lens</b></h1>|
|<h2>Author:<h2>|<h1>Mike X Cohen, <a href="https://sincxpress.com" target="_blank">sincxpress.com</a></h1>|

<br>

<i>Using the code without reading the book may lead to confusion or errors.</i>

In [None]:
# Reference:
# https://www.lesswrong.com/posts/AcKRB8wDpdaN6v6ru/interpreting-gpt-the-logit-lens

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression

import torch
import torch.nn.functional as F

from transformers import RobertaTokenizer, RobertaForMaskedLM

In [None]:
### matplotlib adjustments (commented lines are for dark mode)

# svg plots (higher-res)
import matplotlib_inline.backend_inline
matplotlib_inline.backend_inline.set_matplotlib_formats('svg')

plt.rcParams.update({
    # 'figure.facecolor': '#282a2c',
    # 'figure.edgecolor': '#282a2c',
    # 'axes.facecolor':   '#282a2c',
    # 'axes.edgecolor':   '#DDE2F4',
    # 'axes.labelcolor':  '#DDE2F4',
    # 'xtick.color':      '#DDE2F4',
    # 'ytick.color':      '#DDE2F4',
    # 'text.color':       '#DDE2F4',
    'axes.spines.right': False,
    'axes.spines.top':   False,
    'axes.titleweight': 'bold',
    'axes.labelweight': 'bold',
    'savefig.dpi':300
})

# **Part 1: Single-layer, unmasked lens**

In [None]:
# import roberta model and tokenizer
tokenizer = RobertaTokenizer.from_pretrained('roberta-large')
model = RobertaForMaskedLM.from_pretrained('roberta-large')

model.eval()

In [None]:
n_hidden =
n_hidden

In [None]:
text = 'The way you do anything is the way you do everything'
tokens = tokenizer.encode(text,return_tensors='pt')
numTokens =

for t in tokens[0]:
  print

with torch.no_grad():
  output =

In [None]:
len(output.hidden_states), output.hidden_states[3].shape

In [None]:
layer =

# extract the activations from one layer
activations =

# calculate the raw logits
logits =

# check the shape
logits.shape

In [None]:
# softmax and plot
lsm_outs = F.log_softmax

# max-softmax as prediction
predictedToken = np.argmax
print(f'   Actual token is "{}"')
print(f'Predicted token is "{}"')
print(f'Predicted log-sm is {}')

# show softmax for one token
plt.figure(figsize=(10,3))
plt.plot(,'ro',markersize=8)
plt.plot(,'k.',alpha=.3)
plt.gca().set(xlabel='Token index',ylabel='Log-softmax prob',xlim=[-30,tokenizer.vocab_size+30],
              title=f'Log-softmax logits for the token "{tokenizer.decode(tokens[0,3])}"')

plt.tight_layout()
plt.savefig('ch5_proj31_part1.png')
plt.show()

# **Part 2: Single-layer masked lens**

In [None]:
text = f'The way you do anything is the {tokenizer.mask_token} you do everything'
tokens_wMask = tokenizer.encode(text,return_tensors='pt')

mask_idx =
mask_pos_idx =

for t in tokens_wMask[0]:
  print(f'Token {t:5} is "{tokenizer.decode(t)}"')

with torch.no_grad():
  output_wMask = model(tokens_wMask,output_hidden_states=True)

In [None]:
# get logits from one layer
activations = output_wMask.hidden_states[layer][0]
logits =
lsm_outs_wMask =

In [None]:
# max-softmax is the next prediction
predictedToken = np.argmax(
print(f' Unmasked token is "{}"')
print(f'   Masked token is "{}"')
print(f'Predicted token is "{}"')
print(f'Predicted log-sm is {}')

# show softmax for one token
plt.figure(figsize=(10,3))
plt.plot(,'ro',markersize=8)
plt.plot(,'k.',alpha=.3)
plt.gca().set(xlabel='Token index',ylabel='Log-softmax prob',xlim=[-30,tokenizer.vocab_size+30],)

plt.tight_layout()
plt.savefig('ch5_proj31_part2.png')
plt.show()

In [None]:
# FYI: log-prob to percent-prob
100*np.exp(-0.00003)

# **Part 3: Laminar lens for one masked token**

In [None]:
lsm_target = np.zeros(n_hidden)

for layeri in range(n_hidden):

  # get logits from one layer
  activations = .
  logits = model.
  lsm_outs_wMask = F.log_softmax

  # predicted token
  predictedToken = np.argmax(
  lsm_target[layeri] = lsm_outs_wMask

  # build up the text
  txt =
  txt +=
  txt +=

  # and print it
  print(f'Layer {layeri:2} lens: {txt}')

In [None]:
# the predictor (note the reshape: LinearRegression requires a multidimensional array
sqrtLayerIdx = np.sqrt(

# fit the model and print the results
reg = LinearRegression().fit(
print(f'const: {:6.2f}')
print(f'slope: {:6.2f}')

# predicted data
yHat =  + *

# fitted equation
fiteq = fr'$\hat{{y}} = {reg.intercept_:.3f} + {reg.coef_[0]:.3f}\sqrt{{L}}$'

# plot observed and predicted
plt.figure(figsize=(8,5))
plt.plot(,'r',label=fiteq)
plt.legend()
plt.plot(,'ko',markersize=10,markerfacecolor=[.7,.9,.7])
plt.gca().set(xlabel='Hidden layer',ylabel='Log-softmax prob',title=f'Logits from layer {layer}')

plt.tight_layout()
plt.savefig('ch5_proj31_part3.png')
plt.show()

# **Part 4: Sliding masked sequences**

In [None]:
# loop over tokens, replace with [MASK], and get logits
for idx,tok in enumerate(tokens[0]):

  # make a copy and replace a token with mask
  masked_tokens = tokens.clone()
  masked_tokens[0,idx] =

  # confirmation:
  print

# **Part 5: The logit lens**

In [None]:
predictedTokens = np.zeros()
lsm_target = np.zeros()
lsm_max = np.zeros()
slopes = np.zeros()

# loop over tokens, replace with [MASK], and get logits
for midx,tok in enumerate(

  # make a copy and replace a token with mask
  masked_tokens =
  masked_tokens[0,midx] =

  # forward pass with masked tokens
  with torch.no_grad():
    output_wMask = model(

  ### loop over layers
  for layeri in range(n_hidden):

    # get logits from one layer
    activations =
    logits =
    lsm_outs_wMask =

    # predicted token
    predictedTokens[layeri,midx] = np.argmax
    lsm_target[layeri,midx] = lsm_outs_wMask
    lsm_max[layeri,midx] = lsm_outs_wMask


  # regression slope for the target token
  reg = LinearRegression().fit(
  slopes[0,midx] = reg.coef_[0]

  # and again for the max-token
  reg = LinearRegression().fit(
  slopes[1,midx] =

In [None]:
fig,axs = plt.subplots(1,3,figsize=(12,4))

h = axs[0].imshow()
fig.colorbar(h,ax=axs[0],pad=.02)
axs[0].set(ylabel='Layer',title='A) Target token log-sm prob',xticks=range(len(tokens[0])),
           xticklabels=[tokenizer.decode(t) for t in tokens[0]])
axs[0].tick_params(axis='x',labelrotation=90)

h = axs[1].imshow()
fig.colorbar(h,ax=axs[1],pad=.02)
axs[1].set(ylabel='Layer',title='B) Max token log-sm prob',xticks=range(len(tokens[0])),
           xticklabels=[tokenizer.decode(t) for t in tokens[0]])
axs[1].tick_params(axis='x',labelrotation=90)

axs[2].plot(,label='Target')
axs[2].plot(,label='Max token')
axs[2].legend()
axs[2].axhline(0,linestyle='--',color='gray',linewidth=.6,zorder=-30)
axs[2].set(ylabel='Slope ($\\beta_1$)',xticks=range(len(tokens[0])),
           xticklabels=[tokenizer.decode(t) for t in tokens[0]],
           title='C) Regression slopes')
axs[2].tick_params(axis='x',labelrotation=90)

plt.tight_layout()
plt.savefig('ch5_proj31_part5.png')
plt.show()

# **Part 6: Logit Lens text heatmap**

In [None]:
lsm_max_scaled = (lsm_max-lsm_max.min()) /

In [None]:
fig,ax = plt.subplots(1,figsize=(13,11))

# original text (separated into a list of decoded tokens)
target = [tokenizer.decode(t) for t in tokens[0]]
numTokens = len(target)

# loop over layers
for layeri in range(n_hidden):

  # y-axis coordinate for this layer
  yCoord =

  # print the layer number in the left margin
  ax.text(-.1,yCoord,f'Layer {layeri}:',ha='right')

  # loop over the predicted tokens in this layer
  for xi,tok in enumerate(predictedTokens[layeri]):
    ax.text(,,,ha='center',
            bbox=dict(boxstyle='round,pad=0.3', facecolor=plt.cm.Reds(lsm_max_scaled[layeri,xi]), edgecolor='none',alpha=.5))

ax.axis('off')

# finally, draw the target tokens at the bottom
ax.text(-.1,yCoord-.05,f'Target:',ha='right',fontweight='bold')
for xi,tok in enumerate(target):
  ax.text(,,tok,ha='center',fontsize=12,fontweight='bold')

plt.tight_layout()
plt.savefig('ch5_proj31_part6.png')
plt.show()