|<h2>Book:</h2>|<h1><a href="https://open.substack.com/pub/mikexcohen/p/llm-breakdown-16-tokenization-words" target="_blank">50 ML projects to understand LLMs</a></h1>|
|-|:-:|
|<h2>Project:</h2>|<h1><b>[44] Grammar tuning in MLP neurons</b></h1>|
|<h2>Author:<h2>|<h1>Mike X Cohen, <a href="https://sincxpress.com" target="_blank">sincxpress.com</a></h1>|

<br>

<i>Using the code without reading the book may lead to confusion or errors.</i>

In [None]:
import numpy as np
import matplotlib.pyplot as plt

import scipy.stats as stats
!pip install pingouin
import pingouin as pg # for effect size calculations

import requests

import torch
from transformers import AutoModelForCausalLM,AutoTokenizer

In [None]:
### matplotlib adjustments (commented lines are for dark mode)

# svg plots (higher-res)
import matplotlib_inline.backend_inline
matplotlib_inline.backend_inline.set_matplotlib_formats('svg')

plt.rcParams.update({
    # 'figure.facecolor': '#282a2c',
    # 'figure.edgecolor': '#282a2c',
    # 'axes.facecolor':   '#282a2c',
    # 'axes.edgecolor':   '#DDE2F4',
    # 'axes.labelcolor':  '#DDE2F4',
    # 'xtick.color':      '#DDE2F4',
    # 'ytick.color':      '#DDE2F4',
    # 'text.color':       '#DDE2F4',
    'axes.spines.right': False,
    'axes.spines.top':   False,
    'axes.titleweight': 'bold',
    'axes.labelweight': 'bold',
    'savefig.dpi':300
})

# **Part 1: Get nouns and verbs**

In [None]:
# Eleuther's tokenizer
tokenizer = AutoTokenizer.from_pretrained('EleutherAI/gpt-neo-125m')

# load in GPTneo
model = AutoModelForCausalLM.from_pretrained('EleutherAI/gpt-neo-125m')
model.eval()

In [None]:
# main repo: https://github.com/david47k/top-english-wordlists/

# lists of verbs
url = 'https://raw.githubusercontent.com/david47k/top-english-wordlists/refs/heads/master/top_english_verbs_lower_10000.txt'
all_verbs = requests.get(url).text.split('\n')

# initialize as empty list
verbs =
len_verbs =

# loop over all the verbs
for word in all_verbs:

  # tokenize with preceding space
  tok =

  # add to the list if its single-token
  if len(tok)==1:
    verbs.
    len_verbs.


# split by odd/even
verbs_split1 =
verbs_split2 =

# and print
print(f'{} out of {} verbs are single-token.')
print(f'There are {} split-1 and {} split-2 samples.')

In [None]:
# Note: the .split('\n') method adds an extra element at the end, which is why there seems to be 10,001 verbs:
all_verbs[-1]

In [None]:
# repeat for nouns
url = 'https://raw.githubusercontent.com/david47k/top-english-wordlists/refs/heads/master/top_english_nouns_lower_10000.txt'
all_nouns = requests.get(url).text.split('\n')

# initialize as empty list
nouns =
len_nouns =

# loop over all the nouns
for word in

  # tokenize with preceding space
  tok =

  # add to the list if its single-token
  if len(tok)
    nouns.
    len_nouns.

# split by odd/even
nouns_split1 =
nouns_split2 =

# and print
print)
print()

In [None]:
print('First 5 split-1 verbs:')
print([tokenizer.decode(v) for v in v

print('\nFirst 5 split-2 verbs:')
print()


print('\n\nFirst 5 split-1 nouns:')
print()

print('\nFirst 5 split-2 nouns:')
print()

In [None]:
# check lengths

# convenient to have in numpy
len_nouns =
len_verbs =

yN =
yV =

plt.figure(figsize=(10,3))
plt.bar(,width=.6,label='Nouns',alpha=.9,edgecolor='b')
plt.bar(,width=.6,label='Verbs',alpha=.9,edgecolor='r')

tres = stats.ttest_ind(,)
cohensd = pg.compute_effsize(,,paired=,eftype=)

plt.gca().set(xticks=range(np.max(len_nouns)),xlabel='Number of characters',ylabel='Count',
              title=f"t() = {}, p = {}\nCohen's d = {}")

plt.tight_layout()
plt.savefig('ch7_proj44_part1.png')
plt.show()

# **Part 2: Implant a hook and get activations**

In [None]:
model

In [None]:
# a hook function to grab the activations
mlp_acts = {}

def hook(module,input,output):
  mlp_acts[f'{whichdata}'] =

handle = model.tra

In [None]:
# make a batch
torch.tensor(nouns_split1).unsqueeze(1).shape

In [None]:
# this cell takes around 20 s

# reinialize data-dictionary
mlp_acts =

with torch.no_grad():

  # run the split1 nouns
  whichdata = 'nouns_split1'
  model(torch.tensor().unsqueeze(..))

  # split2 nouns
  whichdata = 'nouns_split2'
  model(

  # the split1 verbs
  whichdata = 'verbs_split1'
  model(

  # and the split2 verbs
  whichdata =
  model(


In [None]:
for k,v in mlp_acts.items():
  print(f"mlp_acts['{}'] has shape {}")

In [None]:
_,axs = plt.subplots(1,3,figsize=(12,3.5))

axs[0].imshow()
axs[0].set(xlabel='Neurons',ylabel='Nouns (index)',title='A) Nouns activations')

axs[1].plot(mlp_acts[].mean(axis=),'ko',markersize=5,markerfacecolor=[.9,.7,.9,.5])
axs[1].set(xlabel='Neurons',ylabel='Activation',title='B) Mean activations over all nouns')

axs[2].plot(,,
            'ko',markersize=5,markerfacecolor=[.9,.7,.9,.5])
axs[2].set(xlabel='Nouns',ylabel='Verbs',title='C) Activations to all words')

plt.tight_layout()
plt.savefig('ch7_proj44_part2.png')
plt.show()

# **Part 3: T-tests on split-1 data**

In [None]:
nneurons = mlp_acts['nouns_split1'].
nneurons

In [None]:
# t-test on all neurons
T_split1 = stats.ttest_ind(,,axis=)

# Cohen's d
cohensd = np.zeros()
for i in range(
  cohensd[i] = pg.compute_effsize(mlp_acts['nouns_split1'][,],mlp_acts['verbs_split1'][,],)

# plot
plt.plot(,,'ko',markerfacecolor='w')
plt.gca().set(xlabel="Cohen's d",ylabel='T-value')

plt.tight_layout()
plt.savefig('ch7_proj44_part3a.png')
plt.show()

In [None]:
plt.figure(figsize=(10,4))

# pvalues FDR corrected
sigPvals1 = stats.false_discovery_control() <

# plot the significant neurons
plt.plot(,,'go',markerfacecolor='w')

# significant and large effect size (Cohen's d>.8)
plt.plot(,,'go')

# non-significant
plt.plot(,,'rx')

# adjustments
plt.gca().set(xlabel='Neuron index',ylabel='T-value',xlim=[-10,nneurons+10],
              title=f'{np.sum(sigPvals1)}/{len(sigPvals1)} were significant, {np.sum(abs(cohensd)>.8)}/{len(cohensd)} were large effects.')

plt.tight_layout()
plt.savefig('ch7_proj44_part3b.png')
plt.show()

# **Part 4: T-tests on split-2 data**

In [None]:
# in split 2
T_split2 = stats.ttest_ind(,,axis=)

# across the two splits
T_split12 = stats.ttest_ind(,,axis=)

In [None]:
# Bonferroni correction
sigPthresh = .05 /

# find where one or both are significant
bothSig_2  = (T_split1.pvalue<sigPthresh).astype(int) +
bothSig_12 =

# correlations between t-values
r_2  = np.corrcoef()[0,1]
r_12 = np.corrcoef()[0,1]


# visualizations
_,axs = plt.subplots(1,2,figsize=(8,3.5))

# split-1 vs. split-2
axs[0].plot(T_split1.statistic[bothSig_2==],T_split2.statistic[bothSig_2==],'ks',markerfacecolor=[.7,.9,.7,.5],markersize=5,label='Both sig.')
axs[0].plot(,label='Neither sig')
axs[0].plot(,label='One sig.')

# split-1 vs. split-12
axs[1].plot(,label='Both sig.')
axs[1].plot(,label='Neither sig')
axs[1].plot(,label='One sig.')

# axis adjustments
axs[0].set(xlabel='Split-1 t-value',ylabel='Split-2 t-value',title=f'A) T-val comparison (r = {r_2:.3f})')
axs[1].set(xlabel='Split-1 t-value',ylabel='Split-12 t-value',title=f'B) T-val comparison (r = {r_12:.3f})')

# common adjustments
for a in axs:
  a.axhline(0,color='k',linestyle='--',linewidth=.5)
  a.axvline(0,color='k',linestyle='--',linewidth=.5)
  a.legend()


plt.tight_layout()
plt.savefig('ch7_proj44_part4.png')
plt.show()

# **Part 5: Investigating distributions**

In [None]:
# extract histograms
yNouns1,xNouns1 = np.histogram(mlp_acts['nouns_split1'],bins='fd')
yNouns2,xNouns2 =
yVerbs1,xVerbs1 =
yVerbs2,xVerbs2 =

# and visualize them
plt.figure(figsize=(9,3))
plt.plot(,label='Nouns 1')
plt.plot(,label='Nouns 2')
plt.plot(,label='Verbs 1')
plt.plot(,label='Verbs 2')

plt.legend()
plt.gca().set(xlabel='MLP expansion activation',ylabel='Count',ylim=[0,None],
              title='Histograms of all MLP neurons')

plt.tight_layout()
plt.savefig('ch7_proj44_part5a.png')
plt.show()

In [None]:
# histograms of t>0 and t<0 subpopulations
yNouns1_neg,xNouns1_neg = np.histogram(mlp_acts['nouns_split1'][:,T_split1.statistic<0],bins='fd',density=True)
yVerbs1_neg,xVerbs1_neg =
yNouns1_pos,xNouns1_pos =
yVerbs1_pos,xVerbs1_pos =

plt.figure(figsize=(9,3))
plt.plot(xNouns1_neg[:-1],yNouns1_neg,linewidth=2,label='Nouns t<0')
plt.plot(,label='Verbs t<0')
plt.plot(,label='Nouns t>0')
plt.plot(,label='Verbs t>0')

plt.legend()
plt.gca().set(xlabel='MLP expansion activation',ylabel='Density',ylim=[0,None],
              title='Histograms separated by t-value sign')

plt.tight_layout()
plt.savefig('ch7_proj44_part5b.png')
plt.show()

In [None]:
# find the neurons with the largest positive and negative t-values
max_t =
min_t =

# and get their histograms
yNouns1_max,xNouns1_max =
yVerbs1_max,xVerbs1_max =
yNouns1_min,xNouns1_min =
yVerbs1_min,xVerbs1_min =

plt.figure(figsize=(9,3))
plt.plot(xNouns1_max[:-1],yNouns1_max,'r',linewidth=2,label=f'Nouns (t = {T_split1.statistic[max_t]:.2f})')
plt.plot(xVerbs1_max[:-1],yVerbs1_max,'g',linewidth=2,label=f'Verbs (t = {T_split1.statistic[max_t]:.2f})')
plt.plot(xNouns1_min[:-1],yNouns1_min,'r--',linewidth=2,label=f'Nouns (t = {T_split1.statistic[min_t]:.2f})')
plt.plot(xVerbs1_min[:-1],yVerbs1_min,'g--',linewidth=2,label=f'Verbs (t = {T_split1.statistic[min_t]:.2f})')

plt.gca().set(xlabel='MLP expansion activation',ylabel='Density',ylim=[0,None],
              title='Histograms from two neurons')

plt.legend()
plt.tight_layout()
plt.savefig('ch7_proj44_part5c.png')
plt.show()

# **Part 6: Within-category tests**

In [None]:
# within-category t-tests
T_withinNoun = stats.ttest_ind(,,axis=0)
T_withinVerb = stats.ttest_ind(,,axis=0)

# and plot
fig,axs = plt.subplots(1,2,figsize=(9,4))
axs[0].plot(,,'ko',markerfacecolor=[.9,.7,.7,.3])
axs[1].plot(,,'ks',markerfacecolor=[.7,.9,.7,.3])

axlim =
axs[0].set(xlim=[-axlim,axlim],ylim=[-axlim,axlim],xlabel='t(nouns,verbs), split 1',ylabel='t(nouns-1,nouns-2)',
           title='A) Across vs. within-nouns comparison')
axs[1].set(xlim=[-axlim,axlim],ylim=[-axlim,axlim],xlabel='t(nouns,verbs), split 2',ylabel='t(verbs-1,verbs-2)',
           title='B) Across vs. within-verbs comparison')

plt.tight_layout()
plt.savefig('ch7_proj44_part6a.png')
plt.show()

In [None]:
# Cohen's d
cohensd_within = np.zeros(nneurons)
for i in range(nneurons):
  cohensd_within[i] = pg.compute_effsize(

# histograms
yW,xW = np.histogram(,bins='fd')
yA,xA = np.histogram(,bins='fd')

# visualize
plt.figure(figsize=(9,3))
plt.plot(xW[:-1],yW,'o-',linewidth=2,label='Within category')
plt.plot(xA[:-1],yA,'s-',linewidth=2,label='Across category')

# indicating effect sizes
plt.axvline(.2,linestyle='--',color='r',label='Small effect')
plt.axvline(.8,linestyle=':',color='m',label='Large effect')

plt.legend()
plt.gca().set(xlabel="Cohen's d",ylabel='Count')

plt.tight_layout()
plt.savefig('ch7_proj44_part6b.png')
plt.show()

# **Part 7: Laminar profile of tuning**

In [None]:
n_layers = len(model.transformer.h)

In [None]:
# remove previous hook
handle.remove()

def outerHook(layeri):
  def hook(module,input,output):
    mlp_acts[f'L{layeri}_{whichdata}'] = output.detach().numpy().squeeze()
  return hook


# surgery ;)
handles = []
for layeri in range(n_layers):
  h =
  handles.

In [None]:
# this cell takes around 20 s

# reinialize data-dictionary
mlp_acts

with torch.no_grad():

  # run the split1 nouns
  whichdata = 'nouns_split1'
  model(

  # split2 nouns
  whichdata = 'nou
  model(

  # the split1 verbs
  whichdata =
  model

  # and the split2 verbs
  whichdata = ''
  model


In [None]:
for k,v in mlp_acts.items():
  print(f"mlp_acts['{k}'] has shape {list(v.shape)}")

In [None]:
sig_neurons = np.zeros((n_layers,5))

for layeri in range(n_layers):

  # run the t-tests
  T_split1 = stats.ttest_ind(
  T_split2 = stats.ttest_ind(

  # boolean of significant tests
  issig1 = stats.<.05
  issig2 = stats.<.05

  # proportion of significant neurons
  sig_neurons[layeri,0] =
  sig_neurons[layeri,1] =

  # average significant t-values
  sig_neurons[layeri,2] =
  sig_neurons[layeri,3] =

  # correlation between them
  sig_neurons[layeri,4] = np.corrcoef(T_split1.statistic,T_split2.statistic)[0,1]


In [None]:
_,axs = plt.subplots(1,3,figsize=(12,3))

# proportion significant neurons
axs[0].plot(,label='Split-1')
axs[0].plot(label='Split-2')
axs[0].set(xlabel='Transformer layer',ylabel='Proportion significant neurons',title='A) Proportion significant neurons')
axs[0].legend()

# average t-values
axs[1].plot(,label='Split-1')
axs[1].plot(,label='Split-2')
axs[1].set(xlabel='Transformer layer',ylabel='Average t-values',title='B) |T| of significant neurons')
axs[1].legend()

# correlations
axs[2].plot(sig_neurons[:,4],'kh',markersize=12,markerfacecolor=[.7,.7,.9])
axs[2].set(xlabel='Transformer layer',ylabel='Correlation coefficient',title='C) T-value split correlations')


plt.tight_layout()
plt.savefig('ch7_proj44_part7.png')
plt.show()

# **Part 8: Tokens in vs. out of order**

In [None]:
# source: https://en.wikipedia.org/wiki/Coconut
text = 'The coconut (Cocos nucifera) is a member of the palm family (Arecaceae) and the only living species of the genus Cocos.'

tokens = tokenizer.encode(text,return_tensors='pt')
print(f'There are {len(text)} characters and {len(tokens[0])} tokens.')

In [None]:
# scramble and invert
scrambled_idx = torch.randperm(
scrambled_tokens = tokens[0,
inverse_idx =

print(f'Original sentence:\n {tokenizer.decode(tokens[0,:])}\n')
print(f'Scrambled sentence:\n {tokenizer.decode(scrambled_tokens[0,:])}\n')
print(f'Inverted scrambling:\n {}')


In [None]:
mlp_acts = {}

with torch.no_grad():

  whichdata = 'sentence'
  model(tokens)

  whichdata = 'words'
  model(

  whichdata = 'scrambled'
  model(

In [None]:
for k,v in mlp_acts.items():
  print(f"mlp_acts['{k}'] has shape {list(v.shape)}")

In [None]:
_,axs = plt.subplots(1,3,figsize=(12,3.5))

skip = 10
layer = 3

sent = mlp_acts[f'L{layer}_sentence'].flatten()[::skip]
word = mlp_acts[f'L{layer}_words'].
scrm =

axs[0].plot(,,'ko',markersize=3,markerfacecolor=[.7,.7,.9,.3])
axs[0].set(xlabel='Sentence',ylabel='Words',title=f'A) Sentence vs. words (r = {np.corrcoef(sent,word)[0,1]:.3f})')

axs[1].plot(,,'ks',markersize=3,markerfacecolor=[.7,.9,.7,.3])
axs[1].set(xlabel='Sentence',ylabel='Scrambled sentence',title=f'B) Sentence vs. scrambled (r = {np.corrcoef(sent,scrm)[0,1]:.3f})')

axs[2].plot(,,'k^',markersize=3,markerfacecolor=[.9,.7,.7,.3])
axs[2].set(xlabel='Scrambled sentence',ylabel='Words',title=f'C) Scrambled vs. words (r = {np.corrcoef(scrm,word)[0,1]:.3f})')

plt.tight_layout()
plt.savefig('ch7_proj44_part8a.png')
plt.show()

In [None]:
Rs = np.zeros((n_layers,3))

# loop over layers
for layeri in range(n_layers):

  # extract the activations
  sent = mlp_acts[f'L{layeri}_sentence
  word = mlp_acts
  scrm =

  # correlation coefficients
  Rs[layeri,0] = np.corrcoef(sent,word)[0,1]
  Rs[layeri,1] =
  Rs[layeri,2] =

# and the visualizations
plt.figure(figsize=(10,3))
plt.plot(,label='Sentence-word')
plt.plot(,label='Sentence-scrambled')
plt.plot(,label='Scrambled-word')

plt.axhline(0,linestyle='--',color='k',linewidth=.4,zorder=-10)
plt.gca().set(xlabel='Transformer layer',ylabel='Correlation coefficient',
              title='Correlations across token organizations')
plt.legend()

plt.tight_layout()
plt.savefig('ch7_proj44_part8b.png')
plt.show()