|<h2>Book:</h2>|<h1><a href="https://open.substack.com/pub/mikexcohen/p/llm-breakdown-16-tokenization-words" target="_blank">50 ML projects to understand LLMs</a></h1>|
|-|:-:|
|<h2>Project:</h2>|<h1><b>[27] Impact of layer-specific noise and scaling</b></h1>|
|<h2>Author:<h2>|<h1>Mike X Cohen, <a href="https://sincxpress.com" target="_blank">sincxpress.com</a></h1>|

<br>

<i>Using the code without reading the book may lead to confusion or errors.</i>

In [None]:
import numpy as np
import matplotlib.pyplot as plt

import torch

from transformers import AutoModelForCausalLM, AutoTokenizer

In [None]:
### matplotlib adjustments (commented lines are for dark mode)

# svg plots (higher-res)
import matplotlib_inline.backend_inline
matplotlib_inline.backend_inline.set_matplotlib_formats('svg')

plt.rcParams.update({
    # 'figure.facecolor': '#282a2c',
    # 'figure.edgecolor': '#282a2c',
    # 'axes.facecolor':   '#282a2c',
    # 'axes.edgecolor':   '#DDE2F4',
    # 'axes.labelcolor':  '#DDE2F4',
    # 'xtick.color':      '#DDE2F4',
    # 'ytick.color':      '#DDE2F4',
    # 'text.color':       '#DDE2F4',
    'axes.spines.right': False,
    'axes.spines.top':   False,
    'axes.titleweight': 'bold',
    'axes.labelweight': 'bold',
    'savefig.dpi':300
})

# **Part 1: Model, tokens, and clean activations**

In [None]:
# load in GPT2-large and its tokenizer
tokenizer = AutoTokenizer.from_pretrained('gpt2')
model = AutoModelForCausalLM.from_pretrained('gpt2-large',output_hidden_states=True)
model.eval()

In [None]:
# move to the gpu
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device);

In [None]:
print(f'This LLM has {} transformer layers.')

In [None]:
text = 'Pay no attention to that man behind the'
tokens =
target_token =

for t in tokens[0]:
  print

In [None]:
with torch.no_grad(): # ~8s on CPU, <1s on GPU
  outputs_clean = model(tokens

outputs_clean.hidden_states[0].shape

In [None]:
# log softmax
logsm =
log_sm_target_clean =

nextword_clean = torch.argmax
print(f'Next word is "{}" (token index {}) with {} probability.')

In [None]:
plt.figure(figsize=(10,3))

plt.axhline(,color='g',linestyle='--',linewidth=.5)
plt.axvline(,color='g',linestyle='--',linewidth=.5)

plt.plot(,'kh',markerfacecolor=[.7,.9,.7,.3],markersize=4)
plt.gca().set(xlabel='Token index',ylabel='log-softmax prob',
              title='log-softmax of final token',xlim=[-150,tokenizer.vocab_size+150])

plt.tight_layout()
plt.savefig('ch5_proj27_part1.png')
plt.show()

# **Part 2: Hook to inject noise**

In [None]:
# hooking functions
def implant_noise_hook(layer_number):
  def noise_hook(module,input,output):

    # only change one layer
    if layer_number == layer2noise:

      # unpack tuple
      hidden, *rest = output

      # generate a matrix of noise
      h_std =
      noise =

      # add that noise to the hidden states
      hidden +=
      # print(f'Changed layer {layer_number} with noise')

      # reconstruct output
      output = tuple([hidden]+rest)

    return output
  return noise_hook


# loop over layers and do surgery
handles = []
for layeri in range(model.config.n_layer):
  baselayer =
  h = baselayer.register_forward_hook(implant_noise_hook(layeri))
  handles.append(h)

In [None]:
# test with one layer
layer2noise = 15

with torch.no_grad():
  outputs_noise = model(tokens.to(device))

outputs_noise.hidden_states[0].shape

In [None]:
# initialize
diffnorms = torch.zeros()

# loop over layers
for layeri in range():

  # extract hidden states for this layer
  hs_c = outputs_clean.
  hs_n = outputs_noise.

  # norm of difference matrix
  diffnorms[layeri] =

# and plot
plt.figure(figsize=(10,3))
plt.plot(diffnorms,'kh',markerfacecolor=[.9,.7,.7],markersize=12)
plt.axvline(layer2noise+1,color='k',linestyle='--',zorder=-10)
plt.axhline(0,color='k',linestyle=':',zorder=-10)
plt.gca().set(xlabel='Layer',ylabel='Norm of difference')

plt.tight_layout()
plt.savefig('ch5_proj27_part2.png')
plt.show()

In [None]:
# predicted next token and its probability
max_logit = torch.argmax(
log_sm = outputs_noise.

print(f' Clean model: next token is "{}" with {} probability.')
print(f'Noised model: next token is "{}" with {} probability.')

# **Part 3: Impacts of layer-specific noising**

In [None]:
# initializations
log_sm_targets = torch.zeros((,))

# loop over layers
for layer2noise in range():

  # run the model
  with torch.no_grad():
    outputs_noise = model

  # log-softmax the final token logits
  logsm = outputs_noise.logits...

  # get the target (" curtain") value
  log_sm_targets[layer2noise,0] =

  # get the max value
  maxtok = torch.argmax
  log_sm_targets[layer2noise,1] = logsm[

  # print the completed text
  print(f'L{}: {}"{}"')

In [None]:
_,axs = plt.subplots(1,2,figsize=(12,3.5))

# plot the impact of the perturbations
axs[0].axhline(,label='Clean')
axs[0].plot(,label='Noisified')
axs[0].legend()
axs[0].set(xlabel='Layer',ylabel='log-softmax prob',)

# and the max logit
axs[1].axhline(,label='Clean')
axs[1].plot()
axs[1].set(xlabel='Layer',ylabel='log-softmax prob',ylim=axs[0].get_ylim(),)

plt.tight_layout()
plt.savefig('ch5_proj27_part3.png')
plt.show()

In [None]:
# remove handles
for h in handles:
  h.remove()

# **Part 4: Layer-specific scalar dampening**

In [None]:
# hooking functions
def implant_scale_hook(layer_number):
  def scale_hook(module, input, output):

    # only change one layer
    if layer_number == layer2scale:

      # unpack tuple


      # in-place method to scale down the hidden states


      # reconstruct output


    return output
  return scale_hook


# loop over layers and do surgery
handles = []
for layeri in range(model.config.n_layer):
  baselayer = model.transformer.h[layeri]
  h = baselayer.
  handles.append(h)

In [None]:
# test with one layer
layer2scale = 15

with torch.no_grad():
  outputs_scale = model(tokens.to(device))

outputs_scale.hidden_states[0].shape

In [None]:
# initialize
diffnorms = torch.zeros(model.config.n_layer)

# loop over layers
for layeri in range(model.config.n_layer):

  # extract hidden states for this layer
  hs_c =
  hs_n =

  # norm of difference matrix
  diffnorms[layeri] =

# and plot
plt.figure(figsize=(10,3))
plt.plot(diffnorms)
plt.gca().set(xlabel='Layer',ylabel='Norm of difference')

plt.tight_layout()
plt.savefig('ch5_proj27_part4a.png')
plt.show()

In [None]:
# initializations
log_sm_targets = torch.zeros((model.config.n_layer,2))

# loop over layers
for layer2scale in

  # run the model


  # log-softmax the final token logits
  logsm =

  # get the target (" curtain") value
  log_sm_targets[layer2scale,0] =

  # get the max value
  maxtok = torch.argmax
  log_sm_targets[layer2scale,1] =

  # print the completed text
  print

In [None]:
_,axs = plt.subplots(1,2,figsize=(12,3.5))

# plot the impact of the perturbations
axs[0].axhline(label='Clean')
axs[0].plot(label='Scaled')
axs[0].set(xlabel='Layer',ylabel='log-softmax prob',
              title=)

# and the max logit
axs[1].axhline(label='Clean')
axs[1].plot(
axs[1].set(xlabel='Layer',ylabel='log-softmax prob',ylim=axs[0].get_ylim(),
              title='Impact of down-scaling on log-softmax of max logit')

plt.tight_layout()
plt.savefig('ch5_proj27_part4b.png')
plt.show()