# Doing linear regression on the token embeddings

What features can be extracted from a model's token embeddings using linear regression?

In [59]:
DEVELOPMENT_MODE = False #@param {type:"bool"}
MODEL_NAME = "gpt-neo-125M" #@param {type:"string"}
TOKEN_BEGIN_SPACE = "Ġ" #@param {type:"string"}
LENGTH_OUTLIER_THRESHOLD = 15 #@param {type:"int"}
NUMERIC_OUTLIER_THRESHOLD = 1000 #@param {type:"int"}

## Setup

In [3]:
try:
    import google.colab
    IN_COLAB = True
    print("Running as a Colab notebook")
    %pip install git+https://github.com/SamAdamDay/mechanistic-interpretability-projects.git
except:
    IN_COLAB = False
    print("Running as a Jupyter notebook - intended for development only!")
    from IPython import get_ipython

    ipython = get_ipython()
    # Code to automatically update the HookedTransformer code as its edited without restarting the kernel
    ipython.magic("load_ext autoreload")
    ipython.magic("autoreload 2")

Running as a Jupyter notebook - intended for development only!
The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


  ipython.magic("load_ext autoreload")
  ipython.magic("autoreload 2")


In [4]:
import plotly.io as pio

if IN_COLAB or DEVELOPMENT_MODE:
    pio.renderers.default = "colab"
else:
    pio.renderers.default = "notebook_connected"
print(f"Using renderer: {pio.renderers.default}")


Using renderer: notebook_connected


In [5]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader

import numpy as np

from sklearn.linear_model import LinearRegression

from fancy_einsum import einsum

from tqdm import tqdm

import plotly.express as px

import matplotlib.pyplot as plt

import transformer_lens
import transformer_lens.utils as utils
from transformer_lens.hook_points import (
    HookedRootModule,
    HookPoint,
)  # Hooking utilities
from transformer_lens import (
    HookedTransformer,
    HookedTransformerConfig,
    FactoredMatrix,
    ActivationCache,
)


In [6]:
torch.set_grad_enabled(False)

<torch.autograd.grad_mode.set_grad_enabled at 0x7f1d418b5750>

In [7]:
device = "cuda" if torch.cuda.is_available() else "cpu"
device = "cpu"
print(device)

cpu



CUDA initialization: CUDA unknown error - this may be due to an incorrectly set up environment, e.g. changing env variable CUDA_VISIBLE_DEVICES after program start. Setting the available devices to be zero. (Triggered internally at ../c10/cuda/CUDAFunctions.cpp:109.)



## Get model and tokens

In [8]:
model = HookedTransformer.from_pretrained(MODEL_NAME, device=device)

Using pad_token, but it is not set yet.


Loaded pretrained model gpt-neo-125M into HookedTransformer


In [9]:
W_E = model.W_E
W_E_numpy = utils.to_numpy(W_E)
print(W_E_numpy.shape)

(50257, 768)


In [10]:
d_vocab = model.tokenizer.vocab_size
str_tokens = model.tokenizer.convert_ids_to_tokens(list(range(d_vocab)))

## Token length

Regressing on the number of characters in a token.

In [16]:
lengths_basic = np.array(list(map(len, str_tokens)))
def len_no_space(token_str: str):
    if token_str.startswith(TOKEN_BEGIN_SPACE):
        return len(token_str) - 1
    else:
        return len(token_str)
lengths_nospace = np.array(list(map(len_no_space, str_tokens)))

In [15]:
px.histogram(x=lengths_basic, title="Token Basic Lengths")

In [17]:
px.histogram(x=lengths_nospace, title="Token Lengths Without Space")

What are these weird outliers?

In [29]:
print([x for x in str_tokens if len(x) > LENGTH_OUTLIER_THRESHOLD])



Let's mask out the outliers

In [30]:
outliers_mask = lengths_basic <= LENGTH_OUTLIER_THRESHOLD

In [31]:
regression_tasks = {
    "Basic lengths": (W_E_numpy, lengths_basic),
    "Lengths without space": (W_E_numpy, lengths_nospace),
    "Basic lengths excluding outliers": (W_E_numpy[outliers_mask], lengths_basic[outliers_mask]),
    "Lengitgths without space excluding outliers": (W_E_numpy[outliers_mask], lengths_nospace[outliers_mask]),
}

regressions = {}
preditions = {}
for name, (X, y) in regression_tasks.items():
    regressions[name] = LinearRegression().fit(X, y)
    preditions[name] = regressions[name].predict(X)

In [38]:
for name in regression_tasks:
    fig = px.box(
        x=regression_tasks[name][1],
        y=preditions[name],
        title=name,
        labels=dict(x="True length", y="Predicted length"),
    )
    fig.add_shape(type="line", x0=0, y0=0, x1=15,y1=15, line=dict(dash="dot"), label=dict(text="x=y"))
    fig.show()

## Numerical value

Can we regress on the numerical value of those tokens which are numbers?

In [63]:
# Determine which tokens are numbers and their numerical values
is_number_mask = np.empty(d_vocab, dtype=bool)
numerical_values = []
for i,str_token in enumerate(str_tokens):
    is_number_mask[i] = all(x in "0123456789" for x in str_token)
    if is_number_mask[i]:
        numerical_values.append(int(str_token))

numerical_values = np.array(numerical_values)

In [45]:
px.histogram(x=numerical_values, title="Numerical tokens").show()

Looks like there's an outlier!

In [60]:
outliers_mask = numerical_values <= NUMERIC_OUTLIER_THRESHOLD

In [62]:
px.histogram(x=numerical_values[outliers_mask], title="Numerical tokens excluding outliers").show()

In [67]:
regression = LinearRegression().fit(
    W_E_numpy[is_number_mask][outliers_mask], numerical_values[outliers_mask]
)
prediction = regression.predict(W_E_numpy[is_number_mask][outliers_mask])
fig = px.box(
    x=numerical_values[outliers_mask],
    y=prediction,
    title="Numerical Values",
    labels=dict(x="True Numerical Value", y="Predicted Numerical Value"),
)
fig.add_shape(
    type="line",
    x0=0,
    y0=0,
    x1=NUMERIC_OUTLIER_THRESHOLD,
    y1=NUMERIC_OUTLIER_THRESHOLD,
    line=dict(dash="dot"),
    label=dict(text="x=y"),
)
fig.show()