# Analyze Attentio Head L0H0

This CoLab analyses a Transformer model that performs integer addition, subtraction and multiplication e.g. 133357+182243=+0315600, 123450-345670=-0123230 and 000345*000823=+283935. Each digit is a separate token. For 6 digit questions, the model is given 14 "question" (input) tokens, and must then predict the corresponding 8 "answer" (output) tokens.

https://github.com/PhilipQuirke/verified_transformers/blob/main/assets/ins1_mix_d6_l3_h4_t40K_s372001MathsPurposePerNode.svg
https://github.com/PhilipQuirke/verified_transformers/blob/main/assets/ins1_mix_d6_l3_h4_t40K_s372001QuantaAtP18.svg
https://github.com/PhilipQuirke/verified_transformers/blob/main/assets/Hypothesis2_A2_Calc.png

# Part 0: Import libraries
Imports standard libraries.

Imports "verified_transformer" public library as "qt". This library is specific to this CoLab's "QuantaTool" approach to transformer analysis. Refer to [README.md](https://github.com/PhilipQuirke/verified_transformers/blob/main/README.md) for more detail.

In [None]:
import os
os.environ['CUDA_DEVICE_ORDER']='PCI_BUS_ID'
os.environ['CUDA_VISIBLE_DEVICES']='3'

In [None]:
DEVELOPMENT_MODE = True
try:
    import google.colab
    IN_COLAB = True
    print("Running as a Colab notebook")
    !pip install matplotlib

    !pip install kaleido
    !pip install transformer_lens
    !pip install torchtyping
    !pip install transformers

    !pip install numpy
    !pip install scikit-learn

except:
    IN_COLAB = False

    def setup_jupyter(install_libraries=False):
        if install_libraries:
            !pip install matplotlib==3.8.4
            !pip install kaleido==0.2.1
            !pip install transformer_lens==1.15.0
            !pip install torchtyping==0.1.4
            !pip install transformers==4.39.3

            !pip install numpy==1.26.4
            !pip install plotly==5.20.0
            !pip install pytest==8.1.1
            !pip install scikit-learn==1.4.1.post1

        print("Running as a Jupyter notebook - intended for development only!")
        from IPython import get_ipython

        ipython = get_ipython()
        # Code to automatically update the HookedTransformer code as its edited without restarting the kernel
        ipython.magic("load_ext autoreload")
        ipython.magic("autoreload 2")

    # setup_jupyter(install_libraries=True)   # Uncomment if you need to install libraries in notebook.
    setup_jupyter(install_libraries=False)

In [None]:
# Plotly needs a different renderer for VSCode/Notebooks vs Colab argh
import kaleido
import plotly.io as pio

if IN_COLAB or not DEVELOPMENT_MODE:
    pio.renderers.default = "colab"
else:
    pio.renderers.default = "notebook_connected"
print(f"Using renderer: {pio.renderers.default}")

In [None]:
pio.templates['plotly'].layout.xaxis.title.font.size = 20
pio.templates['plotly'].layout.yaxis.title.font.size = 20
pio.templates['plotly'].layout.title.font.size = 30

In [None]:
import json
import torch
import torch.nn.functional as F
import numpy as np
import random
import itertools
import re
from enum import Enum

In [None]:
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors
import textwrap

In [None]:
import transformer_lens
import transformer_lens.utils as utils
from transformer_lens.hook_points import (
    HookedRootModule,
    HookPoint,
)  # Hooking utilities
from transformer_lens import HookedTransformer, HookedTransformerConfig, FactoredMatrix, ActivationCache

In [None]:
# Import Principal Component Analysis (PCA) library
use_pca = True
try:
  from sklearn.decomposition import PCA
except Exception as e:
  print("pca import failed with exception:", e)
  use_pca = False

  # Sometimes version conflicts means the PCA library does not import. This workaround partially fixes the issue
  !pip install --upgrade numpy
  !pip install --upgrade scikit-learn

  # To complete workaround, now select menu option "Runtime > Restart session and Run all".
  stop

In [None]:
! pip uninstall QuantaTools -y || true   # Ensure a clean install.

In [None]:
# Refer https://github.com/PhilipQuirke/verified_transformers/blob/main/README.md
!pip install --upgrade git+https://github.com/PhilipQuirke/verified_transformers.git@amir/analyze_pca_outputs_on_l0h1  # Specify @branch if testing a specific branch
import QuantaTools as qt

# Part 1A: Configuration

Which existing model do we want to analyse?

The existing model weightings created by the sister Colab [VerifiedArithmeticTrain](https://github.com/PhilipQuirke/transformer-maths/blob/main/assets/VerifiedArithmeticTrain.ipynb) are loaded from HuggingFace (in Part 5). Refer https://github.com/PhilipQuirke/verified_transformers/blob/main/README.md for more detail.

In [None]:
# Singleton QuantaTool "main" configuration class. MathsConfig is derived from the chain AlgoConfig > UsefulConfig > ModelConfig
cfg = qt.MathsConfig()

# Singleton QuantaTool "ablation intervention" configuration class
acfg = qt.acfg

In [None]:
cfg.model_name = "ins1_mix_d6_l3_h4_t40K_s372001"  # AvgFinalLoss=1.7e-08. Accurate on 1M Qs
cfg.perc_sub = 80

# Part 1B: Configuration: Input and Output file names



In [None]:
# Needed when user changes model_name and reruns this Colab a second time
cfg.reset_useful()
cfg.reset_algo()
cfg.initialize_maths_token_positions()
acfg.reset_ablate()

if cfg.model_name != "":
  # Update cfg member data n_digits, n_layers, n_heads, n_training_steps from model_name
  cfg.parse_model_name()

  if cfg.model_name.startswith("ins1_mix_d6_l3") :
    if cfg.training_seed == 372001:
      # Mixed model initialised with add_d6_l2_h3_t15K.pth.
      cfg.insert_n_training_steps = 15000
    else:
      # Mixed model initialised with add_d6_l2_h3_t20K.pth.
      cfg.insert_n_training_steps = 20000

  cfg.batch_size = 512

In [None]:
cfg.op_config_description()

In [None]:
main_fname = cfg.file_config_prefix()
main_fname_pth = main_fname + '.pth'
main_fname_behavior_json = main_fname + '_behavior.json'
main_fname_algorithm_json = main_fname + '_algorithm.json'

def print_config():
  print("%Add=", cfg.perc_add(), "%Sub=", cfg.perc_sub, "%Mult=", cfg.perc_mult, "InsertMode=", cfg.insert_mode, "File=", main_fname)

print_config()
print("weight_decay=", cfg.weight_decay, "lr=", cfg.lr, "batch_size=", cfg.batch_size)
print('Main model will be read from HuggingLab file', main_fname_pth)
print('Main model behavior analysis tags will save to Colab temporary file', main_fname_behavior_json)
print('Main model algorithm analysis tags will save to Colab temporary file', main_fname_algorithm_json)

# Part 3A: Set Up: Vocabulary / Embedding / Unembedding

  

In [None]:
main_fname_pth

In [None]:
qt.set_maths_vocabulary(cfg)
qt.set_maths_question_meanings(cfg)
print(cfg.token_position_meanings)

# Part 3B: Set Up: Create model

In [None]:
# Transformer creation

# Structure is documented at https://neelnanda-io.github.io/TransformerLens/transformer_lens.html#transformer_lens.HookedTransformerConfig.HookedTransformerConfig
ht_cfg = HookedTransformerConfig(
    n_layers = cfg.n_layers,
    n_heads = cfg.n_heads,
    d_model = cfg.d_model,
    d_head = cfg.d_head,
    d_mlp = cfg.d_mlp(),
    act_fn = cfg.act_fn,
    normalization_type = 'LN',
    d_vocab = cfg.d_vocab,
    d_vocab_out = cfg.d_vocab,
    n_ctx = cfg.n_ctx(),
    init_weights = True,
    device = "cuda",
    seed = cfg.training_seed,
)

cfg.main_model = HookedTransformer(ht_cfg)

# Part 4: Set Up: Loss Function & Data Generator
This maths loss function and data generator are imported from QuantaTools as logits_to_tokens_loss, loss_fn, maths_data_generator_core and maths_data_generator.

In [None]:
# Define "iterator" maths "questions" data generator function. Invoked using next().
ds = qt.maths_data_generator( cfg )

In [None]:
# Generate sample data generator (unit test)
print(next(ds)[:3,:])

# Part 5: Set Up: Load Model from HuggingFace

In [None]:
main_fname_pth

In [None]:
main_repo_name="PhilipQuirke/VerifiedArithmetic"
print("Loading model from HuggingFace", main_repo_name, main_fname_pth)

cfg.main_model.load_state_dict(utils.download_file_from_hf(repo_name=main_repo_name, file_name=main_fname_pth, force_is_torch=True))
cfg.main_model.eval()

# Part 6A: Look at Math Purpose Per Node

* Rerunning the main notebook, and looking at the purpose of each node.
* In particular, we can see that L0HO impacts performance on A1.SC, A1.MB and A2.NB. We want to more formally verify the polysemantic behavior of this node.

In [None]:
from IPython.display import display, HTML, IFrame

# Path to your PDF file
pdf_path = 'ins1_mix_d6_l3_h4_t40K_s372001MathsPurposePerNode.pdf'

# Display the PDF using an inline frame
display(IFrame(pdf_path, width=700, height=575))

# Part 7: Results: Manual interpretation of PCA results

Principal Component Analysis (PCA) is a powerful technique that aids in mechanistic interpretability by simplifying complex datasets into principal components that capture the most significant variance within the data.

This library uses PCA to help understand the purpose of individual useful nodes. For more background refer https://github.com/PhilipQuirke/verified_transformers/blob/main/pca.md

If an attention head and an answer digit An gives an interpretable response (2 or 3 distinct output clusters) on 3 groups of questions aligned to T8, T9 and T10 definitions, then plot the response and add a PCA tag



In [None]:
from QuantaTools.maths_tools.maths_test_questions.tricase_test_questions_generator import CustomTriclassConfig, OperatorQTypeNumber, TOTAL_TRICASE_QUESTIONS
from QuantaTools.maths_tools.maths_constants import MathsToken
from QuantaTools.maths_tools.maths_complexity import SimpleQuestionDescriptor

from QuantaTools.quanta_constants import QType

from QuantaTools import make_maths_tricase_questions_customized, make_maths_tricase_questions

In [None]:
from QuantaTools.maths_tools.maths_test_questions.tricase_test_questions_generator import *

In [None]:
QType
TriCaseBehavior
MathsToken

In [None]:
DigitOperatorQTypeTricase(digit=1, operator=11, qtype=QType.MATH_SUB, test_case=TriCaseBehavior.MT1)

# Part 7 A: Question configurations for classes of questions.
Set up question configs for Neg only, Add Only, Sub Only and mixture configurations.
Will help with analysis.

In [None]:
batch_size = 300

In [None]:
add_sub_and_neg_config = CustomTriclassConfig((
    OperatorQTypeNumber(MathsToken.PLUS, QType.MATH_ADD, batch_size),
    OperatorQTypeNumber(MathsToken.MINUS, QType.MATH_SUB, batch_size),
    OperatorQTypeNumber(MathsToken.MINUS, QType.MATH_NEG, batch_size)
))

add_only_config = CustomTriclassConfig((
    OperatorQTypeNumber(MathsToken.PLUS, QType.MATH_ADD, batch_size),
    OperatorQTypeNumber(MathsToken.MINUS, QType.MATH_SUB, 0),
    OperatorQTypeNumber(MathsToken.MINUS, QType.MATH_NEG, 0)
))

sub_only_config = CustomTriclassConfig((
    OperatorQTypeNumber(MathsToken.PLUS, QType.MATH_ADD, 0),
    OperatorQTypeNumber(MathsToken.MINUS, QType.MATH_SUB, batch_size),
    OperatorQTypeNumber(MathsToken.MINUS, QType.MATH_NEG, 0)
))

neg_only_config = CustomTriclassConfig((
    OperatorQTypeNumber(MathsToken.PLUS, QType.MATH_ADD, 0),
    OperatorQTypeNumber(MathsToken.MINUS, QType.MATH_SUB, 0),
    OperatorQTypeNumber(MathsToken.MINUS, QType.MATH_NEG, batch_size)
))

# Part 7 B: PCA the output of P18L0H0 for ADD, SUB, NEG questions with no SC/MB/NB features.

In [None]:
%%capture
add_only_questions = make_maths_tricase_questions_customized(cfg, add_only_config, verbose=True)

In [None]:
add_only_questions

In [None]:
make_single_tricase_question(cfg=None, test_digit=1, test_case=TriCaseBehavior.MT3, operation=MathsToken.MINUS, qtype=QType.MATH_NEG)

In [None]:
key = DigitOperatorQTypeTricase(digit=1, operator=maths_tokens_to_names[MathsToken.MINUS], qtype=QType.MATH_NEG, test_case=TriCaseBehavior.MT3)

In [None]:
questions = cfg.customized_tricase_questions_dict[key]

In [None]:
questions[0].squeeze().shape

In [None]:
[str(item) for item in items]

In [None]:
my_dict = {}

target_cases = [TriCaseBehavior.MT3]
qtype=QType.MATH_NEG            
num_questions = 300
local_num_questions = int(num_questions / len(target_cases))
operator = MathsToken.MINUS


answer_digit = 0
for test_case in target_cases:
    all_questions = make_tricase_questions(
            cfg, test_digit=answer_digit, test_case=test_case, operation=operator, qtype=qtype, num_questions=local_num_questions
    )
    key = DigitOperatorQTypeTricase(answer_digit, maths_tokens_to_names[operator], qtype, test_case)

    my_dict[key] = all_questions

questions_created = [len(my_dict.get(
            DigitOperatorQTypeTricase(answer_digit, maths_tokens_to_names[operator], qtype, test_case), [])) for test_case in TriCaseBehavior
]

questions_created

In [None]:
mixed_questions = make_maths_tricase_questions_customized(cfg, add_sub_and_neg_config, verbose=True)

In [None]:
cfg.customized_tricase_questions_dict

In [None]:
batch_size = 300

In [None]:
neg_only_questions = make_maths_tricase_questions_customized(cfg, neg_only_config, verbose=False)

In [None]:
neg_only_questions_no_mb_or_nb = DigitOperatorQTypeTricase(digit=0, operator=MathsToken.MINUS, qtype=QType.MATH_NEG, tricase=TriCaseBehavior.ST8)
neg_only_questions_no_mb_or_nb = DigitOperatorQTypeTricase(digit=0, operator=MathsToken.MINUS, qtype=QType.MATH_NEG, tricase=TriCaseBehavior.ST8)

In [None]:
neg_only_questions[DigitOperatorQTypeTricase(digit=5, operator=11, qtype

In [None]:
questions = make_maths_tricase_questions_customized(cfg, add_sub_and_neg_config)

questions = make_maths_tricase_questions_customized(cfg, add_only_config)

questions = make_maths_tricase_questions_customized(cfg, sub_only_config)

questions = make_maths_tricase_questions_customized(cfg, neg_only_config)

In [None]:
color_mappings = {'T8':  'red', 'T9': 'green', 'T10': 'blue'} 

In [None]:
# Create a cache of sample maths questions based on the T8, T9, T10 categorisation in cfg.tricase_questions_dict
qt.make_maths_tricase_questions(cfg)

In [None]:
dir(cfg.main_model)