In [None]:
import torch
from model_analyzer import ModelAnalyzer
from activation_extraction import (
    ActivationRecord,
    save_activations,
    load_activations,
    compare_activations,
    get_activation_statistics
)
from intervention import (
    InterventionHandler,
    ActivationPatch,
    SteeringVector,
    create_steering_vector as _create_steering_vector
)

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
print("Initializing ModelAnalyzer...")
device = "cuda" if torch.cuda.is_available() else "cpu"
analyzer = ModelAnalyzer("../models/Llama-3.2-1B", device=device)
analyzer.load_model()

analyzer.print_architecture_summary()

Initializing ModelAnalyzer...
Initializing ModelAnalyzer for ..\models\Llama-3.2-1B
Device: cuda
[OK] ModelAnalyzer ready
Loading model from ..\models\Llama-3.2-1B...
Set _attn_implementation to eager in config
Set attn_implementation to eager in model
[OK] Model loaded on cuda

MODEL ARCHITECTURE SUMMARY

Model Type: llama
Number of Layers: 16
Hidden Size: 2048
Attention Heads: 32
Vocabulary Size: 128256
Max Position Embeddings: 131072

Total Parameters: 1,235,814,400
Trainable Parameters: 1,235,814,400

Layer Names:
  0: model.layers.0
  1: model.layers.1
  2: model.layers.2
  3: model.layers.3
  4: model.layers.4
  5: model.layers.5
  6: model.layers.6
  7: model.layers.7
  8: model.layers.8
  9: model.layers.9
  10: model.layers.10
  11: model.layers.11
  12: model.layers.12
  13: model.layers.13
  14: model.layers.14
  15: model.layers.15



In [5]:
generated_text = analyzer.generate(
    prompt="12 * 12 = ",
    max_new_tokens=10
)

record = analyzer.extract_activations(text=generated_text, 
    layer_names=None,
    layer_indices=None,
    include_attention=True,
    return_logits=False,
    metadata=None
)


In [6]:
print(record.prompt)
print(record.tokens)
print(record.tokens.index("Ġ="))
print(record.token_ids)
print(record.metadata)
print(record.layer_activations["model.layers.0"].shape)
print([(i, record.layer_activations[i].shape) for i in record.layer_activations])
print([(i, record.attention_weights[i].shape) for i in record.attention_weights])
import numpy as np
# (Batch, Heads, Query Len, Key Len)
print(np.argmax(record.attention_weights["attention_layer_0"][0, :, 6, 2].cpu().numpy(), axis=-1))


12 * 12 = 144
What does this pattern of 24 times
['<|begin_of_text|>', '12', 'Ġ*', 'Ġ', '12', 'Ġ=', 'Ġ', '144', 'Ċ', 'What', 'Ġdoes', 'Ġthis', 'Ġpattern', 'Ġof', 'Ġ', '24', 'Ġtimes']
5
[128000, 717, 353, 220, 717, 284, 220, 8929, 198, 3923, 1587, 420, 5497, 315, 220, 1187, 3115]
{'num_tokens': 17, 'num_layers': 17, 'model_name': 'llama'}
torch.Size([1, 17, 2048])
[('model.layers.0', torch.Size([1, 17, 2048])), ('model.layers.1', torch.Size([1, 17, 2048])), ('model.layers.2', torch.Size([1, 17, 2048])), ('model.layers.3', torch.Size([1, 17, 2048])), ('model.layers.4', torch.Size([1, 17, 2048])), ('model.layers.5', torch.Size([1, 17, 2048])), ('model.layers.6', torch.Size([1, 17, 2048])), ('model.layers.7', torch.Size([1, 17, 2048])), ('model.layers.8', torch.Size([1, 17, 2048])), ('model.layers.9', torch.Size([1, 17, 2048])), ('model.layers.10', torch.Size([1, 17, 2048])), ('model.layers.11', torch.Size([1, 17, 2048])), ('model.layers.12', torch.Size([1, 17, 2048])), ('model.layers.13',

In [5]:
lens_view = analyzer.logit_lens_on_activation(
    activation=record,
    token_position=4, 
    layer_indices=None,
    top_k=50,
    apply_ln=True
)

for layer_idx in lens_view['layers']:
    print(f"Layer {layer_idx}")
    print(lens_view['layers'][layer_idx]['top_k_tokens'])
    print(lens_view['layers'][layer_idx]['top_k_probs'])
    #print(lens_view['layers'][layer_idx]['logits'])


Layer 0
['=', ' =', '-', '(', '/', '[', '+', '>', '="', ' ', '<', '=-', ':', '==', '*', '=\\', '=.', '=(', '=\n', "='", ',', '}=', '_', '.', '|', '&', ' (', '=s', '\n', '\\', ')=', '1', '=[', '?', '={', ';', '=true', '@', '=$', '=p', '=m', ')', '!', '=new', '{', '0', "'", '2', '=C', '$']
[1.4224784536054358e-05, 1.1448168152128346e-05, 1.0346017916162964e-05, 1.0301829206582624e-05, 1.023234744934598e-05, 1.0200571523455437e-05, 1.0161347745452076e-05, 1.0137066055904143e-05, 1.0058578482130542e-05, 1.0017586646426935e-05, 9.995247637561988e-06, 9.964163837139495e-06, 9.963616321329027e-06, 9.881365258479491e-06, 9.841085557127371e-06, 9.83151767286472e-06, 9.808924914977979e-06, 9.78081880020909e-06, 9.767647497938015e-06, 9.764124115463346e-06, 9.760083230503369e-06, 9.759516615304165e-06, 9.718103683553636e-06, 9.699736438051332e-06, 9.655747817305382e-06, 9.610144843463786e-06, 9.602764293958899e-06, 9.59765020525083e-06, 9.588035027263686e-06, 9.538821359456051e-06, 9.535863682685

In [18]:
activations = load_activations("activations/activation_records.pt", format="pt")
print(f"Loaded {len(activations)} activation records.")

print(type(activations))

for i, act in enumerate(activations):
    print(f"Activation Record {i}:")
    print(f"  Prompt: {act.prompt.split('=')[0]}=")


Loaded 53 record(s) from activations\activation_records.pt (PyTorch format)
Loaded 53 activation records.
<class 'list'>
Activation Record 0:
  Prompt: 1+1=
Activation Record 1:
  Prompt: 1+2=
Activation Record 2:
  Prompt: 1+3=
Activation Record 3:
  Prompt: 1+4=
Activation Record 4:
  Prompt: 1+5=
Activation Record 5:
  Prompt: 2+1=
Activation Record 6:
  Prompt: 2+2=
Activation Record 7:
  Prompt: 2+3=
Activation Record 8:
  Prompt: 2+4=
Activation Record 9:
  Prompt: 2+5=
Activation Record 10:
  Prompt: 3+1=
Activation Record 11:
  Prompt: 3+2=
Activation Record 12:
  Prompt: 3+3=
Activation Record 13:
  Prompt: 3+4=
Activation Record 14:
  Prompt: 3+5=
Activation Record 15:
  Prompt: 4+1=
Activation Record 16:
  Prompt: 4+2=
Activation Record 17:
  Prompt: 4+3=
Activation Record 18:
  Prompt: 4+4=
Activation Record 19:
  Prompt: 4+5=
Activation Record 20:
  Prompt: 5+1=
Activation Record 21:
  Prompt: 5+2=
Activation Record 22:
  Prompt: 5+3=
Activation Record 23:
  Prompt: 5+4=
A

In [25]:
act = activations[50]

print()
print(act.prompt)
print("=====================")
print(act.tokens)
print(act.metadata)
print([act.layer_activations[i].shape for i in act.layer_activations])
#print([act.attention_weights[i].shape for i in act.attention_weights])



lens_view = analyzer.logit_lens_on_activation(
    activation=act,
    token_position=4, 
    layer_indices=None,
    top_k=50,
    apply_ln=True
)

for layer_idx in lens_view['layers']:
    print(f"Layer {layer_idx}")
    print(lens_view['layers'][layer_idx]['top_k_tokens'])
    print(lens_view['layers'][layer_idx]['top_k_probs'])
    #print(lens_view['layers'][layer_idx]['logits'])


336+639=1209 (mod 10)
n = 0:4
f = function(n)
   
['<|begin_of_text|>', '336', '+', '639', '=', '120', '9', 'Ġ(', 'mod', 'Ġ', '10', ')Ċ', 'n', 'Ġ=', 'Ġ', '0', ':', '4', 'Ċ', 'f', 'Ġ=', 'Ġfunction', '(n', ')Ċ', 'ĠĠĠ']
{'num_tokens': 25, 'num_layers': 17, 'model_name': 'llama'}
[torch.Size([1, 25, 2048]), torch.Size([1, 25, 2048]), torch.Size([1, 25, 2048]), torch.Size([1, 25, 2048]), torch.Size([1, 25, 2048]), torch.Size([1, 25, 2048]), torch.Size([1, 25, 2048]), torch.Size([1, 25, 2048]), torch.Size([1, 25, 2048]), torch.Size([1, 25, 2048]), torch.Size([1, 25, 2048]), torch.Size([1, 25, 2048]), torch.Size([1, 25, 2048]), torch.Size([1, 25, 2048]), torch.Size([1, 25, 2048]), torch.Size([1, 25, 2048]), torch.Size([1, 25, 2048])]
Layer 0
['=', ' =', '-', '(', '/', '[', '+', '>', '="', ' ', '<', '=-', ':', '==', '*', '=\\', '=.', '=(', '=\n', "='", ',', '}=', '_', '.', '|', '&', ' (', '=s', '\n', '\\', ')=', '1', '=[', '?', '={', ';', '=true', '@', '=$', '=p', '=m', ')', '!', '=new', '{',