In [1]:
!pip install -r requirements.txt



In [2]:
# Importing necessary libraries
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

In [3]:
# Starting with a smaller model for quicker iteration
model = "EleutherAI/gpt-neo-125M"
device = "cuda" if torch.cuda.is_available() else "cpu"

In [4]:
# loading the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(model)
model = AutoModelForCausalLM.from_pretrained(
    model,
    output_attentions=True,
    output_hidden_states=True,
).to(device)
model.eval()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
The following generation flags are not valid and may be ignored: ['output_attentions', 'output_hidden_states']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


GPTNeoForCausalLM(
  (transformer): GPTNeoModel(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(2048, 768)
    (drop): Dropout(p=0.0, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPTNeoBlock(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPTNeoAttention(
          (attention): GPTNeoSelfAttention(
            (attn_dropout): Dropout(p=0.0, inplace=False)
            (resid_dropout): Dropout(p=0.0, inplace=False)
            (k_proj): Linear(in_features=768, out_features=768, bias=False)
            (v_proj): Linear(in_features=768, out_features=768, bias=False)
            (q_proj): Linear(in_features=768, out_features=768, bias=False)
            (out_proj): Linear(in_features=768, out_features=768, bias=True)
          )
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPTNeoMLP(
          (c_fc): Linear(in_features=768, out_features=3072, bias=True)
          (c_proj): Linear(in_fe

In [5]:
# Sample prompt
prompt = "Why does phone battery drain faster over time? Answer in 2 detailed points"

In [6]:
# Tokenizing prompt into tensor for the model
inputs = tokenizer(prompt, return_tensors="pt").to(device)

In [7]:
with torch.no_grad():
    output = model.generate(
    **inputs,
    max_new_tokens=40,
    do_sample=False,
    repetition_penalty=10.0,
    return_dict_in_generate=True,
    output_attentions=True,
    output_hidden_states=True,
)



Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


In [8]:
# Decoding the generated text
decoded = tokenizer.decode(output.sequences[0], skip_special_tokens=True)
print("Generated text:\n")
print(decoded)

Generated text:

Why does phone battery drain faster over time? Answer in 2 detailed points

The answer to this question is yes. The battery life of a smartphone depends on the battery capacity and the charging time. If you have a phone that has a built-in charger, then


In [9]:
# No of transformers layers that returned attention tensors
print("Number of layers:", len(output.attentions))

Number of layers: 40


In [10]:
# Attention tensor shape for 1st layer
print("Attention shape (layer 0):", output.attentions[0][0].shape)


Attention shape (layer 0): torch.Size([1, 12, 14, 14])


In [12]:
#Hidden state shape [batch_size,seq_len,seq_len]
print("Hidden state shape (layer 0):", output.hidden_states[0][0].shape)


Hidden state shape (layer 0): torch.Size([1, 14, 768])


- These tensors are post-generation captures of the model's internals.

- They can be used as raw signals for exploratory analyses (attention flow,
layer-wise activation patterns, or as inputs to gradient-based attribution workflows).



Planned next steps:

- Analyze attention flow from input tokens to generated tokens
- Compare attention-based vs gradient-based attributions
- Experiment with per-token vs per-span explanations
- Investigate faithfulness limitations of attention as explanation
