In [1]:
from transformers import BertTokenizer, BertForSequenceClassification
import torch
from captum.attr import IntegratedGradients



In [70]:
def prepare_input(text):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
    return inputs

In [71]:
ig = IntegratedGradients(model)

In [72]:
from captum.attr import IntegratedGradients, LayerIntegratedGradients

def attribute_text(text):
    inputs = prepare_input(text)
    input_ids = inputs['input_ids'].long()
    attention_mask = inputs['attention_mask']
    
    # Define a forward function that isolates the input feature for attribution
    # The function outputs the logits
    def forward_func(input_ids, attention_mask):
        return model(input_ids=input_ids, attention_mask=attention_mask)[0]
    
    # Use LayerIntegratedGradients and target the embedding layer specifically
    lig = LayerIntegratedGradients(forward_func, model.bert.embeddings)
    
    # Calculate attributions with respect to the embeddings
    attributions, delta = lig.attribute(inputs=input_ids,
                                        additional_forward_args=(attention_mask,),
                                        target=1,
                                        return_convergence_delta=True,
                                        attribute_to_layer_input=False)  # attribute to the output of the embeddings

    return attributions


In [73]:
def visualize_attributions(text):
    inputs = prepare_input(text)
    attributions = attribute_text(text)
    tokens = tokenizer.convert_ids_to_tokens(inputs['input_ids'][0])
    
    # Summing across the embedding dimensions to get per-token attribution
    token_attributions = attributions.sum(dim=-1).squeeze(0).detach().numpy()
    
    # Display tokens with their attributions
    for token, attribution in zip(tokens, token_attributions):
        print(f"{token}: {attribution:.4f}")

In [74]:
text = "The movie was fantastic!"
visualize_attributions(text)


[CLS]: 0.0993
the: 0.1091
movie: -0.2246
was: 0.1845
fantastic: -0.1559
!: -0.1775
[SEP]: 0.1472


In [6]:
from transformers import LlamaModel, LlamaConfig

# Initializing a LLaMA llama-7b style configuration
configuration = LlamaConfig()

# Initializing a model from the llama-7b style configuration
model = LlamaModel(configuration)

# Accessing the model configuration
configuration = model.config

In [7]:
print(model)

LlamaModel(
  (embed_tokens): Embedding(32000, 4096)
  (layers): ModuleList(
    (0-31): 32 x LlamaDecoderLayer(
      (self_attn): LlamaSdpaAttention(
        (q_proj): Linear(in_features=4096, out_features=4096, bias=False)
        (k_proj): Linear(in_features=4096, out_features=4096, bias=False)
        (v_proj): Linear(in_features=4096, out_features=4096, bias=False)
        (o_proj): Linear(in_features=4096, out_features=4096, bias=False)
        (rotary_emb): LlamaRotaryEmbedding()
      )
      (mlp): LlamaMLP(
        (gate_proj): Linear(in_features=4096, out_features=11008, bias=False)
        (up_proj): Linear(in_features=4096, out_features=11008, bias=False)
        (down_proj): Linear(in_features=11008, out_features=4096, bias=False)
        (act_fn): SiLU()
      )
      (input_layernorm): LlamaRMSNorm()
      (post_attention_layernorm): LlamaRMSNorm()
    )
  )
  (norm): LlamaRMSNorm()
)


In [9]:
print(configuration)

LlamaConfig {
  "attention_bias": false,
  "attention_dropout": 0.0,
  "bos_token_id": 1,
  "eos_token_id": 2,
  "hidden_act": "silu",
  "hidden_size": 4096,
  "initializer_range": 0.02,
  "intermediate_size": 11008,
  "max_position_embeddings": 2048,
  "model_type": "llama",
  "num_attention_heads": 32,
  "num_hidden_layers": 32,
  "num_key_value_heads": 32,
  "pretraining_tp": 1,
  "rms_norm_eps": 1e-06,
  "rope_scaling": null,
  "rope_theta": 10000.0,
  "tie_word_embeddings": false,
  "transformers_version": "4.38.2",
  "use_cache": true,
  "vocab_size": 32000
}



In [11]:
for (name, param) in model.named_parameters():
    print(f"{name}: {param}")

embed_tokens.weight: Parameter containing:
tensor([[ 0.0098, -0.0053,  0.0223,  ..., -0.0305,  0.0012, -0.0400],
        [-0.0235, -0.0234,  0.0160,  ..., -0.0098,  0.0154, -0.0212],
        [-0.0028,  0.0105,  0.0304,  ..., -0.0012,  0.0159,  0.0208],
        ...,
        [-0.0308, -0.0031,  0.0051,  ...,  0.0130, -0.0019,  0.0044],
        [ 0.0178,  0.0289,  0.0265,  ...,  0.0284,  0.0141, -0.0062],
        [ 0.0164, -0.0250,  0.0042,  ...,  0.0049, -0.0098,  0.0230]],
       requires_grad=True)
layers.0.self_attn.q_proj.weight: Parameter containing:
tensor([[ 0.0086, -0.0338, -0.0118,  ..., -0.0275,  0.0066, -0.0116],
        [ 0.0220,  0.0044,  0.0054,  ..., -0.0156,  0.0096, -0.0001],
        [-0.0169,  0.0348,  0.0461,  ...,  0.0090, -0.0023,  0.0116],
        ...,
        [-0.0313, -0.0087, -0.0020,  ...,  0.0030,  0.0120,  0.0145],
        [ 0.0315,  0.0077,  0.0211,  ..., -0.0356, -0.0483, -0.0172],
        [-0.0127, -0.0429, -0.0151,  ...,  0.0096, -0.0002, -0.0114]],
      