In [1]:
reciever_heads = [(4, 12),
 (1, 20),
 (1, 18),
 (3, 18),
 (4, 14),
 (25, 20),
 (25, 22),
 (3, 17),
 (29, 24),
 (1, 13),
 (29, 25),
 (29, 27),
 (29, 26),
 (24, 9),
 (1, 26),
 (29, 15),
 (24, 11),
 (3, 4),
 (7, 5),
 (1, 16)]

In [None]:
ablated_model = copy.deepcopy(model)
layer_index_to_ablate = 31
head_index_to_ablate = 14

def ablate_attention_hook(module, input, output):
    """
    Forward hook to ablate a specific attention head's output.

    Args:
        module (nn.Module): The module to which the hook is attached.
        input (tuple): The input to the module.
        output (torch.Tensor): The output of the module.
    """
    # The output of the attention layer is typically a tuple. The first element is the
    # tensor containing the combined head outputs.
    attention_output = output[0]

    # Get the dimensions of the attention output tensor
    batch_size, sequence_length, hidden_dim = attention_output.shape

    # Find the size of each head's output dimension
    # It's hidden_dim / num_heads. This needs to be calculated from the model config.
    num_heads = module.num_heads
    head_dim = hidden_dim // num_heads

    # Reshape the output to separate heads
    # The shape becomes (batch_size, sequence_length, num_heads, head_dim)
    reshaped_output = attention_output.view(batch_size, sequence_length, num_heads, head_dim)

    # Abiate the specified head by setting its values to zero
    # Use torch.zeros_like to maintain the correct data type and device
    reshaped_output[:, :, head_index_to_ablate, :] = torch.zeros_like(
        reshaped_output[:, :, head_index_to_ablate, :]
    )

    # Reshape the tensor back to its original shape
    # This prepares it for the next module in the network
    modified_output = reshaped_output.view(batch_size, sequence_length, hidden_dim)

    # Return the modified output. It must be returned as a tuple to match the original output format.
    return (modified_output,) + output[1:]

# --- Register the hook on the specific attention layer ---
# We need to find the correct module. In most transformer models, the attention modules are
# located within the decoder layers.
# The structure is often model.base_model.layers[layer_index].self_attn
# Let's verify the structure for the DeepSeek-R1-Distill-Llama-8B model
attention_layer = ablated_model.model.layers[layer_index_to_ablate].self_attn

# Register the hook. The hook will now be triggered on every forward pass of this module.
attention_layer.register_forward_hook(ablate_attention_hook)

# --- Tokenize the input text ---
inputs = tokenizer(input_text, return_tensors="pt")

# --- Generate text with the original and ablated models ---
# The original model will generate text normally
original_output = model.generate(
    **inputs,
    max_new_tokens=20,
    do_sample=False,
    pad_token_id=tokenizer.eos_token_id
)
original_text = tokenizer.decode(original_output[0], skip_special_tokens=True)

# The ablated model's specified head output will be zeroed out
ablated_output = ablated_model.generate(
    **inputs,
    max_new_tokens=20,
    do_sample=False,
    pad_token_id=tokenizer.eos_token_id
)
ablated_text = tokenizer.decode(ablated_output[0], skip_special_tokens=True)

# --- Print the results for comparison ---
print("=" * 50)
print(f"Original Text:")
print(original_text)
print("-" * 50)
print(f"Ablated Text (Layer {layer_index_to_ablate}, Head {head_index_to_ablate}):")
print(ablated_text)
print("=" * 50)


In [None]:
tokenizer(text)