In [8]:
# imports
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer 
from datasets import load_dataset
import pandas as pd 
import numpy as np
from huggingface_hub import login

from tqdm import tqdm
import gc


In [None]:
# login to HF (paste token) 
login(token= "")

In [2]:
def get_pad_token(model_id, tokenizer):
    if tokenizer.pad_token is not None:
        return tokenizer.pad_token
    model_to_pad_token = {
        "meta-llama/Llama-3.2-3B-Instruct": "<|finetune_right_pad_id|>",
        "google/gemma-7b-it": "<pad>",
        "mistralai/Ministral-8B-Instruct-2410": "<pad>",
        "meta-llama/Llama-3.1-8B-Instruct": "<|finetune_right_pad_id|>"
    }
    pad_token = model_to_pad_token[model_id]
    print(f"Using pad token for {model_id}: {pad_token}")
    return pad_token

def get_model(model_name: str):
    # Load the model with float16 precision
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        torch_dtype=torch.float16,  # Added float16 precision
        device_map="auto",
        trust_remote_code=True,
    )
    
    # Load tokenizer
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    
    # Set the pad token using the same logic as control experiments
    tokenizer.pad_token = get_pad_token(model_name, tokenizer)
    print(f"Pad token set to: {tokenizer.pad_token}")
        
    return model, tokenizer


model, tokenizer = get_model("meta-llama/Llama-3.2-3B-Instruct")

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Using pad token for meta-llama/Llama-3.2-3B-Instruct: <|finetune_right_pad_id|>
Pad token set to: <|finetune_right_pad_id|>


In [3]:
def get_res_layers_to_enumerate(model):
    model_name = model.config._name_or_path
    if 'gpt' in model_name:
        return model.transformer.h
    elif 'pythia' in model_name:
        return model.gpt_neox.layers
    elif 'bert' in model_name:
        return model.encoder.layer
    elif 'mistral' in model_name:
        return model.model.layers
    elif 'gemma' in model_name:
        return model.model.layers
    elif "llama" in model_name:
        return model.model.layers
    elif "qwen" in model_name.lower():
        return model.model.layers
    else:
      raise ValueError(f"Unsupported model: {model_name}.")

In [4]:
def get_res_activations(model, tokenizer, prompt):

    """Simplest version - just get activations from all layers"""    

    # Tokenize input
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    input_length = inputs.input_ids.shape[1]  # Store original length

    # Storage for activations
    activations = {}
    

    # Simple hook function
    def get_activation(name):

        def hook(model, input, output):

            activations[name] = output.detach().cpu()

        return hook
    

    # Get layers and register hooks
    layers_to_enum = get_res_layers_to_enumerate(model)

    hooks = [] 

    for i, layer in enumerate(layers_to_enum):

        hook_handle = layer.register_forward_hook(get_activation(i))

        hooks.append(hook_handle)

    # Run forward pass
    model.eval()

    with torch.no_grad():
        outputs = model.generate(**inputs, max_new_tokens = 200, do_sample = True, pad_token_id = tokenizer.pad_token_id)

    # Decode the full generated sequence
    new_tokens = outputs[0][input_length:]  # Skip the input tokens
    decoded_outputs = tokenizer.decode(new_tokens, skip_special_tokens=True)

    print(decoded_outputs)
    

    # Clean up hooks
    for h in hooks:
        h.remove()

    
    return activations, decoded_outputs

In [5]:
# format prompt 
def format_conversation(tokenizer, messages):
    """Format messages according to the model's chat template"""
    return tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True
    )

# Single question - this is the correct way
messages = [{"role": "user", "content": "How do you do?"}]
prompt = format_conversation(tokenizer, messages)

a, o = get_res_activations(model, tokenizer, prompt)

print(a)

print(o)

I'm doing well, thank you for asking. I'm a large language model, so I don't have feelings or emotions like humans do, but I'm functioning properly and ready to assist you with any questions or tasks you may have. How about you? How's your day going?
{0: tensor([[[-0.0048,  0.0000,  0.0235,  ..., -0.0091,  0.0082, -0.0044]]],
       dtype=torch.float16), 1: tensor([[[-0.0094,  0.0144,  0.0406,  ..., -0.0247,  0.0312, -0.0238]]],
       dtype=torch.float16), 2: tensor([[[-0.0015,  0.0246,  0.0760,  ..., -0.0502, -0.0155, -0.0393]]],
       dtype=torch.float16), 3: tensor([[[ 0.0242,  0.0248,  0.0917,  ..., -0.1015, -0.0359, -0.0090]]],
       dtype=torch.float16), 4: tensor([[[ 0.0635, -0.0667,  0.1449,  ..., -0.1262, -0.1207,  0.0222]]],
       dtype=torch.float16), 5: tensor([[[ 0.0020,  0.0266,  0.2401,  ..., -0.0441, -0.0747, -0.0853]]],
       dtype=torch.float16), 6: tensor([[[-0.1353,  0.0723,  0.2620,  ..., -0.0273, -0.1289, -0.0744]]],
       dtype=torch.float16), 7: tensor([[[

In [13]:
# Refactor function to get activations for multiple prompts at once
def get_batch_res_activations(model, tokenizer, prompts):
    """Efficient batch version - get activations from all layers for multiple prompts"""
    # Handle single prompt case
    if isinstance(prompts, str):
        prompts = [prompts]
    
    # Tokenize all inputs with padding
    inputs = tokenizer(
        prompts,
        return_tensors="pt",
        padding=True,  # Pad to same length
        truncation=True,  # Handle very long sequences
        max_length=512  # Adjust as needed
    ).to(model.device)
    
    # Storage for activations - now batch_size x seq_len x hidden_dim
    activations = {}
    
    def get_activation(name):
        def hook(model, input, output):
            # Store the full batch of activations
            activations[name] = output.detach().cpu()
        return hook
    
    # Register hooks once for all prompts
    layers_to_enum = get_res_layers_to_enumerate(model)
    hooks = []
    for i, layer in enumerate(layers_to_enum):
        hook_handle = layer.register_forward_hook(get_activation(i))
        hooks.append(hook_handle)
    
    model.eval()
    with torch.no_grad():
        # Generate for entire batch
        outputs = model.generate(
            **inputs,  # This already contains input_ids and attention_mask
            max_new_tokens=200,
            do_sample=True,
            pad_token_id=tokenizer.pad_token_id,
            temperature=0.7
        )
    
    # Decode raw outputs (no cleaning)
    raw_outputs = []
    for i, output in enumerate(outputs):
        full_decoded = tokenizer.decode(output, skip_special_tokens=True)
        raw_outputs.append(full_decoded)
        # print(f"Response {i+1}: {full_decoded}")
    
    # Clean up hooks
    for h in hooks:
        h.remove()
    
    return activations, raw_outputs

In [14]:
def format_prompts_from_strings(tokenizer, prompt_strings):
    """
    Takes a list of string prompts and formats them for the model
    Input: ["how do you do", "what is your favorite condiment", ...]
    Output: List of properly formatted prompts
    """
    formatted_prompts = []
    for prompt_string in prompt_strings:
        # Convert string to message format
        messages = [{"role": "user", "content": prompt_string}]
        # Apply chat template
        formatted_prompt = tokenizer.apply_chat_template(
            messages,
            tokenize=False,
            add_generation_prompt=True
        )
        formatted_prompts.append(formatted_prompt)
    return formatted_prompts


# Usage - exactly what you want:
prompts = ["how do you do", "what is your favorite condiment", "tell me about cats", "explain gravity"]
formatted_prompts = format_prompts_from_strings(tokenizer, prompts)

# Then use with your batch function:
activations, raw_outputs = get_batch_res_activations(model, tokenizer, formatted_prompts)

A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


In [17]:
import torch
from tqdm import tqdm
import gc
import pandas as pd

def process_dataframe_dataset(model, tokenizer, df, prompt_column, batch_size=16, save_every=100):
    """
    Process a pandas DataFrame and add activations as a new column
    
    Args:
        model: The model to get activations from
        tokenizer: Tokenizer for the model
        df: pandas DataFrame containing the prompts
        prompt_column: Name of the column containing the prompt strings
        batch_size: Number of examples to process at once
        save_every: Print progress every N batches
    
    Returns:
        df: DataFrame with new 'activations' and 'outputs' columns added
    """
    
    # Create a copy to avoid modifying the original
    df_copy = df.copy()
    
    # Extract prompts from the specified column
    dataset = df_copy[prompt_column].tolist()
    
    # Format all prompts upfront
    print("Formatting prompts...")
    formatted_prompts = format_prompts_from_strings(tokenizer, dataset)
    
    # Storage for all results
    all_activations = []
    all_outputs = []
    
    # Calculate number of batches
    num_batches = (len(formatted_prompts) + batch_size - 1) // batch_size
    print(f"Processing {len(formatted_prompts)} examples in {num_batches} batches of size {batch_size}")
    
    # Process in batches
    for batch_idx in tqdm(range(num_batches), desc="Processing batches"):
        start_idx = batch_idx * batch_size
        end_idx = min(start_idx + batch_size, len(formatted_prompts))
        
        batch_prompts = formatted_prompts[start_idx:end_idx]
        
        try:
            # Get activations for this batch
            batch_activations, batch_outputs = get_batch_res_activations(
                model, tokenizer, batch_prompts
            )
            
            # Convert batch activations to per-example format
            batch_size_actual = len(batch_prompts)
            for example_idx in range(batch_size_actual):
                # Extract activations for this specific example
                example_activations = {}
                for layer_idx, layer_acts in batch_activations.items():
                    # layer_acts shape: [batch_size, seq_len, hidden_dim]
                    example_activations[layer_idx] = layer_acts[example_idx]  # [seq_len, hidden_dim]
                
                all_activations.append(example_activations)
                all_outputs.append(batch_outputs[example_idx])
            
            # Clean up GPU memory
            del batch_activations
            torch.cuda.empty_cache() if torch.cuda.is_available() else None
            gc.collect()
            
            # Progress update
            if (batch_idx + 1) % save_every == 0:
                print(f"Processed {len(all_outputs)} examples so far...")
                
        except Exception as e:
            print(f"Error processing batch {batch_idx}: {e}")
            # Add empty results for failed examples to maintain alignment
            batch_size_actual = len(batch_prompts)
            for _ in range(batch_size_actual):
                all_activations.append({})  # Empty dict for failed examples
                all_outputs.append("")       # Empty string for failed outputs
            continue
    
    # Add results as new columns to the DataFrame
    df_copy['activations'] = all_activations
    df_copy['model_outputs'] = all_outputs
    
    # Save the enhanced DataFrame
    print("Saving enhanced DataFrame...")
    try:
        # Save as pickle to preserve the tensor data in activations
        df_copy.to_pickle('dataframe_with_activations.pkl')
        print("Enhanced DataFrame saved to 'dataframe_with_activations.pkl'")
        
        # Also save as parquet (more efficient for large datasets)
        # Note: parquet might not handle complex nested structures well
        # df_copy.to_parquet('enhanced_dataframe_with_activations.parquet')
        
    except Exception as e:
        print(f"Warning: Could not save enhanced DataFrame: {e}")
        print("DataFrame is still returned in memory")
    
    print(f"Completed processing {len(all_outputs)} examples")
    return df_copy


def format_prompts_from_strings(tokenizer, prompt_strings):
    """
    Takes a list of string prompts and formats them for the model
    """
    formatted_prompts = []
    for prompt_string in prompt_strings:
        messages = [{"role": "user", "content": prompt_string}]
        formatted_prompt = tokenizer.apply_chat_template(
            messages,
            tokenize=False,
            add_generation_prompt=True
        )
        formatted_prompts.append(formatted_prompt)
    return formatted_prompts


In [19]:
# Usage example:
if __name__ == "__main__":
    # Example DataFrame - create 10 examples by repeating the pattern
    base_prompts = [
        "how do you do",
        "what is your favorite condiment", 
        "tell me about cats",
        "explain gravity"
    ]
    
    # Create 10 prompts by cycling through the base prompts
    prompts_10 = []
    for i in range(10):
        prompts_10.append(base_prompts[i % len(base_prompts)])
    
    df = pd.DataFrame({
        'id': range(10),
        'prompt': prompts_10,
        'category': ['question'] * 10,
        'other_data': ['some_value'] * 10
    })
    
    # Process the DataFrame
    enhanced_df = process_dataframe_dataset(
        model, tokenizer, df, 
        prompt_column='prompt',  # Name of column with prompts
        batch_size=8,
        save_every=50
    )
    
    # Now your DataFrame has new columns:
    print("New DataFrame structure:")
    print(enhanced_df.columns.tolist())
    
    # Access activations for a specific row
    row_0_activations = enhanced_df.loc[0, 'activations']
    print(f"Row 0 activations keys: {row_0_activations.keys()}")
    print(f"Layer 0 activation shape: {row_0_activations[0].shape}")
    
    # Access model output for a specific row
    row_0_output = enhanced_df.loc[0, 'model_outputs']
    print(f"Row 0 model output: {row_0_output[:100]}...")

Formatting prompts...
Processing 10 examples in 2 batches of size 8


Processing batches:   0%|                                                                                                                                                   | 0/2 [00:00<?, ?it/s]A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
Processing batches:  50%|█████████████████████████████████████████████████████████████████████▌                                                                     | 1/2 [00:06<00:06,  6.73s/it]A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
Processing batches: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:11<00:00,  5.52s/it]

Saving enhanced DataFrame...
Enhanced DataFrame saved to 'dataframe_with_activations.pkl'
Completed processing 10 examples
New DataFrame structure:
['id', 'prompt', 'category', 'other_data', 'activations', 'model_outputs']
Row 0 activations keys: dict_keys([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27])
Layer 0 activation shape: torch.Size([1, 3072])
Row 0 model output: system

Cutting Knowledge Date: December 2023
Today Date: 06 Aug 2025

user

how do you doassistant
...





In [20]:
# Loading the saved DataFrame later:
def load_enhanced_dataframe(filepath='enhanced_dataframe_with_activations.pkl'):
    """Load the enhanced DataFrame with activations"""
    return pd.read_pickle(filepath)

In [21]:
df = load_enhanced_dataframe()

print(df.columns)

print(df.head())

Index(['id', 'prompt', 'category', 'other_data', 'activations',
       'model_outputs'],
      dtype='object')
   id                           prompt  category  other_data  \
0   0                    how do you do  question  some_value   
1   1  what is your favorite condiment  question  some_value   
2   2               tell me about cats  question  some_value   
3   3                  explain gravity  question  some_value   
4   4                    how do you do  question  some_value   

                                         activations  \
0  {0: [[tensor(0.0070, dtype=torch.float16), ten...   
1  {0: [[tensor(0.0064, dtype=torch.float16), ten...   
2  {0: [[tensor(0.0184, dtype=torch.float16), ten...   
3  {0: [[tensor(-0.0277, dtype=torch.float16), te...   
4  {0: [[tensor(0.0067, dtype=torch.float16), ten...   

                                       model_outputs  
0  system\n\nCutting Knowledge Date: December 202...  
1  system\n\nCutting Knowledge Date: December 202...  
2 