In [1]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import PeftModel
import torch
import torch.nn.functional as F
import transformer_lens
from datasets import load_dataset
from tqdm import tqdm
import re
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import numpy as np
import os
import pickle
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def load_and_merge_model(model_base: str, adapter_path: str):
    """
    Function to load a causal language model and merge an adapter.

    Args:
    model_base (str): Path or identifier of the base model (e.g. "EleutherAI/pythia-70m").
    adapter_path (str): Path to the fine-tuned adapter model.

    Returns:
    model_merged: The merged and unloaded model.
    tokenizer: The tokenizer loaded from the adapter.
    """

    tokenizer = AutoTokenizer.from_pretrained(adapter_path)
    model_id = AutoModelForCausalLM.from_pretrained(model_base)
    model = PeftModel.from_pretrained(model_id, adapter_path)    
    return model.merge_and_unload(), tokenizer

def process_activations(samples, model, num_layers, db_activations, task_name=None):
    """
    Process activations for a list of samples using a model and store the results in the provided activations database.

    Args:
    - samples (list): List of text samples to process.
    - model (object): Model with a `run_with_cache` method to obtain activations.
    - num_layers (int): Number of layers of the model to extract activations from.
    - db_activations (dict): Dictionary to store activations and task labels.
    - task_name (str): Name of the task to assign to the activations. Default is 'tel'.

    Returns:
    - db_activations (dict): Updated dictionary with activations and task labels.
    """
    
    for text in tqdm(samples, desc="Processing DB"):
        _, activations = model.run_with_cache(text)
        vector = []
        for layer in range(num_layers):
            block_act_fn = activations[f'blocks.{layer}.mlp.hook_post']
            vector.append(block_act_fn)
        vector = torch.cat(vector, dim=0).flatten().cpu().numpy()
        db_activations["activations"].append(vector)
        db_activations["task"].append(task_name)

    return db_activations 

In [3]:
"""model_base = "Qwen/Qwen2-1.5B-Instruct"
adapter_path = "fine_tuned_model_both_qwen"""

model_base = "EleutherAI/pythia-70m"
adapter_path = "fine_tuned_model_both_pythia"

model_merged_tel, tokenizer_tel = load_and_merge_model(model_base, adapter_path)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [4]:
dataset_autoregressive = load_dataset('text', data_files={'train': 'data/sentences_train.txt'})
samples_tel = []
for sample in tqdm(dataset_autoregressive['train']['text']):
    if len(sample) > 0:
        # Remove all digits from the sample
        cleaned_sample = re.sub(r'\d+|\.', '', sample)
        samples_tel.append(cleaned_sample)

100%|██████████| 300/300 [00:00<00:00, 340723.31it/s]


In [5]:
model_hooked = transformer_lens.HookedTransformer.from_pretrained(
    model_base,
    hf_model=model_merged_tel, 
    tokenizer=tokenizer_tel,
    device=device,
    move_to_device=True
)
num_of_layers = 6 #model_merged_tel.config.max_window_layers

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loaded pretrained model EleutherAI/pythia-70m into HookedTransformer


In [6]:
db_activations = {
    "activations": [],
    "task": []
}

In [7]:
process_activations(samples_tel, model_hooked, num_of_layers, db_activations, task_name=0)

Processing DB: 100%|██████████| 150/150 [00:02<00:00, 68.78it/s]


{'activations': [array([ 0.07742826, -0.16896877, -0.12680088, ..., -0.16788812,
         -0.16607349, -0.11432232], dtype=float32),
  array([ 0.07742826, -0.16896877, -0.12680084, ..., -0.16115972,
         -0.16419657, -0.11732144], dtype=float32),
  array([ 0.07742826, -0.16896877, -0.12680084, ..., -0.16944133,
          0.45363355, -0.1670116 ], dtype=float32),
  array([ 0.07742826, -0.16896877, -0.12680084, ..., -0.16944359,
         -0.13128571, -0.14853187], dtype=float32),
  array([ 0.07742826, -0.16896877, -0.12680084, ..., -0.16766928,
          0.03085584, -0.11426631], dtype=float32),
  array([ 0.07742826, -0.16896877, -0.12680088, ..., -0.16956438,
         -0.15478787, -0.12239487], dtype=float32),
  array([ 0.07742826, -0.16896877, -0.12680084, ..., -0.168197  ,
          0.39185745, -0.16696979], dtype=float32),
  array([ 0.07742826, -0.16896877, -0.12680084, ..., -0.16942433,
          0.4643546 , -0.13688116], dtype=float32),
  array([ 0.07742826, -0.16896877, -0.126

In [8]:
adapter_path = "fine_tuned_model_math_pythia"
model_merged_math, tokenizer_math = load_and_merge_model(model_base, adapter_path)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [9]:
dataset_math = load_dataset('json', data_files='./data/arithmatic_expressions.json', split="train")
print(dataset_math)
samples_math = []
for sample in tqdm(dataset_math['text']):
    samples_math.append(sample.split(" = ")[0] + "=")
samples_math = np.random.choice(samples_math, 150, replace=False)
print(len(samples_math))

Dataset({
    features: ['text'],
    num_rows: 2000
})


100%|██████████| 2000/2000 [00:00<00:00, 2310911.29it/s]

150





In [10]:
model_hooked_math = transformer_lens.HookedTransformer.from_pretrained(
    model_base,
    hf_model=model_merged_math, 
    tokenizer=tokenizer_math,
    device=device,
    move_to_device=True
)
num_of_layers = 6 #model_merged_tel.config.max_window_layers

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loaded pretrained model EleutherAI/pythia-70m into HookedTransformer


In [11]:
process_activations(samples_math, model_hooked_math, num_of_layers, db_activations, task_name=1)

Processing DB: 100%|██████████| 150/150 [00:02<00:00, 72.68it/s]


{'activations': [array([ 0.07742826, -0.16896877, -0.12680088, ..., -0.16788812,
         -0.16607349, -0.11432232], dtype=float32),
  array([ 0.07742826, -0.16896877, -0.12680084, ..., -0.16115972,
         -0.16419657, -0.11732144], dtype=float32),
  array([ 0.07742826, -0.16896877, -0.12680084, ..., -0.16944133,
          0.45363355, -0.1670116 ], dtype=float32),
  array([ 0.07742826, -0.16896877, -0.12680084, ..., -0.16944359,
         -0.13128571, -0.14853187], dtype=float32),
  array([ 0.07742826, -0.16896877, -0.12680084, ..., -0.16766928,
          0.03085584, -0.11426631], dtype=float32),
  array([ 0.07742826, -0.16896877, -0.12680088, ..., -0.16956438,
         -0.15478787, -0.12239487], dtype=float32),
  array([ 0.07742826, -0.16896877, -0.12680084, ..., -0.168197  ,
          0.39185745, -0.16696979], dtype=float32),
  array([ 0.07742826, -0.16896877, -0.12680084, ..., -0.16942433,
          0.4643546 , -0.13688116], dtype=float32),
  array([ 0.07742826, -0.16896877, -0.126

In [12]:
FOLDER = "results"
os.makedirs(FOLDER, exist_ok=True)  # Crear la carpeta si no existe

# Guardar el diccionario db_activations en formato pickle
with open(os.path.join(FOLDER, "activations.pkl"), 'wb') as file:
    pickle.dump(db_activations, file)