In [63]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import PeftModel
import torch
import torch.nn.functional as F
import transformer_lens
from datasets import load_dataset
from tqdm import tqdm
import re
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import numpy as np
import os
import pickle
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [64]:
def load_and_merge_model(model_base: str, adapter_path: str):
    """
    Function to load a causal language model and merge an adapter.

    Args:
    model_base (str): Path or identifier of the base model (e.g. "EleutherAI/pythia-70m").
    adapter_path (str): Path to the fine-tuned adapter model.

    Returns:
    model_merged: The merged and unloaded model.
    tokenizer: The tokenizer loaded from the adapter.
    """

    tokenizer = AutoTokenizer.from_pretrained(adapter_path)
    model_id = AutoModelForCausalLM.from_pretrained(model_base)
    model = PeftModel.from_pretrained(model_id, adapter_path)    
    return model.merge_and_unload(), tokenizer

def process_activations(samples, model, num_layers, db_activations, task_name=None):
    """
    Process activations for a list of samples using a model and store the results in the provided activations database.

    Args:
    - samples (list): List of text samples to process.
    - model (object): Model with a `run_with_cache` method to obtain activations.
    - num_layers (int): Number of layers of the model to extract activations from.
    - db_activations (dict): Dictionary to store activations and task labels.
    - task_name (str): Name of the task to assign to the activations. Default is 'tel'.

    Returns:
    - db_activations (dict): Updated dictionary with activations and task labels.
    """
    
    for text in tqdm(samples, desc="Processing DB"):
        _, activations = model.run_with_cache(text)
        vector = []
        for layer in range(num_layers):
            block_act_fn = activations[f'blocks.{layer}.mlp.hook_post']
            vector.append(block_act_fn)
        vector = torch.cat(vector, dim=0).flatten().cpu().numpy()
        db_activations["activations"].append(vector)
        db_activations["task"].append(task_name)

    return db_activations 

In [65]:
"""model_base = "Qwen/Qwen2-1.5B-Instruct"
adapter_path = "fine_tuned_model_both_qwen"""

model_base = "EleutherAI/pythia-70m"
adapter_path = "fine_tuned_model_both_pythia"

model_merged_tel, tokenizer_tel = load_and_merge_model(model_base, adapter_path)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [66]:
dataset_autoregressive = load_dataset('text', data_files={'train': 'data/sentences_train.txt'})
samples_tel = []
for sample in tqdm(dataset_autoregressive['train']['text']):
    if len(sample) > 0:
        # Remove all digits from the sample
        cleaned_sample = re.sub(r'\d+|\.', '', sample)
        samples_tel.append(cleaned_sample)

100%|██████████| 300/300 [00:00<00:00, 410937.69it/s]


In [67]:
model_hooked = transformer_lens.HookedTransformer.from_pretrained(
    model_base,
    hf_model=model_merged_tel, 
    tokenizer=tokenizer_tel,
    device=device,
    move_to_device=True
)
num_of_layers = 6 #model_merged_tel.config.max_window_layers

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loaded pretrained model EleutherAI/pythia-70m into HookedTransformer


In [68]:
db_activations = {
    "activations": [],
    "task": []
}

In [69]:
process_activations(samples_tel, model_hooked, num_of_layers, db_activations, task_name=0)

Processing DB: 100%|██████████| 150/150 [00:02<00:00, 64.67it/s]


{'activations': [array([ 0.08458611, -0.16952482, -0.13588722, ..., -0.15529434,
          0.01906241, -0.10441251], dtype=float32),
  array([ 0.08458608, -0.16952483, -0.13588724, ..., -0.13783574,
         -0.03175112, -0.09019347], dtype=float32),
  array([ 0.08458608, -0.16952483, -0.13588724, ..., -0.15587492,
          0.42557243, -0.16263703], dtype=float32),
  array([ 0.08458608, -0.16952483, -0.13588724, ..., -0.16153726,
          0.1764864 , -0.11675359], dtype=float32),
  array([ 0.08458608, -0.16952483, -0.13588724, ..., -0.15649924,
          0.50024915, -0.0921801 ], dtype=float32),
  array([ 0.08458611, -0.16952482, -0.13588722, ..., -0.14993565,
         -0.03562231, -0.09869096], dtype=float32),
  array([ 0.08458608, -0.16952483, -0.13588724, ..., -0.15780297,
          0.56964624, -0.16964298], dtype=float32),
  array([ 0.08458608, -0.16952483, -0.13588724, ..., -0.16255343,
          0.5535056 , -0.16508679], dtype=float32),
  array([ 0.08458611, -0.16952482, -0.135

In [70]:
adapter_path = "fine_tuned_model_math_pythia"
model_merged_math, tokenizer_math = load_and_merge_model(model_base, adapter_path)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [71]:
dataset_math = load_dataset('json', data_files='./data/arithmatic_expressions.json', split="train")
print(dataset_math)
samples_math = []
for sample in tqdm(dataset_math['text']):
    samples_math.append(sample.split("=")[0] + "=")
samples_math = np.random.choice(samples_math, 150, replace=False)
print(len(samples_math))

Dataset({
    features: ['text'],
    num_rows: 2000
})


100%|██████████| 2000/2000 [00:00<00:00, 1682432.41it/s]

150





In [72]:
model_hooked_math = transformer_lens.HookedTransformer.from_pretrained(
    model_base,
    hf_model=model_merged_math, 
    tokenizer=tokenizer_math,
    device=device,
    move_to_device=True
)
num_of_layers = 6 #model_merged_tel.config.max_window_layers

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loaded pretrained model EleutherAI/pythia-70m into HookedTransformer


In [73]:
process_activations(samples_math, model_hooked_math, num_of_layers, db_activations, task_name=1)

Processing DB: 100%|██████████| 150/150 [00:02<00:00, 51.45it/s]


{'activations': [array([ 0.08458611, -0.16952482, -0.13588722, ..., -0.15529434,
          0.01906241, -0.10441251], dtype=float32),
  array([ 0.08458608, -0.16952483, -0.13588724, ..., -0.13783574,
         -0.03175112, -0.09019347], dtype=float32),
  array([ 0.08458608, -0.16952483, -0.13588724, ..., -0.15587492,
          0.42557243, -0.16263703], dtype=float32),
  array([ 0.08458608, -0.16952483, -0.13588724, ..., -0.16153726,
          0.1764864 , -0.11675359], dtype=float32),
  array([ 0.08458608, -0.16952483, -0.13588724, ..., -0.15649924,
          0.50024915, -0.0921801 ], dtype=float32),
  array([ 0.08458611, -0.16952482, -0.13588722, ..., -0.14993565,
         -0.03562231, -0.09869096], dtype=float32),
  array([ 0.08458608, -0.16952483, -0.13588724, ..., -0.15780297,
          0.56964624, -0.16964298], dtype=float32),
  array([ 0.08458608, -0.16952483, -0.13588724, ..., -0.16255343,
          0.5535056 , -0.16508679], dtype=float32),
  array([ 0.08458611, -0.16952482, -0.135

In [74]:
FOLDER = "results"
os.makedirs(FOLDER, exist_ok=True)  # Crear la carpeta si no existe

# Guardar el diccionario db_activations en formato pickle
with open(os.path.join(FOLDER, "activations.pkl"), 'wb') as file:
    pickle.dump(db_activations, file)

## Activations for maths

In [75]:
adapter_path = "fine_tuned_model_math_pythia"
model_merged_math, tokenizer_math = load_and_merge_model(model_base, adapter_path)
db_activations = {
    "activations": [],
    "task": [],
}

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [76]:
dataset_math = load_dataset('json', data_files='./data/arithmatic_expressions.json', split="train")
print(dataset_math)
samples_sum = []
samples_sub = []
samples_mult = []
samples_div = []
for sample in tqdm(dataset_math['text']):
    if "+" in sample:
        samples_sum.append(sample.split("=")[0] + "=")
    elif "-" in sample:
        samples_sub.append(sample.split("=")[0] + "=")
    elif "x" in sample:
        samples_mult.append(sample.split("=")[0] + "=")
    elif "/" in sample:
        samples_div.append(sample.split("=")[0] + "=")
    else:
        raise ValueError(f"Sample {sample} does not contain any operator.")

Dataset({
    features: ['text'],
    num_rows: 2000
})


100%|██████████| 2000/2000 [00:00<00:00, 1320209.00it/s]


In [77]:
model_hooked_math = transformer_lens.HookedTransformer.from_pretrained(
    model_base,
    hf_model=model_merged_math, 
    tokenizer=tokenizer_math,
    device=device,
    move_to_device=True
)
num_of_layers = 6 #model_merged_tel.config.max_window_layers

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loaded pretrained model EleutherAI/pythia-70m into HookedTransformer


In [78]:
process_activations(samples_sum, model_hooked_math, num_of_layers, db_activations, task_name=0)
process_activations(samples_sub, model_hooked_math, num_of_layers, db_activations, task_name=1)
process_activations(samples_mult, model_hooked_math, num_of_layers, db_activations, task_name=2)
process_activations(samples_div, model_hooked_math, num_of_layers, db_activations, task_name=3)

Processing DB: 100%|██████████| 500/500 [00:07<00:00, 65.98it/s]
Processing DB: 100%|██████████| 500/500 [00:07<00:00, 67.81it/s]
Processing DB: 100%|██████████| 500/500 [00:07<00:00, 66.47it/s]
Processing DB: 100%|██████████| 500/500 [00:07<00:00, 64.07it/s]


{'activations': [array([ 0.07878847, -0.16831127, -0.11841718, ..., -0.16229855,
          0.05281039,  0.1764182 ], dtype=float32),
  array([ 0.07878847, -0.16831127, -0.11841718, ..., -0.16953605,
         -0.06283674, -0.06726662], dtype=float32),
  array([ 0.07878847, -0.16831127, -0.11841718, ..., -0.16700855,
          0.36720833, -0.05271519], dtype=float32),
  array([ 0.07878847, -0.16831127, -0.11841718, ..., -0.13792057,
          0.00045935, -0.12472666], dtype=float32),
  array([ 0.07878847, -0.16831127, -0.11841718, ..., -0.1478507 ,
          0.15874653, -0.01957512], dtype=float32),
  array([ 0.07878847, -0.16831127, -0.11841718, ..., -0.12873611,
         -0.14885837, -0.14469436], dtype=float32),
  array([ 0.07878847, -0.16831127, -0.11841718, ..., -0.1437977 ,
         -0.03083481, -0.04030069], dtype=float32),
  array([ 0.07878847, -0.16831127, -0.11841718, ..., -0.16546348,
         -0.04559875, -0.15289806], dtype=float32),
  array([ 0.07878847, -0.16831127, -0.118

In [79]:
FOLDER = "results"
os.makedirs(FOLDER, exist_ok=True)  # Crear la carpeta si no existe

# Guardar el diccionario db_activations en formato pickle
with open(os.path.join(FOLDER, "activations_math.pkl"), 'wb') as file:
    pickle.dump(db_activations, file)