In [5]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import PeftModel
import torch
import torch.nn.functional as F
import transformer_lens
from datasets import load_dataset, concatenate_datasets 
from tqdm import tqdm
import re
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import numpy as np
import os
import pickle
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [6]:
def load_and_merge_model(model_base: str, adapter_path: str):
    """
    Function to load a causal language model and merge an adapter.

    Args:
    model_base (str): Path or identifier of the base model (e.g. "EleutherAI/pythia-70m").
    adapter_path (str): Path to the fine-tuned adapter model.

    Returns:
    model_merged: The merged and unloaded model.
    tokenizer: The tokenizer loaded from the adapter.
    """

    tokenizer = AutoTokenizer.from_pretrained(adapter_path)
    model_id = AutoModelForCausalLM.from_pretrained(model_base)
    model = PeftModel.from_pretrained(model_id, adapter_path)    
    return model.merge_and_unload(), tokenizer

def process_activations(samples, model, num_layers, db_activations, task_name=None):
    """
    Process activations for a list of samples using a model and store the results in the provided activations database.

    Args:
    - samples (list): List of text samples to process.
    - model (object): Model with a `run_with_cache` method to obtain activations.
    - num_layers (int): Number of layers of the model to extract activations from.
    - db_activations (dict): Dictionary to store activations and task labels.
    - task_name (str): Name of the task to assign to the activations. Default is 'tel'.

    Returns:
    - db_activations (dict): Updated dictionary with activations and task labels.
    """
    
    for text in tqdm(samples, desc="Processing DB"):
        _, activations = model.run_with_cache(text)
        vector = []
        for layer in range(num_layers):
            block_act_fn = activations[f'blocks.{layer}.mlp.hook_post']
            vector.append(block_act_fn)
        vector = torch.cat(vector, dim=0).flatten().cpu().numpy()
        db_activations["activations"].append(vector)
        db_activations["task"].append(task_name)

    return db_activations 

In [3]:
model_base = "Qwen/Qwen2-1.5B-Instruct"

## Activations for math and tel

In [3]:
adapter_path = "fine_tuned_model_both_qwen"

#model_base = "EleutherAI/pythia-70m"
#adapter_path = "fine_tuned_model_both_pythia"

model_merged_tel, tokenizer_tel = load_and_merge_model(model_base, adapter_path)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [4]:
dataset_autoregressive = load_dataset('text', data_files={'train': 'data/sentences_train.txt'})
samples_tel = []
for sample in tqdm(dataset_autoregressive['train']['text']):
    if len(sample) > 0:
        # Remove all digits from the sample
        cleaned_sample = re.sub(r'\d+|\.', '', sample)
        samples_tel.append(cleaned_sample)

100%|██████████| 300/300 [00:00<00:00, 1041631.79it/s]


In [5]:
model_hooked = transformer_lens.HookedTransformer.from_pretrained(
    model_base,
    hf_model=model_merged_tel, 
    tokenizer=tokenizer_tel,
    device=device,
    move_to_device=True
)
num_of_layers = model_merged_tel.config.max_window_layers

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loaded pretrained model Qwen/Qwen2-1.5B-Instruct into HookedTransformer


In [6]:
db_activations = {
    "activations": [],
    "task": []
}

In [7]:
process_activations(samples_tel, model_hooked, num_of_layers, db_activations, task_name=0)

Processing DB:   0%|          | 0/150 [00:00<?, ?it/s]

Processing DB: 100%|██████████| 150/150 [00:11<00:00, 13.20it/s]


{'activations': [array([-0.00759343,  0.01179195,  0.04324816, ...,  0.00181106,
         -0.6167257 , -0.49666786], dtype=float32),
  array([-0.00759343,  0.01179195,  0.04324816, ..., -0.28923902,
         -0.8601406 ,  0.7978515 ], dtype=float32),
  array([-0.00759343,  0.01179195,  0.04324816, ..., -0.6773398 ,
         -0.02996855,  0.7834483 ], dtype=float32),
  array([-0.00759343,  0.01179195,  0.04324816, ..., -0.50649786,
         -0.14943008,  0.00720083], dtype=float32),
  array([-0.00759343,  0.01179195,  0.04324816, ..., -0.34183252,
         -0.99755704, -0.06896321], dtype=float32),
  array([-0.00759343,  0.01179195,  0.04324816, ..., -0.15167871,
         -0.22709699, -1.3994457 ], dtype=float32),
  array([-0.00759343,  0.01179195,  0.04324816, ..., -0.45771006,
         -0.09401266,  0.05321687], dtype=float32),
  array([-0.00759343,  0.01179195,  0.04324816, ..., -0.45504934,
         -0.15434039,  0.6567061 ], dtype=float32),
  array([-0.00759343,  0.01179195,  0.043

In [8]:
adapter_path = "fine_tuned_model_math_qwen"
model_merged_math, tokenizer_math = load_and_merge_model(model_base, adapter_path)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [9]:
dataset_math = load_dataset('json', data_files='./data/arithmatic_expressions.json', split="train")
print(dataset_math)
samples_math = []
for sample in tqdm(dataset_math['text']):
    samples_math.append(sample.split("=")[0] + "=")
samples_math = np.random.choice(samples_math, 150, replace=False)
print(len(samples_math))

Dataset({
    features: ['text'],
    num_rows: 2000
})


100%|██████████| 2000/2000 [00:00<00:00, 4488286.78it/s]

150





In [10]:
model_hooked_math = transformer_lens.HookedTransformer.from_pretrained(
    model_base,
    hf_model=model_merged_math, 
    tokenizer=tokenizer_math,
    device=device,
    move_to_device=True
)
num_of_layers = model_merged_tel.config.max_window_layers

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loaded pretrained model Qwen/Qwen2-1.5B-Instruct into HookedTransformer


In [11]:
process_activations(samples_math, model_hooked_math, num_of_layers, db_activations, task_name=1)

Processing DB: 100%|██████████| 150/150 [00:16<00:00,  9.30it/s]


{'activations': [array([-0.00759343,  0.01179195,  0.04324816, ...,  0.00181106,
         -0.6167257 , -0.49666786], dtype=float32),
  array([-0.00759343,  0.01179195,  0.04324816, ..., -0.28923902,
         -0.8601406 ,  0.7978515 ], dtype=float32),
  array([-0.00759343,  0.01179195,  0.04324816, ..., -0.6773398 ,
         -0.02996855,  0.7834483 ], dtype=float32),
  array([-0.00759343,  0.01179195,  0.04324816, ..., -0.50649786,
         -0.14943008,  0.00720083], dtype=float32),
  array([-0.00759343,  0.01179195,  0.04324816, ..., -0.34183252,
         -0.99755704, -0.06896321], dtype=float32),
  array([-0.00759343,  0.01179195,  0.04324816, ..., -0.15167871,
         -0.22709699, -1.3994457 ], dtype=float32),
  array([-0.00759343,  0.01179195,  0.04324816, ..., -0.45771006,
         -0.09401266,  0.05321687], dtype=float32),
  array([-0.00759343,  0.01179195,  0.04324816, ..., -0.45504934,
         -0.15434039,  0.6567061 ], dtype=float32),
  array([-0.00759343,  0.01179195,  0.043

In [12]:
FOLDER = "results"
os.makedirs(FOLDER, exist_ok=True)  # Crear la carpeta si no existe

# Guardar el diccionario db_activations en formato pickle
with open(os.path.join(FOLDER, "activations.pkl"), 'wb') as file:
    pickle.dump(db_activations, file)

In [13]:
#delete model_hooked
del model_hooked
# clean up memory
torch.cuda.empty_cache()

## Activations for maths

In [4]:
adapter_path = "fine_tuned_model_math_qwen"
model_merged_math, tokenizer_math = load_and_merge_model(model_base, adapter_path)
db_activations = {
    "activations": [],
    "task": [],
}

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [5]:
dataset_math = load_dataset('json', data_files='./data/arithmatic_expressions.json', split="train")
print(dataset_math)
samples_sum = []
samples_sub = []
samples_mult = []
samples_div = []
for sample in tqdm(dataset_math['text']):
    if "+" in sample:
        samples_sum.append(sample.split("=")[0] + "=")
    elif "-" in sample:
        samples_sub.append(sample.split("=")[0] + "=")
    elif "x" in sample:
        samples_mult.append(sample.split("=")[0] + "=")
    elif "/" in sample:
        samples_div.append(sample.split("=")[0] + "=")
    else:
        raise ValueError(f"Sample {sample} does not contain any operator.")
    
MAX_SAMPLES = 100
samples_sum = np.random.choice(samples_sum, MAX_SAMPLES, replace=False)
samples_sub = np.random.choice(samples_sub, MAX_SAMPLES, replace=False)
samples_mult = np.random.choice(samples_mult, MAX_SAMPLES, replace=False)
samples_div = np.random.choice(samples_div, MAX_SAMPLES, replace=False)

Dataset({
    features: ['text'],
    num_rows: 2000
})


100%|██████████| 2000/2000 [00:00<00:00, 3739905.48it/s]


In [6]:
import transformer_lens.utils as utils
device = utils.get_device()

model_hooked_math = transformer_lens.HookedTransformer.from_pretrained(
    model_base,
    hf_model=model_merged_math, 
    tokenizer=tokenizer_math,
    device=device,
)
num_of_layers = model_merged_math.config.max_window_layers

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loaded pretrained model Qwen/Qwen2-1.5B-Instruct into HookedTransformer


In [7]:
process_activations(samples_sum, model_hooked_math, num_of_layers, db_activations, task_name=0)
process_activations(samples_sub, model_hooked_math, num_of_layers, db_activations, task_name=1)
process_activations(samples_mult, model_hooked_math, num_of_layers, db_activations, task_name=2)
process_activations(samples_div, model_hooked_math, num_of_layers, db_activations, task_name=3)

Processing DB: 100%|██████████| 100/100 [00:08<00:00, 12.09it/s]
Processing DB: 100%|██████████| 100/100 [00:08<00:00, 12.45it/s]
Processing DB: 100%|██████████| 100/100 [00:07<00:00, 13.00it/s]
Processing DB: 100%|██████████| 100/100 [00:06<00:00, 14.88it/s]


{'activations': [array([-0.01007269,  0.00645165,  0.03944412, ..., -0.06560262,
         -0.43739694, -1.1813518 ], dtype=float32),
  array([-1.0072688e-02,  6.4516482e-03,  3.9444115e-02, ...,
         -2.0597950e-03, -5.0576675e-01, -4.6582303e+00], dtype=float32),
  array([-0.01007269,  0.00645165,  0.03944412, ..., -0.29624072,
         -0.1931593 , -4.610728  ], dtype=float32),
  array([-0.01007269,  0.00645165,  0.03944412, ..., -0.2555054 ,
         -0.11925691, -3.3747988 ], dtype=float32),
  array([-0.01007269,  0.00645165,  0.03944412, ..., -0.1797839 ,
         -0.18773931, -1.473257  ], dtype=float32),
  array([-0.01007269,  0.00645165,  0.03944412, ...,  0.07989857,
         -0.34788406, -3.4923775 ], dtype=float32),
  array([-0.01007269,  0.00645165,  0.03944412, ..., -0.02832895,
         -0.3493095 , -1.8100919 ], dtype=float32),
  array([-0.01007269,  0.00645165,  0.03944412, ..., -0.07027873,
         -0.22028182, -2.2835083 ], dtype=float32),
  array([-0.01007269,  

In [8]:
# delete model_hooked
del model_hooked_math
del model_merged_math
# clean up memory
torch.cuda.empty_cache()

In [9]:
FOLDER = "results"
os.makedirs(FOLDER, exist_ok=True)  # Crear la carpeta si no existe

# Guardar el diccionario db_activations en formato pickle
with open(os.path.join(FOLDER, "activations_math.pkl"), 'wb') as file:
    pickle.dump(db_activations, file)

## Activation for tec and sports

In [14]:
adapter_path = "models/fine_tuned_model_st_qwen"
model_merged_math, tokenizer_math = load_and_merge_model(model_base, adapter_path)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [19]:
#peparacion dataset tecnologia y deporte
SAMPLES = 100
ds_sports = load_dataset('json', data_files='./data/sports_phrases.json', split="train")
# saplme of 100
ds_sports = ds_sports.shuffle(seed=42).select(range(SAMPLES))
print(ds_sports)
ds_technology = load_dataset('json', data_files='./data/technology_phrases.json', split="train")
# saplme of 100
ds_technology = ds_technology.shuffle(seed=42).select(range(SAMPLES))
print(ds_technology)
ds_technology_and_sports = load_dataset('json', data_files='./data/sports_and_technology_phrases.json', split="train")
print(ds_technology_and_sports)

Dataset({
    features: ['text'],
    num_rows: 100
})
Dataset({
    features: ['text'],
    num_rows: 100
})
Dataset({
    features: ['text'],
    num_rows: 50
})


In [10]:
import transformer_lens.utils as utils
device = utils.get_device()

model_hooked_math = transformer_lens.HookedTransformer.from_pretrained(
    model_base,
    hf_model=model_merged_math, 
    tokenizer=tokenizer_math,
    device=device,
)
num_of_layers = model_merged_math.config.max_window_layers

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loaded pretrained model Qwen/Qwen2-1.5B-Instruct into HookedTransformer


In [20]:
db_activations = {
    "activations": [],
    "task": [],
}

process_activations(ds_sports['text'], model_hooked_math, num_of_layers, db_activations, task_name=0)
process_activations(ds_technology['text'], model_hooked_math, num_of_layers, db_activations, task_name=1)
process_activations(ds_technology_and_sports['text'], model_hooked_math, num_of_layers, db_activations, task_name=2)

Processing DB: 100%|██████████| 100/100 [00:06<00:00, 14.39it/s]
Processing DB: 100%|██████████| 100/100 [00:06<00:00, 14.80it/s]
Processing DB: 100%|██████████| 50/50 [00:03<00:00, 14.22it/s]


{'activations': [array([-0.01234835,  0.00513479,  0.06450616, ...,  0.01468724,
         -0.34405357,  0.40162843], dtype=float32),
  array([-0.01234834,  0.00513478,  0.06450617, ..., -0.04764158,
         -0.44798544, -0.47407103], dtype=float32),
  array([-0.01234835,  0.00513479,  0.06450616, ..., -0.0555573 ,
         -0.28461435,  0.01480823], dtype=float32),
  array([-0.01234835,  0.00513479,  0.06450619, ..., -0.07529654,
         -0.29766136,  0.38666114], dtype=float32),
  array([-0.01234835,  0.00513479,  0.06450616, ..., -0.08718769,
         -0.5925559 , -0.07959179], dtype=float32),
  array([-0.01234835,  0.00513479,  0.06450616, ..., -0.07960944,
         -0.36494547, -0.01911674], dtype=float32),
  array([-0.01234835,  0.00513479,  0.06450616, ...,  0.01098129,
         -0.64216155, -0.52750725], dtype=float32),
  array([-0.01234834,  0.00513478,  0.06450617, ...,  0.11386503,
         -0.58626616,  0.06797109], dtype=float32),
  array([-0.01234835,  0.00513479,  0.064

In [21]:
FOLDER = "results"
os.makedirs(FOLDER, exist_ok=True)  # Crear la carpeta si no existe

# Guardar el diccionario db_activations en formato pickle
with open(os.path.join(FOLDER, "activations_st.pkl"), 'wb') as file:
    pickle.dump(db_activations, file)