In [67]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import PeftModel
import torch
import torch.nn.functional as F
import transformer_lens
from datasets import load_dataset
from tqdm import tqdm
import re
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import numpy as np
import os
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [68]:
def load_and_merge_model(model_base: str, adapter_path: str):
    """
    Function to load a causal language model and merge an adapter.

    Args:
    model_base (str): Path or identifier of the base model (e.g. "EleutherAI/pythia-70m").
    adapter_path (str): Path to the fine-tuned adapter model.

    Returns:
    model_merged: The merged and unloaded model.
    tokenizer: The tokenizer loaded from the adapter.
    """

    tokenizer = AutoTokenizer.from_pretrained(adapter_path)
    model_id = AutoModelForCausalLM.from_pretrained(model_base)
    model = PeftModel.from_pretrained(model_id, adapter_path)    
    return model.merge_and_unload(), tokenizer

In [69]:
"""model_base = "Qwen/Qwen2-1.5B-Instruct"
adapter_path = "fine_tuned_model_both_qwen"""

model_base = "EleutherAI/pythia-70m"
adapter_path = "fine_tuned_model_both_pythia"

model_merged_tel, tokenizer_tel = load_and_merge_model(model_base, adapter_path)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [70]:
dataset_autoregressive = load_dataset('text', data_files={'train': 'data/sentences_train.txt'})
samples_tel = []
for sample in tqdm(dataset_autoregressive['train']['text']):
    if len(sample) > 0:
        # Remove all digits from the sample
        cleaned_sample = re.sub(r'\d+|\.', '', sample)
        samples_tel.append(cleaned_sample)

100%|██████████| 300/300 [00:00<00:00, 510877.47it/s]


In [71]:
model_hooked = transformer_lens.HookedTransformer.from_pretrained(
    model_base,
    hf_model=model_merged_tel, 
    tokenizer=tokenizer_tel,
    device=device,
    move_to_device=True
)
num_of_layers = 6 #model_merged_tel.config.max_window_layers

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loaded pretrained model EleutherAI/pythia-70m into HookedTransformer


In [72]:
db_activations = {
    "activations": [],
    "task": []
}

In [73]:
for text in tqdm(samples_tel, desc="Processing DB"):
    _, activations = model_hooked.run_with_cache(text)
    vector = []
    for layer in range(num_of_layers):
        block_act_fn = activations[f'blocks.{layer}.mlp.hook_post'].flatten(1).to("cpu").detach().numpy()
        vector.append(block_act_fn)
    db_activations["activations"].append(vector)
    db_activations["task"].append("tel")

Processing DB: 100%|██████████| 150/150 [00:02<00:00, 68.10it/s]


In [74]:
adapter_path = "fine_tuned_model_math_pythia"
model_merged_math, tokenizer_math = load_and_merge_model(model_base, adapter_path)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [75]:
dataset_math = load_dataset('json', data_files='./data/arithmatic_expressions.json', split="train")
print(dataset_math)
samples_math = []
for sample in tqdm(dataset_math['text']):
    samples_math.append(sample.split(" = ")[0] + "=")
samples_math = np.random.choice(samples_math, 150, replace=False)
print(len(samples_math))

Dataset({
    features: ['text'],
    num_rows: 2000
})


100%|██████████| 2000/2000 [00:00<00:00, 1635525.05it/s]

150





In [76]:
model_hooked_math = transformer_lens.HookedTransformer.from_pretrained(
    model_base,
    hf_model=model_merged_math, 
    tokenizer=tokenizer_math,
    device=device,
    move_to_device=True
)
num_of_layers = 6 #model_merged_tel.config.max_window_layers

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loaded pretrained model EleutherAI/pythia-70m into HookedTransformer


In [77]:
for text in tqdm(samples_math, desc="Processing DB"):
    _, activations = model_hooked_math.run_with_cache(text)
    vector = []
    for layer in range(num_of_layers):
        block_act_fn = activations[f'blocks.{layer}.mlp.hook_post'].flatten(1).to("cpu").detach().numpy()
        vector.append(block_act_fn)
    db_activations["activations"].append(vector)
    db_activations["task"].append("math")

Processing DB: 100%|██████████| 150/150 [00:02<00:00, 61.10it/s]


In [78]:
# transform the activations into a pandas dataframe
df_activations = pd.DataFrame(db_activations)
"""df_activations["task"] = db_activations["task"]
df_activations = df_activations.melt(id_vars=["task"], var_name="layer", value_name="activations")"""

# save the activations to a file
FOLDER = "results"
df_activations.to_csv(
    os.path.join(FOLDER, "activations.csv"),
    index=False, 
    sep=";"
)