In [1]:
import json
import torch
from functools import partial
from baukit import TraceDict
from einops import rearrange, einsum
from tqdm import tqdm

from cmap_utils import get_model_and_tokenizer, load_data, eval_model_performance, cmap_in, cmap_out

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
torch.manual_seed(10)

%load_ext autoreload
%autoreload 2

### Loading Models and Tokenizer

In [2]:
llama_model, tokenizer = get_model_and_tokenizer(model_name="llama", device=device)
goat_model, _ = get_model_and_tokenizer(model_name="goat", device=device)

You are using the default legacy behaviour of the <class 'transformers.models.llama.tokenization_llama.LlamaTokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thouroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/33 [00:00<?, ?it/s]

### Loading Data

In [3]:
data_file = "../data/dataset.jsonl"
dataloader = load_data(tokenizer=tokenizer, data_file=data_file, num_samples=500, batch_size=8)

Length of dataset: 500


### Models Performance

In [4]:
llama_acc = eval_model_performance(llama_model, dataloader, device)
goat_acc = eval_model_performance(goat_model, dataloader, device)

print(f"LLAMA accuracy: {llama_acc}")
print(f"Goat accuracy: {goat_acc}")

  0%|          | 0/63 [00:00<?, ?it/s]
0it [00:00, ?it/s][A
  2%|▏         | 1/63 [00:00<00:31,  1.98it/s]
  3%|▎         | 2/63 [00:00<00:26,  2.31it/s]
  5%|▍         | 3/63 [00:01<00:24,  2.45it/s]
  6%|▋         | 4/63 [00:01<00:23,  2.51it/s]
  8%|▊         | 5/63 [00:02<00:22,  2.55it/s]
 10%|▉         | 6/63 [00:02<00:22,  2.58it/s]
 11%|█         | 7/63 [00:02<00:21,  2.59it/s]
 13%|█▎        | 8/63 [00:03<00:21,  2.60it/s]
 14%|█▍        | 9/63 [00:03<00:20,  2.61it/s]
 16%|█▌        | 10/63 [00:03<00:20,  2.61it/s]
 17%|█▋        | 11/63 [00:04<00:19,  2.62it/s]
 19%|█▉        | 12/63 [00:04<00:19,  2.62it/s]
 21%|██        | 13/63 [00:05<00:19,  2.62it/s]
 22%|██▏       | 14/63 [00:05<00:18,  2.62it/s]
 24%|██▍       | 15/63 [00:05<00:18,  2.62it/s]
 25%|██▌       | 16/63 [00:06<00:17,  2.62it/s]
 27%|██▋       | 17/63 [00:06<00:17,  2.62it/s]
 29%|██▊       | 18/63 [00:06<00:17,  2.62it/s]
 30%|███       | 19/63 [00:07<00:16,  2.62it/s]
 32%|███▏      | 20/63 [00:07<00:16,

LLAMA accuracy: 0.66
Goat accuracy: 0.82





### Loading Model Activations

In [5]:
llama_modules = [[f"model.layers.{layer}.self_attn.k_proj", 
                  f"model.layers.{layer}.self_attn.q_proj",
                  f"model.layers.{layer}.self_attn.v_proj",
                 f"model.layers.{layer}.self_attn.o_proj"] 
                 for layer in range(llama_model.config.num_hidden_layers)]
goat_modules = [[f"base_model.model.model.layers.{layer}.self_attn.k_proj", 
                 f"base_model.model.model.layers.{layer}.self_attn.q_proj",
                 f"base_model.model.model.layers.{layer}.self_attn.v_proj",
                f"base_model.model.model.layers.{layer}.self_attn.o_proj"] 
                for layer in range(goat_model.config.num_hidden_layers)]

llama_modules = [item for sublist in llama_modules for item in sublist]
goat_modules = [item for sublist in goat_modules for item in sublist]

In [6]:
goat_cache = {}

with torch.no_grad():
    for bi, inputs in tqdm(enumerate(dataloader), desc="goat_cache"):
        for k, v in inputs.items():
            if v is not None and isinstance(v, torch.Tensor):
                inputs[k] = v.to(goat_model.device)

        with TraceDict(goat_model, goat_modules, retain_input=True) as cache:
            _ = goat_model(inputs["input_ids"])
        
        for goat_layer, llama_layer in zip(goat_modules, llama_modules):
            if "o_proj" in llama_layer and "o_proj" in goat_layer:
                if bi in goat_cache:
                    goat_cache[bi][llama_layer] = cache[goat_layer].input.cpu()
                else:
                    goat_cache[bi] = {}
                    goat_cache[bi][llama_layer] = cache[goat_layer].input.cpu()
            else:
                if bi in goat_cache:
                    goat_cache[bi][llama_layer] = cache[goat_layer].output.cpu()
                else:
                    goat_cache[bi] = {}
                    goat_cache[bi][llama_layer] = cache[goat_layer].output.cpu()

goat_cache: 63it [00:46,  1.36it/s]


In [7]:
llama_cache = {}

with torch.no_grad():
    for bi, inputs in tqdm(enumerate(dataloader), desc="llama_cache"):
        for k, v in inputs.items():
            if v is not None and isinstance(v, torch.Tensor):
                inputs[k] = v.to(llama_model.device)

        with TraceDict(llama_model, llama_modules, retain_input=True) as cache:
            _ = llama_model(inputs["input_ids"])
        
        for llama_layer in llama_modules:
            if "o_proj" in llama_layer:
                if bi in llama_cache:
                    llama_cache[bi][llama_layer] = cache[llama_layer].input.cpu()
                else:
                    llama_cache[bi] = {}
                    llama_cache[bi][llama_layer] = cache[llama_layer].input.cpu()
            else:
                if bi in llama_cache:
                    llama_cache[bi][llama_layer] = cache[llama_layer].output.cpu()
                else:
                    llama_cache[bi] = {}
                    llama_cache[bi][llama_layer] = cache[llama_layer].output.cpu()

llama_cache: 63it [00:41,  1.51it/s]


### Loading circuit

In [8]:
with open("../experiment_1/results/circuits/llama_circuit.json", "r") as f:
    circuit_heads = json.load(f)

In [9]:
with open("../experiment_2/results/DCM/llama_circuit/value_fetcher/object_value/0.01.txt", "r") as f:
    data = f.readlines()
    value_fetcher = json.loads(data[0].split(": ")[1])

with open("../experiment_2/results/DCM/llama_circuit/pos_transmitter/positional/0.01.txt", "r") as f:
    data = f.readlines()
    pos_transmitter = json.loads(data[0].split(": ")[1])

with open("../experiment_2/results/DCM/llama_circuit/pos_detector/positional/0.01.txt", "r") as f:
    data = f.readlines()
    pos_detector = json.loads(data[0].split(": ")[1])

struct_reader = circuit_heads["struct_reader"]

print(f"Value Fetcher Heads: {len(value_fetcher)}")
print(f"Heads affecting direct logit heads: {len(pos_transmitter)}")
print(f"Heads at query box token: {len(pos_detector)}")
print(f"Heads at prev query box token: {len(struct_reader)}")

Value Fetcher Heads: 40
Heads affecting direct logit heads: 5
Heads at query box token: 14
Heads at prev query box token: 5


### CMAP (output patching)

In [18]:
# Full circuit (Select group of heads for CMAP accordingly)
pos_heads_dict = {}
# pos_heads_dict[0] = value_fetcher
# pos_heads_dict[0] = pos_transmitter
# pos_heads_dict[2] = pos_detector
pos_heads_dict[-1] = struct_reader

In [19]:
correct_count, total_count = 0, 0

with torch.no_grad():
    for bi, inputs in tqdm(enumerate(dataloader)):
        for k, v in inputs.items():
            if v is not None and isinstance(v, torch.Tensor):
                inputs[k] = v.to(llama_model.device)

        with TraceDict(llama_model,
                       llama_modules,
                       retain_input=True,
                       edit_output=partial(
                            cmap_out,
                            model = llama_model,
                            goat_cache = goat_cache,
                            bi = bi,
                            pos_heads_dict = pos_heads_dict,
                            input_tokens = inputs)) as _:
                outputs = llama_model(inputs["input_ids"], output_attentions=True)

        for bi in range(inputs["labels"].size(0)):
            label = inputs["labels"][bi]
            pred = torch.argmax(outputs.logits[bi][inputs["last_token_indices"][bi]])

            if label == pred:
                correct_count += 1
            total_count += 1

        del outputs
        torch.cuda.empty_cache()

current_acc = round(correct_count / total_count, 2)
print(f"Task accuracy: {current_acc}")

63it [00:32,  1.93it/s]

Task accuracy: 0.67





Output CMAP Results:
- Full Circuit: 0.78
- Value Fetcher: 0.77
- Position Transmitter: 0.68
- Position Detector:0.62
- Structure Reader: 0.67

### CMAP (input patching)

In [28]:
# Select group of heads for CMAP accordingly
pos_heads_dict = {}
# pos_heads_dict[0] = value_fetcher
pos_heads_dict[0] = pos_transmitter

In [33]:
correct_count, total_count = 0, 0

with torch.no_grad():
    for bi, inputs in tqdm(enumerate(dataloader)):
        for k, v in inputs.items():
            if v is not None and isinstance(v, torch.Tensor):
                inputs[k] = v.to(llama_model.device)

        with TraceDict(llama_model,
                       llama_modules,
                       retain_input=True,
                       edit_output=partial(
                            cmap_in,
                            model = llama_model,
                            goat_cache = goat_cache,
                            llama_cache = llama_cache,
                            patching_component = ["q_proj", "k_proj", "v_proj"], #Options: "q_proj" (query), "k_proj" (key), "v_proj" (value)
                            bi = bi,
                            pos_heads_dict = pos_heads_dict,
                            input_tokens = inputs)) as _:
                outputs = llama_model(inputs["input_ids"], output_attentions=True)

        for bi in range(inputs["labels"].size(0)):
            label = inputs["labels"][bi]
            pred = torch.argmax(outputs.logits[bi][inputs["last_token_indices"][bi]])

            if label == pred:
                correct_count += 1
            total_count += 1

        del outputs
        torch.cuda.empty_cache()

current_acc = round(correct_count / total_count, 2)
print(f"Task accuracy: {current_acc}")

63it [00:38,  1.65it/s]

Task accuracy: 0.67





Input CMAP Results:
- Value Fetcher:
    - Query: 0.77
    - Key: 0.63
    - Value: 0.69
    - QK: 0.76
    - QKV: 0.77
- Position Transmitter:
    - Query: 0.66
    - Key: 0.65
    - Value: 0.69
    - QK: 0.65
    - QKV: 0.67

## Patching Weights

In [11]:
def patch_weights(inputs=None, output=None, layer=None, bi=None):
    inputs = inputs[0]
    value_fetcher_curr_layer = [h for l, h in value_fetcher if l == int(layer.split(".")[2])]
    position_trans_curr_layer = [h for l, h in pos_transmitter if l == int(layer.split(".")[2])]

    if ("q_proj" in layer) and len(value_fetcher_curr_layer) > 0:
        llama_w = llama_model.state_dict()[f"{layer}.weight"].clone()
        goat_w = goat_model.state_dict()[f"{layer}.weight"].clone()

        llama_w = rearrange(llama_w, 
                           "d_model (n_heads d_head) -> d_model n_heads d_head", 
                           n_heads=llama_model.config.num_attention_heads,
                           )
        goat_w = rearrange(goat_w, 
                           "d_model (n_heads d_head) -> d_model n_heads d_head", 
                           n_heads=goat_model.config.num_attention_heads,
                           )

#         for head_idx in value_fetcher_curr_layer:
        llama_w = goat_w

        llama_w = rearrange(llama_w, 
                           "d_model n_heads d_head -> d_model (n_heads d_head)", 
                           n_heads=llama_model.config.num_attention_heads,
                           )

        new_output = einsum(
            inputs, llama_w, "batch seq_len hidden_size, d_model hidden_size -> batch seq_len d_model"
        )
        output = torch.cat((output[:, :-1], new_output[:, -1].unsqueeze(dim=1)), dim=1)

    if ("o_proj" in layer) and len(position_trans_curr_layer) > 0:
        inputs = rearrange(
                inputs,
                "batch seq_len (n_heads d_head) -> batch seq_len n_heads d_head",
                n_heads=llama_model.config.num_attention_heads,
            )
        
        g_cache = rearrange(
                goat_cache[bi][layer],
                "batch seq_len (n_heads d_head) -> batch seq_len n_heads d_head",
                n_heads=llama_model.config.num_attention_heads,
            )
        
        for head in position_trans_curr_layer:
            inputs[:, -1, head] = g_cache[:, -1, head]
        
        inputs = rearrange(
                inputs,
                "batch seq_len n_heads d_head -> batch seq_len (n_heads d_head)",
                n_heads=llama_model.config.num_attention_heads,
            )
        w_o = llama_model.state_dict()[f"{layer}.weight"]
        output = einsum(
            inputs, w_o, "batch seq_len hidden_size, d_model hidden_size -> batch seq_len d_model"
        )

    return output

In [12]:
correct_count, total_count = 0, 0
llama_model.eval()
with torch.no_grad():
    for bi, inputs in tqdm(enumerate(tqdm(dataloader))):
        for k, v in inputs.items():
            if v is not None and isinstance(v, torch.Tensor):
                inputs[k] = v.to(llama_model.device)

        with TraceDict(
            llama_model,
            llama_modules,
            retain_input=True,
            edit_output=partial(
                patch_weights,
                bi=bi,
            ),
        ) as _:
            outputs = llama_model(inputs["input_ids"])

        for bi in range(inputs["labels"].size(0)):
            label = inputs["labels"][bi]
            pred = torch.argmax(outputs.logits[bi][inputs["last_token_indices"][bi]])

            if label == pred:
                correct_count += 1
            # else:
            #     print(f"Label: {tokenizer.decode(label)}, Prediction: {tokenizer.decode(pred)}")
            total_count += 1

        del outputs
        torch.cuda.empty_cache()

  0%|          | 0/63 [00:00<?, ?it/s]
0it [00:00, ?it/s][A

model.layers.14.self_attn.q_proj: llama: 108.36489868164062, fine-tuned: 108.38367462158203
model.layers.15.self_attn.q_proj: llama: 108.3883056640625, fine-tuned: 108.40525817871094
model.layers.16.self_attn.q_proj: llama: 106.8139419555664, fine-tuned: 106.8269271850586
model.layers.17.self_attn.q_proj: llama: 104.55338287353516, fine-tuned: 104.56761932373047
model.layers.18.self_attn.q_proj: llama: 103.43733215332031, fine-tuned: 103.45516967773438
model.layers.19.self_attn.q_proj: llama: 101.43477630615234, fine-tuned: 101.44762420654297
model.layers.20.self_attn.q_proj: llama: 102.77690887451172, fine-tuned: 102.79048156738281
model.layers.21.self_attn.q_proj: llama: 99.40530395507812, fine-tuned: 99.41356658935547
model.layers.22.self_attn.q_proj: llama: 101.00042724609375, fine-tuned: 101.01142883300781
model.layers.23.self_attn.q_proj: llama: 97.8895492553711, fine-tuned: 97.90031433105469
model.layers.24.self_attn.q_proj: llama: 97.97964477539062, fine-tuned: 98.0158920288086


  2%|▏         | 1/63 [00:00<00:26,  2.32it/s]

model.layers.14.self_attn.q_proj: llama: 108.36489868164062, fine-tuned: 108.38367462158203
model.layers.15.self_attn.q_proj: llama: 108.3883056640625, fine-tuned: 108.40525817871094
model.layers.16.self_attn.q_proj: llama: 106.8139419555664, fine-tuned: 106.8269271850586
model.layers.17.self_attn.q_proj: llama: 104.55338287353516, fine-tuned: 104.56761932373047
model.layers.18.self_attn.q_proj: llama: 103.43733215332031, fine-tuned: 103.45516967773438
model.layers.19.self_attn.q_proj: llama: 101.43477630615234, fine-tuned: 101.44762420654297
model.layers.20.self_attn.q_proj: llama: 102.77690887451172, fine-tuned: 102.79048156738281
model.layers.21.self_attn.q_proj: llama: 99.40530395507812, fine-tuned: 99.41356658935547
model.layers.22.self_attn.q_proj: llama: 101.00042724609375, fine-tuned: 101.01142883300781
model.layers.23.self_attn.q_proj: llama: 97.8895492553711, fine-tuned: 97.90031433105469
model.layers.24.self_attn.q_proj: llama: 97.97964477539062, fine-tuned: 98.0158920288086


  3%|▎         | 2/63 [00:00<00:25,  2.35it/s]

model.layers.14.self_attn.q_proj: llama: 108.36489868164062, fine-tuned: 108.38367462158203
model.layers.15.self_attn.q_proj: llama: 108.3883056640625, fine-tuned: 108.40525817871094
model.layers.16.self_attn.q_proj: llama: 106.8139419555664, fine-tuned: 106.8269271850586
model.layers.17.self_attn.q_proj: llama: 104.55338287353516, fine-tuned: 104.56761932373047
model.layers.18.self_attn.q_proj: llama: 103.43733215332031, fine-tuned: 103.45516967773438
model.layers.19.self_attn.q_proj: llama: 101.43477630615234, fine-tuned: 101.44762420654297
model.layers.20.self_attn.q_proj: llama: 102.77690887451172, fine-tuned: 102.79048156738281
model.layers.21.self_attn.q_proj: llama: 99.40530395507812, fine-tuned: 99.41356658935547
model.layers.22.self_attn.q_proj: llama: 101.00042724609375, fine-tuned: 101.01142883300781
model.layers.23.self_attn.q_proj: llama: 97.8895492553711, fine-tuned: 97.90031433105469
model.layers.24.self_attn.q_proj: llama: 97.97964477539062, fine-tuned: 98.0158920288086


  5%|▍         | 3/63 [00:01<00:25,  2.36it/s]

model.layers.14.self_attn.q_proj: llama: 108.36489868164062, fine-tuned: 108.38367462158203
model.layers.15.self_attn.q_proj: llama: 108.3883056640625, fine-tuned: 108.40525817871094
model.layers.16.self_attn.q_proj: llama: 106.8139419555664, fine-tuned: 106.8269271850586
model.layers.17.self_attn.q_proj: llama: 104.55338287353516, fine-tuned: 104.56761932373047
model.layers.18.self_attn.q_proj: llama: 103.43733215332031, fine-tuned: 103.45516967773438
model.layers.19.self_attn.q_proj: llama: 101.43477630615234, fine-tuned: 101.44762420654297
model.layers.20.self_attn.q_proj: llama: 102.77690887451172, fine-tuned: 102.79048156738281
model.layers.21.self_attn.q_proj: llama: 99.40530395507812, fine-tuned: 99.41356658935547
model.layers.22.self_attn.q_proj: llama: 101.00042724609375, fine-tuned: 101.01142883300781
model.layers.23.self_attn.q_proj: llama: 97.8895492553711, fine-tuned: 97.90031433105469
model.layers.24.self_attn.q_proj: llama: 97.97964477539062, fine-tuned: 98.0158920288086


  6%|▋         | 4/63 [00:01<00:24,  2.37it/s]

model.layers.14.self_attn.q_proj: llama: 108.36489868164062, fine-tuned: 108.38367462158203
model.layers.15.self_attn.q_proj: llama: 108.3883056640625, fine-tuned: 108.40525817871094
model.layers.16.self_attn.q_proj: llama: 106.8139419555664, fine-tuned: 106.8269271850586
model.layers.17.self_attn.q_proj: llama: 104.55338287353516, fine-tuned: 104.56761932373047
model.layers.18.self_attn.q_proj: llama: 103.43733215332031, fine-tuned: 103.45516967773438
model.layers.19.self_attn.q_proj: llama: 101.43477630615234, fine-tuned: 101.44762420654297
model.layers.20.self_attn.q_proj: llama: 102.77690887451172, fine-tuned: 102.79048156738281
model.layers.21.self_attn.q_proj: llama: 99.40530395507812, fine-tuned: 99.41356658935547
model.layers.22.self_attn.q_proj: llama: 101.00042724609375, fine-tuned: 101.01142883300781
model.layers.23.self_attn.q_proj: llama: 97.8895492553711, fine-tuned: 97.90031433105469
model.layers.24.self_attn.q_proj: llama: 97.97964477539062, fine-tuned: 98.0158920288086


  8%|▊         | 5/63 [00:02<00:24,  2.37it/s]

model.layers.14.self_attn.q_proj: llama: 108.36489868164062, fine-tuned: 108.38367462158203
model.layers.15.self_attn.q_proj: llama: 108.3883056640625, fine-tuned: 108.40525817871094
model.layers.16.self_attn.q_proj: llama: 106.8139419555664, fine-tuned: 106.8269271850586
model.layers.17.self_attn.q_proj: llama: 104.55338287353516, fine-tuned: 104.56761932373047
model.layers.18.self_attn.q_proj: llama: 103.43733215332031, fine-tuned: 103.45516967773438
model.layers.19.self_attn.q_proj: llama: 101.43477630615234, fine-tuned: 101.44762420654297
model.layers.20.self_attn.q_proj: llama: 102.77690887451172, fine-tuned: 102.79048156738281
model.layers.21.self_attn.q_proj: llama: 99.40530395507812, fine-tuned: 99.41356658935547
model.layers.22.self_attn.q_proj: llama: 101.00042724609375, fine-tuned: 101.01142883300781
model.layers.23.self_attn.q_proj: llama: 97.8895492553711, fine-tuned: 97.90031433105469
model.layers.24.self_attn.q_proj: llama: 97.97964477539062, fine-tuned: 98.0158920288086


 10%|▉         | 6/63 [00:02<00:23,  2.38it/s]

model.layers.14.self_attn.q_proj: llama: 108.36489868164062, fine-tuned: 108.38367462158203
model.layers.15.self_attn.q_proj: llama: 108.3883056640625, fine-tuned: 108.40525817871094
model.layers.16.self_attn.q_proj: llama: 106.8139419555664, fine-tuned: 106.8269271850586
model.layers.17.self_attn.q_proj: llama: 104.55338287353516, fine-tuned: 104.56761932373047
model.layers.18.self_attn.q_proj: llama: 103.43733215332031, fine-tuned: 103.45516967773438
model.layers.19.self_attn.q_proj: llama: 101.43477630615234, fine-tuned: 101.44762420654297
model.layers.20.self_attn.q_proj: llama: 102.77690887451172, fine-tuned: 102.79048156738281
model.layers.21.self_attn.q_proj: llama: 99.40530395507812, fine-tuned: 99.41356658935547
model.layers.22.self_attn.q_proj: llama: 101.00042724609375, fine-tuned: 101.01142883300781
model.layers.23.self_attn.q_proj: llama: 97.8895492553711, fine-tuned: 97.90031433105469
model.layers.24.self_attn.q_proj: llama: 97.97964477539062, fine-tuned: 98.0158920288086


 11%|█         | 7/63 [00:02<00:23,  2.38it/s]

model.layers.14.self_attn.q_proj: llama: 108.36489868164062, fine-tuned: 108.38367462158203
model.layers.15.self_attn.q_proj: llama: 108.3883056640625, fine-tuned: 108.40525817871094
model.layers.16.self_attn.q_proj: llama: 106.8139419555664, fine-tuned: 106.8269271850586
model.layers.17.self_attn.q_proj: llama: 104.55338287353516, fine-tuned: 104.56761932373047
model.layers.18.self_attn.q_proj: llama: 103.43733215332031, fine-tuned: 103.45516967773438
model.layers.19.self_attn.q_proj: llama: 101.43477630615234, fine-tuned: 101.44762420654297
model.layers.20.self_attn.q_proj: llama: 102.77690887451172, fine-tuned: 102.79048156738281
model.layers.21.self_attn.q_proj: llama: 99.40530395507812, fine-tuned: 99.41356658935547
model.layers.22.self_attn.q_proj: llama: 101.00042724609375, fine-tuned: 101.01142883300781
model.layers.23.self_attn.q_proj: llama: 97.8895492553711, fine-tuned: 97.90031433105469
model.layers.24.self_attn.q_proj: llama: 97.97964477539062, fine-tuned: 98.0158920288086


 13%|█▎        | 8/63 [00:03<00:23,  2.37it/s]

model.layers.14.self_attn.q_proj: llama: 108.36489868164062, fine-tuned: 108.38367462158203
model.layers.15.self_attn.q_proj: llama: 108.3883056640625, fine-tuned: 108.40525817871094
model.layers.16.self_attn.q_proj: llama: 106.8139419555664, fine-tuned: 106.8269271850586
model.layers.17.self_attn.q_proj: llama: 104.55338287353516, fine-tuned: 104.56761932373047
model.layers.18.self_attn.q_proj: llama: 103.43733215332031, fine-tuned: 103.45516967773438
model.layers.19.self_attn.q_proj: llama: 101.43477630615234, fine-tuned: 101.44762420654297
model.layers.20.self_attn.q_proj: llama: 102.77690887451172, fine-tuned: 102.79048156738281
model.layers.21.self_attn.q_proj: llama: 99.40530395507812, fine-tuned: 99.41356658935547
model.layers.22.self_attn.q_proj: llama: 101.00042724609375, fine-tuned: 101.01142883300781
model.layers.23.self_attn.q_proj: llama: 97.8895492553711, fine-tuned: 97.90031433105469
model.layers.24.self_attn.q_proj: llama: 97.97964477539062, fine-tuned: 98.0158920288086


 14%|█▍        | 9/63 [00:03<00:22,  2.37it/s]

model.layers.14.self_attn.q_proj: llama: 108.36489868164062, fine-tuned: 108.38367462158203
model.layers.15.self_attn.q_proj: llama: 108.3883056640625, fine-tuned: 108.40525817871094
model.layers.16.self_attn.q_proj: llama: 106.8139419555664, fine-tuned: 106.8269271850586
model.layers.17.self_attn.q_proj: llama: 104.55338287353516, fine-tuned: 104.56761932373047
model.layers.18.self_attn.q_proj: llama: 103.43733215332031, fine-tuned: 103.45516967773438
model.layers.19.self_attn.q_proj: llama: 101.43477630615234, fine-tuned: 101.44762420654297
model.layers.20.self_attn.q_proj: llama: 102.77690887451172, fine-tuned: 102.79048156738281
model.layers.21.self_attn.q_proj: llama: 99.40530395507812, fine-tuned: 99.41356658935547
model.layers.22.self_attn.q_proj: llama: 101.00042724609375, fine-tuned: 101.01142883300781
model.layers.23.self_attn.q_proj: llama: 97.8895492553711, fine-tuned: 97.90031433105469
model.layers.24.self_attn.q_proj: llama: 97.97964477539062, fine-tuned: 98.0158920288086


 16%|█▌        | 10/63 [00:04<00:22,  2.37it/s]

model.layers.14.self_attn.q_proj: llama: 108.36489868164062, fine-tuned: 108.38367462158203
model.layers.15.self_attn.q_proj: llama: 108.3883056640625, fine-tuned: 108.40525817871094
model.layers.16.self_attn.q_proj: llama: 106.8139419555664, fine-tuned: 106.8269271850586
model.layers.17.self_attn.q_proj: llama: 104.55338287353516, fine-tuned: 104.56761932373047
model.layers.18.self_attn.q_proj: llama: 103.43733215332031, fine-tuned: 103.45516967773438
model.layers.19.self_attn.q_proj: llama: 101.43477630615234, fine-tuned: 101.44762420654297
model.layers.20.self_attn.q_proj: llama: 102.77690887451172, fine-tuned: 102.79048156738281
model.layers.21.self_attn.q_proj: llama: 99.40530395507812, fine-tuned: 99.41356658935547
model.layers.22.self_attn.q_proj: llama: 101.00042724609375, fine-tuned: 101.01142883300781
model.layers.23.self_attn.q_proj: llama: 97.8895492553711, fine-tuned: 97.90031433105469
model.layers.24.self_attn.q_proj: llama: 97.97964477539062, fine-tuned: 98.0158920288086


 17%|█▋        | 11/63 [00:04<00:21,  2.37it/s]

model.layers.14.self_attn.q_proj: llama: 108.36489868164062, fine-tuned: 108.38367462158203
model.layers.15.self_attn.q_proj: llama: 108.3883056640625, fine-tuned: 108.40525817871094
model.layers.16.self_attn.q_proj: llama: 106.8139419555664, fine-tuned: 106.8269271850586
model.layers.17.self_attn.q_proj: llama: 104.55338287353516, fine-tuned: 104.56761932373047
model.layers.18.self_attn.q_proj: llama: 103.43733215332031, fine-tuned: 103.45516967773438
model.layers.19.self_attn.q_proj: llama: 101.43477630615234, fine-tuned: 101.44762420654297
model.layers.20.self_attn.q_proj: llama: 102.77690887451172, fine-tuned: 102.79048156738281
model.layers.21.self_attn.q_proj: llama: 99.40530395507812, fine-tuned: 99.41356658935547
model.layers.22.self_attn.q_proj: llama: 101.00042724609375, fine-tuned: 101.01142883300781
model.layers.23.self_attn.q_proj: llama: 97.8895492553711, fine-tuned: 97.90031433105469
model.layers.24.self_attn.q_proj: llama: 97.97964477539062, fine-tuned: 98.0158920288086


 19%|█▉        | 12/63 [00:05<00:21,  2.37it/s]

model.layers.14.self_attn.q_proj: llama: 108.36489868164062, fine-tuned: 108.38367462158203
model.layers.15.self_attn.q_proj: llama: 108.3883056640625, fine-tuned: 108.40525817871094
model.layers.16.self_attn.q_proj: llama: 106.8139419555664, fine-tuned: 106.8269271850586
model.layers.17.self_attn.q_proj: llama: 104.55338287353516, fine-tuned: 104.56761932373047
model.layers.18.self_attn.q_proj: llama: 103.43733215332031, fine-tuned: 103.45516967773438
model.layers.19.self_attn.q_proj: llama: 101.43477630615234, fine-tuned: 101.44762420654297
model.layers.20.self_attn.q_proj: llama: 102.77690887451172, fine-tuned: 102.79048156738281
model.layers.21.self_attn.q_proj: llama: 99.40530395507812, fine-tuned: 99.41356658935547
model.layers.22.self_attn.q_proj: llama: 101.00042724609375, fine-tuned: 101.01142883300781
model.layers.23.self_attn.q_proj: llama: 97.8895492553711, fine-tuned: 97.90031433105469
model.layers.24.self_attn.q_proj: llama: 97.97964477539062, fine-tuned: 98.0158920288086


 21%|██        | 13/63 [00:05<00:21,  2.37it/s]

model.layers.14.self_attn.q_proj: llama: 108.36489868164062, fine-tuned: 108.38367462158203
model.layers.15.self_attn.q_proj: llama: 108.3883056640625, fine-tuned: 108.40525817871094
model.layers.16.self_attn.q_proj: llama: 106.8139419555664, fine-tuned: 106.8269271850586
model.layers.17.self_attn.q_proj: llama: 104.55338287353516, fine-tuned: 104.56761932373047
model.layers.18.self_attn.q_proj: llama: 103.43733215332031, fine-tuned: 103.45516967773438
model.layers.19.self_attn.q_proj: llama: 101.43477630615234, fine-tuned: 101.44762420654297
model.layers.20.self_attn.q_proj: llama: 102.77690887451172, fine-tuned: 102.79048156738281
model.layers.21.self_attn.q_proj: llama: 99.40530395507812, fine-tuned: 99.41356658935547
model.layers.22.self_attn.q_proj: llama: 101.00042724609375, fine-tuned: 101.01142883300781
model.layers.23.self_attn.q_proj: llama: 97.8895492553711, fine-tuned: 97.90031433105469
model.layers.24.self_attn.q_proj: llama: 97.97964477539062, fine-tuned: 98.0158920288086


 22%|██▏       | 14/63 [00:05<00:20,  2.37it/s]

model.layers.14.self_attn.q_proj: llama: 108.36489868164062, fine-tuned: 108.38367462158203
model.layers.15.self_attn.q_proj: llama: 108.3883056640625, fine-tuned: 108.40525817871094
model.layers.16.self_attn.q_proj: llama: 106.8139419555664, fine-tuned: 106.8269271850586
model.layers.17.self_attn.q_proj: llama: 104.55338287353516, fine-tuned: 104.56761932373047
model.layers.18.self_attn.q_proj: llama: 103.43733215332031, fine-tuned: 103.45516967773438
model.layers.19.self_attn.q_proj: llama: 101.43477630615234, fine-tuned: 101.44762420654297
model.layers.20.self_attn.q_proj: llama: 102.77690887451172, fine-tuned: 102.79048156738281
model.layers.21.self_attn.q_proj: llama: 99.40530395507812, fine-tuned: 99.41356658935547
model.layers.22.self_attn.q_proj: llama: 101.00042724609375, fine-tuned: 101.01142883300781
model.layers.23.self_attn.q_proj: llama: 97.8895492553711, fine-tuned: 97.90031433105469
model.layers.24.self_attn.q_proj: llama: 97.97964477539062, fine-tuned: 98.0158920288086


 24%|██▍       | 15/63 [00:06<00:20,  2.37it/s]

model.layers.14.self_attn.q_proj: llama: 108.36489868164062, fine-tuned: 108.38367462158203
model.layers.15.self_attn.q_proj: llama: 108.3883056640625, fine-tuned: 108.40525817871094
model.layers.16.self_attn.q_proj: llama: 106.8139419555664, fine-tuned: 106.8269271850586
model.layers.17.self_attn.q_proj: llama: 104.55338287353516, fine-tuned: 104.56761932373047
model.layers.18.self_attn.q_proj: llama: 103.43733215332031, fine-tuned: 103.45516967773438
model.layers.19.self_attn.q_proj: llama: 101.43477630615234, fine-tuned: 101.44762420654297
model.layers.20.self_attn.q_proj: llama: 102.77690887451172, fine-tuned: 102.79048156738281
model.layers.21.self_attn.q_proj: llama: 99.40530395507812, fine-tuned: 99.41356658935547
model.layers.22.self_attn.q_proj: llama: 101.00042724609375, fine-tuned: 101.01142883300781
model.layers.23.self_attn.q_proj: llama: 97.8895492553711, fine-tuned: 97.90031433105469
model.layers.24.self_attn.q_proj: llama: 97.97964477539062, fine-tuned: 98.0158920288086


 25%|██▌       | 16/63 [00:06<00:19,  2.37it/s]

model.layers.14.self_attn.q_proj: llama: 108.36489868164062, fine-tuned: 108.38367462158203
model.layers.15.self_attn.q_proj: llama: 108.3883056640625, fine-tuned: 108.40525817871094
model.layers.16.self_attn.q_proj: llama: 106.8139419555664, fine-tuned: 106.8269271850586
model.layers.17.self_attn.q_proj: llama: 104.55338287353516, fine-tuned: 104.56761932373047
model.layers.18.self_attn.q_proj: llama: 103.43733215332031, fine-tuned: 103.45516967773438
model.layers.19.self_attn.q_proj: llama: 101.43477630615234, fine-tuned: 101.44762420654297
model.layers.20.self_attn.q_proj: llama: 102.77690887451172, fine-tuned: 102.79048156738281
model.layers.21.self_attn.q_proj: llama: 99.40530395507812, fine-tuned: 99.41356658935547
model.layers.22.self_attn.q_proj: llama: 101.00042724609375, fine-tuned: 101.01142883300781
model.layers.23.self_attn.q_proj: llama: 97.8895492553711, fine-tuned: 97.90031433105469
model.layers.24.self_attn.q_proj: llama: 97.97964477539062, fine-tuned: 98.0158920288086


 27%|██▋       | 17/63 [00:07<00:19,  2.38it/s]

model.layers.14.self_attn.q_proj: llama: 108.36489868164062, fine-tuned: 108.38367462158203
model.layers.15.self_attn.q_proj: llama: 108.3883056640625, fine-tuned: 108.40525817871094
model.layers.16.self_attn.q_proj: llama: 106.8139419555664, fine-tuned: 106.8269271850586
model.layers.17.self_attn.q_proj: llama: 104.55338287353516, fine-tuned: 104.56761932373047
model.layers.18.self_attn.q_proj: llama: 103.43733215332031, fine-tuned: 103.45516967773438
model.layers.19.self_attn.q_proj: llama: 101.43477630615234, fine-tuned: 101.44762420654297
model.layers.20.self_attn.q_proj: llama: 102.77690887451172, fine-tuned: 102.79048156738281
model.layers.21.self_attn.q_proj: llama: 99.40530395507812, fine-tuned: 99.41356658935547
model.layers.22.self_attn.q_proj: llama: 101.00042724609375, fine-tuned: 101.01142883300781
model.layers.23.self_attn.q_proj: llama: 97.8895492553711, fine-tuned: 97.90031433105469
model.layers.24.self_attn.q_proj: llama: 97.97964477539062, fine-tuned: 98.0158920288086


 29%|██▊       | 18/63 [00:07<00:18,  2.38it/s]

model.layers.14.self_attn.q_proj: llama: 108.36489868164062, fine-tuned: 108.38367462158203
model.layers.15.self_attn.q_proj: llama: 108.3883056640625, fine-tuned: 108.40525817871094
model.layers.16.self_attn.q_proj: llama: 106.8139419555664, fine-tuned: 106.8269271850586
model.layers.17.self_attn.q_proj: llama: 104.55338287353516, fine-tuned: 104.56761932373047
model.layers.18.self_attn.q_proj: llama: 103.43733215332031, fine-tuned: 103.45516967773438
model.layers.19.self_attn.q_proj: llama: 101.43477630615234, fine-tuned: 101.44762420654297
model.layers.20.self_attn.q_proj: llama: 102.77690887451172, fine-tuned: 102.79048156738281
model.layers.21.self_attn.q_proj: llama: 99.40530395507812, fine-tuned: 99.41356658935547
model.layers.22.self_attn.q_proj: llama: 101.00042724609375, fine-tuned: 101.01142883300781
model.layers.23.self_attn.q_proj: llama: 97.8895492553711, fine-tuned: 97.90031433105469
model.layers.24.self_attn.q_proj: llama: 97.97964477539062, fine-tuned: 98.0158920288086


 30%|███       | 19/63 [00:08<00:18,  2.38it/s]

model.layers.14.self_attn.q_proj: llama: 108.36489868164062, fine-tuned: 108.38367462158203
model.layers.15.self_attn.q_proj: llama: 108.3883056640625, fine-tuned: 108.40525817871094
model.layers.16.self_attn.q_proj: llama: 106.8139419555664, fine-tuned: 106.8269271850586
model.layers.17.self_attn.q_proj: llama: 104.55338287353516, fine-tuned: 104.56761932373047
model.layers.18.self_attn.q_proj: llama: 103.43733215332031, fine-tuned: 103.45516967773438
model.layers.19.self_attn.q_proj: llama: 101.43477630615234, fine-tuned: 101.44762420654297
model.layers.20.self_attn.q_proj: llama: 102.77690887451172, fine-tuned: 102.79048156738281
model.layers.21.self_attn.q_proj: llama: 99.40530395507812, fine-tuned: 99.41356658935547
model.layers.22.self_attn.q_proj: llama: 101.00042724609375, fine-tuned: 101.01142883300781
model.layers.23.self_attn.q_proj: llama: 97.8895492553711, fine-tuned: 97.90031433105469
model.layers.24.self_attn.q_proj: llama: 97.97964477539062, fine-tuned: 98.0158920288086


 32%|███▏      | 20/63 [00:08<00:18,  2.38it/s]

model.layers.14.self_attn.q_proj: llama: 108.36489868164062, fine-tuned: 108.38367462158203
model.layers.15.self_attn.q_proj: llama: 108.3883056640625, fine-tuned: 108.40525817871094
model.layers.16.self_attn.q_proj: llama: 106.8139419555664, fine-tuned: 106.8269271850586
model.layers.17.self_attn.q_proj: llama: 104.55338287353516, fine-tuned: 104.56761932373047
model.layers.18.self_attn.q_proj: llama: 103.43733215332031, fine-tuned: 103.45516967773438
model.layers.19.self_attn.q_proj: llama: 101.43477630615234, fine-tuned: 101.44762420654297
model.layers.20.self_attn.q_proj: llama: 102.77690887451172, fine-tuned: 102.79048156738281
model.layers.21.self_attn.q_proj: llama: 99.40530395507812, fine-tuned: 99.41356658935547
model.layers.22.self_attn.q_proj: llama: 101.00042724609375, fine-tuned: 101.01142883300781
model.layers.23.self_attn.q_proj: llama: 97.8895492553711, fine-tuned: 97.90031433105469
model.layers.24.self_attn.q_proj: llama: 97.97964477539062, fine-tuned: 98.0158920288086


 33%|███▎      | 21/63 [00:08<00:17,  2.38it/s]

model.layers.14.self_attn.q_proj: llama: 108.36489868164062, fine-tuned: 108.38367462158203
model.layers.15.self_attn.q_proj: llama: 108.3883056640625, fine-tuned: 108.40525817871094
model.layers.16.self_attn.q_proj: llama: 106.8139419555664, fine-tuned: 106.8269271850586
model.layers.17.self_attn.q_proj: llama: 104.55338287353516, fine-tuned: 104.56761932373047
model.layers.18.self_attn.q_proj: llama: 103.43733215332031, fine-tuned: 103.45516967773438
model.layers.19.self_attn.q_proj: llama: 101.43477630615234, fine-tuned: 101.44762420654297
model.layers.20.self_attn.q_proj: llama: 102.77690887451172, fine-tuned: 102.79048156738281
model.layers.21.self_attn.q_proj: llama: 99.40530395507812, fine-tuned: 99.41356658935547
model.layers.22.self_attn.q_proj: llama: 101.00042724609375, fine-tuned: 101.01142883300781
model.layers.23.self_attn.q_proj: llama: 97.8895492553711, fine-tuned: 97.90031433105469
model.layers.24.self_attn.q_proj: llama: 97.97964477539062, fine-tuned: 98.0158920288086


 35%|███▍      | 22/63 [00:09<00:17,  2.37it/s]

model.layers.14.self_attn.q_proj: llama: 108.36489868164062, fine-tuned: 108.38367462158203
model.layers.15.self_attn.q_proj: llama: 108.3883056640625, fine-tuned: 108.40525817871094
model.layers.16.self_attn.q_proj: llama: 106.8139419555664, fine-tuned: 106.8269271850586
model.layers.17.self_attn.q_proj: llama: 104.55338287353516, fine-tuned: 104.56761932373047
model.layers.18.self_attn.q_proj: llama: 103.43733215332031, fine-tuned: 103.45516967773438
model.layers.19.self_attn.q_proj: llama: 101.43477630615234, fine-tuned: 101.44762420654297
model.layers.20.self_attn.q_proj: llama: 102.77690887451172, fine-tuned: 102.79048156738281
model.layers.21.self_attn.q_proj: llama: 99.40530395507812, fine-tuned: 99.41356658935547
model.layers.22.self_attn.q_proj: llama: 101.00042724609375, fine-tuned: 101.01142883300781
model.layers.23.self_attn.q_proj: llama: 97.8895492553711, fine-tuned: 97.90031433105469
model.layers.24.self_attn.q_proj: llama: 97.97964477539062, fine-tuned: 98.0158920288086


 37%|███▋      | 23/63 [00:09<00:16,  2.37it/s]

model.layers.14.self_attn.q_proj: llama: 108.36489868164062, fine-tuned: 108.38367462158203
model.layers.15.self_attn.q_proj: llama: 108.3883056640625, fine-tuned: 108.40525817871094
model.layers.16.self_attn.q_proj: llama: 106.8139419555664, fine-tuned: 106.8269271850586
model.layers.17.self_attn.q_proj: llama: 104.55338287353516, fine-tuned: 104.56761932373047
model.layers.18.self_attn.q_proj: llama: 103.43733215332031, fine-tuned: 103.45516967773438
model.layers.19.self_attn.q_proj: llama: 101.43477630615234, fine-tuned: 101.44762420654297
model.layers.20.self_attn.q_proj: llama: 102.77690887451172, fine-tuned: 102.79048156738281
model.layers.21.self_attn.q_proj: llama: 99.40530395507812, fine-tuned: 99.41356658935547
model.layers.22.self_attn.q_proj: llama: 101.00042724609375, fine-tuned: 101.01142883300781
model.layers.23.self_attn.q_proj: llama: 97.8895492553711, fine-tuned: 97.90031433105469
model.layers.24.self_attn.q_proj: llama: 97.97964477539062, fine-tuned: 98.0158920288086


 38%|███▊      | 24/63 [00:10<00:16,  2.37it/s]

model.layers.14.self_attn.q_proj: llama: 108.36489868164062, fine-tuned: 108.38367462158203
model.layers.15.self_attn.q_proj: llama: 108.3883056640625, fine-tuned: 108.40525817871094
model.layers.16.self_attn.q_proj: llama: 106.8139419555664, fine-tuned: 106.8269271850586
model.layers.17.self_attn.q_proj: llama: 104.55338287353516, fine-tuned: 104.56761932373047
model.layers.18.self_attn.q_proj: llama: 103.43733215332031, fine-tuned: 103.45516967773438
model.layers.19.self_attn.q_proj: llama: 101.43477630615234, fine-tuned: 101.44762420654297
model.layers.20.self_attn.q_proj: llama: 102.77690887451172, fine-tuned: 102.79048156738281
model.layers.21.self_attn.q_proj: llama: 99.40530395507812, fine-tuned: 99.41356658935547
model.layers.22.self_attn.q_proj: llama: 101.00042724609375, fine-tuned: 101.01142883300781
model.layers.23.self_attn.q_proj: llama: 97.8895492553711, fine-tuned: 97.90031433105469
model.layers.24.self_attn.q_proj: llama: 97.97964477539062, fine-tuned: 98.0158920288086


 40%|███▉      | 25/63 [00:10<00:15,  2.38it/s]

model.layers.14.self_attn.q_proj: llama: 108.36489868164062, fine-tuned: 108.38367462158203
model.layers.15.self_attn.q_proj: llama: 108.3883056640625, fine-tuned: 108.40525817871094
model.layers.16.self_attn.q_proj: llama: 106.8139419555664, fine-tuned: 106.8269271850586
model.layers.17.self_attn.q_proj: llama: 104.55338287353516, fine-tuned: 104.56761932373047
model.layers.18.self_attn.q_proj: llama: 103.43733215332031, fine-tuned: 103.45516967773438
model.layers.19.self_attn.q_proj: llama: 101.43477630615234, fine-tuned: 101.44762420654297
model.layers.20.self_attn.q_proj: llama: 102.77690887451172, fine-tuned: 102.79048156738281
model.layers.21.self_attn.q_proj: llama: 99.40530395507812, fine-tuned: 99.41356658935547
model.layers.22.self_attn.q_proj: llama: 101.00042724609375, fine-tuned: 101.01142883300781
model.layers.23.self_attn.q_proj: llama: 97.8895492553711, fine-tuned: 97.90031433105469
model.layers.24.self_attn.q_proj: llama: 97.97964477539062, fine-tuned: 98.0158920288086


 41%|████▏     | 26/63 [00:10<00:15,  2.38it/s]

model.layers.14.self_attn.q_proj: llama: 108.36489868164062, fine-tuned: 108.38367462158203
model.layers.15.self_attn.q_proj: llama: 108.3883056640625, fine-tuned: 108.40525817871094
model.layers.16.self_attn.q_proj: llama: 106.8139419555664, fine-tuned: 106.8269271850586
model.layers.17.self_attn.q_proj: llama: 104.55338287353516, fine-tuned: 104.56761932373047
model.layers.18.self_attn.q_proj: llama: 103.43733215332031, fine-tuned: 103.45516967773438
model.layers.19.self_attn.q_proj: llama: 101.43477630615234, fine-tuned: 101.44762420654297
model.layers.20.self_attn.q_proj: llama: 102.77690887451172, fine-tuned: 102.79048156738281
model.layers.21.self_attn.q_proj: llama: 99.40530395507812, fine-tuned: 99.41356658935547
model.layers.22.self_attn.q_proj: llama: 101.00042724609375, fine-tuned: 101.01142883300781
model.layers.23.self_attn.q_proj: llama: 97.8895492553711, fine-tuned: 97.90031433105469
model.layers.24.self_attn.q_proj: llama: 97.97964477539062, fine-tuned: 98.0158920288086


 43%|████▎     | 27/63 [00:11<00:15,  2.38it/s]

model.layers.14.self_attn.q_proj: llama: 108.36489868164062, fine-tuned: 108.38367462158203
model.layers.15.self_attn.q_proj: llama: 108.3883056640625, fine-tuned: 108.40525817871094
model.layers.16.self_attn.q_proj: llama: 106.8139419555664, fine-tuned: 106.8269271850586
model.layers.17.self_attn.q_proj: llama: 104.55338287353516, fine-tuned: 104.56761932373047
model.layers.18.self_attn.q_proj: llama: 103.43733215332031, fine-tuned: 103.45516967773438
model.layers.19.self_attn.q_proj: llama: 101.43477630615234, fine-tuned: 101.44762420654297
model.layers.20.self_attn.q_proj: llama: 102.77690887451172, fine-tuned: 102.79048156738281
model.layers.21.self_attn.q_proj: llama: 99.40530395507812, fine-tuned: 99.41356658935547
model.layers.22.self_attn.q_proj: llama: 101.00042724609375, fine-tuned: 101.01142883300781
model.layers.23.self_attn.q_proj: llama: 97.8895492553711, fine-tuned: 97.90031433105469
model.layers.24.self_attn.q_proj: llama: 97.97964477539062, fine-tuned: 98.0158920288086


 44%|████▍     | 28/63 [00:11<00:14,  2.38it/s]

model.layers.14.self_attn.q_proj: llama: 108.36489868164062, fine-tuned: 108.38367462158203
model.layers.15.self_attn.q_proj: llama: 108.3883056640625, fine-tuned: 108.40525817871094
model.layers.16.self_attn.q_proj: llama: 106.8139419555664, fine-tuned: 106.8269271850586
model.layers.17.self_attn.q_proj: llama: 104.55338287353516, fine-tuned: 104.56761932373047
model.layers.18.self_attn.q_proj: llama: 103.43733215332031, fine-tuned: 103.45516967773438
model.layers.19.self_attn.q_proj: llama: 101.43477630615234, fine-tuned: 101.44762420654297
model.layers.20.self_attn.q_proj: llama: 102.77690887451172, fine-tuned: 102.79048156738281
model.layers.21.self_attn.q_proj: llama: 99.40530395507812, fine-tuned: 99.41356658935547
model.layers.22.self_attn.q_proj: llama: 101.00042724609375, fine-tuned: 101.01142883300781
model.layers.23.self_attn.q_proj: llama: 97.8895492553711, fine-tuned: 97.90031433105469
model.layers.24.self_attn.q_proj: llama: 97.97964477539062, fine-tuned: 98.0158920288086


 46%|████▌     | 29/63 [00:12<00:14,  2.38it/s]

model.layers.14.self_attn.q_proj: llama: 108.36489868164062, fine-tuned: 108.38367462158203
model.layers.15.self_attn.q_proj: llama: 108.3883056640625, fine-tuned: 108.40525817871094
model.layers.16.self_attn.q_proj: llama: 106.8139419555664, fine-tuned: 106.8269271850586
model.layers.17.self_attn.q_proj: llama: 104.55338287353516, fine-tuned: 104.56761932373047
model.layers.18.self_attn.q_proj: llama: 103.43733215332031, fine-tuned: 103.45516967773438
model.layers.19.self_attn.q_proj: llama: 101.43477630615234, fine-tuned: 101.44762420654297
model.layers.20.self_attn.q_proj: llama: 102.77690887451172, fine-tuned: 102.79048156738281
model.layers.21.self_attn.q_proj: llama: 99.40530395507812, fine-tuned: 99.41356658935547
model.layers.22.self_attn.q_proj: llama: 101.00042724609375, fine-tuned: 101.01142883300781
model.layers.23.self_attn.q_proj: llama: 97.8895492553711, fine-tuned: 97.90031433105469
model.layers.24.self_attn.q_proj: llama: 97.97964477539062, fine-tuned: 98.0158920288086


 48%|████▊     | 30/63 [00:12<00:13,  2.38it/s]

model.layers.14.self_attn.q_proj: llama: 108.36489868164062, fine-tuned: 108.38367462158203
model.layers.15.self_attn.q_proj: llama: 108.3883056640625, fine-tuned: 108.40525817871094
model.layers.16.self_attn.q_proj: llama: 106.8139419555664, fine-tuned: 106.8269271850586
model.layers.17.self_attn.q_proj: llama: 104.55338287353516, fine-tuned: 104.56761932373047
model.layers.18.self_attn.q_proj: llama: 103.43733215332031, fine-tuned: 103.45516967773438
model.layers.19.self_attn.q_proj: llama: 101.43477630615234, fine-tuned: 101.44762420654297
model.layers.20.self_attn.q_proj: llama: 102.77690887451172, fine-tuned: 102.79048156738281
model.layers.21.self_attn.q_proj: llama: 99.40530395507812, fine-tuned: 99.41356658935547
model.layers.22.self_attn.q_proj: llama: 101.00042724609375, fine-tuned: 101.01142883300781
model.layers.23.self_attn.q_proj: llama: 97.8895492553711, fine-tuned: 97.90031433105469
model.layers.24.self_attn.q_proj: llama: 97.97964477539062, fine-tuned: 98.0158920288086


 49%|████▉     | 31/63 [00:13<00:13,  2.38it/s]

model.layers.14.self_attn.q_proj: llama: 108.36489868164062, fine-tuned: 108.38367462158203
model.layers.15.self_attn.q_proj: llama: 108.3883056640625, fine-tuned: 108.40525817871094
model.layers.16.self_attn.q_proj: llama: 106.8139419555664, fine-tuned: 106.8269271850586
model.layers.17.self_attn.q_proj: llama: 104.55338287353516, fine-tuned: 104.56761932373047
model.layers.18.self_attn.q_proj: llama: 103.43733215332031, fine-tuned: 103.45516967773438
model.layers.19.self_attn.q_proj: llama: 101.43477630615234, fine-tuned: 101.44762420654297
model.layers.20.self_attn.q_proj: llama: 102.77690887451172, fine-tuned: 102.79048156738281
model.layers.21.self_attn.q_proj: llama: 99.40530395507812, fine-tuned: 99.41356658935547
model.layers.22.self_attn.q_proj: llama: 101.00042724609375, fine-tuned: 101.01142883300781
model.layers.23.self_attn.q_proj: llama: 97.8895492553711, fine-tuned: 97.90031433105469
model.layers.24.self_attn.q_proj: llama: 97.97964477539062, fine-tuned: 98.0158920288086


 51%|█████     | 32/63 [00:13<00:13,  2.38it/s]

model.layers.14.self_attn.q_proj: llama: 108.36489868164062, fine-tuned: 108.38367462158203
model.layers.15.self_attn.q_proj: llama: 108.3883056640625, fine-tuned: 108.40525817871094
model.layers.16.self_attn.q_proj: llama: 106.8139419555664, fine-tuned: 106.8269271850586
model.layers.17.self_attn.q_proj: llama: 104.55338287353516, fine-tuned: 104.56761932373047
model.layers.18.self_attn.q_proj: llama: 103.43733215332031, fine-tuned: 103.45516967773438
model.layers.19.self_attn.q_proj: llama: 101.43477630615234, fine-tuned: 101.44762420654297
model.layers.20.self_attn.q_proj: llama: 102.77690887451172, fine-tuned: 102.79048156738281
model.layers.21.self_attn.q_proj: llama: 99.40530395507812, fine-tuned: 99.41356658935547
model.layers.22.self_attn.q_proj: llama: 101.00042724609375, fine-tuned: 101.01142883300781
model.layers.23.self_attn.q_proj: llama: 97.8895492553711, fine-tuned: 97.90031433105469
model.layers.24.self_attn.q_proj: llama: 97.97964477539062, fine-tuned: 98.0158920288086


 52%|█████▏    | 33/63 [00:13<00:12,  2.38it/s]

model.layers.14.self_attn.q_proj: llama: 108.36489868164062, fine-tuned: 108.38367462158203
model.layers.15.self_attn.q_proj: llama: 108.3883056640625, fine-tuned: 108.40525817871094
model.layers.16.self_attn.q_proj: llama: 106.8139419555664, fine-tuned: 106.8269271850586
model.layers.17.self_attn.q_proj: llama: 104.55338287353516, fine-tuned: 104.56761932373047
model.layers.18.self_attn.q_proj: llama: 103.43733215332031, fine-tuned: 103.45516967773438
model.layers.19.self_attn.q_proj: llama: 101.43477630615234, fine-tuned: 101.44762420654297
model.layers.20.self_attn.q_proj: llama: 102.77690887451172, fine-tuned: 102.79048156738281
model.layers.21.self_attn.q_proj: llama: 99.40530395507812, fine-tuned: 99.41356658935547
model.layers.22.self_attn.q_proj: llama: 101.00042724609375, fine-tuned: 101.01142883300781
model.layers.23.self_attn.q_proj: llama: 97.8895492553711, fine-tuned: 97.90031433105469
model.layers.24.self_attn.q_proj: llama: 97.97964477539062, fine-tuned: 98.0158920288086


 54%|█████▍    | 34/63 [00:14<00:12,  2.38it/s]

model.layers.14.self_attn.q_proj: llama: 108.36489868164062, fine-tuned: 108.38367462158203
model.layers.15.self_attn.q_proj: llama: 108.3883056640625, fine-tuned: 108.40525817871094
model.layers.16.self_attn.q_proj: llama: 106.8139419555664, fine-tuned: 106.8269271850586
model.layers.17.self_attn.q_proj: llama: 104.55338287353516, fine-tuned: 104.56761932373047
model.layers.18.self_attn.q_proj: llama: 103.43733215332031, fine-tuned: 103.45516967773438
model.layers.19.self_attn.q_proj: llama: 101.43477630615234, fine-tuned: 101.44762420654297
model.layers.20.self_attn.q_proj: llama: 102.77690887451172, fine-tuned: 102.79048156738281
model.layers.21.self_attn.q_proj: llama: 99.40530395507812, fine-tuned: 99.41356658935547
model.layers.22.self_attn.q_proj: llama: 101.00042724609375, fine-tuned: 101.01142883300781
model.layers.23.self_attn.q_proj: llama: 97.8895492553711, fine-tuned: 97.90031433105469
model.layers.24.self_attn.q_proj: llama: 97.97964477539062, fine-tuned: 98.0158920288086


 56%|█████▌    | 35/63 [00:14<00:11,  2.38it/s]

model.layers.14.self_attn.q_proj: llama: 108.36489868164062, fine-tuned: 108.38367462158203
model.layers.15.self_attn.q_proj: llama: 108.3883056640625, fine-tuned: 108.40525817871094
model.layers.16.self_attn.q_proj: llama: 106.8139419555664, fine-tuned: 106.8269271850586
model.layers.17.self_attn.q_proj: llama: 104.55338287353516, fine-tuned: 104.56761932373047
model.layers.18.self_attn.q_proj: llama: 103.43733215332031, fine-tuned: 103.45516967773438
model.layers.19.self_attn.q_proj: llama: 101.43477630615234, fine-tuned: 101.44762420654297
model.layers.20.self_attn.q_proj: llama: 102.77690887451172, fine-tuned: 102.79048156738281
model.layers.21.self_attn.q_proj: llama: 99.40530395507812, fine-tuned: 99.41356658935547
model.layers.22.self_attn.q_proj: llama: 101.00042724609375, fine-tuned: 101.01142883300781
model.layers.23.self_attn.q_proj: llama: 97.8895492553711, fine-tuned: 97.90031433105469
model.layers.24.self_attn.q_proj: llama: 97.97964477539062, fine-tuned: 98.0158920288086


 57%|█████▋    | 36/63 [00:15<00:11,  2.38it/s]

model.layers.14.self_attn.q_proj: llama: 108.36489868164062, fine-tuned: 108.38367462158203
model.layers.15.self_attn.q_proj: llama: 108.3883056640625, fine-tuned: 108.40525817871094
model.layers.16.self_attn.q_proj: llama: 106.8139419555664, fine-tuned: 106.8269271850586
model.layers.17.self_attn.q_proj: llama: 104.55338287353516, fine-tuned: 104.56761932373047
model.layers.18.self_attn.q_proj: llama: 103.43733215332031, fine-tuned: 103.45516967773438
model.layers.19.self_attn.q_proj: llama: 101.43477630615234, fine-tuned: 101.44762420654297
model.layers.20.self_attn.q_proj: llama: 102.77690887451172, fine-tuned: 102.79048156738281
model.layers.21.self_attn.q_proj: llama: 99.40530395507812, fine-tuned: 99.41356658935547
model.layers.22.self_attn.q_proj: llama: 101.00042724609375, fine-tuned: 101.01142883300781
model.layers.23.self_attn.q_proj: llama: 97.8895492553711, fine-tuned: 97.90031433105469
model.layers.24.self_attn.q_proj: llama: 97.97964477539062, fine-tuned: 98.0158920288086


 59%|█████▊    | 37/63 [00:15<00:10,  2.38it/s]

model.layers.14.self_attn.q_proj: llama: 108.36489868164062, fine-tuned: 108.38367462158203
model.layers.15.self_attn.q_proj: llama: 108.3883056640625, fine-tuned: 108.40525817871094
model.layers.16.self_attn.q_proj: llama: 106.8139419555664, fine-tuned: 106.8269271850586
model.layers.17.self_attn.q_proj: llama: 104.55338287353516, fine-tuned: 104.56761932373047
model.layers.18.self_attn.q_proj: llama: 103.43733215332031, fine-tuned: 103.45516967773438
model.layers.19.self_attn.q_proj: llama: 101.43477630615234, fine-tuned: 101.44762420654297
model.layers.20.self_attn.q_proj: llama: 102.77690887451172, fine-tuned: 102.79048156738281
model.layers.21.self_attn.q_proj: llama: 99.40530395507812, fine-tuned: 99.41356658935547
model.layers.22.self_attn.q_proj: llama: 101.00042724609375, fine-tuned: 101.01142883300781
model.layers.23.self_attn.q_proj: llama: 97.8895492553711, fine-tuned: 97.90031433105469
model.layers.24.self_attn.q_proj: llama: 97.97964477539062, fine-tuned: 98.0158920288086


 60%|██████    | 38/63 [00:16<00:10,  2.38it/s]

model.layers.14.self_attn.q_proj: llama: 108.36489868164062, fine-tuned: 108.38367462158203
model.layers.15.self_attn.q_proj: llama: 108.3883056640625, fine-tuned: 108.40525817871094
model.layers.16.self_attn.q_proj: llama: 106.8139419555664, fine-tuned: 106.8269271850586
model.layers.17.self_attn.q_proj: llama: 104.55338287353516, fine-tuned: 104.56761932373047
model.layers.18.self_attn.q_proj: llama: 103.43733215332031, fine-tuned: 103.45516967773438
model.layers.19.self_attn.q_proj: llama: 101.43477630615234, fine-tuned: 101.44762420654297
model.layers.20.self_attn.q_proj: llama: 102.77690887451172, fine-tuned: 102.79048156738281
model.layers.21.self_attn.q_proj: llama: 99.40530395507812, fine-tuned: 99.41356658935547
model.layers.22.self_attn.q_proj: llama: 101.00042724609375, fine-tuned: 101.01142883300781
model.layers.23.self_attn.q_proj: llama: 97.8895492553711, fine-tuned: 97.90031433105469
model.layers.24.self_attn.q_proj: llama: 97.97964477539062, fine-tuned: 98.0158920288086


 62%|██████▏   | 39/63 [00:16<00:10,  2.38it/s]

model.layers.14.self_attn.q_proj: llama: 108.36489868164062, fine-tuned: 108.38367462158203
model.layers.15.self_attn.q_proj: llama: 108.3883056640625, fine-tuned: 108.40525817871094
model.layers.16.self_attn.q_proj: llama: 106.8139419555664, fine-tuned: 106.8269271850586
model.layers.17.self_attn.q_proj: llama: 104.55338287353516, fine-tuned: 104.56761932373047
model.layers.18.self_attn.q_proj: llama: 103.43733215332031, fine-tuned: 103.45516967773438
model.layers.19.self_attn.q_proj: llama: 101.43477630615234, fine-tuned: 101.44762420654297
model.layers.20.self_attn.q_proj: llama: 102.77690887451172, fine-tuned: 102.79048156738281
model.layers.21.self_attn.q_proj: llama: 99.40530395507812, fine-tuned: 99.41356658935547
model.layers.22.self_attn.q_proj: llama: 101.00042724609375, fine-tuned: 101.01142883300781
model.layers.23.self_attn.q_proj: llama: 97.8895492553711, fine-tuned: 97.90031433105469
model.layers.24.self_attn.q_proj: llama: 97.97964477539062, fine-tuned: 98.0158920288086


 63%|██████▎   | 40/63 [00:16<00:09,  2.38it/s]

model.layers.14.self_attn.q_proj: llama: 108.36489868164062, fine-tuned: 108.38367462158203
model.layers.15.self_attn.q_proj: llama: 108.3883056640625, fine-tuned: 108.40525817871094
model.layers.16.self_attn.q_proj: llama: 106.8139419555664, fine-tuned: 106.8269271850586
model.layers.17.self_attn.q_proj: llama: 104.55338287353516, fine-tuned: 104.56761932373047
model.layers.18.self_attn.q_proj: llama: 103.43733215332031, fine-tuned: 103.45516967773438
model.layers.19.self_attn.q_proj: llama: 101.43477630615234, fine-tuned: 101.44762420654297
model.layers.20.self_attn.q_proj: llama: 102.77690887451172, fine-tuned: 102.79048156738281
model.layers.21.self_attn.q_proj: llama: 99.40530395507812, fine-tuned: 99.41356658935547



 65%|██████▌   | 41/63 [00:17<00:09,  2.21it/s]

model.layers.22.self_attn.q_proj: llama: 101.00042724609375, fine-tuned: 101.01142883300781
model.layers.23.self_attn.q_proj: llama: 97.8895492553711, fine-tuned: 97.90031433105469
model.layers.24.self_attn.q_proj: llama: 97.97964477539062, fine-tuned: 98.0158920288086
model.layers.28.self_attn.q_proj: llama: 97.06059265136719, fine-tuned: 97.08554077148438
model.layers.29.self_attn.q_proj: llama: 96.15409851074219, fine-tuned: 96.20056915283203
model.layers.14.self_attn.q_proj: llama: 108.36489868164062, fine-tuned: 108.38367462158203
model.layers.15.self_attn.q_proj: llama: 108.3883056640625, fine-tuned: 108.40525817871094
model.layers.16.self_attn.q_proj: llama: 106.8139419555664, fine-tuned: 106.8269271850586
model.layers.17.self_attn.q_proj: llama: 104.55338287353516, fine-tuned: 104.56761932373047
model.layers.18.self_attn.q_proj: llama: 103.43733215332031, fine-tuned: 103.45516967773438
model.layers.19.self_attn.q_proj: llama: 101.43477630615234, fine-tuned: 101.44762420654297
m


 67%|██████▋   | 42/63 [00:17<00:09,  2.26it/s]

model.layers.14.self_attn.q_proj: llama: 108.36489868164062, fine-tuned: 108.38367462158203
model.layers.15.self_attn.q_proj: llama: 108.3883056640625, fine-tuned: 108.40525817871094
model.layers.16.self_attn.q_proj: llama: 106.8139419555664, fine-tuned: 106.8269271850586
model.layers.17.self_attn.q_proj: llama: 104.55338287353516, fine-tuned: 104.56761932373047
model.layers.18.self_attn.q_proj: llama: 103.43733215332031, fine-tuned: 103.45516967773438
model.layers.19.self_attn.q_proj: llama: 101.43477630615234, fine-tuned: 101.44762420654297
model.layers.20.self_attn.q_proj: llama: 102.77690887451172, fine-tuned: 102.79048156738281
model.layers.21.self_attn.q_proj: llama: 99.40530395507812, fine-tuned: 99.41356658935547
model.layers.22.self_attn.q_proj: llama: 101.00042724609375, fine-tuned: 101.01142883300781
model.layers.23.self_attn.q_proj: llama: 97.8895492553711, fine-tuned: 97.90031433105469
model.layers.24.self_attn.q_proj: llama: 97.97964477539062, fine-tuned: 98.0158920288086


 68%|██████▊   | 43/63 [00:18<00:08,  2.29it/s]

model.layers.14.self_attn.q_proj: llama: 108.36489868164062, fine-tuned: 108.38367462158203
model.layers.15.self_attn.q_proj: llama: 108.3883056640625, fine-tuned: 108.40525817871094
model.layers.16.self_attn.q_proj: llama: 106.8139419555664, fine-tuned: 106.8269271850586
model.layers.17.self_attn.q_proj: llama: 104.55338287353516, fine-tuned: 104.56761932373047
model.layers.18.self_attn.q_proj: llama: 103.43733215332031, fine-tuned: 103.45516967773438
model.layers.19.self_attn.q_proj: llama: 101.43477630615234, fine-tuned: 101.44762420654297
model.layers.20.self_attn.q_proj: llama: 102.77690887451172, fine-tuned: 102.79048156738281
model.layers.21.self_attn.q_proj: llama: 99.40530395507812, fine-tuned: 99.41356658935547
model.layers.22.self_attn.q_proj: llama: 101.00042724609375, fine-tuned: 101.01142883300781
model.layers.23.self_attn.q_proj: llama: 97.8895492553711, fine-tuned: 97.90031433105469
model.layers.24.self_attn.q_proj: llama: 97.97964477539062, fine-tuned: 98.0158920288086


 70%|██████▉   | 44/63 [00:18<00:08,  2.32it/s]

model.layers.14.self_attn.q_proj: llama: 108.36489868164062, fine-tuned: 108.38367462158203
model.layers.15.self_attn.q_proj: llama: 108.3883056640625, fine-tuned: 108.40525817871094
model.layers.16.self_attn.q_proj: llama: 106.8139419555664, fine-tuned: 106.8269271850586
model.layers.17.self_attn.q_proj: llama: 104.55338287353516, fine-tuned: 104.56761932373047
model.layers.18.self_attn.q_proj: llama: 103.43733215332031, fine-tuned: 103.45516967773438
model.layers.19.self_attn.q_proj: llama: 101.43477630615234, fine-tuned: 101.44762420654297
model.layers.20.self_attn.q_proj: llama: 102.77690887451172, fine-tuned: 102.79048156738281
model.layers.21.self_attn.q_proj: llama: 99.40530395507812, fine-tuned: 99.41356658935547
model.layers.22.self_attn.q_proj: llama: 101.00042724609375, fine-tuned: 101.01142883300781
model.layers.23.self_attn.q_proj: llama: 97.8895492553711, fine-tuned: 97.90031433105469
model.layers.24.self_attn.q_proj: llama: 97.97964477539062, fine-tuned: 98.0158920288086


 71%|███████▏  | 45/63 [00:19<00:07,  2.33it/s]

model.layers.14.self_attn.q_proj: llama: 108.36489868164062, fine-tuned: 108.38367462158203
model.layers.15.self_attn.q_proj: llama: 108.3883056640625, fine-tuned: 108.40525817871094
model.layers.16.self_attn.q_proj: llama: 106.8139419555664, fine-tuned: 106.8269271850586
model.layers.17.self_attn.q_proj: llama: 104.55338287353516, fine-tuned: 104.56761932373047
model.layers.18.self_attn.q_proj: llama: 103.43733215332031, fine-tuned: 103.45516967773438
model.layers.19.self_attn.q_proj: llama: 101.43477630615234, fine-tuned: 101.44762420654297
model.layers.20.self_attn.q_proj: llama: 102.77690887451172, fine-tuned: 102.79048156738281
model.layers.21.self_attn.q_proj: llama: 99.40530395507812, fine-tuned: 99.41356658935547
model.layers.22.self_attn.q_proj: llama: 101.00042724609375, fine-tuned: 101.01142883300781
model.layers.23.self_attn.q_proj: llama: 97.8895492553711, fine-tuned: 97.90031433105469
model.layers.24.self_attn.q_proj: llama: 97.97964477539062, fine-tuned: 98.0158920288086


 73%|███████▎  | 46/63 [00:19<00:07,  2.35it/s]

model.layers.14.self_attn.q_proj: llama: 108.36489868164062, fine-tuned: 108.38367462158203
model.layers.15.self_attn.q_proj: llama: 108.3883056640625, fine-tuned: 108.40525817871094
model.layers.16.self_attn.q_proj: llama: 106.8139419555664, fine-tuned: 106.8269271850586
model.layers.17.self_attn.q_proj: llama: 104.55338287353516, fine-tuned: 104.56761932373047
model.layers.18.self_attn.q_proj: llama: 103.43733215332031, fine-tuned: 103.45516967773438
model.layers.19.self_attn.q_proj: llama: 101.43477630615234, fine-tuned: 101.44762420654297
model.layers.20.self_attn.q_proj: llama: 102.77690887451172, fine-tuned: 102.79048156738281
model.layers.21.self_attn.q_proj: llama: 99.40530395507812, fine-tuned: 99.41356658935547
model.layers.22.self_attn.q_proj: llama: 101.00042724609375, fine-tuned: 101.01142883300781
model.layers.23.self_attn.q_proj: llama: 97.8895492553711, fine-tuned: 97.90031433105469
model.layers.24.self_attn.q_proj: llama: 97.97964477539062, fine-tuned: 98.0158920288086


 75%|███████▍  | 47/63 [00:19<00:06,  2.36it/s]

model.layers.14.self_attn.q_proj: llama: 108.36489868164062, fine-tuned: 108.38367462158203
model.layers.15.self_attn.q_proj: llama: 108.3883056640625, fine-tuned: 108.40525817871094
model.layers.16.self_attn.q_proj: llama: 106.8139419555664, fine-tuned: 106.8269271850586
model.layers.17.self_attn.q_proj: llama: 104.55338287353516, fine-tuned: 104.56761932373047
model.layers.18.self_attn.q_proj: llama: 103.43733215332031, fine-tuned: 103.45516967773438
model.layers.19.self_attn.q_proj: llama: 101.43477630615234, fine-tuned: 101.44762420654297
model.layers.20.self_attn.q_proj: llama: 102.77690887451172, fine-tuned: 102.79048156738281
model.layers.21.self_attn.q_proj: llama: 99.40530395507812, fine-tuned: 99.41356658935547
model.layers.22.self_attn.q_proj: llama: 101.00042724609375, fine-tuned: 101.01142883300781
model.layers.23.self_attn.q_proj: llama: 97.8895492553711, fine-tuned: 97.90031433105469
model.layers.24.self_attn.q_proj: llama: 97.97964477539062, fine-tuned: 98.0158920288086


 76%|███████▌  | 48/63 [00:20<00:06,  2.36it/s]

model.layers.14.self_attn.q_proj: llama: 108.36489868164062, fine-tuned: 108.38367462158203
model.layers.15.self_attn.q_proj: llama: 108.3883056640625, fine-tuned: 108.40525817871094
model.layers.16.self_attn.q_proj: llama: 106.8139419555664, fine-tuned: 106.8269271850586
model.layers.17.self_attn.q_proj: llama: 104.55338287353516, fine-tuned: 104.56761932373047
model.layers.18.self_attn.q_proj: llama: 103.43733215332031, fine-tuned: 103.45516967773438
model.layers.19.self_attn.q_proj: llama: 101.43477630615234, fine-tuned: 101.44762420654297
model.layers.20.self_attn.q_proj: llama: 102.77690887451172, fine-tuned: 102.79048156738281
model.layers.21.self_attn.q_proj: llama: 99.40530395507812, fine-tuned: 99.41356658935547
model.layers.22.self_attn.q_proj: llama: 101.00042724609375, fine-tuned: 101.01142883300781
model.layers.23.self_attn.q_proj: llama: 97.8895492553711, fine-tuned: 97.90031433105469
model.layers.24.self_attn.q_proj: llama: 97.97964477539062, fine-tuned: 98.0158920288086


 78%|███████▊  | 49/63 [00:20<00:05,  2.37it/s]

model.layers.14.self_attn.q_proj: llama: 108.36489868164062, fine-tuned: 108.38367462158203
model.layers.15.self_attn.q_proj: llama: 108.3883056640625, fine-tuned: 108.40525817871094
model.layers.16.self_attn.q_proj: llama: 106.8139419555664, fine-tuned: 106.8269271850586
model.layers.17.self_attn.q_proj: llama: 104.55338287353516, fine-tuned: 104.56761932373047
model.layers.18.self_attn.q_proj: llama: 103.43733215332031, fine-tuned: 103.45516967773438
model.layers.19.self_attn.q_proj: llama: 101.43477630615234, fine-tuned: 101.44762420654297
model.layers.20.self_attn.q_proj: llama: 102.77690887451172, fine-tuned: 102.79048156738281
model.layers.21.self_attn.q_proj: llama: 99.40530395507812, fine-tuned: 99.41356658935547
model.layers.22.self_attn.q_proj: llama: 101.00042724609375, fine-tuned: 101.01142883300781
model.layers.23.self_attn.q_proj: llama: 97.8895492553711, fine-tuned: 97.90031433105469
model.layers.24.self_attn.q_proj: llama: 97.97964477539062, fine-tuned: 98.0158920288086


 79%|███████▉  | 50/63 [00:21<00:05,  2.37it/s]

model.layers.14.self_attn.q_proj: llama: 108.36489868164062, fine-tuned: 108.38367462158203
model.layers.15.self_attn.q_proj: llama: 108.3883056640625, fine-tuned: 108.40525817871094
model.layers.16.self_attn.q_proj: llama: 106.8139419555664, fine-tuned: 106.8269271850586
model.layers.17.self_attn.q_proj: llama: 104.55338287353516, fine-tuned: 104.56761932373047
model.layers.18.self_attn.q_proj: llama: 103.43733215332031, fine-tuned: 103.45516967773438
model.layers.19.self_attn.q_proj: llama: 101.43477630615234, fine-tuned: 101.44762420654297
model.layers.20.self_attn.q_proj: llama: 102.77690887451172, fine-tuned: 102.79048156738281
model.layers.21.self_attn.q_proj: llama: 99.40530395507812, fine-tuned: 99.41356658935547
model.layers.22.self_attn.q_proj: llama: 101.00042724609375, fine-tuned: 101.01142883300781
model.layers.23.self_attn.q_proj: llama: 97.8895492553711, fine-tuned: 97.90031433105469
model.layers.24.self_attn.q_proj: llama: 97.97964477539062, fine-tuned: 98.0158920288086


 81%|████████  | 51/63 [00:21<00:05,  2.37it/s]

model.layers.14.self_attn.q_proj: llama: 108.36489868164062, fine-tuned: 108.38367462158203
model.layers.15.self_attn.q_proj: llama: 108.3883056640625, fine-tuned: 108.40525817871094
model.layers.16.self_attn.q_proj: llama: 106.8139419555664, fine-tuned: 106.8269271850586
model.layers.17.self_attn.q_proj: llama: 104.55338287353516, fine-tuned: 104.56761932373047
model.layers.18.self_attn.q_proj: llama: 103.43733215332031, fine-tuned: 103.45516967773438
model.layers.19.self_attn.q_proj: llama: 101.43477630615234, fine-tuned: 101.44762420654297
model.layers.20.self_attn.q_proj: llama: 102.77690887451172, fine-tuned: 102.79048156738281
model.layers.21.self_attn.q_proj: llama: 99.40530395507812, fine-tuned: 99.41356658935547
model.layers.22.self_attn.q_proj: llama: 101.00042724609375, fine-tuned: 101.01142883300781
model.layers.23.self_attn.q_proj: llama: 97.8895492553711, fine-tuned: 97.90031433105469
model.layers.24.self_attn.q_proj: llama: 97.97964477539062, fine-tuned: 98.0158920288086


 83%|████████▎ | 52/63 [00:21<00:04,  2.38it/s]

model.layers.14.self_attn.q_proj: llama: 108.36489868164062, fine-tuned: 108.38367462158203
model.layers.15.self_attn.q_proj: llama: 108.3883056640625, fine-tuned: 108.40525817871094
model.layers.16.self_attn.q_proj: llama: 106.8139419555664, fine-tuned: 106.8269271850586
model.layers.17.self_attn.q_proj: llama: 104.55338287353516, fine-tuned: 104.56761932373047
model.layers.18.self_attn.q_proj: llama: 103.43733215332031, fine-tuned: 103.45516967773438
model.layers.19.self_attn.q_proj: llama: 101.43477630615234, fine-tuned: 101.44762420654297
model.layers.20.self_attn.q_proj: llama: 102.77690887451172, fine-tuned: 102.79048156738281
model.layers.21.self_attn.q_proj: llama: 99.40530395507812, fine-tuned: 99.41356658935547
model.layers.22.self_attn.q_proj: llama: 101.00042724609375, fine-tuned: 101.01142883300781
model.layers.23.self_attn.q_proj: llama: 97.8895492553711, fine-tuned: 97.90031433105469
model.layers.24.self_attn.q_proj: llama: 97.97964477539062, fine-tuned: 98.0158920288086


 84%|████████▍ | 53/63 [00:22<00:04,  2.38it/s]

model.layers.14.self_attn.q_proj: llama: 108.36489868164062, fine-tuned: 108.38367462158203
model.layers.15.self_attn.q_proj: llama: 108.3883056640625, fine-tuned: 108.40525817871094
model.layers.16.self_attn.q_proj: llama: 106.8139419555664, fine-tuned: 106.8269271850586
model.layers.17.self_attn.q_proj: llama: 104.55338287353516, fine-tuned: 104.56761932373047
model.layers.18.self_attn.q_proj: llama: 103.43733215332031, fine-tuned: 103.45516967773438
model.layers.19.self_attn.q_proj: llama: 101.43477630615234, fine-tuned: 101.44762420654297
model.layers.20.self_attn.q_proj: llama: 102.77690887451172, fine-tuned: 102.79048156738281
model.layers.21.self_attn.q_proj: llama: 99.40530395507812, fine-tuned: 99.41356658935547
model.layers.22.self_attn.q_proj: llama: 101.00042724609375, fine-tuned: 101.01142883300781
model.layers.23.self_attn.q_proj: llama: 97.8895492553711, fine-tuned: 97.90031433105469
model.layers.24.self_attn.q_proj: llama: 97.97964477539062, fine-tuned: 98.0158920288086


 86%|████████▌ | 54/63 [00:22<00:03,  2.38it/s]

model.layers.14.self_attn.q_proj: llama: 108.36489868164062, fine-tuned: 108.38367462158203
model.layers.15.self_attn.q_proj: llama: 108.3883056640625, fine-tuned: 108.40525817871094
model.layers.16.self_attn.q_proj: llama: 106.8139419555664, fine-tuned: 106.8269271850586
model.layers.17.self_attn.q_proj: llama: 104.55338287353516, fine-tuned: 104.56761932373047
model.layers.18.self_attn.q_proj: llama: 103.43733215332031, fine-tuned: 103.45516967773438
model.layers.19.self_attn.q_proj: llama: 101.43477630615234, fine-tuned: 101.44762420654297
model.layers.20.self_attn.q_proj: llama: 102.77690887451172, fine-tuned: 102.79048156738281
model.layers.21.self_attn.q_proj: llama: 99.40530395507812, fine-tuned: 99.41356658935547
model.layers.22.self_attn.q_proj: llama: 101.00042724609375, fine-tuned: 101.01142883300781
model.layers.23.self_attn.q_proj: llama: 97.8895492553711, fine-tuned: 97.90031433105469
model.layers.24.self_attn.q_proj: llama: 97.97964477539062, fine-tuned: 98.0158920288086


 87%|████████▋ | 55/63 [00:23<00:03,  2.38it/s]

model.layers.14.self_attn.q_proj: llama: 108.36489868164062, fine-tuned: 108.38367462158203
model.layers.15.self_attn.q_proj: llama: 108.3883056640625, fine-tuned: 108.40525817871094
model.layers.16.self_attn.q_proj: llama: 106.8139419555664, fine-tuned: 106.8269271850586
model.layers.17.self_attn.q_proj: llama: 104.55338287353516, fine-tuned: 104.56761932373047
model.layers.18.self_attn.q_proj: llama: 103.43733215332031, fine-tuned: 103.45516967773438
model.layers.19.self_attn.q_proj: llama: 101.43477630615234, fine-tuned: 101.44762420654297
model.layers.20.self_attn.q_proj: llama: 102.77690887451172, fine-tuned: 102.79048156738281
model.layers.21.self_attn.q_proj: llama: 99.40530395507812, fine-tuned: 99.41356658935547
model.layers.22.self_attn.q_proj: llama: 101.00042724609375, fine-tuned: 101.01142883300781
model.layers.23.self_attn.q_proj: llama: 97.8895492553711, fine-tuned: 97.90031433105469
model.layers.24.self_attn.q_proj: llama: 97.97964477539062, fine-tuned: 98.0158920288086


 89%|████████▉ | 56/63 [00:23<00:02,  2.38it/s]

model.layers.14.self_attn.q_proj: llama: 108.36489868164062, fine-tuned: 108.38367462158203
model.layers.15.self_attn.q_proj: llama: 108.3883056640625, fine-tuned: 108.40525817871094
model.layers.16.self_attn.q_proj: llama: 106.8139419555664, fine-tuned: 106.8269271850586
model.layers.17.self_attn.q_proj: llama: 104.55338287353516, fine-tuned: 104.56761932373047
model.layers.18.self_attn.q_proj: llama: 103.43733215332031, fine-tuned: 103.45516967773438
model.layers.19.self_attn.q_proj: llama: 101.43477630615234, fine-tuned: 101.44762420654297
model.layers.20.self_attn.q_proj: llama: 102.77690887451172, fine-tuned: 102.79048156738281
model.layers.21.self_attn.q_proj: llama: 99.40530395507812, fine-tuned: 99.41356658935547
model.layers.22.self_attn.q_proj: llama: 101.00042724609375, fine-tuned: 101.01142883300781
model.layers.23.self_attn.q_proj: llama: 97.8895492553711, fine-tuned: 97.90031433105469
model.layers.24.self_attn.q_proj: llama: 97.97964477539062, fine-tuned: 98.0158920288086


 90%|█████████ | 57/63 [00:24<00:02,  2.38it/s]

model.layers.14.self_attn.q_proj: llama: 108.36489868164062, fine-tuned: 108.38367462158203
model.layers.15.self_attn.q_proj: llama: 108.3883056640625, fine-tuned: 108.40525817871094
model.layers.16.self_attn.q_proj: llama: 106.8139419555664, fine-tuned: 106.8269271850586
model.layers.17.self_attn.q_proj: llama: 104.55338287353516, fine-tuned: 104.56761932373047
model.layers.18.self_attn.q_proj: llama: 103.43733215332031, fine-tuned: 103.45516967773438
model.layers.19.self_attn.q_proj: llama: 101.43477630615234, fine-tuned: 101.44762420654297
model.layers.20.self_attn.q_proj: llama: 102.77690887451172, fine-tuned: 102.79048156738281
model.layers.21.self_attn.q_proj: llama: 99.40530395507812, fine-tuned: 99.41356658935547
model.layers.22.self_attn.q_proj: llama: 101.00042724609375, fine-tuned: 101.01142883300781
model.layers.23.self_attn.q_proj: llama: 97.8895492553711, fine-tuned: 97.90031433105469
model.layers.24.self_attn.q_proj: llama: 97.97964477539062, fine-tuned: 98.0158920288086


 92%|█████████▏| 58/63 [00:24<00:02,  2.38it/s]

model.layers.14.self_attn.q_proj: llama: 108.36489868164062, fine-tuned: 108.38367462158203
model.layers.15.self_attn.q_proj: llama: 108.3883056640625, fine-tuned: 108.40525817871094
model.layers.16.self_attn.q_proj: llama: 106.8139419555664, fine-tuned: 106.8269271850586
model.layers.17.self_attn.q_proj: llama: 104.55338287353516, fine-tuned: 104.56761932373047
model.layers.18.self_attn.q_proj: llama: 103.43733215332031, fine-tuned: 103.45516967773438
model.layers.19.self_attn.q_proj: llama: 101.43477630615234, fine-tuned: 101.44762420654297
model.layers.20.self_attn.q_proj: llama: 102.77690887451172, fine-tuned: 102.79048156738281
model.layers.21.self_attn.q_proj: llama: 99.40530395507812, fine-tuned: 99.41356658935547
model.layers.22.self_attn.q_proj: llama: 101.00042724609375, fine-tuned: 101.01142883300781
model.layers.23.self_attn.q_proj: llama: 97.8895492553711, fine-tuned: 97.90031433105469
model.layers.24.self_attn.q_proj: llama: 97.97964477539062, fine-tuned: 98.0158920288086


 94%|█████████▎| 59/63 [00:24<00:01,  2.38it/s]

model.layers.14.self_attn.q_proj: llama: 108.36489868164062, fine-tuned: 108.38367462158203
model.layers.15.self_attn.q_proj: llama: 108.3883056640625, fine-tuned: 108.40525817871094
model.layers.16.self_attn.q_proj: llama: 106.8139419555664, fine-tuned: 106.8269271850586
model.layers.17.self_attn.q_proj: llama: 104.55338287353516, fine-tuned: 104.56761932373047
model.layers.18.self_attn.q_proj: llama: 103.43733215332031, fine-tuned: 103.45516967773438
model.layers.19.self_attn.q_proj: llama: 101.43477630615234, fine-tuned: 101.44762420654297
model.layers.20.self_attn.q_proj: llama: 102.77690887451172, fine-tuned: 102.79048156738281
model.layers.21.self_attn.q_proj: llama: 99.40530395507812, fine-tuned: 99.41356658935547
model.layers.22.self_attn.q_proj: llama: 101.00042724609375, fine-tuned: 101.01142883300781
model.layers.23.self_attn.q_proj: llama: 97.8895492553711, fine-tuned: 97.90031433105469
model.layers.24.self_attn.q_proj: llama: 97.97964477539062, fine-tuned: 98.0158920288086


 95%|█████████▌| 60/63 [00:25<00:01,  2.38it/s]

model.layers.14.self_attn.q_proj: llama: 108.36489868164062, fine-tuned: 108.38367462158203
model.layers.15.self_attn.q_proj: llama: 108.3883056640625, fine-tuned: 108.40525817871094
model.layers.16.self_attn.q_proj: llama: 106.8139419555664, fine-tuned: 106.8269271850586
model.layers.17.self_attn.q_proj: llama: 104.55338287353516, fine-tuned: 104.56761932373047
model.layers.18.self_attn.q_proj: llama: 103.43733215332031, fine-tuned: 103.45516967773438
model.layers.19.self_attn.q_proj: llama: 101.43477630615234, fine-tuned: 101.44762420654297
model.layers.20.self_attn.q_proj: llama: 102.77690887451172, fine-tuned: 102.79048156738281
model.layers.21.self_attn.q_proj: llama: 99.40530395507812, fine-tuned: 99.41356658935547
model.layers.22.self_attn.q_proj: llama: 101.00042724609375, fine-tuned: 101.01142883300781
model.layers.23.self_attn.q_proj: llama: 97.8895492553711, fine-tuned: 97.90031433105469
model.layers.24.self_attn.q_proj: llama: 97.97964477539062, fine-tuned: 98.0158920288086


 97%|█████████▋| 61/63 [00:25<00:00,  2.38it/s]

model.layers.14.self_attn.q_proj: llama: 108.36489868164062, fine-tuned: 108.38367462158203
model.layers.15.self_attn.q_proj: llama: 108.3883056640625, fine-tuned: 108.40525817871094
model.layers.16.self_attn.q_proj: llama: 106.8139419555664, fine-tuned: 106.8269271850586
model.layers.17.self_attn.q_proj: llama: 104.55338287353516, fine-tuned: 104.56761932373047
model.layers.18.self_attn.q_proj: llama: 103.43733215332031, fine-tuned: 103.45516967773438
model.layers.19.self_attn.q_proj: llama: 101.43477630615234, fine-tuned: 101.44762420654297
model.layers.20.self_attn.q_proj: llama: 102.77690887451172, fine-tuned: 102.79048156738281
model.layers.21.self_attn.q_proj: llama: 99.40530395507812, fine-tuned: 99.41356658935547
model.layers.22.self_attn.q_proj: llama: 101.00042724609375, fine-tuned: 101.01142883300781
model.layers.23.self_attn.q_proj: llama: 97.8895492553711, fine-tuned: 97.90031433105469
model.layers.24.self_attn.q_proj: llama: 97.97964477539062, fine-tuned: 98.0158920288086


 98%|█████████▊| 62/63 [00:26<00:00,  2.38it/s]
100%|██████████| 63/63 [00:26<00:00,  2.38it/s]
63it [00:26,  2.38it/s]

model.layers.14.self_attn.q_proj: llama: 108.36489868164062, fine-tuned: 108.38367462158203
model.layers.15.self_attn.q_proj: llama: 108.3883056640625, fine-tuned: 108.40525817871094
model.layers.16.self_attn.q_proj: llama: 106.8139419555664, fine-tuned: 106.8269271850586
model.layers.17.self_attn.q_proj: llama: 104.55338287353516, fine-tuned: 104.56761932373047
model.layers.18.self_attn.q_proj: llama: 103.43733215332031, fine-tuned: 103.45516967773438
model.layers.19.self_attn.q_proj: llama: 101.43477630615234, fine-tuned: 101.44762420654297
model.layers.20.self_attn.q_proj: llama: 102.77690887451172, fine-tuned: 102.79048156738281
model.layers.21.self_attn.q_proj: llama: 99.40530395507812, fine-tuned: 99.41356658935547
model.layers.22.self_attn.q_proj: llama: 101.00042724609375, fine-tuned: 101.01142883300781
model.layers.23.self_attn.q_proj: llama: 97.8895492553711, fine-tuned: 97.90031433105469
model.layers.24.self_attn.q_proj: llama: 97.97964477539062, fine-tuned: 98.0158920288086




In [13]:
round(correct_count/total_count, 2)

0.66

## Patching inputs of Value Fetcher Heads

In [40]:
goat_input_cache = {}

with torch.no_grad():
    for bi, inputs in tqdm(enumerate(dataloader)):
        for k, v in inputs.items():
            if v is not None and isinstance(v, torch.Tensor):
                inputs[k] = v.to(goat_model.device)

        with TraceDict(goat_model, llama_modules, retain_input=True) as cache:
            _ = goat_model(inputs["input_ids"])

        for _, llama_layer in zip(goat_modules, llama_modules):
            if "o_proj" not in llama_layer:
                if bi in goat_input_cache:
                    goat_input_cache[bi][llama_layer] = cache[llama_layer].input.cpu()
                else:
                    goat_input_cache[bi] = {}
                    goat_input_cache[bi][llama_layer] = cache[llama_layer].input.cpu()

63it [00:36,  1.71it/s]


In [76]:
def patch_inputs(inputs=None, output=None, layer=None, bi=None):
    inp = inputs[0]
    value_fetcher_curr_layer = [h for l, h in value_fetcher if l == int(layer.split(".")[2])]
    position_trans_curr_layer = [h for l, h in pos_transmitter if l == int(layer.split(".")[2])]

    if ("k_proj" in layer) and len(value_fetcher_curr_layer) > 0:
        # Patching inputs and weights of value fetcher heads from fine-tuned to llama

        goat_inp = goat_input_cache[bi][layer]

        llama_w = llama_model.state_dict()[f"{layer}.weight"].clone()
        llama_w = rearrange(llama_w, 
                           "(n_heads d_head) d_model -> n_heads d_head d_model", 
                           n_heads=goat_model.config.num_attention_heads)
        
        goat_w = goat_model.state_dict()[f"{layer}.weight"].clone()
        goat_w = rearrange(goat_w, 
                           "(n_heads d_head) d_model -> n_heads d_head d_model", 
                           n_heads=goat_model.config.num_attention_heads)

        output = rearrange(output, 
                   "batch seq_len (n_heads d_head) -> batch seq_len n_heads d_head", 
                   n_heads=goat_model.config.num_attention_heads)

        for head_idx in value_fetcher_curr_layer:
            res = einsum(goat_inp.cuda(),
                          llama_w[head_idx, :],
                          "batch seq_len hidden_size, d_head hidden_size -> batch seq_len d_head")
            output[:, -1, head_idx] = res[:, -1]

        output = rearrange(output, 
                       "batch seq_len n_heads d_head -> batch seq_len (n_heads d_head)", 
                       n_heads=llama_model.config.num_attention_heads)

#     if ("o_proj" in layer) and len(position_trans_curr_layer) > 0:
#         inp = rearrange(
#                 inp,
#                 "batch seq_len (n_heads d_head) -> batch seq_len n_heads d_head",
#                 n_heads=llama_model.config.num_attention_heads)
        
#         g_cache = rearrange(
#                 goat_cache[bi][layer],
#                 "batch seq_len (n_heads d_head) -> batch seq_len n_heads d_head",
#                 n_heads=llama_model.config.num_attention_heads)
        
#         for head in position_trans_curr_layer:
#             inp[:, -1, head] = g_cache[:, -1, head]
        
#         inp = rearrange(
#                 inp,
#                 "batch seq_len n_heads d_head -> batch seq_len (n_heads d_head)",
#                 n_heads=llama_model.config.num_attention_heads)
#         w_o = llama_model.state_dict()[f"{layer}.weight"]
#         output = einsum(
#             inp, w_o, "batch seq_len hidden_size, d_model hidden_size -> batch seq_len d_model")

    return output

In [77]:
correct_count, total_count = 0, 0
with torch.no_grad():
    for bi, inputs in tqdm(enumerate(tqdm(dataloader))):
        for k, v in inputs.items():
            if v is not None and isinstance(v, torch.Tensor):
                inputs[k] = v.to(llama_model.device)

        with TraceDict(
            llama_model,
            llama_modules,
            retain_input=True,
            edit_output=partial(
                patch_inputs,
                bi=bi,
            ),
        ) as _:
            outputs = llama_model(inputs["input_ids"])

        for bi in range(inputs["labels"].size(0)):
            label = inputs["labels"][bi]
            pred = torch.argmax(outputs.logits[bi][inputs["last_token_indices"][bi]])

            if label == pred:
                correct_count += 1
            # else:
            #     print(f"Label: {tokenizer.decode(label)}, Prediction: {tokenizer.decode(pred)}")
            total_count += 1

        del outputs
        torch.cuda.empty_cache()

  0%|          | 0/63 [00:00<?, ?it/s]
0it [00:00, ?it/s][A
  2%|▏         | 1/63 [00:00<00:26,  2.31it/s]
  3%|▎         | 2/63 [00:00<00:26,  2.31it/s]
  5%|▍         | 3/63 [00:01<00:25,  2.31it/s]
  6%|▋         | 4/63 [00:01<00:25,  2.32it/s]
  8%|▊         | 5/63 [00:02<00:24,  2.32it/s]
 10%|▉         | 6/63 [00:02<00:24,  2.33it/s]
 11%|█         | 7/63 [00:03<00:24,  2.33it/s]
 13%|█▎        | 8/63 [00:03<00:23,  2.33it/s]
 14%|█▍        | 9/63 [00:03<00:23,  2.34it/s]
 16%|█▌        | 10/63 [00:04<00:22,  2.34it/s]
 17%|█▋        | 11/63 [00:04<00:22,  2.34it/s]
 19%|█▉        | 12/63 [00:05<00:21,  2.34it/s]
 21%|██        | 13/63 [00:05<00:21,  2.34it/s]
 22%|██▏       | 14/63 [00:06<00:20,  2.34it/s]
 24%|██▍       | 15/63 [00:06<00:20,  2.33it/s]
 25%|██▌       | 16/63 [00:06<00:20,  2.33it/s]
 27%|██▋       | 17/63 [00:07<00:19,  2.33it/s]
 29%|██▊       | 18/63 [00:07<00:19,  2.33it/s]
 30%|███       | 19/63 [00:08<00:18,  2.33it/s]
 32%|███▏      | 20/63 [00:08<00:18,

In [78]:
round(correct_count/total_count, 2)

0.65

In [85]:
for layer in llama_modules:
    llama_w = llama_model.state_dict()[f"{layer}.weight"].clone()
    goat_w = goat_model.state_dict()[f"{layer}.weight"].clone()
    value_fetcher_curr_layer = [h for l, h in value_fetcher if l == int(layer.split(".")[2])]

    if len(value_fetcher_curr_layer) > 0:
        print(f"Layer: {layer}, Llama weight: {llama_w.norm().item()}, Fine-tuned weight: {goat_w.norm().item()}")
        print(f"Layer: {layer}, Distance: {torch.dist(llama_w, goat_w)}\n")

Layer: model.layers.14.self_attn.k_proj, Llama weight: 109.2076416015625, Fine-tuned weight: 109.2258071899414
Layer: model.layers.14.self_attn.k_proj, Distance: 3.194016456604004

Layer: model.layers.14.self_attn.q_proj, Llama weight: 108.36489868164062, Fine-tuned weight: 108.38367462158203
Layer: model.layers.14.self_attn.q_proj, Distance: 3.2767512798309326

Layer: model.layers.14.self_attn.v_proj, Llama weight: 77.7557373046875, Fine-tuned weight: 77.76526641845703
Layer: model.layers.14.self_attn.v_proj, Distance: 2.292689085006714

Layer: model.layers.14.self_attn.o_proj, Llama weight: 77.85401916503906, Fine-tuned weight: 77.86835479736328
Layer: model.layers.14.self_attn.o_proj, Distance: 2.7456164360046387

Layer: model.layers.15.self_attn.k_proj, Llama weight: 110.20281982421875, Fine-tuned weight: 110.21894073486328
Layer: model.layers.15.self_attn.k_proj, Distance: 3.171581983566284

Layer: model.layers.15.self_attn.q_proj, Llama weight: 108.3883056640625, Fine-tuned weigh