In [1]:
import torch
import os
from torch.nn import CosineSimilarity
import matplotlib.pyplot as plt
from transformers import AutoTokenizer, AutoModelForCausalLM, LlamaForCausalLM, LlamaTokenizer
from functools import partial
from baukit import TraceDict
from einops import rearrange, einsum
from collections import defaultdict
import matplotlib.pyplot as plt
# from plotly_utils import imshow, scatter
from tqdm import tqdm
from datasets import Dataset
from torch.utils.data import DataLoader
from peft import PeftModel

import pysvelte
import analysis_utils
from counterfactual_datasets.entity_tracking import *

# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")/
device = "cuda"
torch.manual_seed(10)

%load_ext autoreload
%autoreload 2

In [2]:
# path = "AlekseyKorshuk/vicuna-7b"
# path = "/data/nikhil_prakash/llama_weights/7B"
# vicuna_tokenizer = AutoTokenizer.from_pretrained(path)
# vicuna_model = AutoModelForCausalLM.from_pretrained(path).to(device)

In [2]:
print("Model Loading...")
# path = "AlekseyKorshuk/vicuna-7b"
path = "/data/nikhil_prakash/llama_weights/7B"
# llama_tokenizer = AutoTokenizer.from_pretrained(path)
llama_tokenizer = LlamaTokenizer.from_pretrained('hf-internal-testing/llama-tokenizer', padding_side='right')
llama_model = AutoModelForCausalLM.from_pretrained(path).to(device)

fine_tuned_path = "/data/nikhil_prakash/goat-finetuning/vibrant-glitter-10/"
fine_tuned_model = AutoModelForCausalLM.from_pretrained(fine_tuned_path).to(device)

# base_model = "decapoda-research/llama-7b-hf"
# lora_weights = "tiedong/goat-lora-7b"
# fine_tuned_model = LlamaForCausalLM.from_pretrained(
#     base_model,
#     load_in_8bit=False,
#     torch_dtype=torch.float32,
#     device_map="auto",
# )
# fine_tuned_model = PeftModel.from_pretrained(
#     fine_tuned_model,
#     lora_weights,
#     torch_dtype=torch.float32,
#     device_map={'': 0},
# )

llama_tokenizer.pad_token_id = llama_tokenizer.eos_token_id

You are using the default legacy behaviour of the <class 'transformers.models.llama.tokenization_llama.LlamaTokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thouroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


Model Loading...


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [3]:
data_file = "./box_datasets/no_instructions/alternative/Random/7/train.jsonl"
object_file = "./box_datasets/objects_with_bnc_frequency.csv"
batch_size = 8

In [4]:
raw_data = entity_tracking_example_sampler(
    tokenizer=llama_tokenizer,
    num_samples=500,
    data_file=data_file,
    # object_file=object_file,
    few_shot=False,
    alt_examples=True,
    # num_ents_or_ops=3,
    architecture="LLaMAForCausalLM",
)

dataset = Dataset.from_dict(
    {
        "input_ids": raw_data[0],
        "last_token_indices": raw_data[1],
        "labels": raw_data[2],
    }
).with_format("torch")

print(f"Length of dataset: {len(dataset)}")

dataloader = DataLoader(dataset, batch_size=batch_size)

Length of dataset: 500


In [5]:
idx = 0
print(
    f"Prompt: {llama_tokenizer.decode(dataset[idx]['input_ids'][:dataset[idx]['last_token_indices']+1])}"
)
print(f"Answer: {llama_tokenizer.decode(dataset[idx]['labels'])}")

Prompt: <s>The document is in Box X, the pot is in Box T, the magnet is in Box A, the game is in Box E, the bill is in Box M, the cross is in Box K, the map is in Box D. Box X contains the
Answer: document


In [6]:
total_count = 0
correct_count = 0
llama_model.eval()
with torch.no_grad():
    for _, inputs in tqdm(enumerate(dataloader)):
        for k, v in inputs.items():
            if v is not None and isinstance(v, torch.Tensor):
                inputs[k] = v.to(llama_model.device)

        outputs = llama_model(input_ids=inputs["input_ids"])

        for bi in range(inputs["labels"].size(0)):
            label = inputs["labels"][bi]
            pred = torch.argmax(outputs.logits[bi][inputs["last_token_indices"][bi]])

            if label == pred:
                correct_count += 1
#             else:
#                 print(f"Label: {llama_tokenizer.decode(label)}, Prediction: {llama_tokenizer.decode(pred)}")
            total_count += 1

del outputs
torch.cuda.empty_cache()

current_acc = round(correct_count / total_count, 2)
print(f"Task accuracy: {current_acc}")

63it [00:23,  2.64it/s]

Task accuracy: 0.66





In [47]:
total_count = 0
correct_count = 0
fine_tuned_model.eval()
with torch.no_grad():
    for _, inputs in tqdm(enumerate(dataloader)):
        for k, v in inputs.items():
            if v is not None and isinstance(v, torch.Tensor):
                inputs[k] = v.to(fine_tuned_model.device)

        outputs = fine_tuned_model(input_ids=inputs["input_ids"])

        for bi in range(inputs["labels"].size(0)):
            label = inputs["labels"][bi]
            pred = torch.argmax(outputs.logits[bi][inputs["last_token_indices"][bi]])

            if label == pred:
                correct_count += 1
#             else:
#                 print(f"Label: {llama_tokenizer.decode(label)}, Prediction: {llama_tokenizer.decode(pred)}")
            total_count += 1

del outputs
torch.cuda.empty_cache()

current_acc = round(correct_count / total_count, 2)
print(f"Task accuracy: {current_acc}")

63it [00:23,  2.64it/s]

Task accuracy: 0.8





In [6]:
with open("circuit_heads.json", "r") as f:
    circuit_heads = json.load(f)

In [7]:
llama_modules = [[f"model.layers.{layer}.self_attn.k_proj", 
                  f"model.layers.{layer}.self_attn.q_proj",
                  f"model.layers.{layer}.self_attn.v_proj",
                 f"model.layers.{layer}.self_attn.o_proj"] 
                 for layer in range(32)]
goat_modules = [[f"base_model.model.model.layers.{layer}.self_attn.k_proj", 
                 f"base_model.model.model.layers.{layer}.self_attn.q_proj",
                 f"base_model.model.model.layers.{layer}.self_attn.v_proj",
                f"base_model.model.model.layers.{layer}.self_attn.o_proj"] 
                for layer in range(32)]

llama_modules = [item for sublist in llama_modules for item in sublist]
goat_modules = [item for sublist in goat_modules for item in sublist]

In [8]:
fine_tuned_cache = {}

with torch.no_grad():
    for bi, inputs in tqdm(enumerate(dataloader)):
        for k, v in inputs.items():
            if v is not None and isinstance(v, torch.Tensor):
                inputs[k] = v.to(fine_tuned_model.device)

        with TraceDict(fine_tuned_model, llama_modules, retain_input=True) as cache:
            _ = fine_tuned_model(inputs["input_ids"])
        
        for goat_layer, llama_layer in zip(goat_modules, llama_modules):
            if "o_proj" in llama_layer:
                if bi in fine_tuned_cache:
                    fine_tuned_cache[bi][llama_layer] = cache[llama_layer].input.cpu()
                else:
                    fine_tuned_cache[bi] = {}
                    fine_tuned_cache[bi][llama_layer] = cache[llama_layer].input.cpu()
            else:
                if bi in fine_tuned_cache:
                    fine_tuned_cache[bi][llama_layer] = cache[llama_layer].output.cpu()
                else:
                    fine_tuned_cache[bi] = {}
                    fine_tuned_cache[bi][llama_layer] = cache[llama_layer].output.cpu()

63it [00:45,  1.37it/s]


In [9]:
llama_cache = {}

with torch.no_grad():
    for bi, inputs in tqdm(enumerate(dataloader)):
        for k, v in inputs.items():
            if v is not None and isinstance(v, torch.Tensor):
                inputs[k] = v.to(llama_model.device)

        with TraceDict(llama_model, llama_modules, retain_input=True) as cache:
            _ = llama_model(inputs["input_ids"])
        
        for llama_layer in llama_modules:
            if "o_proj" in llama_layer:
                if bi in llama_cache:
                    llama_cache[bi][llama_layer] = cache[llama_layer].input.cpu()
                else:
                    llama_cache[bi] = {}
                    llama_cache[bi][llama_layer] = cache[llama_layer].input.cpu()
            else:
                if bi in llama_cache:
                    llama_cache[bi][llama_layer] = cache[llama_layer].output.cpu()
                else:
                    llama_cache[bi] = {}
                    llama_cache[bi][llama_layer] = cache[llama_layer].output.cpu()

63it [00:43,  1.46it/s]


In [12]:
def cross_model_patching(inputs, outputs, layer, bi, relative_pos, input_tokens):
    if isinstance(inputs, tuple):
        inputs = inputs[0]

    if isinstance(outputs, tuple):
        outputs = outputs[0]

    g_cache = rearrange(
                fine_tuned_cache[bi][layer],
                "batch seq_len (n_heads d_head) -> batch seq_len n_heads d_head",
                n_heads=llama_model.config.num_attention_heads,
            )

    l_cache = rearrange(
                llama_cache[bi][layer],
                "batch seq_len (n_heads d_head) -> batch seq_len n_heads d_head",
                n_heads=llama_model.config.num_attention_heads,
            )
    
    if "o_proj" in layer:
        inputs = rearrange(
                inputs,
                "batch seq_len (n_heads d_head) -> batch seq_len n_heads d_head",
                n_heads=llama_model.config.num_attention_heads,
            )

        for rel_pos, heads in relative_pos.items():
            curr_layer_heads = [h for l, h in heads if l == int(layer.split(".")[2])]            
            
            if rel_pos == -1:
                for batch in range(inputs.size(0)):
                    prev_query_box_pos = analysis_utils.compute_prev_query_box_pos(
                        input_tokens["input_ids"][batch],
                        input_tokens["last_token_indices"][batch]
                    )
                    for head in curr_layer_heads:
                        inputs[batch, prev_query_box_pos, head] = g_cache[batch, prev_query_box_pos, head]

            else:
                for head in curr_layer_heads:
                    inputs[:, -1, head] = g_cache[:, -1, head]
        
#         inputs[:, :-1, :] = l_cache[:, :-1, :]

        inputs = rearrange(
                inputs,
                "batch seq_len n_heads d_head -> batch seq_len (n_heads d_head)",
                n_heads=llama_model.config.num_attention_heads,
            )
        w_o = llama_model.state_dict()[f"{layer}.weight"]
        outputs = einsum(
            inputs, w_o, "batch seq_len hidden_size, d_model hidden_size -> batch seq_len d_model"
        )

    else:
        pass
#         outputs = rearrange(
#                 outputs,
#                 "batch seq_len (n_heads d_head) -> batch seq_len n_heads d_head",
#                 n_heads=llama_model.config.num_attention_heads,
#             )

#         for rel_pos, heads in relative_pos.items():
#             curr_layer_heads = [h for l, h in heads if l == int(layer.split(".")[2])]   

#             if rel_pos == -1:
#                 for batch in range(inputs.size(0)):
#                     prev_query_box_pos = analysis_utils.compute_prev_query_box_pos(
#                         input_tokens["input_ids"][batch],
#                         input_tokens["last_token_indices"][batch]
#                     )
# #                 if "v_proj" in layer:
# #                     for head in curr_layer_heads:
# #                         outputs[:, prev_query_box_pos, head] = cache[:, prev_query_box_pos, head]

# #                 if "k_proj" in layer:
# #                     for head in curr_layer_heads:
# #                         outputs[:, prev_query_box_pos, head] = cache[:, prev_query_box_pos, head]

# #                 if "q_proj" in layer:
# #                     for head in curr_layer_heads:
# #                         outputs[:, prev_query_box_pos, head] = cache[:, prev_query_box_pos, head]
#             else:
# #                 pos = outputs.size(1) - rel_pos - 1
# #                 for batch in range(inputs.size(0)):
# #                     prev_query_box_pos = analysis_utils.compute_prev_query_box_pos(
# #                         input_tokens["input_ids"][batch],
# #                         input_tokens["last_token_indices"][batch]
# #                     )
# #                     correct_obj_pos = prev_query_box_pos - 4
# #                 if "v_proj" in layer:
# #                     for head_idx in curr_layer_heads:
# #                         outputs[:, :, head_idx] = g_cache[:, :, head_idx]

# #                 if "k_proj" in layer:
# #                     for head in curr_layer_heads:
# #                         outputs[:, :, head] = g_cache[:, :, head]

#                 if "q_proj" in layer:
#                     for head in curr_layer_heads:
#                         outputs[:, -1, head] = g_cache[:, -1, head]

#         outputs = rearrange(
#                     outputs,
#                     "batch seq_len n_heads d_head -> batch seq_len (n_heads d_head)",
#                     n_heads=llama_model.config.num_attention_heads,
#                 )

    return outputs

In [10]:
with open("./new_masks/llama-7b/direct_logit_heads/object_value/0.01.txt", "r") as f:
    data = f.readlines()
    value_fetcher_heads = json.loads(data[0].split(": ")[1])

with open("./new_masks/llama-7b/heads_affect_direct_logit/positional/0.01.txt", "r") as f:
    data = f.readlines()
    positional_info_fetcher_heads = json.loads(data[0].split(": ")[1])

with open("./new_masks/llama-7b/heads_at_query_box_pos/positional/0.01.txt", "r") as f:
    data = f.readlines()
    duplicate_token_heads = json.loads(data[0].split(": ")[1])

print(f"Value Fetcher Heads: {len(value_fetcher_heads)}")
print(f"Heads affecting direct logit heads: {len(positional_info_fetcher_heads)}")
print(f"Heads at query box token: {len(duplicate_token_heads)}")
print(f"Heads at prev query box token: {len(circuit_heads['heads_at_prev_box_pos'])}")

Value Fetcher Heads: 39
Heads affecting direct logit heads: 7
Heads at query box token: 17
Heads at prev query box token: 4


In [11]:
relative_pos = {}
relative_pos[0] = value_fetcher_heads
relative_pos[0] += positional_info_fetcher_heads
relative_pos[2] = duplicate_token_heads
relative_pos[-1] = circuit_heads['heads_at_prev_box_pos']

In [16]:
correct_count, total_count = 0, 0

with torch.no_grad():
    for bi, inputs in tqdm(enumerate(dataloader)):
        for k, v in inputs.items():
            if v is not None and isinstance(v, torch.Tensor):
                inputs[k] = v.to(llama_model.device)

        with TraceDict(llama_model, llama_modules, retain_input=True, edit_output=partial(
                                                                        cross_model_patching,
                                                                        bi = bi,
                                                                        relative_pos = relative_pos,
                                                                        input_tokens = inputs)) as trace:
                outputs = llama_model(inputs["input_ids"], output_attentions=True)

        for batch in range(inputs["labels"].size(0)):
            label = inputs["labels"][batch]
            pred = torch.argmax(outputs.logits[batch][inputs["last_token_indices"][batch]])

            if label == pred:
                correct_count += 1
            total_count += 1

        del outputs
        torch.cuda.empty_cache()

current_acc = round(correct_count / total_count, 2)
print(f"Task accuracy: {current_acc}")

63it [00:32,  1.93it/s]

Task accuracy: 0.73





Final fine-tuned model:

output patching
- full circuit: 0.73
- Value Fetcher: 0.74
- Position Transmitter: 0.68
- Position Detector: 0.66
- Structure Reader: 0.66

input patching:
- Value Fetcher
    - query: 0.72
    - key: 0.65
    - value: 0.67
    - keys + final query: 0.73
    - keys + final query + value: 0.74
- Position Transmitter:
    - query: 0.68
    - key: 0.66
    - value: 0.65
    - keys + final query: 0.68
    - keys + final query + value: 0.68

## Patching Weights

In [11]:
def patch_weights(inputs=None, output=None, layer=None, bi=None):
    inputs = inputs[0]
    value_fetcher_curr_layer = [h for l, h in value_fetcher_heads if l == int(layer.split(".")[2])]
    position_trans_curr_layer = [h for l, h in positional_info_fetcher_heads if l == int(layer.split(".")[2])]

    if ("q_proj" in layer) and len(value_fetcher_curr_layer) > 0:
        llama_w = llama_model.state_dict()[f"{layer}.weight"].clone()
        fine_tuned_w = fine_tuned_model.state_dict()[f"{layer}.weight"].clone()

        llama_w = rearrange(llama_w, 
                           "d_model (n_heads d_head) -> d_model n_heads d_head", 
                           n_heads=llama_model.config.num_attention_heads,
                           )
        fine_tuned_w = rearrange(fine_tuned_w, 
                           "d_model (n_heads d_head) -> d_model n_heads d_head", 
                           n_heads=fine_tuned_model.config.num_attention_heads,
                           )

#         for head_idx in value_fetcher_curr_layer:
        llama_w = fine_tuned_w

        llama_w = rearrange(llama_w, 
                           "d_model n_heads d_head -> d_model (n_heads d_head)", 
                           n_heads=llama_model.config.num_attention_heads,
                           )

        new_output = einsum(
            inputs, llama_w, "batch seq_len hidden_size, d_model hidden_size -> batch seq_len d_model"
        )
        output = torch.cat((output[:, :-1], new_output[:, -1].unsqueeze(dim=1)), dim=1)

    if ("o_proj" in layer) and len(position_trans_curr_layer) > 0:
        inputs = rearrange(
                inputs,
                "batch seq_len (n_heads d_head) -> batch seq_len n_heads d_head",
                n_heads=llama_model.config.num_attention_heads,
            )
        
        g_cache = rearrange(
                fine_tuned_cache[bi][layer],
                "batch seq_len (n_heads d_head) -> batch seq_len n_heads d_head",
                n_heads=llama_model.config.num_attention_heads,
            )
        
        for head in position_trans_curr_layer:
            inputs[:, -1, head] = g_cache[:, -1, head]
        
        inputs = rearrange(
                inputs,
                "batch seq_len n_heads d_head -> batch seq_len (n_heads d_head)",
                n_heads=llama_model.config.num_attention_heads,
            )
        w_o = llama_model.state_dict()[f"{layer}.weight"]
        output = einsum(
            inputs, w_o, "batch seq_len hidden_size, d_model hidden_size -> batch seq_len d_model"
        )

    return output

In [12]:
correct_count, total_count = 0, 0
llama_model.eval()
with torch.no_grad():
    for bi, inputs in tqdm(enumerate(tqdm(dataloader))):
        for k, v in inputs.items():
            if v is not None and isinstance(v, torch.Tensor):
                inputs[k] = v.to(llama_model.device)

        with TraceDict(
            llama_model,
            llama_modules,
            retain_input=True,
            edit_output=partial(
                patch_weights,
                bi=bi,
            ),
        ) as _:
            outputs = llama_model(inputs["input_ids"])

        for bi in range(inputs["labels"].size(0)):
            label = inputs["labels"][bi]
            pred = torch.argmax(outputs.logits[bi][inputs["last_token_indices"][bi]])

            if label == pred:
                correct_count += 1
            # else:
            #     print(f"Label: {tokenizer.decode(label)}, Prediction: {tokenizer.decode(pred)}")
            total_count += 1

        del outputs
        torch.cuda.empty_cache()

  0%|          | 0/63 [00:00<?, ?it/s]
0it [00:00, ?it/s][A

model.layers.14.self_attn.q_proj: llama: 108.36489868164062, fine-tuned: 108.38367462158203
model.layers.15.self_attn.q_proj: llama: 108.3883056640625, fine-tuned: 108.40525817871094
model.layers.16.self_attn.q_proj: llama: 106.8139419555664, fine-tuned: 106.8269271850586
model.layers.17.self_attn.q_proj: llama: 104.55338287353516, fine-tuned: 104.56761932373047
model.layers.18.self_attn.q_proj: llama: 103.43733215332031, fine-tuned: 103.45516967773438
model.layers.19.self_attn.q_proj: llama: 101.43477630615234, fine-tuned: 101.44762420654297
model.layers.20.self_attn.q_proj: llama: 102.77690887451172, fine-tuned: 102.79048156738281
model.layers.21.self_attn.q_proj: llama: 99.40530395507812, fine-tuned: 99.41356658935547
model.layers.22.self_attn.q_proj: llama: 101.00042724609375, fine-tuned: 101.01142883300781
model.layers.23.self_attn.q_proj: llama: 97.8895492553711, fine-tuned: 97.90031433105469
model.layers.24.self_attn.q_proj: llama: 97.97964477539062, fine-tuned: 98.0158920288086


  2%|▏         | 1/63 [00:00<00:26,  2.32it/s]

model.layers.14.self_attn.q_proj: llama: 108.36489868164062, fine-tuned: 108.38367462158203
model.layers.15.self_attn.q_proj: llama: 108.3883056640625, fine-tuned: 108.40525817871094
model.layers.16.self_attn.q_proj: llama: 106.8139419555664, fine-tuned: 106.8269271850586
model.layers.17.self_attn.q_proj: llama: 104.55338287353516, fine-tuned: 104.56761932373047
model.layers.18.self_attn.q_proj: llama: 103.43733215332031, fine-tuned: 103.45516967773438
model.layers.19.self_attn.q_proj: llama: 101.43477630615234, fine-tuned: 101.44762420654297
model.layers.20.self_attn.q_proj: llama: 102.77690887451172, fine-tuned: 102.79048156738281
model.layers.21.self_attn.q_proj: llama: 99.40530395507812, fine-tuned: 99.41356658935547
model.layers.22.self_attn.q_proj: llama: 101.00042724609375, fine-tuned: 101.01142883300781
model.layers.23.self_attn.q_proj: llama: 97.8895492553711, fine-tuned: 97.90031433105469
model.layers.24.self_attn.q_proj: llama: 97.97964477539062, fine-tuned: 98.0158920288086


  3%|▎         | 2/63 [00:00<00:25,  2.35it/s]

model.layers.14.self_attn.q_proj: llama: 108.36489868164062, fine-tuned: 108.38367462158203
model.layers.15.self_attn.q_proj: llama: 108.3883056640625, fine-tuned: 108.40525817871094
model.layers.16.self_attn.q_proj: llama: 106.8139419555664, fine-tuned: 106.8269271850586
model.layers.17.self_attn.q_proj: llama: 104.55338287353516, fine-tuned: 104.56761932373047
model.layers.18.self_attn.q_proj: llama: 103.43733215332031, fine-tuned: 103.45516967773438
model.layers.19.self_attn.q_proj: llama: 101.43477630615234, fine-tuned: 101.44762420654297
model.layers.20.self_attn.q_proj: llama: 102.77690887451172, fine-tuned: 102.79048156738281
model.layers.21.self_attn.q_proj: llama: 99.40530395507812, fine-tuned: 99.41356658935547
model.layers.22.self_attn.q_proj: llama: 101.00042724609375, fine-tuned: 101.01142883300781
model.layers.23.self_attn.q_proj: llama: 97.8895492553711, fine-tuned: 97.90031433105469
model.layers.24.self_attn.q_proj: llama: 97.97964477539062, fine-tuned: 98.0158920288086


  5%|▍         | 3/63 [00:01<00:25,  2.36it/s]

model.layers.14.self_attn.q_proj: llama: 108.36489868164062, fine-tuned: 108.38367462158203
model.layers.15.self_attn.q_proj: llama: 108.3883056640625, fine-tuned: 108.40525817871094
model.layers.16.self_attn.q_proj: llama: 106.8139419555664, fine-tuned: 106.8269271850586
model.layers.17.self_attn.q_proj: llama: 104.55338287353516, fine-tuned: 104.56761932373047
model.layers.18.self_attn.q_proj: llama: 103.43733215332031, fine-tuned: 103.45516967773438
model.layers.19.self_attn.q_proj: llama: 101.43477630615234, fine-tuned: 101.44762420654297
model.layers.20.self_attn.q_proj: llama: 102.77690887451172, fine-tuned: 102.79048156738281
model.layers.21.self_attn.q_proj: llama: 99.40530395507812, fine-tuned: 99.41356658935547
model.layers.22.self_attn.q_proj: llama: 101.00042724609375, fine-tuned: 101.01142883300781
model.layers.23.self_attn.q_proj: llama: 97.8895492553711, fine-tuned: 97.90031433105469
model.layers.24.self_attn.q_proj: llama: 97.97964477539062, fine-tuned: 98.0158920288086


  6%|▋         | 4/63 [00:01<00:24,  2.37it/s]

model.layers.14.self_attn.q_proj: llama: 108.36489868164062, fine-tuned: 108.38367462158203
model.layers.15.self_attn.q_proj: llama: 108.3883056640625, fine-tuned: 108.40525817871094
model.layers.16.self_attn.q_proj: llama: 106.8139419555664, fine-tuned: 106.8269271850586
model.layers.17.self_attn.q_proj: llama: 104.55338287353516, fine-tuned: 104.56761932373047
model.layers.18.self_attn.q_proj: llama: 103.43733215332031, fine-tuned: 103.45516967773438
model.layers.19.self_attn.q_proj: llama: 101.43477630615234, fine-tuned: 101.44762420654297
model.layers.20.self_attn.q_proj: llama: 102.77690887451172, fine-tuned: 102.79048156738281
model.layers.21.self_attn.q_proj: llama: 99.40530395507812, fine-tuned: 99.41356658935547
model.layers.22.self_attn.q_proj: llama: 101.00042724609375, fine-tuned: 101.01142883300781
model.layers.23.self_attn.q_proj: llama: 97.8895492553711, fine-tuned: 97.90031433105469
model.layers.24.self_attn.q_proj: llama: 97.97964477539062, fine-tuned: 98.0158920288086


  8%|▊         | 5/63 [00:02<00:24,  2.37it/s]

model.layers.14.self_attn.q_proj: llama: 108.36489868164062, fine-tuned: 108.38367462158203
model.layers.15.self_attn.q_proj: llama: 108.3883056640625, fine-tuned: 108.40525817871094
model.layers.16.self_attn.q_proj: llama: 106.8139419555664, fine-tuned: 106.8269271850586
model.layers.17.self_attn.q_proj: llama: 104.55338287353516, fine-tuned: 104.56761932373047
model.layers.18.self_attn.q_proj: llama: 103.43733215332031, fine-tuned: 103.45516967773438
model.layers.19.self_attn.q_proj: llama: 101.43477630615234, fine-tuned: 101.44762420654297
model.layers.20.self_attn.q_proj: llama: 102.77690887451172, fine-tuned: 102.79048156738281
model.layers.21.self_attn.q_proj: llama: 99.40530395507812, fine-tuned: 99.41356658935547
model.layers.22.self_attn.q_proj: llama: 101.00042724609375, fine-tuned: 101.01142883300781
model.layers.23.self_attn.q_proj: llama: 97.8895492553711, fine-tuned: 97.90031433105469
model.layers.24.self_attn.q_proj: llama: 97.97964477539062, fine-tuned: 98.0158920288086


 10%|▉         | 6/63 [00:02<00:23,  2.38it/s]

model.layers.14.self_attn.q_proj: llama: 108.36489868164062, fine-tuned: 108.38367462158203
model.layers.15.self_attn.q_proj: llama: 108.3883056640625, fine-tuned: 108.40525817871094
model.layers.16.self_attn.q_proj: llama: 106.8139419555664, fine-tuned: 106.8269271850586
model.layers.17.self_attn.q_proj: llama: 104.55338287353516, fine-tuned: 104.56761932373047
model.layers.18.self_attn.q_proj: llama: 103.43733215332031, fine-tuned: 103.45516967773438
model.layers.19.self_attn.q_proj: llama: 101.43477630615234, fine-tuned: 101.44762420654297
model.layers.20.self_attn.q_proj: llama: 102.77690887451172, fine-tuned: 102.79048156738281
model.layers.21.self_attn.q_proj: llama: 99.40530395507812, fine-tuned: 99.41356658935547
model.layers.22.self_attn.q_proj: llama: 101.00042724609375, fine-tuned: 101.01142883300781
model.layers.23.self_attn.q_proj: llama: 97.8895492553711, fine-tuned: 97.90031433105469
model.layers.24.self_attn.q_proj: llama: 97.97964477539062, fine-tuned: 98.0158920288086


 11%|█         | 7/63 [00:02<00:23,  2.38it/s]

model.layers.14.self_attn.q_proj: llama: 108.36489868164062, fine-tuned: 108.38367462158203
model.layers.15.self_attn.q_proj: llama: 108.3883056640625, fine-tuned: 108.40525817871094
model.layers.16.self_attn.q_proj: llama: 106.8139419555664, fine-tuned: 106.8269271850586
model.layers.17.self_attn.q_proj: llama: 104.55338287353516, fine-tuned: 104.56761932373047
model.layers.18.self_attn.q_proj: llama: 103.43733215332031, fine-tuned: 103.45516967773438
model.layers.19.self_attn.q_proj: llama: 101.43477630615234, fine-tuned: 101.44762420654297
model.layers.20.self_attn.q_proj: llama: 102.77690887451172, fine-tuned: 102.79048156738281
model.layers.21.self_attn.q_proj: llama: 99.40530395507812, fine-tuned: 99.41356658935547
model.layers.22.self_attn.q_proj: llama: 101.00042724609375, fine-tuned: 101.01142883300781
model.layers.23.self_attn.q_proj: llama: 97.8895492553711, fine-tuned: 97.90031433105469
model.layers.24.self_attn.q_proj: llama: 97.97964477539062, fine-tuned: 98.0158920288086


 13%|█▎        | 8/63 [00:03<00:23,  2.37it/s]

model.layers.14.self_attn.q_proj: llama: 108.36489868164062, fine-tuned: 108.38367462158203
model.layers.15.self_attn.q_proj: llama: 108.3883056640625, fine-tuned: 108.40525817871094
model.layers.16.self_attn.q_proj: llama: 106.8139419555664, fine-tuned: 106.8269271850586
model.layers.17.self_attn.q_proj: llama: 104.55338287353516, fine-tuned: 104.56761932373047
model.layers.18.self_attn.q_proj: llama: 103.43733215332031, fine-tuned: 103.45516967773438
model.layers.19.self_attn.q_proj: llama: 101.43477630615234, fine-tuned: 101.44762420654297
model.layers.20.self_attn.q_proj: llama: 102.77690887451172, fine-tuned: 102.79048156738281
model.layers.21.self_attn.q_proj: llama: 99.40530395507812, fine-tuned: 99.41356658935547
model.layers.22.self_attn.q_proj: llama: 101.00042724609375, fine-tuned: 101.01142883300781
model.layers.23.self_attn.q_proj: llama: 97.8895492553711, fine-tuned: 97.90031433105469
model.layers.24.self_attn.q_proj: llama: 97.97964477539062, fine-tuned: 98.0158920288086


 14%|█▍        | 9/63 [00:03<00:22,  2.37it/s]

model.layers.14.self_attn.q_proj: llama: 108.36489868164062, fine-tuned: 108.38367462158203
model.layers.15.self_attn.q_proj: llama: 108.3883056640625, fine-tuned: 108.40525817871094
model.layers.16.self_attn.q_proj: llama: 106.8139419555664, fine-tuned: 106.8269271850586
model.layers.17.self_attn.q_proj: llama: 104.55338287353516, fine-tuned: 104.56761932373047
model.layers.18.self_attn.q_proj: llama: 103.43733215332031, fine-tuned: 103.45516967773438
model.layers.19.self_attn.q_proj: llama: 101.43477630615234, fine-tuned: 101.44762420654297
model.layers.20.self_attn.q_proj: llama: 102.77690887451172, fine-tuned: 102.79048156738281
model.layers.21.self_attn.q_proj: llama: 99.40530395507812, fine-tuned: 99.41356658935547
model.layers.22.self_attn.q_proj: llama: 101.00042724609375, fine-tuned: 101.01142883300781
model.layers.23.self_attn.q_proj: llama: 97.8895492553711, fine-tuned: 97.90031433105469
model.layers.24.self_attn.q_proj: llama: 97.97964477539062, fine-tuned: 98.0158920288086


 16%|█▌        | 10/63 [00:04<00:22,  2.37it/s]

model.layers.14.self_attn.q_proj: llama: 108.36489868164062, fine-tuned: 108.38367462158203
model.layers.15.self_attn.q_proj: llama: 108.3883056640625, fine-tuned: 108.40525817871094
model.layers.16.self_attn.q_proj: llama: 106.8139419555664, fine-tuned: 106.8269271850586
model.layers.17.self_attn.q_proj: llama: 104.55338287353516, fine-tuned: 104.56761932373047
model.layers.18.self_attn.q_proj: llama: 103.43733215332031, fine-tuned: 103.45516967773438
model.layers.19.self_attn.q_proj: llama: 101.43477630615234, fine-tuned: 101.44762420654297
model.layers.20.self_attn.q_proj: llama: 102.77690887451172, fine-tuned: 102.79048156738281
model.layers.21.self_attn.q_proj: llama: 99.40530395507812, fine-tuned: 99.41356658935547
model.layers.22.self_attn.q_proj: llama: 101.00042724609375, fine-tuned: 101.01142883300781
model.layers.23.self_attn.q_proj: llama: 97.8895492553711, fine-tuned: 97.90031433105469
model.layers.24.self_attn.q_proj: llama: 97.97964477539062, fine-tuned: 98.0158920288086


 17%|█▋        | 11/63 [00:04<00:21,  2.37it/s]

model.layers.14.self_attn.q_proj: llama: 108.36489868164062, fine-tuned: 108.38367462158203
model.layers.15.self_attn.q_proj: llama: 108.3883056640625, fine-tuned: 108.40525817871094
model.layers.16.self_attn.q_proj: llama: 106.8139419555664, fine-tuned: 106.8269271850586
model.layers.17.self_attn.q_proj: llama: 104.55338287353516, fine-tuned: 104.56761932373047
model.layers.18.self_attn.q_proj: llama: 103.43733215332031, fine-tuned: 103.45516967773438
model.layers.19.self_attn.q_proj: llama: 101.43477630615234, fine-tuned: 101.44762420654297
model.layers.20.self_attn.q_proj: llama: 102.77690887451172, fine-tuned: 102.79048156738281
model.layers.21.self_attn.q_proj: llama: 99.40530395507812, fine-tuned: 99.41356658935547
model.layers.22.self_attn.q_proj: llama: 101.00042724609375, fine-tuned: 101.01142883300781
model.layers.23.self_attn.q_proj: llama: 97.8895492553711, fine-tuned: 97.90031433105469
model.layers.24.self_attn.q_proj: llama: 97.97964477539062, fine-tuned: 98.0158920288086


 19%|█▉        | 12/63 [00:05<00:21,  2.37it/s]

model.layers.14.self_attn.q_proj: llama: 108.36489868164062, fine-tuned: 108.38367462158203
model.layers.15.self_attn.q_proj: llama: 108.3883056640625, fine-tuned: 108.40525817871094
model.layers.16.self_attn.q_proj: llama: 106.8139419555664, fine-tuned: 106.8269271850586
model.layers.17.self_attn.q_proj: llama: 104.55338287353516, fine-tuned: 104.56761932373047
model.layers.18.self_attn.q_proj: llama: 103.43733215332031, fine-tuned: 103.45516967773438
model.layers.19.self_attn.q_proj: llama: 101.43477630615234, fine-tuned: 101.44762420654297
model.layers.20.self_attn.q_proj: llama: 102.77690887451172, fine-tuned: 102.79048156738281
model.layers.21.self_attn.q_proj: llama: 99.40530395507812, fine-tuned: 99.41356658935547
model.layers.22.self_attn.q_proj: llama: 101.00042724609375, fine-tuned: 101.01142883300781
model.layers.23.self_attn.q_proj: llama: 97.8895492553711, fine-tuned: 97.90031433105469
model.layers.24.self_attn.q_proj: llama: 97.97964477539062, fine-tuned: 98.0158920288086


 21%|██        | 13/63 [00:05<00:21,  2.37it/s]

model.layers.14.self_attn.q_proj: llama: 108.36489868164062, fine-tuned: 108.38367462158203
model.layers.15.self_attn.q_proj: llama: 108.3883056640625, fine-tuned: 108.40525817871094
model.layers.16.self_attn.q_proj: llama: 106.8139419555664, fine-tuned: 106.8269271850586
model.layers.17.self_attn.q_proj: llama: 104.55338287353516, fine-tuned: 104.56761932373047
model.layers.18.self_attn.q_proj: llama: 103.43733215332031, fine-tuned: 103.45516967773438
model.layers.19.self_attn.q_proj: llama: 101.43477630615234, fine-tuned: 101.44762420654297
model.layers.20.self_attn.q_proj: llama: 102.77690887451172, fine-tuned: 102.79048156738281
model.layers.21.self_attn.q_proj: llama: 99.40530395507812, fine-tuned: 99.41356658935547
model.layers.22.self_attn.q_proj: llama: 101.00042724609375, fine-tuned: 101.01142883300781
model.layers.23.self_attn.q_proj: llama: 97.8895492553711, fine-tuned: 97.90031433105469
model.layers.24.self_attn.q_proj: llama: 97.97964477539062, fine-tuned: 98.0158920288086


 22%|██▏       | 14/63 [00:05<00:20,  2.37it/s]

model.layers.14.self_attn.q_proj: llama: 108.36489868164062, fine-tuned: 108.38367462158203
model.layers.15.self_attn.q_proj: llama: 108.3883056640625, fine-tuned: 108.40525817871094
model.layers.16.self_attn.q_proj: llama: 106.8139419555664, fine-tuned: 106.8269271850586
model.layers.17.self_attn.q_proj: llama: 104.55338287353516, fine-tuned: 104.56761932373047
model.layers.18.self_attn.q_proj: llama: 103.43733215332031, fine-tuned: 103.45516967773438
model.layers.19.self_attn.q_proj: llama: 101.43477630615234, fine-tuned: 101.44762420654297
model.layers.20.self_attn.q_proj: llama: 102.77690887451172, fine-tuned: 102.79048156738281
model.layers.21.self_attn.q_proj: llama: 99.40530395507812, fine-tuned: 99.41356658935547
model.layers.22.self_attn.q_proj: llama: 101.00042724609375, fine-tuned: 101.01142883300781
model.layers.23.self_attn.q_proj: llama: 97.8895492553711, fine-tuned: 97.90031433105469
model.layers.24.self_attn.q_proj: llama: 97.97964477539062, fine-tuned: 98.0158920288086


 24%|██▍       | 15/63 [00:06<00:20,  2.37it/s]

model.layers.14.self_attn.q_proj: llama: 108.36489868164062, fine-tuned: 108.38367462158203
model.layers.15.self_attn.q_proj: llama: 108.3883056640625, fine-tuned: 108.40525817871094
model.layers.16.self_attn.q_proj: llama: 106.8139419555664, fine-tuned: 106.8269271850586
model.layers.17.self_attn.q_proj: llama: 104.55338287353516, fine-tuned: 104.56761932373047
model.layers.18.self_attn.q_proj: llama: 103.43733215332031, fine-tuned: 103.45516967773438
model.layers.19.self_attn.q_proj: llama: 101.43477630615234, fine-tuned: 101.44762420654297
model.layers.20.self_attn.q_proj: llama: 102.77690887451172, fine-tuned: 102.79048156738281
model.layers.21.self_attn.q_proj: llama: 99.40530395507812, fine-tuned: 99.41356658935547
model.layers.22.self_attn.q_proj: llama: 101.00042724609375, fine-tuned: 101.01142883300781
model.layers.23.self_attn.q_proj: llama: 97.8895492553711, fine-tuned: 97.90031433105469
model.layers.24.self_attn.q_proj: llama: 97.97964477539062, fine-tuned: 98.0158920288086


 25%|██▌       | 16/63 [00:06<00:19,  2.37it/s]

model.layers.14.self_attn.q_proj: llama: 108.36489868164062, fine-tuned: 108.38367462158203
model.layers.15.self_attn.q_proj: llama: 108.3883056640625, fine-tuned: 108.40525817871094
model.layers.16.self_attn.q_proj: llama: 106.8139419555664, fine-tuned: 106.8269271850586
model.layers.17.self_attn.q_proj: llama: 104.55338287353516, fine-tuned: 104.56761932373047
model.layers.18.self_attn.q_proj: llama: 103.43733215332031, fine-tuned: 103.45516967773438
model.layers.19.self_attn.q_proj: llama: 101.43477630615234, fine-tuned: 101.44762420654297
model.layers.20.self_attn.q_proj: llama: 102.77690887451172, fine-tuned: 102.79048156738281
model.layers.21.self_attn.q_proj: llama: 99.40530395507812, fine-tuned: 99.41356658935547
model.layers.22.self_attn.q_proj: llama: 101.00042724609375, fine-tuned: 101.01142883300781
model.layers.23.self_attn.q_proj: llama: 97.8895492553711, fine-tuned: 97.90031433105469
model.layers.24.self_attn.q_proj: llama: 97.97964477539062, fine-tuned: 98.0158920288086


 27%|██▋       | 17/63 [00:07<00:19,  2.38it/s]

model.layers.14.self_attn.q_proj: llama: 108.36489868164062, fine-tuned: 108.38367462158203
model.layers.15.self_attn.q_proj: llama: 108.3883056640625, fine-tuned: 108.40525817871094
model.layers.16.self_attn.q_proj: llama: 106.8139419555664, fine-tuned: 106.8269271850586
model.layers.17.self_attn.q_proj: llama: 104.55338287353516, fine-tuned: 104.56761932373047
model.layers.18.self_attn.q_proj: llama: 103.43733215332031, fine-tuned: 103.45516967773438
model.layers.19.self_attn.q_proj: llama: 101.43477630615234, fine-tuned: 101.44762420654297
model.layers.20.self_attn.q_proj: llama: 102.77690887451172, fine-tuned: 102.79048156738281
model.layers.21.self_attn.q_proj: llama: 99.40530395507812, fine-tuned: 99.41356658935547
model.layers.22.self_attn.q_proj: llama: 101.00042724609375, fine-tuned: 101.01142883300781
model.layers.23.self_attn.q_proj: llama: 97.8895492553711, fine-tuned: 97.90031433105469
model.layers.24.self_attn.q_proj: llama: 97.97964477539062, fine-tuned: 98.0158920288086


 29%|██▊       | 18/63 [00:07<00:18,  2.38it/s]

model.layers.14.self_attn.q_proj: llama: 108.36489868164062, fine-tuned: 108.38367462158203
model.layers.15.self_attn.q_proj: llama: 108.3883056640625, fine-tuned: 108.40525817871094
model.layers.16.self_attn.q_proj: llama: 106.8139419555664, fine-tuned: 106.8269271850586
model.layers.17.self_attn.q_proj: llama: 104.55338287353516, fine-tuned: 104.56761932373047
model.layers.18.self_attn.q_proj: llama: 103.43733215332031, fine-tuned: 103.45516967773438
model.layers.19.self_attn.q_proj: llama: 101.43477630615234, fine-tuned: 101.44762420654297
model.layers.20.self_attn.q_proj: llama: 102.77690887451172, fine-tuned: 102.79048156738281
model.layers.21.self_attn.q_proj: llama: 99.40530395507812, fine-tuned: 99.41356658935547
model.layers.22.self_attn.q_proj: llama: 101.00042724609375, fine-tuned: 101.01142883300781
model.layers.23.self_attn.q_proj: llama: 97.8895492553711, fine-tuned: 97.90031433105469
model.layers.24.self_attn.q_proj: llama: 97.97964477539062, fine-tuned: 98.0158920288086


 30%|███       | 19/63 [00:08<00:18,  2.38it/s]

model.layers.14.self_attn.q_proj: llama: 108.36489868164062, fine-tuned: 108.38367462158203
model.layers.15.self_attn.q_proj: llama: 108.3883056640625, fine-tuned: 108.40525817871094
model.layers.16.self_attn.q_proj: llama: 106.8139419555664, fine-tuned: 106.8269271850586
model.layers.17.self_attn.q_proj: llama: 104.55338287353516, fine-tuned: 104.56761932373047
model.layers.18.self_attn.q_proj: llama: 103.43733215332031, fine-tuned: 103.45516967773438
model.layers.19.self_attn.q_proj: llama: 101.43477630615234, fine-tuned: 101.44762420654297
model.layers.20.self_attn.q_proj: llama: 102.77690887451172, fine-tuned: 102.79048156738281
model.layers.21.self_attn.q_proj: llama: 99.40530395507812, fine-tuned: 99.41356658935547
model.layers.22.self_attn.q_proj: llama: 101.00042724609375, fine-tuned: 101.01142883300781
model.layers.23.self_attn.q_proj: llama: 97.8895492553711, fine-tuned: 97.90031433105469
model.layers.24.self_attn.q_proj: llama: 97.97964477539062, fine-tuned: 98.0158920288086


 32%|███▏      | 20/63 [00:08<00:18,  2.38it/s]

model.layers.14.self_attn.q_proj: llama: 108.36489868164062, fine-tuned: 108.38367462158203
model.layers.15.self_attn.q_proj: llama: 108.3883056640625, fine-tuned: 108.40525817871094
model.layers.16.self_attn.q_proj: llama: 106.8139419555664, fine-tuned: 106.8269271850586
model.layers.17.self_attn.q_proj: llama: 104.55338287353516, fine-tuned: 104.56761932373047
model.layers.18.self_attn.q_proj: llama: 103.43733215332031, fine-tuned: 103.45516967773438
model.layers.19.self_attn.q_proj: llama: 101.43477630615234, fine-tuned: 101.44762420654297
model.layers.20.self_attn.q_proj: llama: 102.77690887451172, fine-tuned: 102.79048156738281
model.layers.21.self_attn.q_proj: llama: 99.40530395507812, fine-tuned: 99.41356658935547
model.layers.22.self_attn.q_proj: llama: 101.00042724609375, fine-tuned: 101.01142883300781
model.layers.23.self_attn.q_proj: llama: 97.8895492553711, fine-tuned: 97.90031433105469
model.layers.24.self_attn.q_proj: llama: 97.97964477539062, fine-tuned: 98.0158920288086


 33%|███▎      | 21/63 [00:08<00:17,  2.38it/s]

model.layers.14.self_attn.q_proj: llama: 108.36489868164062, fine-tuned: 108.38367462158203
model.layers.15.self_attn.q_proj: llama: 108.3883056640625, fine-tuned: 108.40525817871094
model.layers.16.self_attn.q_proj: llama: 106.8139419555664, fine-tuned: 106.8269271850586
model.layers.17.self_attn.q_proj: llama: 104.55338287353516, fine-tuned: 104.56761932373047
model.layers.18.self_attn.q_proj: llama: 103.43733215332031, fine-tuned: 103.45516967773438
model.layers.19.self_attn.q_proj: llama: 101.43477630615234, fine-tuned: 101.44762420654297
model.layers.20.self_attn.q_proj: llama: 102.77690887451172, fine-tuned: 102.79048156738281
model.layers.21.self_attn.q_proj: llama: 99.40530395507812, fine-tuned: 99.41356658935547
model.layers.22.self_attn.q_proj: llama: 101.00042724609375, fine-tuned: 101.01142883300781
model.layers.23.self_attn.q_proj: llama: 97.8895492553711, fine-tuned: 97.90031433105469
model.layers.24.self_attn.q_proj: llama: 97.97964477539062, fine-tuned: 98.0158920288086


 35%|███▍      | 22/63 [00:09<00:17,  2.37it/s]

model.layers.14.self_attn.q_proj: llama: 108.36489868164062, fine-tuned: 108.38367462158203
model.layers.15.self_attn.q_proj: llama: 108.3883056640625, fine-tuned: 108.40525817871094
model.layers.16.self_attn.q_proj: llama: 106.8139419555664, fine-tuned: 106.8269271850586
model.layers.17.self_attn.q_proj: llama: 104.55338287353516, fine-tuned: 104.56761932373047
model.layers.18.self_attn.q_proj: llama: 103.43733215332031, fine-tuned: 103.45516967773438
model.layers.19.self_attn.q_proj: llama: 101.43477630615234, fine-tuned: 101.44762420654297
model.layers.20.self_attn.q_proj: llama: 102.77690887451172, fine-tuned: 102.79048156738281
model.layers.21.self_attn.q_proj: llama: 99.40530395507812, fine-tuned: 99.41356658935547
model.layers.22.self_attn.q_proj: llama: 101.00042724609375, fine-tuned: 101.01142883300781
model.layers.23.self_attn.q_proj: llama: 97.8895492553711, fine-tuned: 97.90031433105469
model.layers.24.self_attn.q_proj: llama: 97.97964477539062, fine-tuned: 98.0158920288086


 37%|███▋      | 23/63 [00:09<00:16,  2.37it/s]

model.layers.14.self_attn.q_proj: llama: 108.36489868164062, fine-tuned: 108.38367462158203
model.layers.15.self_attn.q_proj: llama: 108.3883056640625, fine-tuned: 108.40525817871094
model.layers.16.self_attn.q_proj: llama: 106.8139419555664, fine-tuned: 106.8269271850586
model.layers.17.self_attn.q_proj: llama: 104.55338287353516, fine-tuned: 104.56761932373047
model.layers.18.self_attn.q_proj: llama: 103.43733215332031, fine-tuned: 103.45516967773438
model.layers.19.self_attn.q_proj: llama: 101.43477630615234, fine-tuned: 101.44762420654297
model.layers.20.self_attn.q_proj: llama: 102.77690887451172, fine-tuned: 102.79048156738281
model.layers.21.self_attn.q_proj: llama: 99.40530395507812, fine-tuned: 99.41356658935547
model.layers.22.self_attn.q_proj: llama: 101.00042724609375, fine-tuned: 101.01142883300781
model.layers.23.self_attn.q_proj: llama: 97.8895492553711, fine-tuned: 97.90031433105469
model.layers.24.self_attn.q_proj: llama: 97.97964477539062, fine-tuned: 98.0158920288086


 38%|███▊      | 24/63 [00:10<00:16,  2.37it/s]

model.layers.14.self_attn.q_proj: llama: 108.36489868164062, fine-tuned: 108.38367462158203
model.layers.15.self_attn.q_proj: llama: 108.3883056640625, fine-tuned: 108.40525817871094
model.layers.16.self_attn.q_proj: llama: 106.8139419555664, fine-tuned: 106.8269271850586
model.layers.17.self_attn.q_proj: llama: 104.55338287353516, fine-tuned: 104.56761932373047
model.layers.18.self_attn.q_proj: llama: 103.43733215332031, fine-tuned: 103.45516967773438
model.layers.19.self_attn.q_proj: llama: 101.43477630615234, fine-tuned: 101.44762420654297
model.layers.20.self_attn.q_proj: llama: 102.77690887451172, fine-tuned: 102.79048156738281
model.layers.21.self_attn.q_proj: llama: 99.40530395507812, fine-tuned: 99.41356658935547
model.layers.22.self_attn.q_proj: llama: 101.00042724609375, fine-tuned: 101.01142883300781
model.layers.23.self_attn.q_proj: llama: 97.8895492553711, fine-tuned: 97.90031433105469
model.layers.24.self_attn.q_proj: llama: 97.97964477539062, fine-tuned: 98.0158920288086


 40%|███▉      | 25/63 [00:10<00:15,  2.38it/s]

model.layers.14.self_attn.q_proj: llama: 108.36489868164062, fine-tuned: 108.38367462158203
model.layers.15.self_attn.q_proj: llama: 108.3883056640625, fine-tuned: 108.40525817871094
model.layers.16.self_attn.q_proj: llama: 106.8139419555664, fine-tuned: 106.8269271850586
model.layers.17.self_attn.q_proj: llama: 104.55338287353516, fine-tuned: 104.56761932373047
model.layers.18.self_attn.q_proj: llama: 103.43733215332031, fine-tuned: 103.45516967773438
model.layers.19.self_attn.q_proj: llama: 101.43477630615234, fine-tuned: 101.44762420654297
model.layers.20.self_attn.q_proj: llama: 102.77690887451172, fine-tuned: 102.79048156738281
model.layers.21.self_attn.q_proj: llama: 99.40530395507812, fine-tuned: 99.41356658935547
model.layers.22.self_attn.q_proj: llama: 101.00042724609375, fine-tuned: 101.01142883300781
model.layers.23.self_attn.q_proj: llama: 97.8895492553711, fine-tuned: 97.90031433105469
model.layers.24.self_attn.q_proj: llama: 97.97964477539062, fine-tuned: 98.0158920288086


 41%|████▏     | 26/63 [00:10<00:15,  2.38it/s]

model.layers.14.self_attn.q_proj: llama: 108.36489868164062, fine-tuned: 108.38367462158203
model.layers.15.self_attn.q_proj: llama: 108.3883056640625, fine-tuned: 108.40525817871094
model.layers.16.self_attn.q_proj: llama: 106.8139419555664, fine-tuned: 106.8269271850586
model.layers.17.self_attn.q_proj: llama: 104.55338287353516, fine-tuned: 104.56761932373047
model.layers.18.self_attn.q_proj: llama: 103.43733215332031, fine-tuned: 103.45516967773438
model.layers.19.self_attn.q_proj: llama: 101.43477630615234, fine-tuned: 101.44762420654297
model.layers.20.self_attn.q_proj: llama: 102.77690887451172, fine-tuned: 102.79048156738281
model.layers.21.self_attn.q_proj: llama: 99.40530395507812, fine-tuned: 99.41356658935547
model.layers.22.self_attn.q_proj: llama: 101.00042724609375, fine-tuned: 101.01142883300781
model.layers.23.self_attn.q_proj: llama: 97.8895492553711, fine-tuned: 97.90031433105469
model.layers.24.self_attn.q_proj: llama: 97.97964477539062, fine-tuned: 98.0158920288086


 43%|████▎     | 27/63 [00:11<00:15,  2.38it/s]

model.layers.14.self_attn.q_proj: llama: 108.36489868164062, fine-tuned: 108.38367462158203
model.layers.15.self_attn.q_proj: llama: 108.3883056640625, fine-tuned: 108.40525817871094
model.layers.16.self_attn.q_proj: llama: 106.8139419555664, fine-tuned: 106.8269271850586
model.layers.17.self_attn.q_proj: llama: 104.55338287353516, fine-tuned: 104.56761932373047
model.layers.18.self_attn.q_proj: llama: 103.43733215332031, fine-tuned: 103.45516967773438
model.layers.19.self_attn.q_proj: llama: 101.43477630615234, fine-tuned: 101.44762420654297
model.layers.20.self_attn.q_proj: llama: 102.77690887451172, fine-tuned: 102.79048156738281
model.layers.21.self_attn.q_proj: llama: 99.40530395507812, fine-tuned: 99.41356658935547
model.layers.22.self_attn.q_proj: llama: 101.00042724609375, fine-tuned: 101.01142883300781
model.layers.23.self_attn.q_proj: llama: 97.8895492553711, fine-tuned: 97.90031433105469
model.layers.24.self_attn.q_proj: llama: 97.97964477539062, fine-tuned: 98.0158920288086


 44%|████▍     | 28/63 [00:11<00:14,  2.38it/s]

model.layers.14.self_attn.q_proj: llama: 108.36489868164062, fine-tuned: 108.38367462158203
model.layers.15.self_attn.q_proj: llama: 108.3883056640625, fine-tuned: 108.40525817871094
model.layers.16.self_attn.q_proj: llama: 106.8139419555664, fine-tuned: 106.8269271850586
model.layers.17.self_attn.q_proj: llama: 104.55338287353516, fine-tuned: 104.56761932373047
model.layers.18.self_attn.q_proj: llama: 103.43733215332031, fine-tuned: 103.45516967773438
model.layers.19.self_attn.q_proj: llama: 101.43477630615234, fine-tuned: 101.44762420654297
model.layers.20.self_attn.q_proj: llama: 102.77690887451172, fine-tuned: 102.79048156738281
model.layers.21.self_attn.q_proj: llama: 99.40530395507812, fine-tuned: 99.41356658935547
model.layers.22.self_attn.q_proj: llama: 101.00042724609375, fine-tuned: 101.01142883300781
model.layers.23.self_attn.q_proj: llama: 97.8895492553711, fine-tuned: 97.90031433105469
model.layers.24.self_attn.q_proj: llama: 97.97964477539062, fine-tuned: 98.0158920288086


 46%|████▌     | 29/63 [00:12<00:14,  2.38it/s]

model.layers.14.self_attn.q_proj: llama: 108.36489868164062, fine-tuned: 108.38367462158203
model.layers.15.self_attn.q_proj: llama: 108.3883056640625, fine-tuned: 108.40525817871094
model.layers.16.self_attn.q_proj: llama: 106.8139419555664, fine-tuned: 106.8269271850586
model.layers.17.self_attn.q_proj: llama: 104.55338287353516, fine-tuned: 104.56761932373047
model.layers.18.self_attn.q_proj: llama: 103.43733215332031, fine-tuned: 103.45516967773438
model.layers.19.self_attn.q_proj: llama: 101.43477630615234, fine-tuned: 101.44762420654297
model.layers.20.self_attn.q_proj: llama: 102.77690887451172, fine-tuned: 102.79048156738281
model.layers.21.self_attn.q_proj: llama: 99.40530395507812, fine-tuned: 99.41356658935547
model.layers.22.self_attn.q_proj: llama: 101.00042724609375, fine-tuned: 101.01142883300781
model.layers.23.self_attn.q_proj: llama: 97.8895492553711, fine-tuned: 97.90031433105469
model.layers.24.self_attn.q_proj: llama: 97.97964477539062, fine-tuned: 98.0158920288086


 48%|████▊     | 30/63 [00:12<00:13,  2.38it/s]

model.layers.14.self_attn.q_proj: llama: 108.36489868164062, fine-tuned: 108.38367462158203
model.layers.15.self_attn.q_proj: llama: 108.3883056640625, fine-tuned: 108.40525817871094
model.layers.16.self_attn.q_proj: llama: 106.8139419555664, fine-tuned: 106.8269271850586
model.layers.17.self_attn.q_proj: llama: 104.55338287353516, fine-tuned: 104.56761932373047
model.layers.18.self_attn.q_proj: llama: 103.43733215332031, fine-tuned: 103.45516967773438
model.layers.19.self_attn.q_proj: llama: 101.43477630615234, fine-tuned: 101.44762420654297
model.layers.20.self_attn.q_proj: llama: 102.77690887451172, fine-tuned: 102.79048156738281
model.layers.21.self_attn.q_proj: llama: 99.40530395507812, fine-tuned: 99.41356658935547
model.layers.22.self_attn.q_proj: llama: 101.00042724609375, fine-tuned: 101.01142883300781
model.layers.23.self_attn.q_proj: llama: 97.8895492553711, fine-tuned: 97.90031433105469
model.layers.24.self_attn.q_proj: llama: 97.97964477539062, fine-tuned: 98.0158920288086


 49%|████▉     | 31/63 [00:13<00:13,  2.38it/s]

model.layers.14.self_attn.q_proj: llama: 108.36489868164062, fine-tuned: 108.38367462158203
model.layers.15.self_attn.q_proj: llama: 108.3883056640625, fine-tuned: 108.40525817871094
model.layers.16.self_attn.q_proj: llama: 106.8139419555664, fine-tuned: 106.8269271850586
model.layers.17.self_attn.q_proj: llama: 104.55338287353516, fine-tuned: 104.56761932373047
model.layers.18.self_attn.q_proj: llama: 103.43733215332031, fine-tuned: 103.45516967773438
model.layers.19.self_attn.q_proj: llama: 101.43477630615234, fine-tuned: 101.44762420654297
model.layers.20.self_attn.q_proj: llama: 102.77690887451172, fine-tuned: 102.79048156738281
model.layers.21.self_attn.q_proj: llama: 99.40530395507812, fine-tuned: 99.41356658935547
model.layers.22.self_attn.q_proj: llama: 101.00042724609375, fine-tuned: 101.01142883300781
model.layers.23.self_attn.q_proj: llama: 97.8895492553711, fine-tuned: 97.90031433105469
model.layers.24.self_attn.q_proj: llama: 97.97964477539062, fine-tuned: 98.0158920288086


 51%|█████     | 32/63 [00:13<00:13,  2.38it/s]

model.layers.14.self_attn.q_proj: llama: 108.36489868164062, fine-tuned: 108.38367462158203
model.layers.15.self_attn.q_proj: llama: 108.3883056640625, fine-tuned: 108.40525817871094
model.layers.16.self_attn.q_proj: llama: 106.8139419555664, fine-tuned: 106.8269271850586
model.layers.17.self_attn.q_proj: llama: 104.55338287353516, fine-tuned: 104.56761932373047
model.layers.18.self_attn.q_proj: llama: 103.43733215332031, fine-tuned: 103.45516967773438
model.layers.19.self_attn.q_proj: llama: 101.43477630615234, fine-tuned: 101.44762420654297
model.layers.20.self_attn.q_proj: llama: 102.77690887451172, fine-tuned: 102.79048156738281
model.layers.21.self_attn.q_proj: llama: 99.40530395507812, fine-tuned: 99.41356658935547
model.layers.22.self_attn.q_proj: llama: 101.00042724609375, fine-tuned: 101.01142883300781
model.layers.23.self_attn.q_proj: llama: 97.8895492553711, fine-tuned: 97.90031433105469
model.layers.24.self_attn.q_proj: llama: 97.97964477539062, fine-tuned: 98.0158920288086


 52%|█████▏    | 33/63 [00:13<00:12,  2.38it/s]

model.layers.14.self_attn.q_proj: llama: 108.36489868164062, fine-tuned: 108.38367462158203
model.layers.15.self_attn.q_proj: llama: 108.3883056640625, fine-tuned: 108.40525817871094
model.layers.16.self_attn.q_proj: llama: 106.8139419555664, fine-tuned: 106.8269271850586
model.layers.17.self_attn.q_proj: llama: 104.55338287353516, fine-tuned: 104.56761932373047
model.layers.18.self_attn.q_proj: llama: 103.43733215332031, fine-tuned: 103.45516967773438
model.layers.19.self_attn.q_proj: llama: 101.43477630615234, fine-tuned: 101.44762420654297
model.layers.20.self_attn.q_proj: llama: 102.77690887451172, fine-tuned: 102.79048156738281
model.layers.21.self_attn.q_proj: llama: 99.40530395507812, fine-tuned: 99.41356658935547
model.layers.22.self_attn.q_proj: llama: 101.00042724609375, fine-tuned: 101.01142883300781
model.layers.23.self_attn.q_proj: llama: 97.8895492553711, fine-tuned: 97.90031433105469
model.layers.24.self_attn.q_proj: llama: 97.97964477539062, fine-tuned: 98.0158920288086


 54%|█████▍    | 34/63 [00:14<00:12,  2.38it/s]

model.layers.14.self_attn.q_proj: llama: 108.36489868164062, fine-tuned: 108.38367462158203
model.layers.15.self_attn.q_proj: llama: 108.3883056640625, fine-tuned: 108.40525817871094
model.layers.16.self_attn.q_proj: llama: 106.8139419555664, fine-tuned: 106.8269271850586
model.layers.17.self_attn.q_proj: llama: 104.55338287353516, fine-tuned: 104.56761932373047
model.layers.18.self_attn.q_proj: llama: 103.43733215332031, fine-tuned: 103.45516967773438
model.layers.19.self_attn.q_proj: llama: 101.43477630615234, fine-tuned: 101.44762420654297
model.layers.20.self_attn.q_proj: llama: 102.77690887451172, fine-tuned: 102.79048156738281
model.layers.21.self_attn.q_proj: llama: 99.40530395507812, fine-tuned: 99.41356658935547
model.layers.22.self_attn.q_proj: llama: 101.00042724609375, fine-tuned: 101.01142883300781
model.layers.23.self_attn.q_proj: llama: 97.8895492553711, fine-tuned: 97.90031433105469
model.layers.24.self_attn.q_proj: llama: 97.97964477539062, fine-tuned: 98.0158920288086


 56%|█████▌    | 35/63 [00:14<00:11,  2.38it/s]

model.layers.14.self_attn.q_proj: llama: 108.36489868164062, fine-tuned: 108.38367462158203
model.layers.15.self_attn.q_proj: llama: 108.3883056640625, fine-tuned: 108.40525817871094
model.layers.16.self_attn.q_proj: llama: 106.8139419555664, fine-tuned: 106.8269271850586
model.layers.17.self_attn.q_proj: llama: 104.55338287353516, fine-tuned: 104.56761932373047
model.layers.18.self_attn.q_proj: llama: 103.43733215332031, fine-tuned: 103.45516967773438
model.layers.19.self_attn.q_proj: llama: 101.43477630615234, fine-tuned: 101.44762420654297
model.layers.20.self_attn.q_proj: llama: 102.77690887451172, fine-tuned: 102.79048156738281
model.layers.21.self_attn.q_proj: llama: 99.40530395507812, fine-tuned: 99.41356658935547
model.layers.22.self_attn.q_proj: llama: 101.00042724609375, fine-tuned: 101.01142883300781
model.layers.23.self_attn.q_proj: llama: 97.8895492553711, fine-tuned: 97.90031433105469
model.layers.24.self_attn.q_proj: llama: 97.97964477539062, fine-tuned: 98.0158920288086


 57%|█████▋    | 36/63 [00:15<00:11,  2.38it/s]

model.layers.14.self_attn.q_proj: llama: 108.36489868164062, fine-tuned: 108.38367462158203
model.layers.15.self_attn.q_proj: llama: 108.3883056640625, fine-tuned: 108.40525817871094
model.layers.16.self_attn.q_proj: llama: 106.8139419555664, fine-tuned: 106.8269271850586
model.layers.17.self_attn.q_proj: llama: 104.55338287353516, fine-tuned: 104.56761932373047
model.layers.18.self_attn.q_proj: llama: 103.43733215332031, fine-tuned: 103.45516967773438
model.layers.19.self_attn.q_proj: llama: 101.43477630615234, fine-tuned: 101.44762420654297
model.layers.20.self_attn.q_proj: llama: 102.77690887451172, fine-tuned: 102.79048156738281
model.layers.21.self_attn.q_proj: llama: 99.40530395507812, fine-tuned: 99.41356658935547
model.layers.22.self_attn.q_proj: llama: 101.00042724609375, fine-tuned: 101.01142883300781
model.layers.23.self_attn.q_proj: llama: 97.8895492553711, fine-tuned: 97.90031433105469
model.layers.24.self_attn.q_proj: llama: 97.97964477539062, fine-tuned: 98.0158920288086


 59%|█████▊    | 37/63 [00:15<00:10,  2.38it/s]

model.layers.14.self_attn.q_proj: llama: 108.36489868164062, fine-tuned: 108.38367462158203
model.layers.15.self_attn.q_proj: llama: 108.3883056640625, fine-tuned: 108.40525817871094
model.layers.16.self_attn.q_proj: llama: 106.8139419555664, fine-tuned: 106.8269271850586
model.layers.17.self_attn.q_proj: llama: 104.55338287353516, fine-tuned: 104.56761932373047
model.layers.18.self_attn.q_proj: llama: 103.43733215332031, fine-tuned: 103.45516967773438
model.layers.19.self_attn.q_proj: llama: 101.43477630615234, fine-tuned: 101.44762420654297
model.layers.20.self_attn.q_proj: llama: 102.77690887451172, fine-tuned: 102.79048156738281
model.layers.21.self_attn.q_proj: llama: 99.40530395507812, fine-tuned: 99.41356658935547
model.layers.22.self_attn.q_proj: llama: 101.00042724609375, fine-tuned: 101.01142883300781
model.layers.23.self_attn.q_proj: llama: 97.8895492553711, fine-tuned: 97.90031433105469
model.layers.24.self_attn.q_proj: llama: 97.97964477539062, fine-tuned: 98.0158920288086


 60%|██████    | 38/63 [00:16<00:10,  2.38it/s]

model.layers.14.self_attn.q_proj: llama: 108.36489868164062, fine-tuned: 108.38367462158203
model.layers.15.self_attn.q_proj: llama: 108.3883056640625, fine-tuned: 108.40525817871094
model.layers.16.self_attn.q_proj: llama: 106.8139419555664, fine-tuned: 106.8269271850586
model.layers.17.self_attn.q_proj: llama: 104.55338287353516, fine-tuned: 104.56761932373047
model.layers.18.self_attn.q_proj: llama: 103.43733215332031, fine-tuned: 103.45516967773438
model.layers.19.self_attn.q_proj: llama: 101.43477630615234, fine-tuned: 101.44762420654297
model.layers.20.self_attn.q_proj: llama: 102.77690887451172, fine-tuned: 102.79048156738281
model.layers.21.self_attn.q_proj: llama: 99.40530395507812, fine-tuned: 99.41356658935547
model.layers.22.self_attn.q_proj: llama: 101.00042724609375, fine-tuned: 101.01142883300781
model.layers.23.self_attn.q_proj: llama: 97.8895492553711, fine-tuned: 97.90031433105469
model.layers.24.self_attn.q_proj: llama: 97.97964477539062, fine-tuned: 98.0158920288086


 62%|██████▏   | 39/63 [00:16<00:10,  2.38it/s]

model.layers.14.self_attn.q_proj: llama: 108.36489868164062, fine-tuned: 108.38367462158203
model.layers.15.self_attn.q_proj: llama: 108.3883056640625, fine-tuned: 108.40525817871094
model.layers.16.self_attn.q_proj: llama: 106.8139419555664, fine-tuned: 106.8269271850586
model.layers.17.self_attn.q_proj: llama: 104.55338287353516, fine-tuned: 104.56761932373047
model.layers.18.self_attn.q_proj: llama: 103.43733215332031, fine-tuned: 103.45516967773438
model.layers.19.self_attn.q_proj: llama: 101.43477630615234, fine-tuned: 101.44762420654297
model.layers.20.self_attn.q_proj: llama: 102.77690887451172, fine-tuned: 102.79048156738281
model.layers.21.self_attn.q_proj: llama: 99.40530395507812, fine-tuned: 99.41356658935547
model.layers.22.self_attn.q_proj: llama: 101.00042724609375, fine-tuned: 101.01142883300781
model.layers.23.self_attn.q_proj: llama: 97.8895492553711, fine-tuned: 97.90031433105469
model.layers.24.self_attn.q_proj: llama: 97.97964477539062, fine-tuned: 98.0158920288086


 63%|██████▎   | 40/63 [00:16<00:09,  2.38it/s]

model.layers.14.self_attn.q_proj: llama: 108.36489868164062, fine-tuned: 108.38367462158203
model.layers.15.self_attn.q_proj: llama: 108.3883056640625, fine-tuned: 108.40525817871094
model.layers.16.self_attn.q_proj: llama: 106.8139419555664, fine-tuned: 106.8269271850586
model.layers.17.self_attn.q_proj: llama: 104.55338287353516, fine-tuned: 104.56761932373047
model.layers.18.self_attn.q_proj: llama: 103.43733215332031, fine-tuned: 103.45516967773438
model.layers.19.self_attn.q_proj: llama: 101.43477630615234, fine-tuned: 101.44762420654297
model.layers.20.self_attn.q_proj: llama: 102.77690887451172, fine-tuned: 102.79048156738281
model.layers.21.self_attn.q_proj: llama: 99.40530395507812, fine-tuned: 99.41356658935547



 65%|██████▌   | 41/63 [00:17<00:09,  2.21it/s]

model.layers.22.self_attn.q_proj: llama: 101.00042724609375, fine-tuned: 101.01142883300781
model.layers.23.self_attn.q_proj: llama: 97.8895492553711, fine-tuned: 97.90031433105469
model.layers.24.self_attn.q_proj: llama: 97.97964477539062, fine-tuned: 98.0158920288086
model.layers.28.self_attn.q_proj: llama: 97.06059265136719, fine-tuned: 97.08554077148438
model.layers.29.self_attn.q_proj: llama: 96.15409851074219, fine-tuned: 96.20056915283203
model.layers.14.self_attn.q_proj: llama: 108.36489868164062, fine-tuned: 108.38367462158203
model.layers.15.self_attn.q_proj: llama: 108.3883056640625, fine-tuned: 108.40525817871094
model.layers.16.self_attn.q_proj: llama: 106.8139419555664, fine-tuned: 106.8269271850586
model.layers.17.self_attn.q_proj: llama: 104.55338287353516, fine-tuned: 104.56761932373047
model.layers.18.self_attn.q_proj: llama: 103.43733215332031, fine-tuned: 103.45516967773438
model.layers.19.self_attn.q_proj: llama: 101.43477630615234, fine-tuned: 101.44762420654297
m


 67%|██████▋   | 42/63 [00:17<00:09,  2.26it/s]

model.layers.14.self_attn.q_proj: llama: 108.36489868164062, fine-tuned: 108.38367462158203
model.layers.15.self_attn.q_proj: llama: 108.3883056640625, fine-tuned: 108.40525817871094
model.layers.16.self_attn.q_proj: llama: 106.8139419555664, fine-tuned: 106.8269271850586
model.layers.17.self_attn.q_proj: llama: 104.55338287353516, fine-tuned: 104.56761932373047
model.layers.18.self_attn.q_proj: llama: 103.43733215332031, fine-tuned: 103.45516967773438
model.layers.19.self_attn.q_proj: llama: 101.43477630615234, fine-tuned: 101.44762420654297
model.layers.20.self_attn.q_proj: llama: 102.77690887451172, fine-tuned: 102.79048156738281
model.layers.21.self_attn.q_proj: llama: 99.40530395507812, fine-tuned: 99.41356658935547
model.layers.22.self_attn.q_proj: llama: 101.00042724609375, fine-tuned: 101.01142883300781
model.layers.23.self_attn.q_proj: llama: 97.8895492553711, fine-tuned: 97.90031433105469
model.layers.24.self_attn.q_proj: llama: 97.97964477539062, fine-tuned: 98.0158920288086


 68%|██████▊   | 43/63 [00:18<00:08,  2.29it/s]

model.layers.14.self_attn.q_proj: llama: 108.36489868164062, fine-tuned: 108.38367462158203
model.layers.15.self_attn.q_proj: llama: 108.3883056640625, fine-tuned: 108.40525817871094
model.layers.16.self_attn.q_proj: llama: 106.8139419555664, fine-tuned: 106.8269271850586
model.layers.17.self_attn.q_proj: llama: 104.55338287353516, fine-tuned: 104.56761932373047
model.layers.18.self_attn.q_proj: llama: 103.43733215332031, fine-tuned: 103.45516967773438
model.layers.19.self_attn.q_proj: llama: 101.43477630615234, fine-tuned: 101.44762420654297
model.layers.20.self_attn.q_proj: llama: 102.77690887451172, fine-tuned: 102.79048156738281
model.layers.21.self_attn.q_proj: llama: 99.40530395507812, fine-tuned: 99.41356658935547
model.layers.22.self_attn.q_proj: llama: 101.00042724609375, fine-tuned: 101.01142883300781
model.layers.23.self_attn.q_proj: llama: 97.8895492553711, fine-tuned: 97.90031433105469
model.layers.24.self_attn.q_proj: llama: 97.97964477539062, fine-tuned: 98.0158920288086


 70%|██████▉   | 44/63 [00:18<00:08,  2.32it/s]

model.layers.14.self_attn.q_proj: llama: 108.36489868164062, fine-tuned: 108.38367462158203
model.layers.15.self_attn.q_proj: llama: 108.3883056640625, fine-tuned: 108.40525817871094
model.layers.16.self_attn.q_proj: llama: 106.8139419555664, fine-tuned: 106.8269271850586
model.layers.17.self_attn.q_proj: llama: 104.55338287353516, fine-tuned: 104.56761932373047
model.layers.18.self_attn.q_proj: llama: 103.43733215332031, fine-tuned: 103.45516967773438
model.layers.19.self_attn.q_proj: llama: 101.43477630615234, fine-tuned: 101.44762420654297
model.layers.20.self_attn.q_proj: llama: 102.77690887451172, fine-tuned: 102.79048156738281
model.layers.21.self_attn.q_proj: llama: 99.40530395507812, fine-tuned: 99.41356658935547
model.layers.22.self_attn.q_proj: llama: 101.00042724609375, fine-tuned: 101.01142883300781
model.layers.23.self_attn.q_proj: llama: 97.8895492553711, fine-tuned: 97.90031433105469
model.layers.24.self_attn.q_proj: llama: 97.97964477539062, fine-tuned: 98.0158920288086


 71%|███████▏  | 45/63 [00:19<00:07,  2.33it/s]

model.layers.14.self_attn.q_proj: llama: 108.36489868164062, fine-tuned: 108.38367462158203
model.layers.15.self_attn.q_proj: llama: 108.3883056640625, fine-tuned: 108.40525817871094
model.layers.16.self_attn.q_proj: llama: 106.8139419555664, fine-tuned: 106.8269271850586
model.layers.17.self_attn.q_proj: llama: 104.55338287353516, fine-tuned: 104.56761932373047
model.layers.18.self_attn.q_proj: llama: 103.43733215332031, fine-tuned: 103.45516967773438
model.layers.19.self_attn.q_proj: llama: 101.43477630615234, fine-tuned: 101.44762420654297
model.layers.20.self_attn.q_proj: llama: 102.77690887451172, fine-tuned: 102.79048156738281
model.layers.21.self_attn.q_proj: llama: 99.40530395507812, fine-tuned: 99.41356658935547
model.layers.22.self_attn.q_proj: llama: 101.00042724609375, fine-tuned: 101.01142883300781
model.layers.23.self_attn.q_proj: llama: 97.8895492553711, fine-tuned: 97.90031433105469
model.layers.24.self_attn.q_proj: llama: 97.97964477539062, fine-tuned: 98.0158920288086


 73%|███████▎  | 46/63 [00:19<00:07,  2.35it/s]

model.layers.14.self_attn.q_proj: llama: 108.36489868164062, fine-tuned: 108.38367462158203
model.layers.15.self_attn.q_proj: llama: 108.3883056640625, fine-tuned: 108.40525817871094
model.layers.16.self_attn.q_proj: llama: 106.8139419555664, fine-tuned: 106.8269271850586
model.layers.17.self_attn.q_proj: llama: 104.55338287353516, fine-tuned: 104.56761932373047
model.layers.18.self_attn.q_proj: llama: 103.43733215332031, fine-tuned: 103.45516967773438
model.layers.19.self_attn.q_proj: llama: 101.43477630615234, fine-tuned: 101.44762420654297
model.layers.20.self_attn.q_proj: llama: 102.77690887451172, fine-tuned: 102.79048156738281
model.layers.21.self_attn.q_proj: llama: 99.40530395507812, fine-tuned: 99.41356658935547
model.layers.22.self_attn.q_proj: llama: 101.00042724609375, fine-tuned: 101.01142883300781
model.layers.23.self_attn.q_proj: llama: 97.8895492553711, fine-tuned: 97.90031433105469
model.layers.24.self_attn.q_proj: llama: 97.97964477539062, fine-tuned: 98.0158920288086


 75%|███████▍  | 47/63 [00:19<00:06,  2.36it/s]

model.layers.14.self_attn.q_proj: llama: 108.36489868164062, fine-tuned: 108.38367462158203
model.layers.15.self_attn.q_proj: llama: 108.3883056640625, fine-tuned: 108.40525817871094
model.layers.16.self_attn.q_proj: llama: 106.8139419555664, fine-tuned: 106.8269271850586
model.layers.17.self_attn.q_proj: llama: 104.55338287353516, fine-tuned: 104.56761932373047
model.layers.18.self_attn.q_proj: llama: 103.43733215332031, fine-tuned: 103.45516967773438
model.layers.19.self_attn.q_proj: llama: 101.43477630615234, fine-tuned: 101.44762420654297
model.layers.20.self_attn.q_proj: llama: 102.77690887451172, fine-tuned: 102.79048156738281
model.layers.21.self_attn.q_proj: llama: 99.40530395507812, fine-tuned: 99.41356658935547
model.layers.22.self_attn.q_proj: llama: 101.00042724609375, fine-tuned: 101.01142883300781
model.layers.23.self_attn.q_proj: llama: 97.8895492553711, fine-tuned: 97.90031433105469
model.layers.24.self_attn.q_proj: llama: 97.97964477539062, fine-tuned: 98.0158920288086


 76%|███████▌  | 48/63 [00:20<00:06,  2.36it/s]

model.layers.14.self_attn.q_proj: llama: 108.36489868164062, fine-tuned: 108.38367462158203
model.layers.15.self_attn.q_proj: llama: 108.3883056640625, fine-tuned: 108.40525817871094
model.layers.16.self_attn.q_proj: llama: 106.8139419555664, fine-tuned: 106.8269271850586
model.layers.17.self_attn.q_proj: llama: 104.55338287353516, fine-tuned: 104.56761932373047
model.layers.18.self_attn.q_proj: llama: 103.43733215332031, fine-tuned: 103.45516967773438
model.layers.19.self_attn.q_proj: llama: 101.43477630615234, fine-tuned: 101.44762420654297
model.layers.20.self_attn.q_proj: llama: 102.77690887451172, fine-tuned: 102.79048156738281
model.layers.21.self_attn.q_proj: llama: 99.40530395507812, fine-tuned: 99.41356658935547
model.layers.22.self_attn.q_proj: llama: 101.00042724609375, fine-tuned: 101.01142883300781
model.layers.23.self_attn.q_proj: llama: 97.8895492553711, fine-tuned: 97.90031433105469
model.layers.24.self_attn.q_proj: llama: 97.97964477539062, fine-tuned: 98.0158920288086


 78%|███████▊  | 49/63 [00:20<00:05,  2.37it/s]

model.layers.14.self_attn.q_proj: llama: 108.36489868164062, fine-tuned: 108.38367462158203
model.layers.15.self_attn.q_proj: llama: 108.3883056640625, fine-tuned: 108.40525817871094
model.layers.16.self_attn.q_proj: llama: 106.8139419555664, fine-tuned: 106.8269271850586
model.layers.17.self_attn.q_proj: llama: 104.55338287353516, fine-tuned: 104.56761932373047
model.layers.18.self_attn.q_proj: llama: 103.43733215332031, fine-tuned: 103.45516967773438
model.layers.19.self_attn.q_proj: llama: 101.43477630615234, fine-tuned: 101.44762420654297
model.layers.20.self_attn.q_proj: llama: 102.77690887451172, fine-tuned: 102.79048156738281
model.layers.21.self_attn.q_proj: llama: 99.40530395507812, fine-tuned: 99.41356658935547
model.layers.22.self_attn.q_proj: llama: 101.00042724609375, fine-tuned: 101.01142883300781
model.layers.23.self_attn.q_proj: llama: 97.8895492553711, fine-tuned: 97.90031433105469
model.layers.24.self_attn.q_proj: llama: 97.97964477539062, fine-tuned: 98.0158920288086


 79%|███████▉  | 50/63 [00:21<00:05,  2.37it/s]

model.layers.14.self_attn.q_proj: llama: 108.36489868164062, fine-tuned: 108.38367462158203
model.layers.15.self_attn.q_proj: llama: 108.3883056640625, fine-tuned: 108.40525817871094
model.layers.16.self_attn.q_proj: llama: 106.8139419555664, fine-tuned: 106.8269271850586
model.layers.17.self_attn.q_proj: llama: 104.55338287353516, fine-tuned: 104.56761932373047
model.layers.18.self_attn.q_proj: llama: 103.43733215332031, fine-tuned: 103.45516967773438
model.layers.19.self_attn.q_proj: llama: 101.43477630615234, fine-tuned: 101.44762420654297
model.layers.20.self_attn.q_proj: llama: 102.77690887451172, fine-tuned: 102.79048156738281
model.layers.21.self_attn.q_proj: llama: 99.40530395507812, fine-tuned: 99.41356658935547
model.layers.22.self_attn.q_proj: llama: 101.00042724609375, fine-tuned: 101.01142883300781
model.layers.23.self_attn.q_proj: llama: 97.8895492553711, fine-tuned: 97.90031433105469
model.layers.24.self_attn.q_proj: llama: 97.97964477539062, fine-tuned: 98.0158920288086


 81%|████████  | 51/63 [00:21<00:05,  2.37it/s]

model.layers.14.self_attn.q_proj: llama: 108.36489868164062, fine-tuned: 108.38367462158203
model.layers.15.self_attn.q_proj: llama: 108.3883056640625, fine-tuned: 108.40525817871094
model.layers.16.self_attn.q_proj: llama: 106.8139419555664, fine-tuned: 106.8269271850586
model.layers.17.self_attn.q_proj: llama: 104.55338287353516, fine-tuned: 104.56761932373047
model.layers.18.self_attn.q_proj: llama: 103.43733215332031, fine-tuned: 103.45516967773438
model.layers.19.self_attn.q_proj: llama: 101.43477630615234, fine-tuned: 101.44762420654297
model.layers.20.self_attn.q_proj: llama: 102.77690887451172, fine-tuned: 102.79048156738281
model.layers.21.self_attn.q_proj: llama: 99.40530395507812, fine-tuned: 99.41356658935547
model.layers.22.self_attn.q_proj: llama: 101.00042724609375, fine-tuned: 101.01142883300781
model.layers.23.self_attn.q_proj: llama: 97.8895492553711, fine-tuned: 97.90031433105469
model.layers.24.self_attn.q_proj: llama: 97.97964477539062, fine-tuned: 98.0158920288086


 83%|████████▎ | 52/63 [00:21<00:04,  2.38it/s]

model.layers.14.self_attn.q_proj: llama: 108.36489868164062, fine-tuned: 108.38367462158203
model.layers.15.self_attn.q_proj: llama: 108.3883056640625, fine-tuned: 108.40525817871094
model.layers.16.self_attn.q_proj: llama: 106.8139419555664, fine-tuned: 106.8269271850586
model.layers.17.self_attn.q_proj: llama: 104.55338287353516, fine-tuned: 104.56761932373047
model.layers.18.self_attn.q_proj: llama: 103.43733215332031, fine-tuned: 103.45516967773438
model.layers.19.self_attn.q_proj: llama: 101.43477630615234, fine-tuned: 101.44762420654297
model.layers.20.self_attn.q_proj: llama: 102.77690887451172, fine-tuned: 102.79048156738281
model.layers.21.self_attn.q_proj: llama: 99.40530395507812, fine-tuned: 99.41356658935547
model.layers.22.self_attn.q_proj: llama: 101.00042724609375, fine-tuned: 101.01142883300781
model.layers.23.self_attn.q_proj: llama: 97.8895492553711, fine-tuned: 97.90031433105469
model.layers.24.self_attn.q_proj: llama: 97.97964477539062, fine-tuned: 98.0158920288086


 84%|████████▍ | 53/63 [00:22<00:04,  2.38it/s]

model.layers.14.self_attn.q_proj: llama: 108.36489868164062, fine-tuned: 108.38367462158203
model.layers.15.self_attn.q_proj: llama: 108.3883056640625, fine-tuned: 108.40525817871094
model.layers.16.self_attn.q_proj: llama: 106.8139419555664, fine-tuned: 106.8269271850586
model.layers.17.self_attn.q_proj: llama: 104.55338287353516, fine-tuned: 104.56761932373047
model.layers.18.self_attn.q_proj: llama: 103.43733215332031, fine-tuned: 103.45516967773438
model.layers.19.self_attn.q_proj: llama: 101.43477630615234, fine-tuned: 101.44762420654297
model.layers.20.self_attn.q_proj: llama: 102.77690887451172, fine-tuned: 102.79048156738281
model.layers.21.self_attn.q_proj: llama: 99.40530395507812, fine-tuned: 99.41356658935547
model.layers.22.self_attn.q_proj: llama: 101.00042724609375, fine-tuned: 101.01142883300781
model.layers.23.self_attn.q_proj: llama: 97.8895492553711, fine-tuned: 97.90031433105469
model.layers.24.self_attn.q_proj: llama: 97.97964477539062, fine-tuned: 98.0158920288086


 86%|████████▌ | 54/63 [00:22<00:03,  2.38it/s]

model.layers.14.self_attn.q_proj: llama: 108.36489868164062, fine-tuned: 108.38367462158203
model.layers.15.self_attn.q_proj: llama: 108.3883056640625, fine-tuned: 108.40525817871094
model.layers.16.self_attn.q_proj: llama: 106.8139419555664, fine-tuned: 106.8269271850586
model.layers.17.self_attn.q_proj: llama: 104.55338287353516, fine-tuned: 104.56761932373047
model.layers.18.self_attn.q_proj: llama: 103.43733215332031, fine-tuned: 103.45516967773438
model.layers.19.self_attn.q_proj: llama: 101.43477630615234, fine-tuned: 101.44762420654297
model.layers.20.self_attn.q_proj: llama: 102.77690887451172, fine-tuned: 102.79048156738281
model.layers.21.self_attn.q_proj: llama: 99.40530395507812, fine-tuned: 99.41356658935547
model.layers.22.self_attn.q_proj: llama: 101.00042724609375, fine-tuned: 101.01142883300781
model.layers.23.self_attn.q_proj: llama: 97.8895492553711, fine-tuned: 97.90031433105469
model.layers.24.self_attn.q_proj: llama: 97.97964477539062, fine-tuned: 98.0158920288086


 87%|████████▋ | 55/63 [00:23<00:03,  2.38it/s]

model.layers.14.self_attn.q_proj: llama: 108.36489868164062, fine-tuned: 108.38367462158203
model.layers.15.self_attn.q_proj: llama: 108.3883056640625, fine-tuned: 108.40525817871094
model.layers.16.self_attn.q_proj: llama: 106.8139419555664, fine-tuned: 106.8269271850586
model.layers.17.self_attn.q_proj: llama: 104.55338287353516, fine-tuned: 104.56761932373047
model.layers.18.self_attn.q_proj: llama: 103.43733215332031, fine-tuned: 103.45516967773438
model.layers.19.self_attn.q_proj: llama: 101.43477630615234, fine-tuned: 101.44762420654297
model.layers.20.self_attn.q_proj: llama: 102.77690887451172, fine-tuned: 102.79048156738281
model.layers.21.self_attn.q_proj: llama: 99.40530395507812, fine-tuned: 99.41356658935547
model.layers.22.self_attn.q_proj: llama: 101.00042724609375, fine-tuned: 101.01142883300781
model.layers.23.self_attn.q_proj: llama: 97.8895492553711, fine-tuned: 97.90031433105469
model.layers.24.self_attn.q_proj: llama: 97.97964477539062, fine-tuned: 98.0158920288086


 89%|████████▉ | 56/63 [00:23<00:02,  2.38it/s]

model.layers.14.self_attn.q_proj: llama: 108.36489868164062, fine-tuned: 108.38367462158203
model.layers.15.self_attn.q_proj: llama: 108.3883056640625, fine-tuned: 108.40525817871094
model.layers.16.self_attn.q_proj: llama: 106.8139419555664, fine-tuned: 106.8269271850586
model.layers.17.self_attn.q_proj: llama: 104.55338287353516, fine-tuned: 104.56761932373047
model.layers.18.self_attn.q_proj: llama: 103.43733215332031, fine-tuned: 103.45516967773438
model.layers.19.self_attn.q_proj: llama: 101.43477630615234, fine-tuned: 101.44762420654297
model.layers.20.self_attn.q_proj: llama: 102.77690887451172, fine-tuned: 102.79048156738281
model.layers.21.self_attn.q_proj: llama: 99.40530395507812, fine-tuned: 99.41356658935547
model.layers.22.self_attn.q_proj: llama: 101.00042724609375, fine-tuned: 101.01142883300781
model.layers.23.self_attn.q_proj: llama: 97.8895492553711, fine-tuned: 97.90031433105469
model.layers.24.self_attn.q_proj: llama: 97.97964477539062, fine-tuned: 98.0158920288086


 90%|█████████ | 57/63 [00:24<00:02,  2.38it/s]

model.layers.14.self_attn.q_proj: llama: 108.36489868164062, fine-tuned: 108.38367462158203
model.layers.15.self_attn.q_proj: llama: 108.3883056640625, fine-tuned: 108.40525817871094
model.layers.16.self_attn.q_proj: llama: 106.8139419555664, fine-tuned: 106.8269271850586
model.layers.17.self_attn.q_proj: llama: 104.55338287353516, fine-tuned: 104.56761932373047
model.layers.18.self_attn.q_proj: llama: 103.43733215332031, fine-tuned: 103.45516967773438
model.layers.19.self_attn.q_proj: llama: 101.43477630615234, fine-tuned: 101.44762420654297
model.layers.20.self_attn.q_proj: llama: 102.77690887451172, fine-tuned: 102.79048156738281
model.layers.21.self_attn.q_proj: llama: 99.40530395507812, fine-tuned: 99.41356658935547
model.layers.22.self_attn.q_proj: llama: 101.00042724609375, fine-tuned: 101.01142883300781
model.layers.23.self_attn.q_proj: llama: 97.8895492553711, fine-tuned: 97.90031433105469
model.layers.24.self_attn.q_proj: llama: 97.97964477539062, fine-tuned: 98.0158920288086


 92%|█████████▏| 58/63 [00:24<00:02,  2.38it/s]

model.layers.14.self_attn.q_proj: llama: 108.36489868164062, fine-tuned: 108.38367462158203
model.layers.15.self_attn.q_proj: llama: 108.3883056640625, fine-tuned: 108.40525817871094
model.layers.16.self_attn.q_proj: llama: 106.8139419555664, fine-tuned: 106.8269271850586
model.layers.17.self_attn.q_proj: llama: 104.55338287353516, fine-tuned: 104.56761932373047
model.layers.18.self_attn.q_proj: llama: 103.43733215332031, fine-tuned: 103.45516967773438
model.layers.19.self_attn.q_proj: llama: 101.43477630615234, fine-tuned: 101.44762420654297
model.layers.20.self_attn.q_proj: llama: 102.77690887451172, fine-tuned: 102.79048156738281
model.layers.21.self_attn.q_proj: llama: 99.40530395507812, fine-tuned: 99.41356658935547
model.layers.22.self_attn.q_proj: llama: 101.00042724609375, fine-tuned: 101.01142883300781
model.layers.23.self_attn.q_proj: llama: 97.8895492553711, fine-tuned: 97.90031433105469
model.layers.24.self_attn.q_proj: llama: 97.97964477539062, fine-tuned: 98.0158920288086


 94%|█████████▎| 59/63 [00:24<00:01,  2.38it/s]

model.layers.14.self_attn.q_proj: llama: 108.36489868164062, fine-tuned: 108.38367462158203
model.layers.15.self_attn.q_proj: llama: 108.3883056640625, fine-tuned: 108.40525817871094
model.layers.16.self_attn.q_proj: llama: 106.8139419555664, fine-tuned: 106.8269271850586
model.layers.17.self_attn.q_proj: llama: 104.55338287353516, fine-tuned: 104.56761932373047
model.layers.18.self_attn.q_proj: llama: 103.43733215332031, fine-tuned: 103.45516967773438
model.layers.19.self_attn.q_proj: llama: 101.43477630615234, fine-tuned: 101.44762420654297
model.layers.20.self_attn.q_proj: llama: 102.77690887451172, fine-tuned: 102.79048156738281
model.layers.21.self_attn.q_proj: llama: 99.40530395507812, fine-tuned: 99.41356658935547
model.layers.22.self_attn.q_proj: llama: 101.00042724609375, fine-tuned: 101.01142883300781
model.layers.23.self_attn.q_proj: llama: 97.8895492553711, fine-tuned: 97.90031433105469
model.layers.24.self_attn.q_proj: llama: 97.97964477539062, fine-tuned: 98.0158920288086


 95%|█████████▌| 60/63 [00:25<00:01,  2.38it/s]

model.layers.14.self_attn.q_proj: llama: 108.36489868164062, fine-tuned: 108.38367462158203
model.layers.15.self_attn.q_proj: llama: 108.3883056640625, fine-tuned: 108.40525817871094
model.layers.16.self_attn.q_proj: llama: 106.8139419555664, fine-tuned: 106.8269271850586
model.layers.17.self_attn.q_proj: llama: 104.55338287353516, fine-tuned: 104.56761932373047
model.layers.18.self_attn.q_proj: llama: 103.43733215332031, fine-tuned: 103.45516967773438
model.layers.19.self_attn.q_proj: llama: 101.43477630615234, fine-tuned: 101.44762420654297
model.layers.20.self_attn.q_proj: llama: 102.77690887451172, fine-tuned: 102.79048156738281
model.layers.21.self_attn.q_proj: llama: 99.40530395507812, fine-tuned: 99.41356658935547
model.layers.22.self_attn.q_proj: llama: 101.00042724609375, fine-tuned: 101.01142883300781
model.layers.23.self_attn.q_proj: llama: 97.8895492553711, fine-tuned: 97.90031433105469
model.layers.24.self_attn.q_proj: llama: 97.97964477539062, fine-tuned: 98.0158920288086


 97%|█████████▋| 61/63 [00:25<00:00,  2.38it/s]

model.layers.14.self_attn.q_proj: llama: 108.36489868164062, fine-tuned: 108.38367462158203
model.layers.15.self_attn.q_proj: llama: 108.3883056640625, fine-tuned: 108.40525817871094
model.layers.16.self_attn.q_proj: llama: 106.8139419555664, fine-tuned: 106.8269271850586
model.layers.17.self_attn.q_proj: llama: 104.55338287353516, fine-tuned: 104.56761932373047
model.layers.18.self_attn.q_proj: llama: 103.43733215332031, fine-tuned: 103.45516967773438
model.layers.19.self_attn.q_proj: llama: 101.43477630615234, fine-tuned: 101.44762420654297
model.layers.20.self_attn.q_proj: llama: 102.77690887451172, fine-tuned: 102.79048156738281
model.layers.21.self_attn.q_proj: llama: 99.40530395507812, fine-tuned: 99.41356658935547
model.layers.22.self_attn.q_proj: llama: 101.00042724609375, fine-tuned: 101.01142883300781
model.layers.23.self_attn.q_proj: llama: 97.8895492553711, fine-tuned: 97.90031433105469
model.layers.24.self_attn.q_proj: llama: 97.97964477539062, fine-tuned: 98.0158920288086


 98%|█████████▊| 62/63 [00:26<00:00,  2.38it/s]
100%|██████████| 63/63 [00:26<00:00,  2.38it/s]
63it [00:26,  2.38it/s]

model.layers.14.self_attn.q_proj: llama: 108.36489868164062, fine-tuned: 108.38367462158203
model.layers.15.self_attn.q_proj: llama: 108.3883056640625, fine-tuned: 108.40525817871094
model.layers.16.self_attn.q_proj: llama: 106.8139419555664, fine-tuned: 106.8269271850586
model.layers.17.self_attn.q_proj: llama: 104.55338287353516, fine-tuned: 104.56761932373047
model.layers.18.self_attn.q_proj: llama: 103.43733215332031, fine-tuned: 103.45516967773438
model.layers.19.self_attn.q_proj: llama: 101.43477630615234, fine-tuned: 101.44762420654297
model.layers.20.self_attn.q_proj: llama: 102.77690887451172, fine-tuned: 102.79048156738281
model.layers.21.self_attn.q_proj: llama: 99.40530395507812, fine-tuned: 99.41356658935547
model.layers.22.self_attn.q_proj: llama: 101.00042724609375, fine-tuned: 101.01142883300781
model.layers.23.self_attn.q_proj: llama: 97.8895492553711, fine-tuned: 97.90031433105469
model.layers.24.self_attn.q_proj: llama: 97.97964477539062, fine-tuned: 98.0158920288086




In [13]:
round(correct_count/total_count, 2)

0.66

## Patching inputs of Value Fetcher Heads

In [14]:
fine_tuned_input_cache = {}

with torch.no_grad():
    for bi, inputs in tqdm(enumerate(dataloader)):
        for k, v in inputs.items():
            if v is not None and isinstance(v, torch.Tensor):
                inputs[k] = v.to(fine_tuned_model.device)

        with TraceDict(fine_tuned_model, llama_modules, retain_input=True) as cache:
            _ = fine_tuned_model(inputs["input_ids"])

        for _, llama_layer in zip(goat_modules, llama_modules):
            if "q_proj" in llama_layer:
                if bi in fine_tuned_input_cache:
                    fine_tuned_input_cache[bi][llama_layer] = cache[llama_layer].input.cpu()
                else:
                    fine_tuned_input_cache[bi] = {}
                    fine_tuned_input_cache[bi][llama_layer] = cache[llama_layer].input.cpu()

63it [00:29,  2.11it/s]


In [29]:
def patch_inputs(inputs=None, output=None, layer=None, bi=None):
    inp = inputs[0]
    value_fetcher_curr_layer = [h for l, h in value_fetcher_heads if l == int(layer.split(".")[2])]
    position_trans_curr_layer = [h for l, h in positional_info_fetcher_heads if l == int(layer.split(".")[2])]
        
    if ("q_proj" in layer) and len(value_fetcher_curr_layer) > 0:
        # Patching inputs of value fetcher heads from fine-tuned to llama
        fine_tuned_inp = fine_tuned_input_cache[bi][layer]

        # Patching weights from fine-tuned to llama
        llama_w = llama_model.state_dict()[f"{layer}.weight"].clone()
        llama_w = rearrange(llama_w, 
                           "(n_heads d_head) d_model -> n_heads d_head d_model", 
                           n_heads=fine_tuned_model.config.num_attention_heads)
        
        fine_tuned_w = fine_tuned_model.state_dict()[f"{layer}.weight"].clone()
        fine_tuned_w = rearrange(fine_tuned_w, 
                           "(n_heads d_head) d_model -> n_heads d_head d_model", 
                           n_heads=fine_tuned_model.config.num_attention_heads)

        output = rearrange(output, 
                   "batch seq_len (n_heads d_head) -> batch seq_len n_heads d_head", 
                   n_heads=fine_tuned_model.config.num_attention_heads)

        for head_idx in value_fetcher_curr_layer:
            res = einsum(inp,
                          fine_tuned_w[head_idx, :],
                          "batch seq_len hidden_size, d_head hidden_size -> batch seq_len d_head")
            output[:, -1, head_idx] = res[:, -1]

        output = rearrange(output, 
                       "batch seq_len n_heads d_head -> batch seq_len (n_heads d_head)", 
                       n_heads=llama_model.config.num_attention_heads,
                       )
            
            
    if ("o_proj" in layer) and len(position_trans_curr_layer) > 0:
        inp = rearrange(
                inp,
                "batch seq_len (n_heads d_head) -> batch seq_len n_heads d_head",
                n_heads=llama_model.config.num_attention_heads,
            )
        
        g_cache = rearrange(
                fine_tuned_cache[bi][layer],
                "batch seq_len (n_heads d_head) -> batch seq_len n_heads d_head",
                n_heads=llama_model.config.num_attention_heads,
            )
        
        for head in position_trans_curr_layer:
            inp[:, -1, head] = g_cache[:, -1, head]
        
        inp = rearrange(
                inp,
                "batch seq_len n_heads d_head -> batch seq_len (n_heads d_head)",
                n_heads=llama_model.config.num_attention_heads,
            )
        w_o = llama_model.state_dict()[f"{layer}.weight"]
        output = einsum(
            inp, w_o, "batch seq_len hidden_size, d_model hidden_size -> batch seq_len d_model"
        )

    return output

In [30]:
correct_count, total_count = 0, 0
with torch.no_grad():
    for bi, inputs in tqdm(enumerate(tqdm(dataloader))):
        for k, v in inputs.items():
            if v is not None and isinstance(v, torch.Tensor):
                inputs[k] = v.to(llama_model.device)

        with TraceDict(
            llama_model,
            llama_modules,
            retain_input=True,
            edit_output=partial(
                patch_inputs,
                bi=bi,
            ),
        ) as _:
            outputs = llama_model(inputs["input_ids"])

        for bi in range(inputs["labels"].size(0)):
            label = inputs["labels"][bi]
            pred = torch.argmax(outputs.logits[bi][inputs["last_token_indices"][bi]])

            if label == pred:
                correct_count += 1
            # else:
            #     print(f"Label: {tokenizer.decode(label)}, Prediction: {tokenizer.decode(pred)}")
            total_count += 1

        del outputs
        torch.cuda.empty_cache()

  0%|          | 0/63 [00:00<?, ?it/s]
0it [00:00, ?it/s][A
  2%|▏         | 1/63 [00:00<00:26,  2.34it/s]
  3%|▎         | 2/63 [00:00<00:25,  2.36it/s]
  5%|▍         | 3/63 [00:01<00:25,  2.36it/s]
  6%|▋         | 4/63 [00:01<00:24,  2.36it/s]
  8%|▊         | 5/63 [00:02<00:24,  2.37it/s]
 10%|▉         | 6/63 [00:02<00:24,  2.37it/s]
 11%|█         | 7/63 [00:03<00:28,  1.99it/s]
 13%|█▎        | 8/63 [00:03<00:26,  2.09it/s]
 14%|█▍        | 9/63 [00:04<00:24,  2.17it/s]
 16%|█▌        | 10/63 [00:04<00:23,  2.23it/s]
 17%|█▋        | 11/63 [00:04<00:22,  2.27it/s]
 19%|█▉        | 12/63 [00:05<00:22,  2.30it/s]
 21%|██        | 13/63 [00:05<00:21,  2.31it/s]
 22%|██▏       | 14/63 [00:06<00:21,  2.33it/s]
 24%|██▍       | 15/63 [00:06<00:20,  2.34it/s]
 25%|██▌       | 16/63 [00:07<00:19,  2.35it/s]
 27%|██▋       | 17/63 [00:07<00:19,  2.36it/s]
 29%|██▊       | 18/63 [00:07<00:19,  2.36it/s]
 30%|███       | 19/63 [00:08<00:18,  2.37it/s]
 32%|███▏      | 20/63 [00:08<00:18,

In [31]:
round(correct_count/total_count, 2)

0.69