In [1]:
import torch

from transformers import AutoModelForCausalLM, AutoTokenizer, Trainer, DataCollatorForLanguageModeling, GenerationConfig, pipeline, Seq2SeqTrainer, default_data_collator
from args import TrainingArguments, DataTrainingArguments, ArgumentParser

from peft import get_peft_model

from arithmetics import PromptArithmeticsConfig

from tasks import Preprocessor

from metrics import exact_match

# from safetensors import safe_open

In [2]:
parser = ArgumentParser(
    (TrainingArguments, DataTrainingArguments, PromptArithmeticsConfig)
)

training_args, data_args, pt_args = parser.parse_toml_file("./configs/prompt_tuning/single-task/llama3_8b.toml")



In [3]:
model = AutoModelForCausalLM.from_pretrained(training_args.model_name_or_path, torch_dtype=torch.bfloat16).to("cuda")
model.generation_config.max_new_tokens = 16
model.generation_config.max_length = 256

# model = get_peft_model(model, peft_config=pt_args)

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [4]:
model.active_adapters

<bound method PeftAdapterMixin.active_adapters of LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(128256, 4096)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaSdpaAttention(
          (q_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear(in_features=4096, out_features=1024, bias=False)
          (v_proj): Linear(in_features=4096, out_features=1024, bias=False)
          (o_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=4096, out_features=14336, bias=False)
          (up_proj): Linear(in_features=4096, out_features=14336, bias=False)
          (down_proj): Linear(in_features=14336, out_features=4096, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((4096,), eps=1e-05)
        (post_attention_layernorm): Lla

In [10]:
tokenizer = AutoTokenizer.from_pretrained(data_args.data_tokenizer_name_or_path, trust_remote_code=True, padding_side="left")
tokenizer.add_special_tokens({"pad_token": "<|reserved_special_token_0|>"})
model.config.pad_token_id = tokenizer.pad_token_id
model.generation_config.pad_token_id = tokenizer.pad_token_id

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [5]:
preprocessor = Preprocessor(
            [data_args.dataset_names[0]], data_args, training_args, pt_args, tokenizer
        )

train_dataset, valid_dataset, test_dataset = preprocessor.get_data()

Max target lengths: [5]


Running qnli_text_preprocessor on dataset:   0%|          | 0/1000 [00:00<?, ? examples/s]

Running preprocess_function on train_dataset:   0%|          | 0/1000 [00:00<?, ? examples/s]

Running qnli_text_preprocessor on dataset:   0%|          | 0/500 [00:00<?, ? examples/s]

Running preprocess_function on valid_dataset:   0%|          | 0/500 [00:00<?, ? examples/s]

Running qnli_text_preprocessor on dataset:   0%|          | 0/100 [00:00<?, ? examples/s]

Running preprocess_function on test_dataset:   0%|          | 0/100 [00:00<?, ? examples/s]

In [6]:
print(train_dataset[0]["input_ids"].count(128001))
print(train_dataset[0]["input_ids"].count(128002))
print(train_dataset[0]["attention_mask"].count(0))
print(train_dataset[6]["labels"])
print(tokenizer.decode(train_dataset[6]["input_ids"]))
print(tokenizer.decode(train_dataset[10]["labels"][-4:]))
print(tokenizer.decode(train_dataset[10]["input_ids"][-4:]))
print(train_dataset[6]["labels"][-3:])
print(train_dataset[6]["input_ids"][-3:])
print(train_dataset[1]["labels"].count(-100))

1
199
199
[-100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,

In [38]:
train_dataset["labels"][1][-6:],  train_dataset["attention_mask"][1][-6:], train_dataset["input_ids"][0][-6:], tokenizer.decode(train_dataset["labels"][0][-4:]), tokenizer.decode(train_dataset["input_ids"][0][-4:])

([-100, -100, 306, 607, 479, 128001],
 [1, 1, 1, 1, 1, 1],
 [220, 1962, 28525, 607, 479, 128001],
 '_entailment<|end_of_text|>',
 '_entailment<|end_of_text|>')

In [7]:
print(tokenizer.decode(train_dataset[1]["input_ids"]))

print(train_dataset[0]["attention_mask"].count(0))
print(train_dataset[0]["attention_mask"][199])
print(train_dataset[0]["input_ids"][199])

print(len(train_dataset[0]["labels"]), len(train_dataset[0]["input_ids"]), len(train_dataset[0]["attention_mask"]))

<|reserved_special_token_0|><|reserved_special_token_0|><|reserved_special_token_0|><|reserved_special_token_0|><|reserved_special_token_0|><|reserved_special_token_0|><|reserved_special_token_0|><|reserved_special_token_0|><|reserved_special_token_0|><|reserved_special_token_0|><|reserved_special_token_0|><|reserved_special_token_0|><|reserved_special_token_0|><|reserved_special_token_0|><|reserved_special_token_0|><|reserved_special_token_0|><|reserved_special_token_0|><|reserved_special_token_0|><|reserved_special_token_0|><|reserved_special_token_0|><|reserved_special_token_0|><|reserved_special_token_0|><|reserved_special_token_0|><|reserved_special_token_0|><|reserved_special_token_0|><|reserved_special_token_0|><|reserved_special_token_0|><|reserved_special_token_0|><|reserved_special_token_0|><|reserved_special_token_0|><|reserved_special_token_0|><|reserved_special_token_0|><|reserved_special_token_0|><|reserved_special_token_0|><|reserved_special_token_0|><|reserved_special_t

In [8]:
print(len(list(valid_dataset.values())[0][0]["labels"]), len(list(valid_dataset.values())[0][0]["input_ids"]), len(list(valid_dataset.values())[0][0]["attention_mask"]))

261 261 261


In [19]:
origin_prompt = "origin_0_meta-llama-3-8b"
model.prompt_encoder.default.embedding.weight = torch.nn.Parameter(torch.load(f"saves/{origin_prompt}/{origin_prompt}.bin")["prompt_embeddings"].to("cuda"))

def compute_metrics(eval_preds):
    preds, labels = eval_preds
    
    decoded_preds = [dpred.split("label: ")[1] for dpred in  tokenizer.batch_decode(preds, skip_special_tokens=True)]
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    print("preds:", decoded_preds)
    print("labels:", decoded_labels)

    return exact_match(decoded_preds, decoded_labels)

trainer = Seq2SeqTrainer(
                    model=model,
                    tokenizer=tokenizer,
                    args=training_args,
                    train_dataset=train_dataset,
                    eval_dataset=list(valid_dataset.values())[0],
                    data_collator=default_data_collator,
                    # compute_metrics=compute_metrics,
                )

trainer.train()



Step,Training Loss,Validation Loss
100,1.8985,0.28786
200,0.218,0.139522
300,0.0933,0.007515
400,0.0134,0.000864
500,0.0044,0.00286
600,0.0008,0.007813
700,0.0,0.002173
800,0.0,0.002128
900,0.0,0.001944
1000,0.0,0.002044


KeyboardInterrupt: 

In [24]:
example_input = test_dataset["qnli_text"][0]["input_ids"]
example_label =  test_dataset["qnli_text"][0]["labels"]
example_attn_mask = test_dataset["qnli_text"][0]["attention_mask"]

print("input:", tokenizer.decode(example_label, skip_special_tokens=True),tokenizer.decode(example_input, skip_special_tokens=True), )

outputs = model.generate(torch.tensor(example_input).unsqueeze(-1).reshape(1, -1).to("cuda"), attention_mask=torch.tensor(example_attn_mask).unsqueeze(-1).reshape(1, -1).to("cuda"))

print("output:", tokenizer.decode(outputs[0], skip_special_tokens=True))

input: entailment qnli question: When did Tesla make the induction motor? sentence: One of the things Tesla developed at that laboratory in 1887 was an induction motor that ran on alternating current, a power system format that was starting to be built in Europe and the United States because of its advantages in long-distance, high-voltage transmission. label: 
output: qnli question: When did Tesla make the induction motor? sentence: One of the things Tesla developed at that laboratory in 1887 was an induction motor that ran on alternating current, a power system format that was starting to be built in Europe and the United States because of its advantages in long-distance, high-voltage transmission. label: not_entailment


In [33]:
c_ent = 0
for label in train_dataset["labels"]:
    if tokenizer.decode([l if l != -100 else tokenizer.pad_token_id for l in label], skip_special_tokens=True) == "not":
        c_ent += 1

print(c_ent)

0


In [42]:
model.generation_config.pad_token_id = tokenizer.pad_token_id

correct = 0
total = len(test_dataset["qnli_text"])

for example in test_dataset["qnli_text"]:
    example_label = torch.tensor(example["labels"])
    example_input = torch.tensor(example["input_ids"])
    example_attn_mask = torch.tensor(example["attention_mask"])

    example_output = model.generate(torch.tensor(example_input).unsqueeze(-1).reshape(1, -1).to("cuda"), attention_mask=torch.tensor(example_attn_mask).unsqueeze(-1).reshape(1, -1).to("cuda"))[0]
    n_new_tokens = len(example_output) - len(example_input)

    decoded_output = tokenizer.decode(example_output[-n_new_tokens:], skip_special_tokens=True)
    decoded_label = tokenizer.decode(example_label, skip_special_tokens=True)

    if decoded_output == decoded_label:
        correct += 1

    print("output:", decoded_output , "label:", decoded_label)

  example_output = model.generate(torch.tensor(example_input).unsqueeze(-1).reshape(1, -1).to("cuda"), attention_mask=torch.tensor(example_attn_mask).unsqueeze(-1).reshape(1, -1).to("cuda"))[0]


output: not_entailment label: entailment
output: not_entailment label: entailment
output: not_entailment label: entailment
output: not_entailment label: not_entailment
output: not_entailment label: not_entailment
output: not_entailment label: not_entailment
output: not_entailment label: not_entailment
output: not_entailment label: not_entailment
output: not_entailment label: entailment
output: not_entailment label: not_entailment
output: not_entailment label: entailment
output: not_entailment label: not_entailment
output: not_entailment label: not_entailment
output: not_entailment label: entailment
output: not_entailment label: not_entailment
output: not_entailment label: entailment
output: not_entailment label: entailment
output: not_entailment label: not_entailment
output: not_entailment label: not_entailment
output: not_entailment label: not_entailment
output: not_entailment label: not_entailment
output: not_entailment label: entailment
output: not_entailment label: entailment
outpu

In [44]:
correct / total 

0.51