In [1]:
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
import torch

from args import TrainingArguments, DataTrainingArguments, ArgumentParser

from arithmetics import PromptArithmeticsConfig

from peft import get_peft_model

from tasks import AutoTask

from trl import SFTTrainer, SFTConfig

from tqdm import tqdm

import numpy as np

from sklearn.metrics import accuracy_score

In [2]:
parser = ArgumentParser(
    (TrainingArguments, DataTrainingArguments, PromptArithmeticsConfig)
)

training_args, data_args, pt_args = parser.parse_toml_file("./configs/prompt_tuning/single-task/llama31_8b_instruct.toml")



In [3]:
model = AutoModelForCausalLM.from_pretrained(training_args.model_name_or_path, torch_dtype=torch.bfloat16).to("cuda")
model.active_adapters = ["default"] # fix because llama has some active adapters for some reason
model = get_peft_model(model, peft_config=pt_args)

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [4]:
tokenizer = AutoTokenizer.from_pretrained(data_args.data_tokenizer_name_or_path, trust_remote_code=True, padding_side="left")
tokenizer.add_special_tokens({"pad_token": "<|reserved_special_token_0|>"})
model.config.pad_token_id = tokenizer.pad_token_id
model.generation_config.pad_token_id = tokenizer.pad_token_id

In [5]:
model.print_trainable_parameters()

trainable params: 409,600 || all params: 8,030,670,848 || trainable%: 0.0051


In [6]:
prompt_before = model.prompt_encoder.default.embedding.weight.clone()
prompt_before

tensor([[ 1.2246,  0.0173, -0.0927,  ..., -0.5116,  0.9227, -0.2867],
        [-0.4179, -0.0058, -1.9487,  ...,  1.2422,  0.0240, -0.3710],
        [ 1.0343,  0.6044, -0.4714,  ...,  1.4029,  1.2548, -0.0041],
        ...,
        [ 1.5220,  1.5922,  2.2478,  ..., -0.7569, -0.2760,  0.0494],
        [ 0.1840, -1.7919,  0.4253,  ...,  2.0103,  0.8831,  0.7768],
        [-1.7413, -0.0052,  1.0741,  ..., -1.4942, -0.9852,  0.5540]],
       device='cuda:0', grad_fn=<CloneBackward0>)

In [7]:
pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    torch_dtype=torch.bfloat16,
    device="cuda"
)

The model 'PeftModelForCausalLM' is not supported for text-generation. Supported models are ['BartForCausalLM', 'BertLMHeadModel', 'BertGenerationDecoder', 'BigBirdForCausalLM', 'BigBirdPegasusForCausalLM', 'BioGptForCausalLM', 'BlenderbotForCausalLM', 'BlenderbotSmallForCausalLM', 'BloomForCausalLM', 'CamembertForCausalLM', 'LlamaForCausalLM', 'CodeGenForCausalLM', 'CohereForCausalLM', 'CpmAntForCausalLM', 'CTRLLMHeadModel', 'Data2VecTextForCausalLM', 'DbrxForCausalLM', 'ElectraForCausalLM', 'ErnieForCausalLM', 'FalconForCausalLM', 'FuyuForCausalLM', 'GemmaForCausalLM', 'Gemma2ForCausalLM', 'GitForCausalLM', 'GPT2LMHeadModel', 'GPT2LMHeadModel', 'GPTBigCodeForCausalLM', 'GPTNeoForCausalLM', 'GPTNeoXForCausalLM', 'GPTNeoXJapaneseForCausalLM', 'GPTJForCausalLM', 'JambaForCausalLM', 'JetMoeForCausalLM', 'LlamaForCausalLM', 'MambaForCausalLM', 'Mamba2ForCausalLM', 'MarianForCausalLM', 'MBartForCausalLM', 'MegaForCausalLM', 'MegatronBertForCausalLM', 'MistralForCausalLM', 'MixtralForCausal

In [8]:
messages = [{"role": "user", "content": "Say only hello"}]
prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
outputs = pipe(prompt, max_new_tokens=120, do_sample=True)
print(outputs[0]["generated_text"])



<|begin_of_text|><|start_header_id|>system<|end_header_id|>

Cutting Knowledge Date: December 2023
Today Date: 26 Jul 2024

<|eot_id|><|start_header_id|>user<|end_header_id|>

Say only hello<|eot_id|><|start_header_id|>assistant<|end_header_id|>

Hello.


In [9]:
train_dataset = AutoTask.get("qnli_text_instruct").get(split="train", task_type="CAUSAL_LM", add_prefix=False, n_obs=3000, split_validation_test=True)
valid_dataset = AutoTask.get("qnli_text_instruct").get(split="validation", task_type="CAUSAL_LM", add_prefix=False, n_obs=500, split_validation_test=True)
test_dataset = AutoTask.get("qnli_text_instruct").get(split="test", task_type="CAUSAL_LM", add_prefix=False, n_obs=500, split_validation_test=True)

Running qnli_text_instruct_preprocessor on dataset:   0%|          | 0/3000 [00:00<?, ? examples/s]

Running qnli_text_instruct_preprocessor on dataset:   0%|          | 0/500 [00:00<?, ? examples/s]

Running qnli_text_instruct_preprocessor on dataset:   0%|          | 0/500 [00:00<?, ? examples/s]

In [10]:
print(train_dataset["content"][0])

Classify the question and sentence pair into labels: entailment, not entailment. Reply only the corresponding label.
question: What is the name of a former Asian Portuguese colony?
sentence: The country has a tiny Chinese population.
label: 


In [11]:
print(valid_dataset["content"][0])

Classify the question and sentence pair into labels: entailment, not entailment. Reply only the corresponding label.
question: How long was Sanskrit a language of culture?
sentence: Buddhist Hybrid Sanskrit is a literary language heavily influenced by the Middle Indo-Aryan languages, based on early Buddhist Prakrit texts which subsequently assimilated to the Classical Sanskrit standard in varying degrees.
label: 


In [12]:
print(test_dataset["content"][0])

Classify the question and sentence pair into labels: entailment, not entailment. Reply only the corresponding label.
question: How big are phycobilisomes?
sentence: Phycobilins come in all colors, though phycoerytherin is one of the pigments that makes many red algae red.
label: 


In [13]:
messages = [{"role": "user", "content": test_dataset["content"][0]}]
prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
outputs = pipe(prompt, max_new_tokens=16, do_sample=False, top_p=None, temperature=None)
print(outputs[0]["generated_text"])

<|begin_of_text|><|start_header_id|>system<|end_header_id|>

Cutting Knowledge Date: December 2023
Today Date: 26 Jul 2024

<|eot_id|><|start_header_id|>user<|end_header_id|>

Classify the question and sentence pair into labels: entailment, not entailment. Reply only the corresponding label.
question: How big are phycobilisomes?
sentence: Phycobilins come in all colors, though phycoerytherin is one of the pigments that makes many red algae red.
label:<|eot_id|><|start_header_id|>assistant<|end_header_id|>

not entailment


In [14]:
def apply_test_template(examples):
    return {"text": tokenizer.apply_chat_template([examples], tokenize=False, add_generation_prompt=True)}

def apply_template(examples):
    return {"text": tokenizer.apply_chat_template([examples, {"role": "assistant", "content": examples["target"]}], tokenize=False, add_generation_prompt=False)}

chat_train_dataset = train_dataset.map(apply_template)
chat_valid_dataset = valid_dataset.map(apply_template)
chat_test_dataset = test_dataset.map(apply_test_template)

Map:   0%|          | 0/3000 [00:00<?, ? examples/s]

In [19]:
print(chat_train_dataset["text"][0])

<|begin_of_text|><|start_header_id|>system<|end_header_id|>

Cutting Knowledge Date: December 2023
Today Date: 26 Jul 2024

<|eot_id|><|start_header_id|>user<|end_header_id|>

Classify the question and sentence pair into labels: entailment, not entailment. Reply only the corresponding label.
question: What is the name of a former Asian Portuguese colony?
sentence: The country has a tiny Chinese population.
label:<|eot_id|><|start_header_id|>assistant<|end_header_id|>

not entailment<|eot_id|>


In [20]:
print(chat_valid_dataset["text"][0])

<|begin_of_text|><|start_header_id|>system<|end_header_id|>

Cutting Knowledge Date: December 2023
Today Date: 26 Jul 2024

<|eot_id|><|start_header_id|>user<|end_header_id|>

Classify the question and sentence pair into labels: entailment, not entailment. Reply only the corresponding label.
question: How long was Sanskrit a language of culture?
sentence: Buddhist Hybrid Sanskrit is a literary language heavily influenced by the Middle Indo-Aryan languages, based on early Buddhist Prakrit texts which subsequently assimilated to the Classical Sanskrit standard in varying degrees.
label:<|eot_id|><|start_header_id|>assistant<|end_header_id|>

not entailment<|eot_id|>


In [21]:
print(chat_test_dataset["text"][0])

<|begin_of_text|><|start_header_id|>system<|end_header_id|>

Cutting Knowledge Date: December 2023
Today Date: 26 Jul 2024

<|eot_id|><|start_header_id|>user<|end_header_id|>

Classify the question and sentence pair into labels: entailment, not entailment. Reply only the corresponding label.
question: How big are phycobilisomes?
sentence: Phycobilins come in all colors, though phycoerytherin is one of the pigments that makes many red algae red.
label:<|eot_id|><|start_header_id|>assistant<|end_header_id|>




In [22]:
def predict(test_dataset, model, tokenizer, labels_list):
    y_pred = []
    pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_new_tokens=16, do_sample=False, top_p=None, temperature=None, device="cuda")

    for x_test in tqdm(test_dataset["text"]):

        result = pipe(x_test)
        answer = result[0]["generated_text"].split("label:<|eot_id|><|start_header_id|>assistant<|end_header_id|>")[-1].strip()

        for label in labels_list:
            if label.lower() == answer.lower():
                y_pred.append(label)
                break
        else:
            y_pred.append("none")
    
    return y_pred

In [23]:
y_pred = predict(chat_test_dataset, model, tokenizer, AutoTask.get("qnli_text_instruct").labels_list)

The model 'PeftModelForCausalLM' is not supported for text-generation. Supported models are ['BartForCausalLM', 'BertLMHeadModel', 'BertGenerationDecoder', 'BigBirdForCausalLM', 'BigBirdPegasusForCausalLM', 'BioGptForCausalLM', 'BlenderbotForCausalLM', 'BlenderbotSmallForCausalLM', 'BloomForCausalLM', 'CamembertForCausalLM', 'LlamaForCausalLM', 'CodeGenForCausalLM', 'CohereForCausalLM', 'CpmAntForCausalLM', 'CTRLLMHeadModel', 'Data2VecTextForCausalLM', 'DbrxForCausalLM', 'ElectraForCausalLM', 'ErnieForCausalLM', 'FalconForCausalLM', 'FuyuForCausalLM', 'GemmaForCausalLM', 'Gemma2ForCausalLM', 'GitForCausalLM', 'GPT2LMHeadModel', 'GPT2LMHeadModel', 'GPTBigCodeForCausalLM', 'GPTNeoForCausalLM', 'GPTNeoXForCausalLM', 'GPTNeoXJapaneseForCausalLM', 'GPTJForCausalLM', 'JambaForCausalLM', 'JetMoeForCausalLM', 'LlamaForCausalLM', 'MambaForCausalLM', 'Mamba2ForCausalLM', 'MarianForCausalLM', 'MBartForCausalLM', 'MegaForCausalLM', 'MegatronBertForCausalLM', 'MistralForCausalLM', 'MixtralForCausal

In [24]:
def evaluate(y_pred, y_true, mapping):
    def map_func(x):
        return mapping.get(x, -1)

    print(mapping)

    y_pred_mapped = np.vectorize(map_func)(y_pred)
    y_true_mapped = np.vectorize(map_func)(y_true)
    
    # print(y_pred_mapped, y_true_mapped)

    accuracy = accuracy_score(y_pred=y_pred_mapped, y_true=y_true_mapped)
    print(accuracy)


In [25]:
id2label = AutoTask.get("qnli_text_instruct").id2label
evaluate(y_pred, test_dataset["target"], {label: id_ for id_, label in id2label.items()})

{'entailment': 0, 'not entailment': 1}
0.774


In [26]:
training_arguments = SFTConfig(
    output_dir="llama31_test",                    
    num_train_epochs=10,                       
    per_device_train_batch_size=4,                            
    optim="adamw_torch",
    logging_steps=1,                         
    learning_rate=2e-3,                       
    weight_decay = 1e-5,
    bf16=True,                     
    warmup_ratio=0.03,                        
    group_by_length=False,
    lr_scheduler_type="cosine",              
    report_to="wandb",                  
    eval_strategy="steps",              
    eval_steps = 0.2,
    save_steps = 0.2,
    dataset_text_field="text",
    load_best_model_at_end = True,
    max_seq_length=512,
)


trainer = SFTTrainer(
    model=model,
    args=training_arguments,
    train_dataset=chat_train_dataset,
    eval_dataset=chat_valid_dataset,
    tokenizer=tokenizer,
    packing=False,

)

Map:   0%|          | 0/3000 [00:00<?, ? examples/s]



In [27]:
trainer.train()

[34m[1mwandb[0m: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mrobert-belanec[0m ([33mrbelanec[0m). Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss,Validation Loss
1500,1.4937,1.148408
3000,1.1493,1.054091
4500,0.97,1.038474
6000,1.2403,1.031534
7500,1.0999,1.030433


We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.43. Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/v4.41.3/en/internal/generation_utils#transformers.Cache)


TrainOutput(global_step=7500, training_loss=1.1435748054981232, metrics={'train_runtime': 3550.522, 'train_samples_per_second': 8.449, 'train_steps_per_second': 2.112, 'total_flos': 1.846011538118738e+17, 'train_loss': 1.1435748054981232, 'epoch': 10.0})

In [33]:
id2label = AutoTask.get("qnli_text_instruct").id2label
evaluate(predict(chat_test_dataset, model, tokenizer, AutoTask.get("qnli_text_instruct").labels_list), test_dataset["target"], {label: id_ for id_, label in id2label.items()})

The model 'PeftModelForCausalLM' is not supported for text-generation. Supported models are ['BartForCausalLM', 'BertLMHeadModel', 'BertGenerationDecoder', 'BigBirdForCausalLM', 'BigBirdPegasusForCausalLM', 'BioGptForCausalLM', 'BlenderbotForCausalLM', 'BlenderbotSmallForCausalLM', 'BloomForCausalLM', 'CamembertForCausalLM', 'LlamaForCausalLM', 'CodeGenForCausalLM', 'CohereForCausalLM', 'CpmAntForCausalLM', 'CTRLLMHeadModel', 'Data2VecTextForCausalLM', 'DbrxForCausalLM', 'ElectraForCausalLM', 'ErnieForCausalLM', 'FalconForCausalLM', 'FuyuForCausalLM', 'GemmaForCausalLM', 'Gemma2ForCausalLM', 'GitForCausalLM', 'GPT2LMHeadModel', 'GPT2LMHeadModel', 'GPTBigCodeForCausalLM', 'GPTNeoForCausalLM', 'GPTNeoXForCausalLM', 'GPTNeoXJapaneseForCausalLM', 'GPTJForCausalLM', 'JambaForCausalLM', 'JetMoeForCausalLM', 'LlamaForCausalLM', 'MambaForCausalLM', 'Mamba2ForCausalLM', 'MarianForCausalLM', 'MBartForCausalLM', 'MegaForCausalLM', 'MegatronBertForCausalLM', 'MistralForCausalLM', 'MixtralForCausal

{'entailment': 0, 'not entailment': 1}
0.774


In [35]:
model.print_trainable_parameters()

trainable params: 409,600 || all params: 8,030,670,848 || trainable%: 0.0051


In [31]:
prompt_after = model.prompt_encoder.default.embedding.weight

In [32]:
prompt_before - prompt_after

tensor([[ 0.4724,  0.5192, -0.3345,  ...,  0.0813,  0.0768, -0.1397],
        [ 0.0799,  0.2757, -0.2964,  ...,  0.3699, -0.0523, -0.2577],
        [ 0.0336,  0.2249, -0.0748,  ...,  0.3830, -0.4427, -0.4188],
        ...,
        [ 0.1433,  0.1526, -0.0582,  ...,  0.1194,  0.1675, -0.0221],
        [ 0.1644,  0.0326, -0.1464,  ...,  0.1095,  0.0710, -0.1145],
        [-0.2023, -0.0108,  0.1510,  ...,  0.2670, -0.1228, -0.2104]],
       device='cuda:0', grad_fn=<SubBackward0>)