In [1]:
from tqdm import tqdm
from datasets import load_dataset
import torch
from torch.utils.data import DataLoader

from peft import (
    get_peft_model,
    LoraConfig,
    TaskType,
)
from transformers import default_data_collator, Trainer, TrainingArguments, BitsAndBytesConfig, DataCollatorForSeq2Seq
import torch.nn as nn
from short_hf import ShortHFModel
from transformers import AutoModelForCausalLM, BitsAndBytesConfig
import argparse
import shutil
import json
import os
from data import get_preprocessed_samsum, create_peft_config, get_alpaca_small, get_medical_dataset
from trl import SFTTrainer
from peft import PeftConfig, get_peft_model, prepare_model_for_kbit_training

from transformers import default_data_collator, Trainer, TrainingArguments

from short_hf import ShortHFModel
from transformers import AutoModelForCausalLM
import argparse
import shutil
import json
import os
import pandas as pd
import re

  from .autonotebook import tqdm as notebook_tqdm


[2024-10-04 01:53:45,724] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)


/home/rdutt/miniconda3/envs/anole/compiler_compat/ld: cannot find -laio: No such file or directory
collect2: error: ld returned 1 exit status




In [2]:
MODEL_PATH = "/nfs/ukrc_roma_ait/models/Pruned_Models/Pruned_llama3_dataset_sec-data-mini_layers_10"
num_layers_to_prune = 10

torch_dtype = torch.float16
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch_dtype,
    bnb_4bit_use_double_quant=True,
)

short_model = ShortHFModel(
            model_name=MODEL_PATH,
            layers_path="model.layers",
            n_prune_layers=num_layers_to_prune,
            bnb_config=bnb_config,
)

`low_cpu_mem_usage` was None, now set to True since model is quantized.
Loading checkpoint shards: 100%|████████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:06<00:00,  2.10s/it]


In [3]:
peft_config = LoraConfig(
        task_type=TaskType.CAUSAL_LM,
        inference_mode=False,
        r=8,
        lora_alpha=8 * 2,
        lora_dropout=0.05,
        # target_modules = ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"]
        target_modules = ["q_proj", "v_proj"]
    )

peft_model = get_peft_model(short_model.model, peft_config)
peft_model.print_trainable_parameters()

trainable params: 2,342,912 || all params: 5,851,484,160 || trainable%: 0.0400


In [4]:
tokenizer = short_model.tokenizer
peft_model = prepare_model_for_kbit_training(peft_model)

## Example Generation without Healing

In [5]:
eval_prompt = """
Summarize this dialog:
A: Hi Tom, are you busy tomorrow's afternoon?
B: I'm pretty sure I am. What's up?
A: Can you go with me to the animal shelter?.
B: What do you want to do?
A: I want to get a puppy for my son.
B: That will make him so happy.
A: Yeah, we've discussed it many times. I think he's ready now.
B: That's good. Raising a dog is a tough issue. Like having a baby ;-) 
A: I'll get him one of those little dogs.
B: One that won't grow up too big;-)
A: And eat too much;-))
B: Do you know which one he would like?
A: Oh, yes, I took him there last Monday. He showed me one that he really liked.
B: I bet you had to drag him away.
A: He wanted to take it home right away ;-).
B: I wonder what he'll name it.
A: He said he'd name it after his dead hamster - Lemmy  - he's  a great Motorhead fan :-)))
---
Summary:
"""

model_input = tokenizer(eval_prompt, return_tensors="pt").to("cuda")

peft_model.eval()
with torch.no_grad():
    print(tokenizer.decode(peft_model.generate(**model_input, max_new_tokens=100, use_cache=True)[0], skip_special_tokens=True))

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.



Summarize this dialog:
A: Hi Tom, are you busy tomorrow's afternoon?
B: I'm pretty sure I am. What's up?
A: Can you go with me to the animal shelter?.
B: What do you want to do?
A: I want to get a puppy for my son.
B: That will make him so happy.
A: Yeah, we've discussed it many times. I think he's ready now.
B: That's good. Raising a dog is a tough issue. Like having a baby ;-) 
A: I'll get him one of those little dogs.
B: One that won't grow up too big;-)
A: And eat too much;-))
B: Do you know which one he would like?
A: Oh, yes, I took him there last Monday. He showed me one that he really liked.
B: I bet you had to drag him away.
A: He wanted to take it home right away ;-).
B: I wonder what he'll name it.
A: He said he'd name it after his dead hamster - Lemmy  - he's  a great Motorhead fan :-)))
---
Summary:
 a "miffiffy'''.



## Model Healing

In [6]:
def get_preprocessed_samsum():
    dataset = load_dataset("samsum", split="train")

    prompt = (
        f"Summarize this dialog:\n{{dialog}}\n---\nSummary:\n"
    )

    def apply_prompt_template(sample):
        return {
            "prompt": prompt.format(dialog=sample["dialogue"]),
            "summary": sample["summary"],
        }

    dataset = dataset.map(apply_prompt_template, remove_columns=list(dataset.features))

    def tokenize_add_label(sample):
        prompt = tokenizer.encode(tokenizer.bos_token + sample["prompt"], add_special_tokens=False)
        summary = tokenizer.encode(sample["summary"] +  tokenizer.eos_token, add_special_tokens=False)
        sample = {
            "input_ids": prompt + summary,
            "attention_mask" : [1] * (len(prompt) + len(summary)),
            "labels": [-100] * len(prompt) + summary,
            }

        return sample

    dataset = dataset.map(tokenize_add_label, remove_columns=list(dataset.features))

    return dataset

In [7]:
peft_model.train()

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): LlamaForCausalLM(
      (model): LlamaModel(
        (embed_tokens): Embedding(128256, 4096)
        (layers): ModuleList(
          (0-21): 22 x LlamaDecoderLayer(
            (self_attn): LlamaSdpaAttention(
              (q_proj): lora.Linear4bit(
                (base_layer): Linear4bit(in_features=4096, out_features=4096, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.05, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=4096, out_features=8, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=8, out_features=4096, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
              )
              (k_proj): Linear4bit(in_features=4096, out_features=1024, bias=Fa

In [8]:
output_dir = "tmp/"

config = {
    'lora_config': peft_config,
    'learning_rate': 1e-5,
    'num_train_epochs': 5,
    'gradient_checkpointing': False,
}

In [None]:
training_args = TrainingArguments(
    output_dir="tmp/",
    overwrite_output_dir=True,
    # logging strategies
    logging_strategy="steps",
    logging_steps=30,
    per_device_train_batch_size=1,
    gradient_accumulation_steps=4,
    save_strategy="epoch",
    optim="adamw_torch_fused",
    **{k:v for k,v in config.items() if k != 'lora_config'}
)

# Create Trainer instance
trainer = Trainer(
    model=peft_model,
    args=training_args,
    train_dataset=get_preprocessed_samsum(),
    data_collator=default_data_collator,
    callbacks=[],
)

# Start training
trainer.train()



`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.


Step,Training Loss
30,7.0531
60,7.2424
90,6.5429
120,6.7869
150,6.7229
180,6.7177
210,6.8422
240,7.1089
270,6.8738
300,6.3758




In [None]:
peft_model.eval()
with torch.no_grad():
    print(tokenizer.decode(peft_model.generate(**model_input, max_new_tokens=100)[0], skip_special_tokens=True))