In [None]:
!pip install -U bitsandbytes
!pip install transformers # ==4.36.2
!pip install -U peft
!pip install -U accelerate
!pip install -U trl
!pip install dataset # s==2.16.0
!pip install sentencepiece

In [None]:
!nvidia-smi

In [6]:
from datasets import load_dataset

dataset = load_dataset("json",name="SumeCzech", data_files="data/sumeczech/sumeczech-1.0-dev.jsonl", split="train")

In [18]:
# use only first 10000 examples
dataset = dataset.select(range(10000))
random_sample = dataset[69]

In [20]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig,HfArgumentParser,TrainingArguments,pipeline, logging
from peft import LoraConfig, PeftModel, prepare_model_for_kbit_training, get_peft_model
import os,torch
from accelerate import Accelerator
from trl import SFTTrainer
from datasets import Dataset

In [21]:
def formatting_prompts_func(example):
    """
    Prepare the input text for the model
    """
    # either Text to abstract, text to headline or abstract to headline
    prompt_template = '<s>[INST]@SumeCzech {type}.\n{in_text}[/INST]{out}</s>'

    # make random choice between the three options
    type = torch.randint(0, 3, (1,)).item()
    if type == 0:
        type_task = 'TEXT2ABSTRACT'
        in_text = example['text']
        out_text = example['abstract']
    elif type == 1:
        type_task = 'TEXT2HEADLINE'
        in_text = example['text']
        out_text = example['headline']
    else:
        type_task = 'ABSTRACT2HEADLINE'
        in_text = example['abstract']
        out_text = example['headline']

    return prompt_template.format(type=type_task, in_text=in_text, out=out_text)

def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )

In [None]:
base_model = "mistralai/Mistral-7B-Instruct-v0.2"
new_model = "Mistral-7B-Instruct-v0.2-ft-SumeCzech"

In [None]:
model = AutoModelForCausalLM.from_pretrained(base_model, device_map={"": Accelerator().local_process_index},)
tokenizer = AutoTokenizer.from_pretrained(base_model)

model.config.use_cache = False
model.config.pretraining_tp = 1
model.gradient_checkpointing_enable()

# fix some fp16 issue
tokenizer.padding_side = 'right'
tokenizer.pad_token = tokenizer.eos_token
tokenizer.add_eos_token = True
tokenizer.bos_token, tokenizer.eos_token

In [None]:


peft_config = LoraConfig(
    lora_alpha=64,
    lora_dropout=0.1,
    r=64,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj","gate_proj"]
)
model = get_peft_model(model, peft_config)

In [None]:
# Hyperparameters
training_arguments = TrainingArguments(
    output_dir="./results",
    num_train_epochs=1,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=1,
    optim="paged_adamw_32bit",
    save_steps=50,
    logging_steps=10,
    learning_rate=2e-4,
    weight_decay=0.001,
    fp16=False,
    bf16=False,
    max_grad_norm=0.3,
    max_steps=-1,
    warmup_ratio=0.03,
    group_by_length=True,
    lr_scheduler_type="constant",
    report_to="tensorboard",
)

# Setting sft parameters
trainer = SFTTrainer(
    model=model,
    train_dataset=dataset,
    peft_config=peft_config,
    max_seq_length= None,
    tokenizer=tokenizer,
    args=training_arguments,
    packing= False,
    formatting_func=formatting_prompts_func,
    # neftune_noise_alpha=5, should improve the performance but needs to be tested
)

In [None]:
print_trainable_parameters(trainer.model)

print("Training...")
trainer.train()

In [None]:
trainer.model.save_pretrained(new_model)
trainer.tokenizer.save_pretrained(new_model)

In [None]:
from tensorboard import notebook
log_dir = "results/runs"
notebook.start("--logdir {} --port 4000".format(log_dir))

In [24]:
logging.set_verbosity(logging.CRITICAL)
# TODO: eval and test inference