In [1]:
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!cp -r "/content/drive/MyDrive/Colab Notebooks/sumeczech" /content

In [None]:
!pip install -U bitsandbytes
!pip install transformers # ==4.36.2
!pip install -U peft
!pip install -U accelerate
!pip install -U trl
!pip install dataset # s==2.16.0
!pip install sentencepiece

In [2]:
!nvidia-smi

Sat Apr 20 08:41:54 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  Tesla T4                       Off | 00000000:00:04.0 Off |                    0 |
| N/A   57C    P8              10W /  70W |      0MiB / 15360MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [3]:
from datasets import load_dataset

dataset = load_dataset("json",name="SumeCzech", data_files="sumeczech/sumeczech-1.0-dev.jsonl", split="train")

In [4]:
# use only first 10000 examples
dataset = dataset.select(range(10000))

In [5]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig,HfArgumentParser,TrainingArguments,pipeline, logging
from peft import LoraConfig, PeftModel, prepare_model_for_kbit_training, get_peft_model
import os,torch
from accelerate import Accelerator
from trl import SFTTrainer
from datasets import Dataset

In [6]:
def formatting_prompts_func(example):
    """
    Prepare the input text for the model
    """
    # either Text to abstract, text to headline or abstract to headline
    prompt_template = '<s>[INST]@SumeCzech {type}.\n{in_text}[/INST]{out}</s>'

    # make random choice between the three options
    type = torch.randint(0, 3, (1,)).item()
    if type == 0:
        type_task = 'TEXT2ABSTRACT'
        in_text = example['text']
        out_text = example['abstract']
    elif type == 1:
        type_task = 'TEXT2HEADLINE'
        in_text = example['text']
        out_text = example['headline']
    else:
        type_task = 'ABSTRACT2HEADLINE'
        in_text = example['abstract']
        out_text = example['headline']

    return {"text": prompt_template.format(type=type_task, in_text=in_text, out=out_text)}

def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )

In [7]:
# transform dataset so it has only field "text" with formatted prompts
dataset = dataset.map(
    formatting_prompts_func,
    remove_columns=dataset.column_names,
    num_proc=4,
    batched=False,
)

In [8]:
dataset[69]

{'text': '<s>[INST]@SumeCzech ABSTRACT2HEADLINE.\nČeský střelec Jan Sychra měl nadějně rozjetý závod, přesto skončil šestnáctý.[/INST]Konce kvalifikací stály skeetaře Sychru finále</s>'}

In [9]:
base_model = "mistralai/Mistral-7B-Instruct-v0.2"
new_model = "Mistral-7B-Instruct-v0.2-ft-SumeCzech"

from huggingface_hub import login
login(token="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") # HF token TODO: zahodit do pice lebo public repo xd

Token has not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: read).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [10]:
# for collab testing
# bnb_config = BitsAndBytesConfig(
#     load_in_4bit= True,
#     bnb_4bit_quant_type= "nf4",
#     bnb_4bit_compute_dtype= torch.float16,
#     bnb_4bit_use_double_quant= False,
# )
# # for collab testing
# model = AutoModelForCausalLM.from_pretrained(
#         base_model,
#         quantization_config=bnb_config,
#         torch_dtype=torch.bfloat16,
#         device_map="auto",
#         trust_remote_code=True,
# )

model = AutoModelForCausalLM.from_pretrained(
    base_model, 
    device_map="auto",
    trust_remote_code=True,
    torch_dtype=torch.bfloat16,
)
tokenizer = AutoTokenizer.from_pretrained(base_model)

model.config.use_cache = False
model.config.pretraining_tp = 1
model.gradient_checkpointing_enable()

# qlora for collab test
# model = prepare_model_for_kbit_training(model)

# fix some fp16 issue
tokenizer.padding_side = 'right'
tokenizer.pad_token = tokenizer.eos_token
tokenizer.add_eos_token = True
tokenizer.bos_token, tokenizer.eos_token

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

('<s>', '</s>')

In [11]:
peft_config = LoraConfig(
    lora_alpha=64, # TODO: Mozno zmenit
    lora_dropout=0.1,
    r=64, # TODO: Mozno zmenit
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj","gate_proj"] #TODO: Mozno pridat aj dalsie
)
model = get_peft_model(model, peft_config)

In [14]:
# Hyperparameters
training_arguments = TrainingArguments(
    output_dir="./results",
    num_train_epochs=1, # TODO: uvidime kolko bude stacit
    per_device_train_batch_size=4, # TODO: mozno zmenit
    gradient_accumulation_steps=1, # TODO: mozno zmenit
    optim="paged_adamw_32bit",
    save_steps=50,
    logging_steps=5,
    learning_rate=2e-4,
    weight_decay=0.001,
    fp16=False,
    bf16=False,
    max_grad_norm=0.3,
    max_steps=-1,
    warmup_ratio=0.03,
    group_by_length=True,
    lr_scheduler_type="constant",
    report_to="tensorboard",
)

# Setting sft parameters
trainer = SFTTrainer(
    model=model,
    train_dataset=dataset,
    peft_config=peft_config,
    max_seq_length= None, # TODO: asi aby sa tam zmestil cely clanok cize imo aspon 3k ?
    tokenizer=tokenizer,
    args=training_arguments,
    packing= False,
    dataset_text_field="text",
    # neftune_noise_alpha=5, should improve the performance but needs to be tested
)



Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [15]:
print_trainable_parameters(trainer.model)

print("Training...")
trainer.train()

trainable params: 92274688 || all params: 3844345856 || trainable%: 2.4002702008713337
Training...




Step,Training Loss
10,1.9096


KeyboardInterrupt: 

In [16]:
trainer.model.save_pretrained(new_model)
trainer.tokenizer.save_pretrained(new_model)

('Mistral-7B-Instruct-v0.2-ft-SumeCzech/tokenizer_config.json',
 'Mistral-7B-Instruct-v0.2-ft-SumeCzech/special_tokens_map.json',
 'Mistral-7B-Instruct-v0.2-ft-SumeCzech/tokenizer.model',
 'Mistral-7B-Instruct-v0.2-ft-SumeCzech/added_tokens.json',
 'Mistral-7B-Instruct-v0.2-ft-SumeCzech/tokenizer.json')

In [None]:
from tensorboard import notebook
log_dir = "results/runs"
notebook.start("--logdir {} --port 4000".format(log_dir))

In [None]:
logging.set_verbosity(logging.CRITICAL)
# TODO: eval and test inference