## Load from GDRIVE

In [None]:
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')

In [1]:
!cp -r "/content/drive/MyDrive/Colab Notebooks/sumeczech" /content

cp: cannot stat '/content/drive/MyDrive/Colab Notebooks/sumeczech': No such file or directory


In [None]:
!pip uninstall einops -y
!pip uninstall triton -y
!pip uninstall flash-attn -y

In [None]:
!pip install --no-cache-dir --upgrade bitsandbytes
!pip install --no-cache-dir --upgrade peft
!pip install --no-cache-dir --upgrade accelerate
!pip install --no-cache-dir --upgrade trl
# !pip install --upgrade triton
# !pip install --upgrade flash-attn --no-build-isolation
!pip install --no-cache-dir dataset
!pip install --no-cache-dir sentencepiece

In [None]:
!pip install psutil

In [1]:
!nvidia-smi

Tue Apr 23 17:23:25 2024       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.105.17   Driver Version: 525.105.17   CUDA Version: 12.3     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA A100-SXM...  On   | 00000000:AF:00.0 Off |                    0 |
| N/A   34C    P0    69W / 400W |      0MiB / 40960MiB |      3%      Default |
|                               |                      |             Disabled |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

### Clean GPU

In [2]:
# # Run this to clean GPU memory
import torch
from numba import cuda
device = cuda.get_current_device()
device.reset()
torch.cuda.empty_cache()

## Load dataset & configure templates

In [3]:
from datasets import load_dataset

dataset = load_dataset("json",name="SumeCzech", data_files="./sumeczech/sumeczech-1.0-dev.jsonl", split="train")

In [4]:
# use only first 10000 examples
dataset = dataset.select(range(10000))

In [5]:
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, logging # 
from peft import LoraConfig, PeftModel, prepare_model_for_kbit_training, get_peft_model
import os,torch
from accelerate import Accelerator
from trl import SFTTrainer
from datasets import Dataset

In [6]:
import random
prompts_task = {
    'TEXT2ABSTRACT': [
        'Shrňte následující text do abstraktu:',
        'Vytvořte stručný abstrakt pro daný text:',
        'Poskytněte abstrakt, který zachytí podstatu následujícího textu:',
        'Vygenerujte abstrakt, který shrnuje klíčové body textu:',
        'Složte abstrakt, který přesně reprezentuje hlavní myšlenky v textu:'
    ],
    'TEXT2HEADLINE': [
        'Vytvořte titulek, který přesně odráží hlavní myšlenku následujícího textu:',
        'Navrhněte stručný titulek, který zachytí podstatu textu:',
        'Vytvořte poutavý titulek, který shrnuje klíčové body textu:',
        'Navrhněte titulek, který efektivně reprezentuje ústřední téma textu:',
        'Vygenerujte titulek, který zahrne kritické informace v textu:'
    ],
    'ABSTRACT2HEADLINE': [
        'Na základě daného abstraktu poskytněte titulek, který zachytí podstatu:',
        'Vytvořte titulek, který přesně shrnuje klíčové body abstraktu:',
        'Vygenerujte stručný titulek, který efektivně reprezentuje ústřední téma abstraktu:',
        'Vyvinout titulek, který zahrne kritické informace v abstraktu:',
        'Navrhněte titulek, který přesně převádí hlavní předmět abstraktu:'
    ]
}


def formatting_prompts_func(example):
    """
    Prepare the input text for the model
    """
    # either Text to abstract, text to headline or abstract to headline
    prompt_template = '<s>[INST]@SumeCzech {type_task}.\n{in_text}[/INST]{out}</s>'

    # make random choice between the three options
    type = torch.randint(0, 3, (1,)).item()
    if type == 0:
        type_task = 'TEXT2ABSTRACT'
        in_text = example['text']
        out_text = example['abstract']
    elif type == 1:
        type_task = 'TEXT2HEADLINE'
        in_text = example['text']
        out_text = example['headline'] + '.' # hadam pochopi ze je koniec headline, cize 1 veta
    else:
        type_task = 'ABSTRACT2HEADLINE'
        in_text = example['abstract']
        out_text = example['headline'] + '.' # hadam pochopi ze je koniec headline, cize 1 veta
    prompt = random.choice(prompts_task[type_task])
    return {"text": prompt_template.format(type_task=prompt, in_text=in_text, out=out_text)}

def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )

In [7]:
# transform dataset so it has only field "text" with formatted prompts
dataset = dataset.map(
    formatting_prompts_func,
    remove_columns=dataset.column_names,
    num_proc=32,
    batched=False,
)

In [8]:
dataset[69]

{'text': '<s>[INST]@SumeCzech Vyvinout titulek, který zahrne kritické informace v abstraktu:.\nČeský střelec Jan Sychra měl nadějně rozjetý závod, přesto skončil šestnáctý.[/INST]Konce kvalifikací stály skeetaře Sychru finále.</s>'}

In [9]:
base_model_name = "mistralai/Mistral-7B-Instruct-v0.2"
new_model_path = "Mistral-7B-Instruct-v0.2-ft"

from huggingface_hub import login
login(token="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") # HF token TODO: zahodit do pice lebo public repo xd

Token has not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: read).
Your token has been saved to /storage/praha1/home/jurajdedic/.cache/huggingface/token
Login successful


## Load model & configure training 
- load model into the GPU
- confogire LORA

In [10]:
cuda.is_available()

True

In [11]:
# for collab testing
# bnb_config = BitsAndBytesConfig(
#     load_in_4bit= True,
#     bnb_4bit_quant_type= "nf4",
#     bnb_4bit_compute_dtype= torch.float16,
#     bnb_4bit_use_double_quant= False,
# )
# # for collab testing
# model = AutoModelForCausalLM.from_pretrained(
#         base_model_name,
#         quantization_config=bnb_config,
#         torch_dtype=torch.bfloat16,
#         device_map="auto",
#         trust_remote_code=True,
# )

model = AutoModelForCausalLM.from_pretrained(
    base_model_name, 
    device_map="auto",
    trust_remote_code=True,
    torch_dtype=torch.bfloat16,
)
tokenizer = AutoTokenizer.from_pretrained(base_model_name)
print(tokenizer)

model.config.use_cache = False
model.config.pretraining_tp = 1
model.gradient_checkpointing_enable()

# qlora for collab test
# model = prepare_model_for_kbit_training(model)

# fix some fp16 issue
tokenizer.padding_side = 'right'
tokenizer.pad_token = tokenizer.eos_token
tokenizer.add_eos_token = True
tokenizer.bos_token, tokenizer.eos_token

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

LlamaTokenizerFast(name_or_path='mistralai/Mistral-7B-Instruct-v0.2', vocab_size=32000, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='left', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>'}, clean_up_tokenization_spaces=False),  added_tokens_decoder={
	0: AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	1: AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	2: AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}


('<s>', '</s>')

In [12]:
# filter out all examples that are long af
dataset = dataset.filter(lambda x: len(tokenizer.encode(x["text"])) <= 6144, num_proc=32)
print(len(dataset))

9968


In [13]:
max_seq_len = max(len(tokenizer.encode(x["text"])) for x in dataset)
print(max_seq_len)

6104


In [14]:
# get avg len of the dataset for the model
avg_seq_len = sum(len(tokenizer.encode(x["text"])) for x in dataset) / len(dataset)
print(avg_seq_len)

867.68158105939


In [17]:
peft_config = LoraConfig(
    lora_alpha=16, # TODO: Mozno zmenit
    lora_dropout=0.01,
    r=16, # TODO: Mozno zmenit
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj","gate_proj", "up_proj", "down_proj", "embed_tokens"] #TODO: 
)
model = get_peft_model(model, peft_config)

In [18]:
# Hyperparameters
training_arguments = TrainingArguments(
    output_dir="./results",
    num_train_epochs=1, # TODO: uvidime kolko bude stacit
    per_device_train_batch_size=4, # TODO: mozno zmenit
    gradient_accumulation_steps=1, # TODO: mozno zmenit
    optim="paged_adamw_32bit",
    save_steps=500,
    logging_steps=5,
    learning_rate=3e-5,
    weight_decay=0.01,
    fp16=False,
    bf16=True,
    max_grad_norm=0.3,
    max_steps=-1,
    warmup_ratio=0.06,
    group_by_length=False,
    lr_scheduler_type="constant",
    report_to="tensorboard",
)

# Setting sft parameters
trainer = SFTTrainer(
    model=model,
    train_dataset=dataset,
    peft_config=peft_config,
    max_seq_length= 6144, # TODO: asi aby sa tam zmestil cely clanok cize imo aspon 3k ?
    tokenizer=tokenizer,
    args=training_arguments,
    packing= False,
    dataset_text_field="text",
    # neftune_noise_alpha=5, should improve the performance but needs to be tested
)

## Train

In [19]:
print_trainable_parameters(trainer.model)

print("Training...")
trainer.train()

trainable params: 42520576 || all params: 7284252672 || trainable%: 0.583732853796316
Training...


Step,Training Loss
5,2.3906
10,2.1254
15,1.9044
20,1.9956
25,1.7764
30,1.9074
35,1.8306
40,1.7961
45,1.7177
50,1.7548




TrainOutput(global_step=2492, training_loss=1.6030917686214416, metrics={'train_runtime': 7133.6454, 'train_samples_per_second': 1.397, 'train_steps_per_second': 0.349, 'total_flos': 7.461882764826378e+17, 'train_loss': 1.6030917686214416, 'epoch': 1.0})

## Save the model

In [20]:
trainer.model.save_pretrained(new_model_path)
trainer.tokenizer.save_pretrained(new_model_path)



('Mistral-7B-Instruct-v0.2-ft/tokenizer_config.json',
 'Mistral-7B-Instruct-v0.2-ft/special_tokens_map.json',
 'Mistral-7B-Instruct-v0.2-ft/tokenizer.model',
 'Mistral-7B-Instruct-v0.2-ft/added_tokens.json',
 'Mistral-7B-Instruct-v0.2-ft/tokenizer.json')

In [None]:
from tensorboard import notebook
log_dir = "./results/runs"
notebook.start("--logdir {} --port 4000".format(log_dir))

## Preparing for inference

- build prompt
- load the saved model

In [17]:
logging.set_verbosity(logging.CRITICAL)
def build_prompt(data, type_sum):
    if type_sum not in ['TEXT2ABSTRACT', 'TEXT2HEADLINE', 'ABSTRACT2HEADLINE']:
        raise ValueError('type_sum must be one of "TEXT2ABSTRACT", "TEXT2HEADLINE", "ABSTRACT2HEADLINE"')
    prompt_template = '<s>[INST]@SumeCzech {type}.\n{data}[/INST]'.format(type=type_sum, data=data)
    return prompt_template

In [11]:
base_model = AutoModelForCausalLM.from_pretrained(
    base_model_name, 
    device_map="auto",
    trust_remote_code=True,
    torch_dtype=torch.bfloat16,
)

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [13]:
lora_config = LoraConfig.from_pretrained(new_model_path)
new_model = get_peft_model(base_model_name, lora_config)

In [15]:
new_tokenizer = AutoTokenizer.from_pretrained(new_model_path)

In [None]:
# pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, torch_dtype=torch.bfloat16, device_map="auto", max_length=256)
sum_data = "Kráter nad zavaleným tunelem v Mostech u Jablunkova začnou stavaři v pátek zavážet. Kvůli dvěma propadům se stavba tunelu na jedné z hlavních tratí spojujících Česko a Slovensko výrazně prodlouží. Evakuovaná rodina se do domu, který stojí jen pár desítek metrů od kráteru, vrátí, až firma provede průzkumné vrty."

device = "cuda"

messages = [
    {"role": "user", "content": "Create a headline out of news abstract:\n" + sum_data},
]

encodeds = new_tokenizer.apply_chat_template(messages, return_tensors="pt")

model_inputs = encodeds.to(device)

generated_ids = new_model.generate(model_inputs, max_new_tokens=100, do_sample=True)
decoded = tokenizer.batch_decode(generated_ids)
print(decoded[0])

In [37]:
# taken from test set
sum_data = "Kráter nad zavaleným tunelem v Mostech u Jablunkova začnou stavaři v pátek zavážet. Kvůli dvěma propadům se stavba tunelu na jedné z hlavních tratí spojujících Česko a Slovensko výrazně prodlouží. Evakuovaná rodina se do domu, který stojí jen pár desítek metrů od kráteru, vrátí, až firma provede průzkumné vrty."
sum_type = "ABSTRACT2HEADLINE"
prompt = build_prompt(sum_data, sum_type)
# print(prompt)

num_seq = 1
result = pipe(
    '<s>[INST]Ahoj, ako sa mas ?[/INST]',
    do_sample=True,
    temperature=0.2,
    top_p=0.95,
    top_k=40,
    num_return_sequences=num_seq, # kolko vys vrati
)


for i in range(num_seq):
    print(result[i]["generated_text"])

<s>[INST]Ahoj, ako sa mas ?[/INST]Ahoj, ako sa mas?[/INST]Ahoj, ako sa mas?[/INST]Ahoj, ako sa mas?[/INST]Ahoj, ako sa mas?[/INST]Ahoj, ako sa mas?[/INST]Ahoj, ako sa mas?[/INST]Ahoj, ako sa mas?[/INST]Ahoj, ako sa mas?[/INST]Ahoj, ako sa mas?[/INST]Ahoj, ako sa mas?[/INST]Ahoj, ako sa mas?[/INST]Ahoj, ako sa mas?[/INST]Ahoj, ako sa mas?[/INST]Ahoj, ako sa mas?[/INST]Ahoj, ako sa mas?[/INST]Ahoj, ako sa mas?[/INST]Ahoj, ako sa mas?[/INST]Ahoj, ako sa mas?[/INST]Ahoj, a
