In [1]:
import pandas as pd
from unsloth import FastLanguageModel
import torch
from datasets import Dataset

from tqdm import tqdm
tqdm.pandas()

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.


  from .autonotebook import tqdm as notebook_tqdm


Unsloth: Your Flash Attention 2 installation seems to be broken?
A possible explanation is you have a new CUDA version which isn't
yet compatible with FA2? Please file a ticket to Unsloth or FA2.
We shall now use Xformers instead, which does not have any performance hits!
We found this negligible impact by benchmarking on 1x A100.
🦥 Unsloth Zoo will now patch everything to make training faster!


In [2]:
torch.cuda.empty_cache()

In [3]:
max_seq_length = 2048
dtype = None # None for auto detection
load_in_4bit = True # 4bit quantization to reduce memory usage

# BASE_PATH = '/kaggle/input/gen-ai-ucu-2024-task-3'
BASE_PATH = '../data'

In [4]:
train_df = pd.read_json(f"{BASE_PATH}/zno.train.jsonl", lines=True)
test_df = pd.read_json(f"{BASE_PATH}/zno.test.jsonl", lines=True)

In [5]:
MODEL_NAME = "unsloth/gemma-2-9b-bnb-4bit"

model, tokenizer = FastLanguageModel.from_pretrained(
    # "unsloth/Qwen2.5-0.5B", "unsloth/Qwen2.5-1.5B", "unsloth/Qwen2.5-3B"
    # "unsloth/Qwen2.5-14B",  "unsloth/Qwen2.5-32B",  "unsloth/Qwen2.5-72B",
    model_name = MODEL_NAME,
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)

Unsloth: If you want to finetune Gemma 2, install flash-attn to make it faster!
To install flash-attn, do the below:

pip install --no-deps --upgrade "flash-attn>=2.6.3"
==((====))==  Unsloth 2025.1.5: Fast Gemma2 patching. Transformers: 4.48.0.
   \\   /|    GPU: NVIDIA GeForce RTX 3090. Max memory: 23.691 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.5.1+cu124. CUDA: 8.6. CUDA Toolkit: 12.4. Triton: 3.1.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.29.post1. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


In [6]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 16, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)

Unsloth 2025.1.5 patched 42 layers with 42 QKV layers, 42 O layers and 42 MLP layers.


In [7]:
zno_prompt = """Below is a question about Ukrainian history, language and literature. Select the correct answer marker from the provided options. Example output: [A]

### Question:
{}

### Options:
{}

### Correct Answer:
{}"""

In [8]:
def solve_task(row):
    question = row['question']
    options = ','.join([f"[{option['marker']}] {option['text']}" for option in row['answers']])

    inputs = tokenizer([zno_prompt.format(question, options, "[",)], return_tensors = "pt").to("cuda")
    outputs = model.generate(**inputs, max_new_tokens=3, use_cache=True)
    outputs = tokenizer.batch_decode(outputs)

    result = outputs[0].split('Correct Answer:\n[')[1][0]
    return result

In [9]:
def formatting_prompts_func(row):
    question = row['question']
    options = ','.join([f"[{option['marker']}] {option['text']}" for option in row['answers']])
    correct_answer = f"({row['correct_answers'][0]})"

    text = zno_prompt.format(question, options, correct_answer) +  tokenizer.eos_token
    return text

train_df['text'] = train_df.apply(formatting_prompts_func, axis=1)
dataset = Dataset.from_pandas(train_df)

In [11]:
from trl import SFTTrainer
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    dataset_num_proc = 4,
    packing = False, # Can make training 5x faster for short sequences.
    args = TrainingArguments(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,
        warmup_steps = 5,
        num_train_epochs = 3, # Set this for 1 full training run.
        # max_steps = None,
        learning_rate = 2e-4,
        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
        report_to = "none", # Use this for WandB etc
    ),
)

Map (num_proc=4): 100%|██████████| 3063/3063 [00:01<00:00, 2072.39 examples/s]


In [12]:
trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 3,063 | Num Epochs = 3
O^O/ \_/ \    Batch size per device = 2 | Gradient Accumulation steps = 4
\        /    Total batch size = 8 | Total steps = 1,149
 "-____-"     Number of trainable parameters = 40,370,176


Step,Training Loss
1,2.0955
2,2.2042
3,2.4335
4,2.5656
5,2.1839
6,2.1392
7,1.984
8,2.1861
9,1.9289
10,2.2752


In [13]:
model.save_pretrained("lora_model_tune") # Local saving
tokenizer.save_pretrained("lora_model_tune")

('lora_model_tune/tokenizer_config.json',
 'lora_model_tune/special_tokens_map.json',
 'lora_model_tune/vocab.json',
 'lora_model_tune/merges.txt',
 'lora_model_tune/added_tokens.json',
 'lora_model_tune/tokenizer.json')

In [10]:
FastLanguageModel.for_inference(model)
inputs = tokenizer(
[
    zno_prompt.format(
        'Позначте рядок, у якому в усіх словах потрібно писати літеру *и*', # instruction
        '(А) бад..лина, благоч..стивий, кр..хкий, ж..виця;,(Б) вар..во, меж..річчя, вич..пурений, кр..шталь;,(В) п’ят..річка, заруч..ни, нев..димка, обітн..ця;,(Г) зач..нати, виконав..ця, знів..чити, вел..чина;,(Д) нож..чок, печ..во, викор..нити, оз..ратися.', # input
        "[",
    )
], return_tensors = "pt").to("cuda")

outputs = model.generate(**inputs, max_new_tokens = 3, use_cache = True)
tokenizer.batch_decode(outputs)

AttributeError: 'GemmaFixedRotaryEmbedding' object has no attribute 'current_rope_size'

In [15]:
submission_df = test_df.copy()
submission_df['correct_answers'] = test_df.progress_apply(solve_task, axis=1)
submission_df

100%|██████████| 751/751 [02:18<00:00,  5.43it/s]


Unnamed: 0,question,answers,subject,id,correct_answers
0,«Сміхом крізь сльози» можна схарактеризувати з...,"[{'marker': 'А', 'text': '«Три зозулі з поклон...",ukrainian-language-and-literature,0,Д
1,"Удовин син, мати, сестра, кохана – ключові обр...","[{'marker': 'А', 'text': '«Засвіт встали козач...",ukrainian-language-and-literature,1,Д
2,В уривку з історичного джерела «*Створивши бан...,"[{'marker': 'А', 'text': 'Правобережної Україн...",history-of-ukraine,2,А
3,В уривку\n\n\n*Доки буде жити Україна\n\nВ теп...,"[{'marker': 'А', 'text': 'Василя Стефаника'}, ...",ukrainian-language-and-literature,3,Д
4,Букву ***и*** на місці пропуску треба писати в...,"[{'marker': 'А', 'text': 'пр….хований, пр…звис...",ukrainian-language-and-literature,4,Б
...,...,...,...,...,...
746,Укажіть правильний варіант послідовного заповн...,"[{'marker': 'А', 'text': 'дієвих прийомів, які...",ukrainian-language-and-literature,746,Г
747,**Проаналізуйте фрагмент історичного документа...,"[{'marker': 'А', 'text': 'Українська головна в...",history-of-ukraine,747,Г
748,Прочитайте речення *(цифра позначає наступне с...,"[{'marker': 'А', 'text': '3, 4, 5, 10'}, {'mar...",ukrainian-language-and-literature,748,Г
749,Граматично правильне продовження речення «*Пер...,"[{'marker': 'А', 'text': 'мені пригадалися дав...",ukrainian-language-and-literature,749,В


In [16]:
submission_df['correct_answers'] = submission_df['correct_answers'].apply(lambda x: x[0])
submission_df[["id", "correct_answers"]].to_csv("submission-tune.csv", index=False)