In [1]:
import pandas as pd
from unsloth import FastLanguageModel
import torch
from datasets import Dataset

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.


  from .autonotebook import tqdm as notebook_tqdm


Unsloth: Your Flash Attention 2 installation seems to be broken?
A possible explanation is you have a new CUDA version which isn't
yet compatible with FA2? Please file a ticket to Unsloth or FA2.
We shall now use Xformers instead, which does not have any performance hits!
We found this negligible impact by benchmarking on 1x A100.
🦥 Unsloth Zoo will now patch everything to make training faster!


In [2]:
from tqdm import tqdm
tqdm.pandas()

In [3]:
max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

# BASE_PATH = '/kaggle/input/gen-ai-ucu-2024-task-3'
BASE_PATH = '../data'

In [4]:
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/gemma-2-9b",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
    # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
)

Unsloth: If you want to finetune Gemma 2, install flash-attn to make it faster!
To install flash-attn, do the below:

pip install --no-deps --upgrade "flash-attn>=2.6.3"
==((====))==  Unsloth 2025.1.5: Fast Gemma2 patching. Transformers: 4.48.0.
   \\   /|    GPU: NVIDIA GeForce RTX 3090. Max memory: 23.691 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.5.1+cu124. CUDA: 8.6. CUDA Toolkit: 12.4. Triton: 3.1.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.29.post1. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


In [5]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 16, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)

Unsloth 2025.1.5 patched 42 layers with 42 QKV layers, 42 O layers and 42 MLP layers.


In [8]:
train_df = pd.read_json(f"{BASE_PATH}/zno.train.jsonl", lines=True)
test_df = pd.read_json(f"{BASE_PATH}/zno.test.jsonl", lines=True)

In [9]:
zno_prompt = """Below is a task, paired with answer options. Solve the task and return the correct answer as an option letter.

### Task:
{}

### Options:
{}

### The correct answer is:
{}"""

In [11]:
def formatting_prompts_func(row):
    question = row['question']
    options = ','.join([f"[{option['marker']}] {option['text']}" for option in train_df.iloc[0]['answers']])
    correct_answer = f"({row['correct_answers'][0]})"

    text = zno_prompt.format(question, options, correct_answer) +  tokenizer.eos_token
    return text

train_df['text'] = train_df.apply(formatting_prompts_func, axis=1)
dataset = Dataset.from_pandas(train_df)

In [12]:
from trl import SFTTrainer
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    dataset_num_proc = 2,
    packing = False, # Can make training 5x faster for short sequences.
    args = TrainingArguments(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,
        warmup_steps = 5,
        num_train_epochs = 3, # Set this for 1 full training run.
        max_steps = 60,
        learning_rate = 1e-4,
        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
        report_to = "none", # Use this for WandB etc
    ),
)

Map (num_proc=2): 100%|██████████| 3063/3063 [00:01<00:00, 1676.96 examples/s]


In [13]:
trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 3,063 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 2 | Gradient Accumulation steps = 4
\        /    Total batch size = 8 | Total steps = 60
 "-____-"     Number of trainable parameters = 68,812,800


Step,Training Loss
1,2.737
2,2.7376
3,2.852
4,2.7558
5,2.4948
6,2.5868
7,2.4927
8,2.4755
9,1.8989
10,2.3836


In [10]:
FastLanguageModel.for_inference(model)
inputs = tokenizer(
[
    zno_prompt.format(
        'Позначте рядок, у якому в усіх словах потрібно писати літеру *и*', # instruction
        '(А) бад..лина, благоч..стивий, кр..хкий, ж..виця;,(Б) вар..во, меж..річчя, вич..пурений, кр..шталь;,(В) п’ят..річка, заруч..ни, нев..димка, обітн..ця;,(Г) зач..нати, виконав..ця, знів..чити, вел..чина;,(Д) нож..чок, печ..во, викор..нити, оз..ратися.', # input
        "",
    )
], return_tensors = "pt").to("cuda")

outputs = model.generate(**inputs, max_new_tokens = 3, use_cache = True)
tokenizer.batch_decode(outputs)

AttributeError: 'GemmaFixedRotaryEmbedding' object has no attribute 'current_rope_size'

In [13]:
train_df.iloc[0].text

'Below is a task, paired with answer options. Solve the task and return the correct answer as an option letter.\n\n### Task:\nПозначте рядок, у якому в усіх словах потрібно писати літеру *и*:\n\n### Options:\n[А] бад..лина, благоч..стивий, кр..хкий, ж..виця;,[Б] вар..во, меж..річчя, вич..пурений, кр..шталь;,[В] п’ят..річка, заруч..ни, нев..димка, обітн..ця;,[Г] зач..нати, виконав..ця, знів..чити, вел..чина;,[Д] нож..чок, печ..во, викор..нити, оз..ратися.\n\n### The correct answer is:\n(В)<|endoftext|>'

In [15]:
def solve_task(row):
    question = row['question']
    options = ','.join([f"[{option['marker']}] {option['text']}" for option in train_df.iloc[0]['answers']])

    inputs = tokenizer([zno_prompt.format(question, options, "",)], return_tensors = "pt").to("cuda")
    outputs = model.generate(**inputs, max_new_tokens = 3, use_cache = True)
    outputs = tokenizer.batch_decode(outputs)
    result = outputs[0].split("The correct answer is:\n")[1]
    return [result[1]]

In [16]:
train2_df = train_df.iloc[:100].copy()
train2_df['solution'] = train2_df.progress_apply(solve_task, axis=1)

100%|██████████| 100/100 [00:29<00:00,  3.35it/s]


In [17]:
count = 0
for _, row in train2_df.iterrows():
    if row['correct_answers'][0] == row['solution'][0]:
        count+=1

print(count)

22


In [34]:
705/3063

0.23016650342801176

In [24]:
train_df[train_df["correct_answers"] != train_df["solution"]]

Unnamed: 0,question,answers,correct_answers,subject,solution
0,"Позначте рядок, у якому в усіх словах потрібно...","[{'marker': 'А', 'text': 'бад..лина, благоч..с...",[В],ukrainian-language-and-literature,[В]
1,"Позначте словосполучення, у якому порушено гра...","[{'marker': 'А', 'text': 'рівно о першій;'}, {...",[Г],ukrainian-language-and-literature,[Г]
2,"Позначте пару речень, у яких виділені слова є ...","[{'marker': 'А', 'text': '*Слово*, чому ти не ...",[Б],ukrainian-language-and-literature,[В]
3,"Позначте речення, у якому одиничний дієприслів...","[{'marker': 'А', 'text': 'Гуркіт канонади реві...",[Д],ukrainian-language-and-literature,[Г]
4,"Позначте рядок, у якому фразеологічні звороти ...","[{'marker': 'А', 'text': 'на живу нитку – біли...",[Б],ukrainian-language-and-literature,[В]
...,...,...,...,...,...
3058,"В уривку\n\n\n*Тріщить, лящить, мов щелепа, ко...","[{'marker': 'А', 'text': 'антитеза'}, {'marker...",[Д],ukrainian-language-and-literature,[В]
3059,"«*Як можна бути вільним, Евкріте, коли маєш ті...","[{'marker': 'А', 'text': 'Степан Радченко й На...",[А],ukrainian-language-and-literature,[Г]
3060,"«*А вчора, пишучи спогади про дитинство, про х...","[{'marker': 'А', 'text': 'Юрій Яновський'}, {'...",[Б],ukrainian-language-and-literature,[А]
3061,"Думка Г. Сковороди, що смерті не боїться «*той...","[{'marker': 'А', 'text': 'Івана Шрама та Івана...",[Д],ukrainian-language-and-literature,[Б]


In [17]:
test_df.iloc[:5].progress_apply(solve_task, axis=1)

100%|██████████| 5/5 [00:01<00:00,  3.08it/s]


0    [В]
1    [В]
2    [Б]
3    [В]
4    [А]
dtype: object

In [13]:
for row in train_df.iloc[:5].iterrows():
    print(solve_task(row))

TypeError: tuple indices must be integers or slices, not str

In [70]:
test_df["corrected_answers"] = test_df.apply(solve_task, axis=1)
test_df

KeyboardInterrupt: 

In [67]:
submission_df = test_df.copy()
submission_df['correct_answers'] = submission_df['corrected_answers'].apply(lambda x: x[1])
submission_df

Unnamed: 0,question,answers,subject,id,corrected_answers,correct_answers
0,«Сміхом крізь сльози» можна схарактеризувати з...,"[{'marker': 'А', 'text': '«Три зозулі з поклон...",ukrainian-language-and-literature,0,[А],А
1,"Удовин син, мати, сестра, кохана – ключові обр...","[{'marker': 'А', 'text': '«Засвіт встали козач...",ukrainian-language-and-literature,1,[А],А
2,В уривку з історичного джерела «*Створивши бан...,"[{'marker': 'А', 'text': 'Правобережної Україн...",history-of-ukraine,2,[Г],Г
3,В уривку\n\n\n*Доки буде жити Україна\n\nВ теп...,"[{'marker': 'А', 'text': 'Василя Стефаника'}, ...",ukrainian-language-and-literature,3,[А],А
4,Букву ***и*** на місці пропуску треба писати в...,"[{'marker': 'А', 'text': 'пр….хований, пр…звис...",ukrainian-language-and-literature,4,[А],А
...,...,...,...,...,...,...
746,Укажіть правильний варіант послідовного заповн...,"[{'marker': 'А', 'text': 'дієвих прийомів, які...",ukrainian-language-and-literature,746,*Пі,П
747,**Проаналізуйте фрагмент історичного документа...,"[{'marker': 'А', 'text': 'Українська головна в...",history-of-ukraine,747,[В],В
748,Прочитайте речення *(цифра позначає наступне с...,"[{'marker': 'А', 'text': '3, 4, 5, 10'}, {'mar...",ukrainian-language-and-literature,748,[А],А
749,Граматично правильне продовження речення «*Пер...,"[{'marker': 'А', 'text': 'мені пригадалися дав...",ukrainian-language-and-literature,749,[А],А


In [68]:
submission_df[["id", "correct_answers"]].to_csv("submission.csv", index=False)

In [24]:
','.join([f"[{option['marker']}] {option['text']}" for option in train_df.iloc[0]['answers']])

'(А) бад..лина, благоч..стивий, кр..хкий, ж..виця;,(Б) вар..во, меж..річчя, вич..пурений, кр..шталь;,(В) п’ят..річка, заруч..ни, нев..димка, обітн..ця;,(Г) зач..нати, виконав..ця, знів..чити, вел..чина;,(Д) нож..чок, печ..во, викор..нити, оз..ратися.'