In [1]:
import pandas as pd
from unsloth import FastLanguageModel
import torch
from datasets import Dataset

ü¶• Unsloth: Will patch your computer to enable 2x faster free finetuning.


  from .autonotebook import tqdm as notebook_tqdm


Unsloth: Your Flash Attention 2 installation seems to be broken?
A possible explanation is you have a new CUDA version which isn't
yet compatible with FA2? Please file a ticket to Unsloth or FA2.
We shall now use Xformers instead, which does not have any performance hits!
We found this negligible impact by benchmarking on 1x A100.
ü¶• Unsloth Zoo will now patch everything to make training faster!


In [2]:
from tqdm import tqdm
tqdm.pandas()

In [3]:
max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

# BASE_PATH = '/kaggle/input/gen-ai-ucu-2024-task-3'
BASE_PATH = '../data'

In [4]:
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/gemma-2-9b",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
    # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
)

Unsloth: If you want to finetune Gemma 2, install flash-attn to make it faster!
To install flash-attn, do the below:

pip install --no-deps --upgrade "flash-attn>=2.6.3"
==((====))==  Unsloth 2025.1.5: Fast Gemma2 patching. Transformers: 4.48.0.
   \\   /|    GPU: NVIDIA GeForce RTX 3090. Max memory: 23.691 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.5.1+cu124. CUDA: 8.6. CUDA Toolkit: 12.4. Triton: 3.1.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.29.post1. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


In [5]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 16, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)

Unsloth 2025.1.5 patched 42 layers with 42 QKV layers, 42 O layers and 42 MLP layers.


In [8]:
train_df = pd.read_json(f"{BASE_PATH}/zno.train.jsonl", lines=True)
test_df = pd.read_json(f"{BASE_PATH}/zno.test.jsonl", lines=True)

In [9]:
zno_prompt = """Below is a task, paired with answer options. Solve the task and return the correct answer as an option letter.

### Task:
{}

### Options:
{}

### The correct answer is:
{}"""

In [11]:
def formatting_prompts_func(row):
    question = row['question']
    options = ','.join([f"[{option['marker']}] {option['text']}" for option in train_df.iloc[0]['answers']])
    correct_answer = f"({row['correct_answers'][0]})"

    text = zno_prompt.format(question, options, correct_answer) +  tokenizer.eos_token
    return text

train_df['text'] = train_df.apply(formatting_prompts_func, axis=1)
dataset = Dataset.from_pandas(train_df)

In [12]:
from trl import SFTTrainer
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    dataset_num_proc = 2,
    packing = False, # Can make training 5x faster for short sequences.
    args = TrainingArguments(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,
        warmup_steps = 5,
        num_train_epochs = 3, # Set this for 1 full training run.
        max_steps = 60,
        learning_rate = 1e-4,
        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
        report_to = "none", # Use this for WandB etc
    ),
)

Map (num_proc=2): 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 3063/3063 [00:01<00:00, 1676.96 examples/s]


In [13]:
trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 3,063 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 2 | Gradient Accumulation steps = 4
\        /    Total batch size = 8 | Total steps = 60
 "-____-"     Number of trainable parameters = 68,812,800


Step,Training Loss
1,2.737
2,2.7376
3,2.852
4,2.7558
5,2.4948
6,2.5868
7,2.4927
8,2.4755
9,1.8989
10,2.3836


In [10]:
FastLanguageModel.for_inference(model)
inputs = tokenizer(
[
    zno_prompt.format(
        '–ü–æ–∑–Ω–∞—á—Ç–µ —Ä—è–¥–æ–∫, —É —è–∫–æ–º—É –≤ —É—Å—ñ—Ö —Å–ª–æ–≤–∞—Ö –ø–æ—Ç—Ä—ñ–±–Ω–æ –ø–∏—Å–∞—Ç–∏ –ª—ñ—Ç–µ—Ä—É *–∏*', # instruction
        '(–ê) –±–∞–¥..–ª–∏–Ω–∞, –±–ª–∞–≥–æ—á..—Å—Ç–∏–≤–∏–π, –∫—Ä..—Ö–∫–∏–π, –∂..–≤–∏—Ü—è;,(–ë) –≤–∞—Ä..–≤–æ, –º–µ–∂..—Ä—ñ—á—á—è, –≤–∏—á..–ø—É—Ä–µ–Ω–∏–π, –∫—Ä..—à—Ç–∞–ª—å;,(–í) –ø‚Äô—è—Ç..—Ä—ñ—á–∫–∞, –∑–∞—Ä—É—á..–Ω–∏, –Ω–µ–≤..–¥–∏–º–∫–∞, –æ–±—ñ—Ç–Ω..—Ü—è;,(–ì) –∑–∞—á..–Ω–∞—Ç–∏, –≤–∏–∫–æ–Ω–∞–≤..—Ü—è, –∑–Ω—ñ–≤..—á–∏—Ç–∏, –≤–µ–ª..—á–∏–Ω–∞;,(–î) –Ω–æ–∂..—á–æ–∫, –ø–µ—á..–≤–æ, –≤–∏–∫–æ—Ä..–Ω–∏—Ç–∏, –æ–∑..—Ä–∞—Ç–∏—Å—è.', # input
        "",
    )
], return_tensors = "pt").to("cuda")

outputs = model.generate(**inputs, max_new_tokens = 3, use_cache = True)
tokenizer.batch_decode(outputs)

AttributeError: 'GemmaFixedRotaryEmbedding' object has no attribute 'current_rope_size'

In [13]:
train_df.iloc[0].text

'Below is a task, paired with answer options. Solve the task and return the correct answer as an option letter.\n\n### Task:\n–ü–æ–∑–Ω–∞—á—Ç–µ —Ä—è–¥–æ–∫, —É —è–∫–æ–º—É –≤ —É—Å—ñ—Ö —Å–ª–æ–≤–∞—Ö –ø–æ—Ç—Ä—ñ–±–Ω–æ –ø–∏—Å–∞—Ç–∏ –ª—ñ—Ç–µ—Ä—É *–∏*:\n\n### Options:\n[–ê] –±–∞–¥..–ª–∏–Ω–∞, –±–ª–∞–≥–æ—á..—Å—Ç–∏–≤–∏–π, –∫—Ä..—Ö–∫–∏–π, –∂..–≤–∏—Ü—è;,[–ë] –≤–∞—Ä..–≤–æ, –º–µ–∂..—Ä—ñ—á—á—è, –≤–∏—á..–ø—É—Ä–µ–Ω–∏–π, –∫—Ä..—à—Ç–∞–ª—å;,[–í] –ø‚Äô—è—Ç..—Ä—ñ—á–∫–∞, –∑–∞—Ä—É—á..–Ω–∏, –Ω–µ–≤..–¥–∏–º–∫–∞, –æ–±—ñ—Ç–Ω..—Ü—è;,[–ì] –∑–∞—á..–Ω–∞—Ç–∏, –≤–∏–∫–æ–Ω–∞–≤..—Ü—è, –∑–Ω—ñ–≤..—á–∏—Ç–∏, –≤–µ–ª..—á–∏–Ω–∞;,[–î] –Ω–æ–∂..—á–æ–∫, –ø–µ—á..–≤–æ, –≤–∏–∫–æ—Ä..–Ω–∏—Ç–∏, –æ–∑..—Ä–∞—Ç–∏—Å—è.\n\n### The correct answer is:\n(–í)<|endoftext|>'

In [15]:
def solve_task(row):
    question = row['question']
    options = ','.join([f"[{option['marker']}] {option['text']}" for option in train_df.iloc[0]['answers']])

    inputs = tokenizer([zno_prompt.format(question, options, "",)], return_tensors = "pt").to("cuda")
    outputs = model.generate(**inputs, max_new_tokens = 3, use_cache = True)
    outputs = tokenizer.batch_decode(outputs)
    result = outputs[0].split("The correct answer is:\n")[1]
    return [result[1]]

In [16]:
train2_df = train_df.iloc[:100].copy()
train2_df['solution'] = train2_df.progress_apply(solve_task, axis=1)

100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 100/100 [00:29<00:00,  3.35it/s]


In [17]:
count = 0
for _, row in train2_df.iterrows():
    if row['correct_answers'][0] == row['solution'][0]:
        count+=1

print(count)

22


In [34]:
705/3063

0.23016650342801176

In [24]:
train_df[train_df["correct_answers"] != train_df["solution"]]

Unnamed: 0,question,answers,correct_answers,subject,solution
0,"–ü–æ–∑–Ω–∞—á—Ç–µ —Ä—è–¥–æ–∫, —É —è–∫–æ–º—É –≤ —É—Å—ñ—Ö —Å–ª–æ–≤–∞—Ö –ø–æ—Ç—Ä—ñ–±–Ω–æ...","[{'marker': '–ê', 'text': '–±–∞–¥..–ª–∏–Ω–∞, –±–ª–∞–≥–æ—á..—Å...",[–í],ukrainian-language-and-literature,[–í]
1,"–ü–æ–∑–Ω–∞—á—Ç–µ —Å–ª–æ–≤–æ—Å–ø–æ–ª—É—á–µ–Ω–Ω—è, —É —è–∫–æ–º—É –ø–æ—Ä—É—à–µ–Ω–æ –≥—Ä–∞...","[{'marker': '–ê', 'text': '—Ä—ñ–≤–Ω–æ –æ –ø–µ—Ä—à—ñ–π;'}, {...",[–ì],ukrainian-language-and-literature,[–ì]
2,"–ü–æ–∑–Ω–∞—á—Ç–µ –ø–∞—Ä—É —Ä–µ—á–µ–Ω—å, —É —è–∫–∏—Ö –≤–∏–¥—ñ–ª–µ–Ω—ñ —Å–ª–æ–≤–∞ —î ...","[{'marker': '–ê', 'text': '*–°–ª–æ–≤–æ*, —á–æ–º—É —Ç–∏ –Ω–µ ...",[–ë],ukrainian-language-and-literature,[–í]
3,"–ü–æ–∑–Ω–∞—á—Ç–µ —Ä–µ—á–µ–Ω–Ω—è, —É —è–∫–æ–º—É –æ–¥–∏–Ω–∏—á–Ω–∏–π –¥—ñ—î–ø—Ä–∏—Å–ª—ñ–≤...","[{'marker': '–ê', 'text': '–ì—É—Ä–∫—ñ—Ç –∫–∞–Ω–æ–Ω–∞–¥–∏ —Ä–µ–≤—ñ...",[–î],ukrainian-language-and-literature,[–ì]
4,"–ü–æ–∑–Ω–∞—á—Ç–µ —Ä—è–¥–æ–∫, —É —è–∫–æ–º—É —Ñ—Ä–∞–∑–µ–æ–ª–æ–≥—ñ—á–Ω—ñ –∑–≤–æ—Ä–æ—Ç–∏ ...","[{'marker': '–ê', 'text': '–Ω–∞ –∂–∏–≤—É –Ω–∏—Ç–∫—É ‚Äì –±—ñ–ª–∏...",[–ë],ukrainian-language-and-literature,[–í]
...,...,...,...,...,...
3058,"–í —É—Ä–∏–≤–∫—É\n\n\n*–¢—Ä—ñ—â–∏—Ç—å, –ª—è—â–∏—Ç—å, –º–æ–≤ —â–µ–ª–µ–ø–∞, –∫–æ...","[{'marker': '–ê', 'text': '–∞–Ω—Ç–∏—Ç–µ–∑–∞'}, {'marker...",[–î],ukrainian-language-and-literature,[–í]
3059,"¬´*–Ø–∫ –º–æ–∂–Ω–∞ –±—É—Ç–∏ –≤—ñ–ª—å–Ω–∏–º, –ï–≤–∫—Ä—ñ—Ç–µ, –∫–æ–ª–∏ –º–∞—î—à —Ç—ñ...","[{'marker': '–ê', 'text': '–°—Ç–µ–ø–∞–Ω –†–∞–¥—á–µ–Ω–∫–æ –π –ù–∞...",[–ê],ukrainian-language-and-literature,[–ì]
3060,"¬´*–ê –≤—á–æ—Ä–∞, –ø–∏—à—É—á–∏ —Å–ø–æ–≥–∞–¥–∏ –ø—Ä–æ –¥–∏—Ç–∏–Ω—Å—Ç–≤–æ, –ø—Ä–æ —Ö...","[{'marker': '–ê', 'text': '–Æ—Ä—ñ–π –Ø–Ω–æ–≤—Å—å–∫–∏–π'}, {'...",[–ë],ukrainian-language-and-literature,[–ê]
3061,"–î—É–º–∫–∞ –ì. –°–∫–æ–≤–æ—Ä–æ–¥–∏, —â–æ —Å–º–µ—Ä—Ç—ñ –Ω–µ –±–æ—ó—Ç—å—Å—è ¬´*—Ç–æ–π...","[{'marker': '–ê', 'text': '–Ü–≤–∞–Ω–∞ –®—Ä–∞–º–∞ —Ç–∞ –Ü–≤–∞–Ω–∞...",[–î],ukrainian-language-and-literature,[–ë]


In [17]:
test_df.iloc[:5].progress_apply(solve_task, axis=1)

100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 5/5 [00:01<00:00,  3.08it/s]


0    [–í]
1    [–í]
2    [–ë]
3    [–í]
4    [–ê]
dtype: object

In [13]:
for row in train_df.iloc[:5].iterrows():
    print(solve_task(row))

TypeError: tuple indices must be integers or slices, not str

In [70]:
test_df["corrected_answers"] = test_df.apply(solve_task, axis=1)
test_df

KeyboardInterrupt: 

In [67]:
submission_df = test_df.copy()
submission_df['correct_answers'] = submission_df['corrected_answers'].apply(lambda x: x[1])
submission_df

Unnamed: 0,question,answers,subject,id,corrected_answers,correct_answers
0,¬´–°–º—ñ—Ö–æ–º –∫—Ä—ñ–∑—å —Å–ª—å–æ–∑–∏¬ª –º–æ–∂–Ω–∞ —Å—Ö–∞—Ä–∞–∫—Ç–µ—Ä–∏–∑—É–≤–∞—Ç–∏ –∑...,"[{'marker': '–ê', 'text': '¬´–¢—Ä–∏ –∑–æ–∑—É–ª—ñ –∑ –ø–æ–∫–ª–æ–Ω...",ukrainian-language-and-literature,0,[–ê],–ê
1,"–£–¥–æ–≤–∏–Ω —Å–∏–Ω, –º–∞—Ç–∏, —Å–µ—Å—Ç—Ä–∞, –∫–æ—Ö–∞–Ω–∞ ‚Äì –∫–ª—é—á–æ–≤—ñ –æ–±—Ä...","[{'marker': '–ê', 'text': '¬´–ó–∞—Å–≤—ñ—Ç –≤—Å—Ç–∞–ª–∏ –∫–æ–∑–∞—á...",ukrainian-language-and-literature,1,[–ê],–ê
2,–í —É—Ä–∏–≤–∫—É –∑ —ñ—Å—Ç–æ—Ä–∏—á–Ω–æ–≥–æ –¥–∂–µ—Ä–µ–ª–∞ ¬´*–°—Ç–≤–æ—Ä–∏–≤—à–∏ –±–∞–Ω...,"[{'marker': '–ê', 'text': '–ü—Ä–∞–≤–æ–±–µ—Ä–µ–∂–Ω–æ—ó –£–∫—Ä–∞—ó–Ω...",history-of-ukraine,2,[–ì],–ì
3,–í —É—Ä–∏–≤–∫—É\n\n\n*–î–æ–∫–∏ –±—É–¥–µ –∂–∏—Ç–∏ –£–∫—Ä–∞—ó–Ω–∞\n\n–í —Ç–µ–ø...,"[{'marker': '–ê', 'text': '–í–∞—Å–∏–ª—è –°—Ç–µ—Ñ–∞–Ω–∏–∫–∞'}, ...",ukrainian-language-and-literature,3,[–ê],–ê
4,–ë—É–∫–≤—É ***–∏*** –Ω–∞ –º—ñ—Å—Ü—ñ –ø—Ä–æ–ø—É—Å–∫—É —Ç—Ä–µ–±–∞ –ø–∏—Å–∞—Ç–∏ –≤...,"[{'marker': '–ê', 'text': '–ø—Ä‚Ä¶.—Ö–æ–≤–∞–Ω–∏–π, –ø—Ä‚Ä¶–∑–≤–∏—Å...",ukrainian-language-and-literature,4,[–ê],–ê
...,...,...,...,...,...,...
746,–£–∫–∞–∂—ñ—Ç—å –ø—Ä–∞–≤–∏–ª—å–Ω–∏–π –≤–∞—Ä—ñ–∞–Ω—Ç¬†–ø–æ—Å–ª—ñ–¥–æ–≤–Ω–æ–≥–æ –∑–∞–ø–æ–≤–Ω...,"[{'marker': '–ê', 'text': '–¥—ñ—î–≤–∏—Ö –ø—Ä–∏–π–æ–º—ñ–≤, —è–∫—ñ...",ukrainian-language-and-literature,746,*–ü—ñ,–ü
747,**–ü—Ä–æ–∞–Ω–∞–ª—ñ–∑—É–π—Ç–µ —Ñ—Ä–∞–≥–º–µ–Ω—Ç —ñ—Å—Ç–æ—Ä–∏—á–Ω–æ–≥–æ –¥–æ–∫—É–º–µ–Ω—Ç–∞...,"[{'marker': '–ê', 'text': '–£–∫—Ä–∞—ó–Ω—Å—å–∫–∞ –≥–æ–ª–æ–≤–Ω–∞ –≤...",history-of-ukraine,747,[–í],–í
748,–ü—Ä–æ—á–∏—Ç–∞–π—Ç–µ —Ä–µ—á–µ–Ω–Ω—è *(—Ü–∏—Ñ—Ä–∞ –ø–æ–∑–Ω–∞—á–∞—î –Ω–∞—Å—Ç—É–ø–Ω–µ —Å...,"[{'marker': '–ê', 'text': '3, 4, 5, 10'}, {'mar...",ukrainian-language-and-literature,748,[–ê],–ê
749,–ì—Ä–∞–º–∞—Ç–∏—á–Ω–æ –ø—Ä–∞–≤–∏–ª—å–Ω–µ –ø—Ä–æ–¥–æ–≤–∂–µ–Ω–Ω—è —Ä–µ—á–µ–Ω–Ω—è ¬´*–ü–µ—Ä...,"[{'marker': '–ê', 'text': '–º–µ–Ω—ñ –ø—Ä–∏–≥–∞–¥–∞–ª–∏—Å—è –¥–∞–≤...",ukrainian-language-and-literature,749,[–ê],–ê


In [68]:
submission_df[["id", "correct_answers"]].to_csv("submission.csv", index=False)

In [24]:
','.join([f"[{option['marker']}] {option['text']}" for option in train_df.iloc[0]['answers']])

'(–ê) –±–∞–¥..–ª–∏–Ω–∞, –±–ª–∞–≥–æ—á..—Å—Ç–∏–≤–∏–π, –∫—Ä..—Ö–∫–∏–π, –∂..–≤–∏—Ü—è;,(–ë) –≤–∞—Ä..–≤–æ, –º–µ–∂..—Ä—ñ—á—á—è, –≤–∏—á..–ø—É—Ä–µ–Ω–∏–π, –∫—Ä..—à—Ç–∞–ª—å;,(–í) –ø‚Äô—è—Ç..—Ä—ñ—á–∫–∞, –∑–∞—Ä—É—á..–Ω–∏, –Ω–µ–≤..–¥–∏–º–∫–∞, –æ–±—ñ—Ç–Ω..—Ü—è;,(–ì) –∑–∞—á..–Ω–∞—Ç–∏, –≤–∏–∫–æ–Ω–∞–≤..—Ü—è, –∑–Ω—ñ–≤..—á–∏—Ç–∏, –≤–µ–ª..—á–∏–Ω–∞;,(–î) –Ω–æ–∂..—á–æ–∫, –ø–µ—á..–≤–æ, –≤–∏–∫–æ—Ä..–Ω–∏—Ç–∏, –æ–∑..—Ä–∞—Ç–∏—Å—è.'