In [1]:
import torch, gc, os, math, random
import tqdm

  import pynvml  # type: ignore[import]


In [2]:
from pynvml import *

nvmlInit()

In [3]:
from datasets import Dataset, load_dataset
from dataclasses import dataclass
from typing import List, Dict

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
from unsloth import FastLanguageModel

ü¶• Unsloth: Will patch your computer to enable 2x faster free finetuning.


  import pynvml  # type: ignore[import]


INFO 09-09 17:10:30 [__init__.py:241] Automatically detected platform cuda.
ü¶• Unsloth Zoo will now patch everything to make training faster!


In [4]:
from transformers import TrainingArguments, AutoTokenizer
from trl import SFTConfig, SFTTrainer

In [5]:
device = "cuda" if torch.cuda.is_available() else "cpu"
device

'cuda'

In [6]:
def flush():
    gc.collect()
    if torch.cuda.is_available():
        torch.cuda.empty_cache()

def gpu_mem(note=""):
    if not torch.cuda.is_available():
        print(f"[{note}] No CUDA available.")
        return
    torch.cuda.synchronize()
    alloc = torch.cuda.memory_allocated() / (1024**3)
    resrv = torch.cuda.memory_reserved() / (1024**3)
    peak = torch.cuda.max_memory_allocated() / (1024**3)
    print(f"[{note}] allocated={alloc:.2f}GB, reserved={resrv:.2f}GB, peak={peak:.2f}GB")

def nvidia_mem():
    if not torch.cuda.is_available():
        return

    nvmlInit()
    h = nvmlDeviceGetHandleByIndex(0)
    info = nvmlDeviceGetMemoryInfo(h)
    print(f"NVML used={info.used/(1024**3):.2f}GB / total={info.total/(1024**3):.2f}GB")

In [7]:
flush()
gpu_mem("fresh"); nvidia_mem()

[fresh] allocated=0.00GB, reserved=0.00GB, peak=0.00GB
NVML used=1.28GB / total=23.99GB


## –ú–æ–¥–µ–ª—å Meta-Llama-3.1-8B-Instruct-bnb-4bit –æ—Ç Unsloth

In [14]:
!git config --global credential.helper store

In [15]:
# !git clone https://viv232:hf_xxxxx@huggingface.co/meta-llama/Llama-3.1-8B-Instruct

–ö–ª–æ–Ω–∏—Ä–æ–≤–∞–Ω–∏–µ –≤ ¬´Llama-3.1-8B-Instruct¬ª...
remote: Enumerating objects: 109, done.[K
remote: Counting objects: 100% (106/106), done.[K
remote: Compressing objects: 100% (106/106), done.[K
remote: Total 109 (delta 53), reused 0 (delta 0), pack-reused 3 (from 1)[K
–ü–æ–ª—É—á–µ–Ω–∏–µ –æ–±—ä–µ–∫—Ç–æ–≤: 100% (109/109), 2.28 –ú–∏–ë | 4.57 –ú–∏–ë/—Å, –≥–æ—Ç–æ–≤–æ.
–û–ø—Ä–µ–¥–µ–ª–µ–Ω–∏–µ –∏–∑–º–µ–Ω–µ–Ω–∏–π: 100% (53/53), –≥–æ—Ç–æ–≤–æ.


In [17]:
# model_name = "meta-llama/Llama-3.1-8B-Instruct"

In [9]:
!git clone https://huggingface.co/unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit

–ö–ª–æ–Ω–∏—Ä–æ–≤–∞–Ω–∏–µ –≤ ¬´Meta-Llama-3.1-8B-Instruct-bnb-4bit¬ª...
remote: Enumerating objects: 131, done.[K
remote: Counting objects: 100% (128/128), done.[K
remote: Compressing objects: 100% (128/128), done.[K
remote: Total 131 (delta 44), reused 0 (delta 0), pack-reused 3 (from 1)[K
–ü–æ–ª—É—á–µ–Ω–∏–µ –æ–±—ä–µ–∫—Ç–æ–≤: 100% (131/131), 2.30 –ú–∏–ë | 3.76 –ú–∏–ë/—Å, –≥–æ—Ç–æ–≤–æ.
–û–ø—Ä–µ–¥–µ–ª–µ–Ω–∏–µ –∏–∑–º–µ–Ω–µ–Ω–∏–π: 100% (44/44), –≥–æ—Ç–æ–≤–æ.


In [9]:
model_name = "unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit"

In [10]:
max_seq_length = 1024

flush()
gpu_mem("before load QLoRA")

[before load QLoRA] allocated=0.00GB, reserved=0.00GB, peak=0.00GB


In [11]:
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name=model_name,
    max_seq_length=max_seq_length,
    dtype=None,
    load_in_4bit=True,    # QLoRA
)

==((====))==  Unsloth 2025.9.1: Fast Llama patching. Transformers: 4.56.1. vLLM: 0.10.1.1.
   \\   /|    NVIDIA GeForce RTX 3090 Ti. Num GPUs = 2. Max memory: 23.536 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.7.1+cu126. CUDA: 8.6. CUDA Toolkit: 12.6. Triton: 3.3.1
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.31. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


In [12]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 16, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)

Unsloth 2025.9.1 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


In [13]:
gpu_mem("after load QLoRA")
nvidia_mem()

[after load QLoRA] allocated=5.50GB, reserved=5.52GB, peak=7.02GB
NVML used=6.64GB / total=23.99GB


In [14]:
tokenizer.eos_token

'<|eot_id|>'

In [15]:
print(f"–ß–∞—Ç-—à–∞–±–ª–æ–Ω: {tokenizer.chat_template}")

–ß–∞—Ç-—à–∞–±–ª–æ–Ω: {{- bos_token }}
{%- if custom_tools is defined %}
    {%- set tools = custom_tools %}
{%- endif %}
{%- if not tools_in_user_message is defined %}
    {%- set tools_in_user_message = true %}
{%- endif %}
{%- if not date_string is defined %}
    {%- set date_string = "26 Jul 2024" %}
{%- endif %}
{%- if not tools is defined %}
    {%- set tools = none %}
{%- endif %}

{#- This block extracts the system message, so we can slot it into the right place. #}
{%- if messages[0]['role'] == 'system' %}
    {%- set system_message = messages[0]['content']|trim %}
    {%- set messages = messages[1:] %}
{%- else %}
    {%- set system_message = "" %}
{%- endif %}

{#- System message + builtin tools #}
{{- "<|start_header_id|>system<|end_header_id|>\n\n" }}
{%- if builtin_tools is defined or tools is not none %}
    {{- "Environment: ipython\n" }}
{%- endif %}
{%- if builtin_tools is defined %}
    {{- "Tools: " + builtin_tools | reject('equalto', 'code_interpreter') | join(", ")

### –û—Ü–µ–Ω–∫–∞ –¥–æ LoRA

In [16]:
FastLanguageModel.for_inference(model)

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): LlamaForCausalLM(
      (model): LlamaModel(
        (embed_tokens): Embedding(128256, 4096, padding_idx=128004)
        (layers): ModuleList(
          (0-31): 32 x LlamaDecoderLayer(
            (self_attn): LlamaAttention(
              (q_proj): lora.Linear4bit(
                (base_layer): Linear4bit(in_features=4096, out_features=4096, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Identity()
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=4096, out_features=16, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=16, out_features=4096, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ModuleDict()
              )
              (k_proj): lor

In [14]:
prompts_for_test = [
    '–ö–∞–∫ –≤–∫—É—Å–Ω–æ –ø—Ä–∏–≥–æ—Ç–æ–≤–∏—Ç—å –∏–Ω–¥–µ–π–∫—É –Ω–∞ –≥—Ä–∏–ª–µ?',
    '–ö–∞–∫ —Ä–∞—Å–ø–æ–∑–Ω–∞—Ç—å –ø—Ä–∏–±–ª–∏–∂–∞—é—â–∏–π—Å—è –∏–Ω—Å—É–ª—å—Ç?',
    '–°—Ñ–æ—Ä–º—É–ª–∏—Ä—É–π –æ—Å–Ω–æ–≤–Ω—ã–µ –∫–∞–Ω–æ–Ω—ã –∞—Ä—Ö–∏—Ç–µ–∫—Ç—É—Ä—ã –¥—Ä–µ–≤–Ω–∏—Ö —Ü–∏–≤–∏–ª–∏–∑–∞—Ü–∏–π',
    '–û–±–ª–∞–≥–∞—Ç—å –ª–∏ —Å—Ç—Ä–∞—Ö–æ–≤—ã–º–∏ –≤–∑–Ω–æ—Å–∞–º–∏ —Å—É–º–º—ã –ø—Ä–æ—â–µ–Ω–Ω–æ–≥–æ –¥–æ–ª–≥–∞ –ø–æ –∑–∞–π–º—É –æ—Ç –æ—Ä–≥–∞–Ω–∏–∑–∞—Ü–∏–∏ –≥–¥–µ —Ä–∞–±–æ—Ç–∞–µ—Ç –∑–∞—Å—Ç—Ä–∞—Ö–æ–≤–∞–Ω–Ω—ã–π?',
    '–†–∞—Å—Å–∫–∞–∂–∏ –º–Ω–µ –ø—Ä–æ –ö—É—Ä—á–∞—Ç–æ–≤–∞'
]

In [15]:
def generate_answer(prompt):
    dialog = tokenizer.apply_chat_template([{"role": "user", "content": prompt}], 
                                           tokenize=False, 
                                           add_generation_prompt=True)
    inputs = tokenizer(dialog, return_tensors = "pt").to("cuda")
    outputs = model.generate(**inputs, max_new_tokens=300, use_cache=True)
    return tokenizer.batch_decode(outputs)[0].split("assistant")[-1]

In [19]:
for text in prompts_for_test:
    print(generate_answer(text))
    print('-' * 50)

<|end_header_id|>

–ß—Ç–æ–±—ã –ø—Ä–∏–≥–æ—Ç–æ–≤–∏—Ç—å –≤–∫—É—Å–Ω—É—é –∏–Ω–¥–µ–π–∫—É –Ω–∞ –≥—Ä–∏–ª–µ, —Å–ª–µ–¥—É–π—Ç–µ —ç—Ç–∏–º —Ä–µ–∫–æ–º–µ–Ω–¥–∞—Ü–∏—è–º:

**–ü—Ä–∏–≥–æ—Ç–æ–≤–ª–µ–Ω–∏–µ –∏–Ω–¥–µ–π–∫–∏ –Ω–∞ –≥—Ä–∏–ª–µ:**

–ò–Ω–≥—Ä–µ–¥–∏–µ–Ω—Ç—ã:

*   1 –∏–Ω–¥–µ–π–∫–∞ (–≤–µ—Å–æ–º 1,5-2 –∫–≥)
*   2 —Å—Ç–æ–ª–æ–≤—ã–µ –ª–æ–∂–∫–∏ –æ–ª–∏–≤–∫–æ–≤–æ–≥–æ –º–∞—Å–ª–∞
*   1 —á–∞–π–Ω–∞—è –ª–æ–∂–∫–∞ —Å–æ–ª–∏
*   1 —á–∞–π–Ω–∞—è –ª–æ–∂–∫–∞ —á–µ—Ä–Ω–æ–≥–æ –ø–µ—Ä—Ü–∞
*   1 —á–∞–π–Ω–∞—è –ª–æ–∂–∫–∞ –ø–∞–ø—Ä–∏–∫–∏
*   1 —á–∞–π–Ω–∞—è –ª–æ–∂–∫–∞ —á–µ—Å–Ω–æ–∫–∞, –∏–∑–º–µ–ª—å—á–µ–Ω–Ω–æ–≥–æ
*   1 –ª—É–∫–æ–≤–∏—Ü–∞, –∏–∑–º–µ–ª—å—á–µ–Ω–Ω–∞—è
*   2 –≤–µ—Ç–æ—á–∫–∏ —Ä–æ–∑–º–∞—Ä–∏–Ω–∞ (–ø–æ –∂–µ–ª–∞–Ω–∏—é)
*   1 –ª–∏–º–æ–Ω, –Ω–∞—Ä–µ–∑–∞–Ω–Ω—ã–π (–ø–æ –∂–µ–ª–∞–Ω–∏—é)

**–ü–æ–¥–≥–æ—Ç–æ–≤–∫–∞ –∏–Ω–¥–µ–π–∫–∏:**

1.  –ù–∞–ª–µ–π—Ç–µ –∏–Ω–¥–µ–π–∫—É –≤ —Ñ–æ—Ä–º—É –¥–ª—è –≥—Ä–∏–ª—è –∏–ª–∏ –Ω–∞ –ø—Ä–æ—Ç–∏–≤–µ–Ω—å.
2.  –í –º–∏—Å–∫–µ —Å–º–µ—à–∞–π—Ç–µ –æ–ª–∏–≤–∫–æ–≤–æ–µ –º–∞—Å–ª–æ, —Å–æ–ª—å, —á–µ—Ä–Ω—ã–π –ø–µ—Ä–µ—Ü, –ø–∞–ø—Ä–∏–∫—É, —á

lm_eval --model hf \
    --model_args pretrained=unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit,dtype="float" \
    --tasks truthfulqa_ru_mc1 \
    --device cuda:0 \
    --batch_size auto:4

In [52]:
!bash ./lm-evaluation-harness/run_lmesh.sh

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


  import pynvml  # type: ignore[import]
INFO 09-09 14:33:34 [__init__.py:241] Automatically detected platform cuda.
2025-09-09:14:33:36 INFO     [__main__:446] Selected Tasks: ['truthfulqa_ru_mc1']
        not applied. Recommend setting `apply_chat_template` (optionally `fewshot_as_multiturn`).
2025-09-09:14:33:36 INFO     [evaluator:202] Setting random seed to 0 | Setting numpy seed to 1234 | Setting torch manual seed to 1234 | Setting fewshot manual seed to 1234
2025-09-09:14:33:36 INFO     [evaluator:240] Initializing hf model, with arguments: {'pretrained': 'unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit', 'dtype': 'float'}
2025-09-09:14:33:36 INFO     [models.huggingface:147] Using device 'cuda:0'
2025-09-09:14:33:37 INFO     [models.huggingface:414] Model parallel was set to False, max memory was not set, and device map was set to {'': 'cuda:0'}
`torch_dtype` is deprecated! Use `dtype` instead!
2025-09-09:14:33:42 INFO     [api.task:434] Building contexts for truthfulqa_ru_mc1 on ra

### –ü–æ–¥–≥–æ—Ç–æ–≤–∫–∞ –¥–∞–Ω–Ω—ã—Ö

In [19]:
vikhr_dataset = load_dataset("Vikhrmodels/GrandMaster-PRO-MAX", split="train")

Generating train split: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 151822/151822 [00:00<00:00, 171669.37 examples/s]
Generating test split: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 3291/3291 [00:00<00:00, 157742.95 examples/s]


In [20]:
vikhr_dataset

Dataset({
    features: ['source', 'conversation', 'prompt_tokens', 'answer_tokens', 'cluster', 'prompt_lang', 'answer_lang'],
    num_rows: 151822
})

In [21]:
def filter_russian(example):
    return example['prompt_lang'] == 'ru' and example['answer_lang'] == 'ru'

In [23]:
vikhr_dataset = vikhr_dataset.filter(filter_russian)

Filter: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 151822/151822 [00:01<00:00, 101434.43 examples/s]


In [24]:
len(vikhr_dataset)

86295

In [25]:
vikhr_dataset[1]["conversation"]

[{'content': '—Å–ª—É—à–∞–π, —É –º–µ–Ω—è —Ç—É—Ç –≤–æ–∑–Ω–∏–∫–ª–∞ –∑–∞–¥–∞—á–∫–∞ –ø–æ –∞—Ä—Ö–∏—Ç–µ–∫—Ç—É—Ä–µ –∫–æ–º–ø—å—é—Ç–µ—Ä–∞, –∏ —è –Ω–µ –º–æ–≥—É –≤ –Ω–µ–π —Ä–∞–∑–æ–±—Ä–∞—Ç—å—Å—è. –º–Ω–µ –Ω—É–∂–Ω–æ —Ä–∞–∑—Ä–∞–±–æ—Ç–∞—Ç—å –∞–ª–≥–æ—Ä–∏—Ç–º, –∫–æ—Ç–æ—Ä—ã–π –æ–ø—Ç–∏–º–∏–∑–∏—Ä—É–µ—Ç –¥–æ—Å—Ç—É–ø –∫ –∫—ç—à-–ø–∞–º—è—Ç–∏ –≤ –º–Ω–æ–≥–æ—è–¥–µ—Ä–Ω–æ–º –ø—Ä–æ—Ü–µ—Å—Å–æ—Ä–µ –¥–ª—è –ø–∞—Ä–∞–ª–ª–µ–ª—å–Ω—ã—Ö –≤—ã—á–∏—Å–ª–µ–Ω–∏–π. —Ç—ã –Ω–µ –º–æ–≥ –±—ã –ø–æ–º–æ—á—å —Å —ç—Ç–∏–º? –≤–æ—Ç –∫–∞–∫ —è –ø—Ä–µ–¥—Å—Ç–∞–≤–ª—è—é –∑–∞–¥–∞—á—É:\n\n1. –∏–º–µ–µ—Ç—Å—è –º–Ω–æ–≥–æ—è–¥–µ—Ä–Ω—ã–π –ø—Ä–æ—Ü–µ—Å—Å–æ—Ä —Å –æ–±—â–∏–º –∫—ç—à–µ–º –≤—Ç–æ—Ä–æ–≥–æ —É—Ä–æ–≤–Ω—è.\n2. –Ω–µ–æ–±—Ö–æ–¥–∏–º–æ –º–∏–Ω–∏–º–∏–∑–∏—Ä–æ–≤–∞—Ç—å –ø—Ä–æ–º–∞—Ö–∏ –∫—ç—à–∞ –ø—Ä–∏ –ø–∞—Ä–∞–ª–ª–µ–ª—å–Ω–æ–º –≤—ã–ø–æ–ª–Ω–µ–Ω–∏–∏ –Ω–µ—Å–∫–æ–ª—å–∫–∏—Ö –∏–Ω—Ç–µ–Ω—Å–∏–≤–Ω—ã—Ö –∑–∞–¥–∞—á –ø–æ –æ–±—Ä–∞–±–æ—Ç–∫–µ –¥–∞–Ω–Ω—ã—Ö.\n3. –∞–ª–≥–æ—Ä–∏—Ç–º –¥–æ–ª–∂–µ–Ω —Ä–∞—Å–ø—Ä–µ–¥–µ–ª—è—Ç—å –¥–∞–Ω–Ω—ã–µ —Ç–∞–∫–∏–º –æ–±—Ä–∞–∑–æ–º, —á—Ç–æ–±—ã –º–∞–∫—Å–∏–º–∞–ª—å–Ω–æ –

In [26]:
tokenizer.apply_chat_template(vikhr_dataset[1]["conversation"], tokenize=False)

'<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\nCutting Knowledge Date: December 2023\nToday Date: 26 Jul 2024\n\n<|eot_id|><|start_header_id|>user<|end_header_id|>\n\n—Å–ª—É—à–∞–π, —É –º–µ–Ω—è —Ç—É—Ç –≤–æ–∑–Ω–∏–∫–ª–∞ –∑–∞–¥–∞—á–∫–∞ –ø–æ –∞—Ä—Ö–∏—Ç–µ–∫—Ç—É—Ä–µ –∫–æ–º–ø—å—é—Ç–µ—Ä–∞, –∏ —è –Ω–µ –º–æ–≥—É –≤ –Ω–µ–π —Ä–∞–∑–æ–±—Ä–∞—Ç—å—Å—è. –º–Ω–µ –Ω—É–∂–Ω–æ —Ä–∞–∑—Ä–∞–±–æ—Ç–∞—Ç—å –∞–ª–≥–æ—Ä–∏—Ç–º, –∫–æ—Ç–æ—Ä—ã–π –æ–ø—Ç–∏–º–∏–∑–∏—Ä—É–µ—Ç –¥–æ—Å—Ç—É–ø –∫ –∫—ç—à-–ø–∞–º—è—Ç–∏ –≤ –º–Ω–æ–≥–æ—è–¥–µ—Ä–Ω–æ–º –ø—Ä–æ—Ü–µ—Å—Å–æ—Ä–µ –¥–ª—è –ø–∞—Ä–∞–ª–ª–µ–ª—å–Ω—ã—Ö –≤—ã—á–∏—Å–ª–µ–Ω–∏–π. —Ç—ã –Ω–µ –º–æ–≥ –±—ã –ø–æ–º–æ—á—å —Å —ç—Ç–∏–º? –≤–æ—Ç –∫–∞–∫ —è –ø—Ä–µ–¥—Å—Ç–∞–≤–ª—è—é –∑–∞–¥–∞—á—É:\n\n1. –∏–º–µ–µ—Ç—Å—è –º–Ω–æ–≥–æ—è–¥–µ—Ä–Ω—ã–π –ø—Ä–æ—Ü–µ—Å—Å–æ—Ä —Å –æ–±—â–∏–º –∫—ç—à–µ–º –≤—Ç–æ—Ä–æ–≥–æ —É—Ä–æ–≤–Ω—è.\n2. –Ω–µ–æ–±—Ö–æ–¥–∏–º–æ –º–∏–Ω–∏–º–∏–∑–∏—Ä–æ–≤–∞—Ç—å –ø—Ä–æ–º–∞—Ö–∏ –∫—ç—à–∞ –ø—Ä–∏ –ø–∞—Ä–∞–ª–ª–µ–ª—å–Ω–æ–º –≤—ã–ø–æ–ª–Ω–µ–Ω–∏–∏ –Ω–µ—Å–∫–æ–ª—å–∫–∏—Ö –∏–Ω—Ç–µ–Ω—Å–∏–≤–Ω—ã—Ö –∑–∞–¥–∞—á –ø

In [27]:
def formatting_func(example):
    return {"text": tokenizer.apply_chat_template(example["conversation"], tokenize=False)}

In [28]:
check_data_prep = vikhr_dataset.select(range(5))

In [29]:
len(check_data_prep)

5

In [30]:
check_data = check_data_prep.map(formatting_func)

Map: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 5/5 [00:00<00:00, 1229.86 examples/s]


In [31]:
check_data

Dataset({
    features: ['source', 'conversation', 'prompt_tokens', 'answer_tokens', 'cluster', 'prompt_lang', 'answer_lang', 'text'],
    num_rows: 5
})

In [32]:
check_data[0]

{'source': 'generated/saiga/tagengo/lmsys_pref',
 'conversation': [{'content': '–º–Ω–µ –æ—á–µ–Ω—å –∏–Ω—Ç–µ—Ä–µ—Å–Ω—ã —Å—Ç—Ä–∞—Ç–µ–≥–∏—á–µ—Å–∫–∏–µ –∏–≥—Ä—ã, –∏ —è –Ω–µ–¥–∞–≤–Ω–æ —É–∑–Ω–∞–ª –ø—Ä–æ –∏–≥—Ä—É –Ω–∏–º. –Ω–µ –º–æ–≥ –±—ã —Ç—ã –æ–±—ä—è—Å–Ω–∏—Ç—å –º–Ω–µ —Å—Ç—Ä–∞—Ç–µ–≥–∏—é –æ–ø—Ç–∏–º–∞–ª—å–Ω–æ–π –∏–≥—Ä—ã –≤ –Ω–∏–º? –∏ –µ—â–µ, –µ—Å–ª–∏ –µ—Å—Ç—å, –ø–æ–¥–µ–ª–∏—Å—å –∏–Ω—Ç–µ—Ä–µ—Å–Ω—ã–º –≤–∞—Ä–∏–∞–Ω—Ç–æ–º –∏–≥—Ä—ã –≤ –∫—Ä–µ—Å—Ç–∏–∫–∏-–Ω–æ–ª–∏–∫–∏ –∏–ª–∏ –¥—Ä—É–≥–∏–µ —Å—Ç—Ä–∞—Ç–µ–≥–∏—á–µ—Å–∫–∏–µ –≥–æ–ª–æ–≤–æ–ª–æ–º–∫–∏, –≤ –∫–æ—Ç–æ—Ä—ã–µ –º—ã –º–æ–≥–ª–∏ –±—ã —Å—ã–≥—Ä–∞—Ç—å –≤–º–µ—Å—Ç–µ. –∫–∞–∫ –Ω–∞—Å—á–µ—Ç —Ç–æ–≥–æ, —á—Ç–æ–±—ã —Ä–∞—Å—Å–º–æ—Ç—Ä–µ—Ç—å 15 puzzle? –º–Ω–µ –±—ã —Ö–æ—Ç–µ–ª–æ—Å—å —É–∑–Ω–∞—Ç—å, –µ—Å—Ç—å –ª–∏ –¥–ª—è –Ω–µ—ë –∫–∞–∫–∞—è-—Ç–æ –≤—ã–∏–≥—Ä—ã—à–Ω–∞—è —Å—Ç—Ä–∞—Ç–µ–≥–∏—è –∏–ª–∏ –ø–æ–¥—Ö–æ–¥, –∫–æ—Ç–æ—Ä—ã–π –≥–∞—Ä–∞–Ω—Ç–∏—Ä—É–µ—Ç –ø–æ–±–µ–¥—É.',
   'role': 'user'},
  {'content': '–†–∞—Å—Å–∫–∞–∂—É —Ç–µ–±–µ –æ —Å—Ç—Ä–∞—Ç–µ–≥–∏—è—Ö –∏–≥—Ä—ã –≤ –ù–∏–º –∏ –∑–∞—Ç—Ä–æ–Ω—É —Ç–µ–º—É 

In [33]:
check_data[0]["text"]

'<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\nCutting Knowledge Date: December 2023\nToday Date: 26 Jul 2024\n\n<|eot_id|><|start_header_id|>user<|end_header_id|>\n\n–º–Ω–µ –æ—á–µ–Ω—å –∏–Ω—Ç–µ—Ä–µ—Å–Ω—ã —Å—Ç—Ä–∞—Ç–µ–≥–∏—á–µ—Å–∫–∏–µ –∏–≥—Ä—ã, –∏ —è –Ω–µ–¥–∞–≤–Ω–æ —É–∑–Ω–∞–ª –ø—Ä–æ –∏–≥—Ä—É –Ω–∏–º. –Ω–µ –º–æ–≥ –±—ã —Ç—ã –æ–±—ä—è—Å–Ω–∏—Ç—å –º–Ω–µ —Å—Ç—Ä–∞—Ç–µ–≥–∏—é –æ–ø—Ç–∏–º–∞–ª—å–Ω–æ–π –∏–≥—Ä—ã –≤ –Ω–∏–º? –∏ –µ—â–µ, –µ—Å–ª–∏ –µ—Å—Ç—å, –ø–æ–¥–µ–ª–∏—Å—å –∏–Ω—Ç–µ—Ä–µ—Å–Ω—ã–º –≤–∞—Ä–∏–∞–Ω—Ç–æ–º –∏–≥—Ä—ã –≤ –∫—Ä–µ—Å—Ç–∏–∫–∏-–Ω–æ–ª–∏–∫–∏ –∏–ª–∏ –¥—Ä—É–≥–∏–µ —Å—Ç—Ä–∞—Ç–µ–≥–∏—á–µ—Å–∫–∏–µ –≥–æ–ª–æ–≤–æ–ª–æ–º–∫–∏, –≤ –∫–æ—Ç–æ—Ä—ã–µ –º—ã –º–æ–≥–ª–∏ –±—ã —Å—ã–≥—Ä–∞—Ç—å –≤–º–µ—Å—Ç–µ. –∫–∞–∫ –Ω–∞—Å—á–µ—Ç —Ç–æ–≥–æ, —á—Ç–æ–±—ã —Ä–∞—Å—Å–º–æ—Ç—Ä–µ—Ç—å 15 puzzle? –º–Ω–µ –±—ã —Ö–æ—Ç–µ–ª–æ—Å—å —É–∑–Ω–∞—Ç—å, –µ—Å—Ç—å –ª–∏ –¥–ª—è –Ω–µ—ë –∫–∞–∫–∞—è-—Ç–æ –≤—ã–∏–≥—Ä—ã—à–Ω–∞—è —Å—Ç—Ä–∞—Ç–µ–≥–∏—è –∏–ª–∏ –ø–æ–¥—Ö–æ–¥, –∫–æ—Ç–æ—Ä—ã–π –≥–∞—Ä–∞–Ω—Ç–∏—Ä—É–µ—Ç –ø–æ–±–µ–¥—É.<|eot_id|><|start_header_id|>a

In [34]:
train_data = vikhr_dataset.map(formatting_func)

Map: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 86295/86295 [00:10<00:00, 8503.30 examples/s] 


### train

In [35]:
from unsloth import is_bfloat16_supported

trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=train_data,
    dataset_text_field="text",
    max_seq_length = max_seq_length,
    dataset_num_proc = 2,
    packing = False, # Can make training 5x faster for short sequences.
    args = SFTConfig(
        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        per_device_train_batch_size=4,
        gradient_accumulation_steps=4,
        warmup_steps=30,
        num_train_epochs=1,
        max_steps=100,
        learning_rate=2e-3,
        logging_steps=1,
        optim="adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
        report_to = "none", # Use this for WandB etc
    ),
)

Unsloth: Tokenizing ["text"] (num_proc=24): 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 86295/86295 [00:07<00:00, 11717.37 examples/s]


In [36]:
gpu_mem("QLoRA before train")

[QLoRA before train] allocated=5.50GB, reserved=5.52GB, peak=7.02GB


In [37]:
!python -c "import torch; print(torch.__version__); print(torch.cuda.is_available())"

  import pynvml  # type: ignore[import]
2.7.1+cu126
True


In [38]:
import os
os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "0"
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

In [39]:
print(f"CUDA available: {torch.cuda.is_available()}")
print(f"CUDA device count: {torch.cuda.device_count()}")
print(f"Current device: {torch.cuda.current_device()}")
print(f"Device name: {torch.cuda.get_device_name(0)}")

CUDA available: True
CUDA device count: 2
Current device: 0
Device name: NVIDIA GeForce RTX 3090 Ti


In [40]:
trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 86,295 | Num Epochs = 1 | Total steps = 100
O^O/ \_/ \    Batch size per device = 8 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (8 x 4 x 1) = 32
 "-____-"     Trainable parameters = 41,943,040 of 8,072,204,288 (0.52% trained)


Unsloth: Will smartly offload gradients to save VRAM!


Step,Training Loss
1,1.319
2,1.505
3,1.4398
4,1.3942
5,1.2066
6,1.2157
7,1.0303
8,1.0842
9,1.0802
10,1.1213


In [41]:
gpu_mem("QLoRA after train"); nvidia_mem()

[QLoRA after train] allocated=5.77GB, reserved=10.22GB, peak=11.74GB
NVML used=11.35GB / total=23.99GB


In [42]:
model.save_pretrained("lora_model")
tokenizer.save_pretrained("lora_model")

('lora_model/tokenizer_config.json',
 'lora_model/special_tokens_map.json',
 'lora_model/chat_template.jinja',
 'lora_model/tokenizer.json')

### –û—Ü–µ–Ω–∫–∞

In [11]:
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name="lora_model",
    max_seq_length=max_seq_length,
    dtype=None,
    load_in_4bit=True,    # QLoRA
)

==((====))==  Unsloth 2025.9.1: Fast Llama patching. Transformers: 4.56.1. vLLM: 0.10.1.1.
   \\   /|    NVIDIA GeForce RTX 3090 Ti. Num GPUs = 2. Max memory: 23.536 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.7.1+cu126. CUDA: 8.6. CUDA Toolkit: 12.6. Triton: 3.3.1
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.31. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Unsloth 2025.9.1 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


In [13]:
model.save_pretrained_merged("llama-3.1-8B-instruct_lora_ru", tokenizer, save_method="merged_16bit")

Found HuggingFace hub cache directory: /home/viv232/.cache/huggingface/hub


Fetching 1 files: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1/1 [00:00<00:00,  1.43it/s]


Checking cache directory for required files...
Cache check failed: model-00001-of-00004.safetensors not found in local cache.
Not all required files found in cache. Will proceed with downloading.


Unsloth: Merging weights into 16bit: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 4/4 [36:15<00:00, 543.96s/it]


In [16]:
for text in prompts_for_test:
    print(generate_answer(text))
    print('-' * 50)

<|end_header_id|>

–ö–æ–Ω–µ—á–Ω–æ, –¥–∞–≤–∞–π—Ç–µ —Ä–∞–∑–±–µ—Ä–µ–º—Å—è, –∫–∞–∫ –º–æ–∂–Ω–æ –ø—Ä–∏–≥–æ—Ç–æ–≤–∏—Ç—å –≤–∫—É—Å–Ω—É—é –∏–Ω–¥–µ–π–∫—É –Ω–∞ –≥—Ä–∏–ª–µ.

### –®–∞–≥–∏ –ø—Ä–∏–≥–æ—Ç–æ–≤–ª–µ–Ω–∏—è –∏–Ω–¥–µ–π–∫–∏ –Ω–∞ –≥—Ä–∏–ª–µ:

#### –®–∞–≥ 1: –ü–æ–¥–≥–æ—Ç–æ–≤–∫–∞ –∏–Ω–¥–µ–π–∫–∏
–ü—Ä–µ–∂–¥–µ –≤—Å–µ–≥–æ, —É–±–µ–¥–∏—Ç–µ—Å—å, —á—Ç–æ —É –≤–∞—Å –µ—Å—Ç—å —Å–≤–µ–∂–∞—è –∏–Ω–¥–µ–π–∫–∞. –ï—Å–ª–∏ –∏–Ω–¥–µ–π–∫–∞ –∑–∞–º–æ—Ä–æ–∂–µ–Ω–∞, –µ—ë –Ω—É–∂–Ω–æ —Ä–∞–∑–º–æ—Ä–æ–∑–∏—Ç—å.

#### –®–∞–≥ 2: –ú–∞—Ä–∏–Ω–∞–¥
–ú–∞—Ä–∏–Ω–∞–¥ –ø–æ–º–æ–≥–∞–µ—Ç –∏–Ω–¥–µ–π–∫–µ —Å—Ç–∞—Ç—å –±–æ–ª–µ–µ —Å–æ—á–Ω–æ–π –∏ –∞—Ä–æ–º–∞—Ç–Ω–æ–π. –í –±–ª–µ–Ω–¥–µ—Ä–µ —Å–º–µ—à–∞–π—Ç–µ:
- 1/2 —Å—Ç–∞–∫–∞–Ω–∞ –æ–ª–∏–≤–∫–æ–≤–æ–≥–æ –º–∞—Å–ª–∞
- 2 —Å—Ç–æ–ª–æ–≤—ã–µ –ª–æ–∂–∫–∏ –ª–∏–º–æ–Ω–Ω–æ–≥–æ —Å–æ–∫–∞
- 1 —Å—Ç–æ–ª–æ–≤–∞—è –ª–æ–∂–∫–∞ —Å–æ–ª–∏
- 1 —Å—Ç–æ–ª–æ–≤–∞—è –ª–æ–∂–∫–∞ —Å–∞—Ö–∞—Ä–∞
- 2 –∑—É–±—á–∏–∫–∞ —á–µ—Å–Ω–æ–∫–∞, –∏–∑–º–µ–ª—å—á–µ–Ω–Ω–æ–≥–æ
- 1 —á–∞–π–Ω—É—é –ª–æ–∂–∫—É –º–æ–ª–æ—Ç–æ–≥–æ —á–µ—Ä–Ω–æ–≥–æ –ø–µ—Ä—Ü–∞
- 1 —á–∞–π–Ω—É—é –ª–æ–∂

In [18]:
gpu_mem("QLoRA before train")

[QLoRA before train] allocated=5.69GB, reserved=5.84GB, peak=7.08GB


In [20]:
nvidia_mem()

NVML used=7.19GB / total=23.99GB


In [23]:
# del model
# del tokenizer
# del trainer

In [26]:
gc.collect()

44949

In [27]:
gpu_mem("QLoRA before train")

[QLoRA before train] allocated=0.12GB, reserved=5.84GB, peak=7.08GB


In [28]:
nvidia_mem()

NVML used=7.28GB / total=23.99GB


In [29]:
if torch.cuda.is_available():
    torch.cuda.empty_cache()
    torch.cuda.synchronize()

In [30]:
gpu_mem("QLoRA before train")

[QLoRA before train] allocated=0.12GB, reserved=0.16GB, peak=7.08GB


In [31]:
nvidia_mem()

NVML used=1.58GB / total=23.99GB


In [33]:
nvidia_mem()

NVML used=1.57GB / total=23.99GB


In [32]:
!bash ./lm-evaluation-harness/run_lmesh_lora.sh

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


  import pynvml  # type: ignore[import]
INFO 09-09 18:16:51 [__init__.py:241] Automatically detected platform cuda.
2025-09-09:18:16:53 INFO     [__main__:446] Selected Tasks: ['truthfulqa_ru_mc1']
        Recommend setting `apply_chat_template` (optionally `fewshot_as_multiturn`).
2025-09-09:18:16:53 INFO     [evaluator:202] Setting random seed to 0 | Setting numpy seed to 1234 | Setting torch manual seed to 1234 | Setting fewshot manual seed to 1234
2025-09-09:18:16:53 INFO     [evaluator:240] Initializing hf model, with arguments: {'pretrained': 'llama-3.1-8B-instruct_lora_ru', 'dtype': 'float'}
2025-09-09:18:16:53 INFO     [models.huggingface:147] Using device 'cuda:0'
2025-09-09:18:16:53 INFO     [models.huggingface:414] Model parallel was set to False, max memory was not set, and device map was set to {'': 'cuda:0'}
`torch_dtype` is deprecated! Use `dtype` instead!
Loading checkpoint shards:  50%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà         | 2/4 [00:01<00:01,  1.72it/s]
Traceback (most r

In [34]:
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name="lora_model",
    max_seq_length=max_seq_length,
    dtype=None,
    load_in_4bit=True,    # QLoRA
)

==((====))==  Unsloth 2025.9.1: Fast Llama patching. Transformers: 4.56.1. vLLM: 0.10.1.1.
   \\   /|    NVIDIA GeForce RTX 3090 Ti. Num GPUs = 2. Max memory: 23.536 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.7.1+cu126. CUDA: 8.6. CUDA Toolkit: 12.6. Triton: 3.3.1
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.31. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


In [36]:
model.save_pretrained_merged("llama-3.1-8B-instruct_lora_ru-4bit", tokenizer, save_method="merged_4bit_forced")

Unsloth: Merging LoRA weights into 4bit model...




Unsloth: Merging finished.
Unsloth: Found skipped modules: ['lm_head']. Updating config.
Unsloth: Saving merged 4bit model to llama-3.1-8B-instruct_lora_ru-4bit...
Unsloth: Merged 4bit model saved.
Unsloth: Merged 4bit model process completed.


In [37]:
!bash ./lm-evaluation-harness/run_lmesh_lora.sh

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


  import pynvml  # type: ignore[import]
INFO 09-09 18:28:38 [__init__.py:241] Automatically detected platform cuda.
2025-09-09:18:28:40 INFO     [__main__:446] Selected Tasks: ['truthfulqa_ru_mc1']
        applied. Recommend setting `apply_chat_template` (optionally `fewshot_as_multiturn`).
2025-09-09:18:28:40 INFO     [evaluator:202] Setting random seed to 0 | Setting numpy seed to 1234 | Setting torch manual seed to 1234 | Setting fewshot manual seed to 1234
2025-09-09:18:28:40 INFO     [evaluator:240] Initializing hf model, with arguments: {'pretrained': 'llama-3.1-8B-instruct_lora_ru-4bit', 'dtype': 'float'}
2025-09-09:18:28:40 INFO     [models.huggingface:147] Using device 'cuda:0'
2025-09-09:18:28:40 INFO     [models.huggingface:414] Model parallel was set to False, max memory was not set, and device map was set to {'': 'cuda:0'}
`torch_dtype` is deprecated! Use `dtype` instead!
Loading checkpoint shards: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2/2 [00:00<00:

## –ú–æ–¥–µ–ª—å YandexGPT-5-Lite-8B-instruct LoRA PEFT

In [8]:
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, Trainer, DataCollatorForLanguageModeling
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from datasets import load_dataset
import torch
from transformers import BitsAndBytesConfig

In [9]:
import os
os.environ['UNSLOTH_DISABLE'] = '1'

In [10]:
model_name = "yandex/YandexGPT-5-Lite-8B-instruct"

In [11]:
# –ö–æ–Ω—Ñ–∏–≥—É—Ä–∞—Ü–∏—è 4-–±–∏—Ç–Ω–æ–π –∫–≤–∞–Ω—Ç–∏–∑–∞—Ü–∏–∏ –¥–ª—è QLoRA
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16
)

In [12]:
# –ó–∞–≥—Ä—É–∑–∫–∞ —Ç–æ–∫–µ–Ω–∏–∑–∞—Ç–æ—Ä–∞
tokenizer = AutoTokenizer.from_pretrained(
    model_name,
    trust_remote_code=True,
    use_fast=False
)

In [13]:
tokenizer.eos_token

'</s>'

In [14]:
tokenizer.pad_token

In [15]:
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

In [16]:
# –ó–∞–≥—Ä—É–∑–∫–∞ –º–æ–¥–µ–ª–∏
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map="auto",
    dtype=torch.float16,  # –Ø–≤–Ω–æ —É–∫–∞–∑—ã–≤–∞–µ–º —Ç–∏–ø –¥–∞–Ω–Ω—ã—Ö
    trust_remote_code=True,
    use_cache=False,  # –î–æ–ª–∂–Ω–æ –±—ã—Ç—å False –ø—Ä–∏ gradient checkpointing
    low_cpu_mem_usage=False
)

Loading checkpoint shards: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 4/4 [00:50<00:00, 12.74s/it]


In [17]:
# –ü–æ–¥–≥–æ—Ç–æ–≤–∫–∞ –º–æ–¥–µ–ª–∏ –¥–ª—è k-bit –æ–±—É—á–µ–Ω–∏—è
model = prepare_model_for_kbit_training(model)

In [18]:
gpu_mem("QLoRA before train")

[QLoRA before train] allocated=2.81GB, reserved=4.11GB, peak=3.79GB


In [19]:
nvidia_mem()

NVML used=5.38GB / total=23.99GB


In [20]:
from peft import LoraConfig, get_peft_model

In [21]:
for name, module in model.named_modules():
    if "proj" in name:
        print(name)

model.layers.0.self_attn.q_proj
model.layers.0.self_attn.k_proj
model.layers.0.self_attn.v_proj
model.layers.0.self_attn.o_proj
model.layers.0.mlp.gate_proj
model.layers.0.mlp.up_proj
model.layers.0.mlp.down_proj
model.layers.1.self_attn.q_proj
model.layers.1.self_attn.k_proj
model.layers.1.self_attn.v_proj
model.layers.1.self_attn.o_proj
model.layers.1.mlp.gate_proj
model.layers.1.mlp.up_proj
model.layers.1.mlp.down_proj
model.layers.2.self_attn.q_proj
model.layers.2.self_attn.k_proj
model.layers.2.self_attn.v_proj
model.layers.2.self_attn.o_proj
model.layers.2.mlp.gate_proj
model.layers.2.mlp.up_proj
model.layers.2.mlp.down_proj
model.layers.3.self_attn.q_proj
model.layers.3.self_attn.k_proj
model.layers.3.self_attn.v_proj
model.layers.3.self_attn.o_proj
model.layers.3.mlp.gate_proj
model.layers.3.mlp.up_proj
model.layers.3.mlp.down_proj
model.layers.4.self_attn.q_proj
model.layers.4.self_attn.k_proj
model.layers.4.self_attn.v_proj
model.layers.4.self_attn.o_proj
model.layers.4.mlp.g

In [22]:
# –ù–∞—Å—Ç—Ä–æ–π–∫–∞ LoRA
lora_config = LoraConfig(
    r=16,  # –†–∞–Ω–≥
    lora_alpha=16,  # –ö–æ—ç—Ñ—Ñ–∏—Ü–∏–µ–Ω—Ç –º–∞—Å—à—Ç–∞–±–∏—Ä–æ–≤–∞–Ω–∏—è
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
)

In [23]:
# –ü—Ä–∏–º–µ–Ω–µ–Ω–∏–µ LoRA –∫ –º–æ–¥–µ–ª–∏
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

trainable params: 41,943,040 || all params: 8,078,495,744 || trainable%: 0.5192


### –ü–æ–¥–≥–æ—Ç–æ–≤–∫–∞ –¥–∞–Ω–Ω—ã—Ö

In [24]:
from datasets import Dataset

In [25]:
vikhr_dataset = load_dataset("Vikhrmodels/GrandMaster-PRO-MAX", split="train")

In [26]:
if len(vikhr_dataset) > 10000:
    vikhr_dataset = vikhr_dataset.select(range(10000))

In [27]:
len(vikhr_dataset)

10000

In [28]:
def filter_russian(example):
    return example['prompt_lang'] == 'ru' and example['answer_lang'] == 'ru'

In [29]:
vikhr_dataset = vikhr_dataset.filter(filter_russian)

In [30]:
dataset = Dataset.from_list(vikhr_dataset)

In [31]:
def formatting_func(example):

    conversation = example["conversation"]
    
    # –¢–æ–∫–µ–Ω–∏–∑–∏—Ä—É–µ–º —Å –ø—Ä–∏–º–µ–Ω–µ–Ω–∏–µ–º —á–∞—Ç-—à–∞–±–ª–æ–Ω–∞
    # –°–Ω–∞—á–∞–ª–∞ –ø–æ–ª—É—á–∞–µ–º —Ç–µ–∫—Å—Ç –∏–∑ —á–∞—Ç-—à–∞–±–ª–æ–Ω–∞
    text = tokenizer.apply_chat_template(
        conversation,
        tokenize=False,  # –ù–µ —Ç–æ–∫–µ–Ω–∏–∑–∏—Ä—É–µ–º, –ø–æ–ª—É—á–∞–µ–º —Ç–µ–∫—Å—Ç
        truncation=True,
        max_length=1024,
        padding=False,  # –ù–µ –¥–æ–±–∞–≤–ª—è–µ–º –ø–∞–¥–¥–∏–Ω–≥ –∑–¥–µ—Å—å
    )
    
    # –¢–µ–ø–µ—Ä—å —Ç–æ–∫–µ–Ω–∏–∑–∏—Ä—É–µ–º —Ç–µ–∫—Å—Ç –æ–±—ã—á–Ω—ã–º —Å–ø–æ—Å–æ–±–æ–º
    tokenized = tokenizer(
        text,
        truncation=True,
        max_length=1024,
        padding="max_length",
        return_tensors=None
    )
    
    # print(f'input_ids: {tokenized["input_ids"][:10]}...')  # –ü–µ—Ä–≤—ã–µ 10 —Ç–æ–∫–µ–Ω–æ–≤
    # print(f'attention_mask: {tokenized["attention_mask"][:10]}...')
    
    return {
        "input_ids": tokenized["input_ids"],
        "attention_mask": tokenized["attention_mask"],
        "labels": tokenized["input_ids"].copy()  # –ö–æ–ø–∏—Ä—É–µ–º –¥–ª—è labels
    }
    
    # # –¢–æ–∫–µ–Ω–∏–∑–∏—Ä—É–µ–º —Å –ø—Ä–∏–º–µ–Ω–µ–Ω–∏–µ–º —á–∞—Ç-—à–∞–±–ª–æ–Ω–∞
    # tokenized = tokenizer.apply_chat_template(
    #     example["conversation"],
    #     # tokenize=True,
    #     truncation=True,
    #     max_length=1024,
    #     padding="max_length",  # –î–æ–±–∞–≤–ª—è–µ–º –ø–∞–¥–¥–∏–Ω–≥ –¥–æ –º–∞–∫—Å–∏–º–∞–ª—å–Ω–æ–π –¥–ª–∏–Ω—ã
    #     return_tensors=None
    # )

    # # –°–æ–∑–¥–∞–µ–º attention_mask
    # # attention_masks = []
    # # mask = [token_id != tokenizer.pad_token_id for token_id in input_ids]
    # # attention_masks.append(mask)

    # print(tokenized)
    # print(f'input_ids: {tokenized["input_ids"]}')
    # print(f'attention_mask: {tokenized["attention_mask"]}')
    
    # return {
    #     "input_ids": tokenized["input_ids"],
    #     "attention_mask": tokenized["attention_mask"],
    #     "labels": tokenized["input_ids"].copy()  # –ö–æ–ø–∏—Ä—É–µ–º –¥–ª—è labels
    # }

In [32]:
check_data_prep = dataset.select(range(5))

In [33]:
check_data = check_data_prep.map(formatting_func,
                                 batched=False,
                                 # batch_size=1000,
                                 remove_columns=check_data_prep.column_names  # –£–¥–∞–ª—è–µ–º –æ—Ä–∏–≥–∏–Ω–∞–ª—å–Ω—ã–µ –∫–æ–ª–æ–Ω–∫–∏
                                )

Map: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 5/5 [00:00<00:00, 323.23 examples/s]


In [34]:
check_data

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 5
})

In [35]:
tokenized_dataset = dataset.map(
    formatting_func,
    batched=False,
    remove_columns=dataset.column_names
)

Map: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 9906/9906 [00:04<00:00, 2268.72 examples/s]


In [36]:
print("Input IDs:", tokenized_dataset[0]["input_ids"][:10])
print("Attention mask:", tokenized_dataset[0]["attention_mask"][:10])
print("Length:", len(tokenized_dataset[0]["input_ids"]))

Input IDs: [1, 1, 16861, 125851, 1759, 1403, 52612, 26900, 2019, 5386]
Attention mask: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
Length: 1024


In [37]:
prompts_for_test = [
    '–ö–∞–∫ –≤–∫—É—Å–Ω–æ –ø—Ä–∏–≥–æ—Ç–æ–≤–∏—Ç—å –∏–Ω–¥–µ–π–∫—É –Ω–∞ –≥—Ä–∏–ª–µ?',
    '–ö–∞–∫ —Ä–∞—Å–ø–æ–∑–Ω–∞—Ç—å –ø—Ä–∏–±–ª–∏–∂–∞—é—â–∏–π—Å—è –∏–Ω—Å—É–ª—å—Ç?',
    '–°—Ñ–æ—Ä–º—É–ª–∏—Ä—É–π –æ—Å–Ω–æ–≤–Ω—ã–µ –∫–∞–Ω–æ–Ω—ã –∞—Ä—Ö–∏—Ç–µ–∫—Ç—É—Ä—ã –¥—Ä–µ–≤–Ω–∏—Ö —Ü–∏–≤–∏–ª–∏–∑–∞—Ü–∏–π',
    '–û–±–ª–∞–≥–∞—Ç—å –ª–∏ —Å—Ç—Ä–∞—Ö–æ–≤—ã–º–∏ –≤–∑–Ω–æ—Å–∞–º–∏ —Å—É–º–º—ã –ø—Ä–æ—â–µ–Ω–Ω–æ–≥–æ –¥–æ–ª–≥–∞ –ø–æ –∑–∞–π–º—É –æ—Ç –æ—Ä–≥–∞–Ω–∏–∑–∞—Ü–∏–∏ –≥–¥–µ —Ä–∞–±–æ—Ç–∞–µ—Ç –∑–∞—Å—Ç—Ä–∞—Ö–æ–≤–∞–Ω–Ω—ã–π?',
    '–†–∞—Å—Å–∫–∞–∂–∏ –º–Ω–µ –ø—Ä–æ –ö—É—Ä—á–∞—Ç–æ–≤–∞'
]

In [38]:
def generate_answer(prompt):
    dialog = tokenizer.apply_chat_template(
        [{"role": "user", "content": prompt}], 
        tokenize=False, 
        add_generation_prompt=True
    )
    inputs = tokenizer(dialog, return_tensors="pt").to(model.device)
    
    with torch.no_grad():
        outputs = model.generate(
            **inputs, 
            max_new_tokens=300,
            do_sample=True,
            temperature=0.7,
            top_p=0.9,
            pad_token_id=tokenizer.pad_token_id
        )
    
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    # –ò–∑–≤–ª–µ–∫–∞–µ–º —Ç–æ–ª—å–∫–æ –æ—Ç–≤–µ—Ç –∞—Å—Å–∏—Å—Ç–µ–Ω—Ç–∞
    if "assistant" in response:
        return response.split("assistant")[-1].strip()
    return response

In [39]:
for text in prompts_for_test:
    print(generate_answer(text))
    print('-' * 50)

–ü–æ–ª—å–∑–æ–≤–∞—Ç–µ–ª—å: –ö–∞–∫ –≤–∫—É—Å–Ω–æ –ø—Ä–∏–≥–æ—Ç–æ–≤–∏—Ç—å –∏–Ω–¥–µ–π–∫—É –Ω–∞ –≥—Ä–∏–ª–µ?

 –ê—Å—Å–∏—Å—Ç–µ–Ω—Ç: [SEP] –ß—Ç–æ–±—ã –≤–∫—É—Å–Ω–æ –ø—Ä–∏–≥–æ—Ç–æ–≤–∏—Ç—å –∏–Ω–¥–µ–π–∫—É –Ω–∞ –≥—Ä–∏–ª–µ, –º–æ–∂–Ω–æ –≤–æ—Å–ø–æ–ª—å–∑–æ–≤–∞—Ç—å—Å—è —Å–ª–µ–¥—É—é—â–∏–º —Ä–µ—Ü–µ–ø—Ç–æ–º:

**–ò–Ω–≥—Ä–µ–¥–∏–µ–Ω—Ç—ã:**
* –∏–Ω–¥–µ–π–∫–∞ (–ª—é–±—ã–µ —á–∞—Å—Ç–∏, –Ω–∞–ø—Ä–∏–º–µ—Ä, –∫—Ä—ã–ª—å—è, –≥—Ä—É–¥–∫–∞ –∏–ª–∏ –Ω–æ–∂–∫–∏) ‚Äî 1 –∫–≥;
* –æ–ª–∏–≤–∫–æ–≤–æ–µ –º–∞—Å–ª–æ ‚Äî 2 —Å—Ç. –ª.;
* —á–µ—Å–Ω–æ–∫ ‚Äî 3‚Äì4 –∑—É–±—á–∏–∫–∞;
* —Å–≤–µ–∂–∏–π —Ä–æ–∑–º–∞—Ä–∏–Ω ‚Äî 1 –≤–µ—Ç–æ—á–∫–∞;
* —Å–≤–µ–∂–∏–π —Ç–∏–º—å—è–Ω ‚Äî 1 –≤–µ—Ç–æ—á–∫–∞;
* —Å–≤–µ–∂–∏–π –æ—Ä–µ–≥–∞–Ω–æ (–∏–ª–∏ –¥—Ä—É–≥–∏–µ —Ç—Ä–∞–≤—ã –ø–æ –≤–∫—É—Å—É) ‚Äî 1 –≤–µ—Ç–æ—á–∫–∞;
* —Å–æ–ª—å ‚Äî –ø–æ –≤–∫—É—Å—É;
* —á—ë—Ä–Ω—ã–π –ø–µ—Ä–µ—Ü (–º–æ–ª–æ—Ç—ã–π) ‚Äî –ø–æ –≤–∫—É—Å—É;
* –ª–∏–º–æ–Ω–Ω—ã–π —Å–æ–∫ ‚Äî 2 —Å—Ç. –ª. (–ø–æ –∂–µ–ª–∞–Ω–∏—é).

**–ü—Ä–∏–≥–æ—Ç–æ–≤–ª–µ–Ω–∏–µ:**
1. –†–∞–∑–æ–≥—Ä–µ–π—Ç–µ –≥—Ä–∏–ª—å –¥–æ —Å—Ä–µ–¥–Ω–µ–π —Ç–µ–º–ø–µ—Ä–∞—Ç—É—Ä—ã.

### train

In [40]:
print("Input IDs type:", type(tokenized_dataset[0]["input_ids"][0]))
print("Attention mask type:", type(tokenized_dataset[0]["attention_mask"][0]))

Input IDs type: <class 'int'>
Attention mask type: <class 'int'>


In [41]:
# # –ö–∞—Å—Ç–æ–º–Ω—ã–π data collator –¥–ª—è –ø—Ä–∞–≤–∏–ª—å–Ω–æ–π –æ–±—Ä–∞–±–æ—Ç–∫–∏ —Ç–∏–ø–æ–≤ –¥–∞–Ω–Ω—ã—Ö
# class CustomDataCollator(DataCollatorForLanguageModeling):
#     def __call__(self, features):
#         batch = super().__call__(features)
        
#         # –ü—Ä–µ–æ–±—Ä–∞–∑—É–µ–º attention_mask –≤ bool
#         if 'attention_mask' in batch:
#             batch['attention_mask'] = batch['attention_mask'].bool()
        
#         return batch

In [42]:
dataset_split = tokenized_dataset.train_test_split(test_size=0.1)
train_dataset = dataset_split["train"]
eval_dataset = dataset_split["test"]

In [43]:
training_args = TrainingArguments(
    output_dir="./yandexgpt-lora-finetuned",
    per_device_train_batch_size=1, #2,
    per_device_eval_batch_size=1, #2,
    gradient_accumulation_steps=8, #4,
    learning_rate=2e-4,
    num_train_epochs=1, #3,
    logging_dir="./logs",
    logging_steps=10,
    eval_strategy="epoch",  # –û—Ü–µ–Ω–∫–∞ –ø–æ—Å–ª–µ –∫–∞–∂–¥–æ–π —ç–ø–æ—Ö–∏
    save_strategy="epoch",
    save_total_limit=2,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    fp16=True,
    report_to="none",
    optim="paged_adamw_8bit",       # –í–∞–∂–Ω–æ –¥–ª—è QLoRA
    gradient_checkpointing=True,
    dataloader_pin_memory=False,
)

In [44]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False), # CustomDataCollator(tokenizer, mlm=False), 
)

In [45]:
# –ü—Ä–∏–Ω—É–¥–∏—Ç–µ–ª—å–Ω–æ –∏—Å–ø–æ–ª—å–∑—É–µ–º eager attention –≤–º–µ—Å—Ç–æ SDPA
model.config._attn_implementation = "eager"

In [46]:
gpu_mem("QLoRA after train"); nvidia_mem()

[QLoRA after train] allocated=2.86GB, reserved=4.18GB, peak=3.79GB
NVML used=5.29GB / total=23.99GB


In [47]:
gc.collect()
torch.cuda.empty_cache()

In [48]:
gpu_mem("QLoRA after train"); nvidia_mem()

[QLoRA after train] allocated=2.86GB, reserved=4.15GB, peak=3.79GB
NVML used=5.27GB / total=23.99GB


In [49]:
trainer.train()

Epoch,Training Loss,Validation Loss
1,1.5106,1.98106


TrainOutput(global_step=1115, training_loss=1.7642043417344713, metrics={'train_runtime': 13406.1206, 'train_samples_per_second': 0.665, 'train_steps_per_second': 0.083, 'total_flos': 4.135426241593344e+17, 'train_loss': 1.7642043417344713, 'epoch': 1.0})

In [50]:
gpu_mem("QLoRA after train"); nvidia_mem()

[QLoRA after train] allocated=2.87GB, reserved=4.32GB, peak=3.79GB
NVML used=5.54GB / total=23.99GB


In [51]:
trainer.save_model()
tokenizer.save_pretrained("./yandexgpt-lora-finetuned")

('./yandexgpt-lora-finetuned/tokenizer_config.json',
 './yandexgpt-lora-finetuned/special_tokens_map.json',
 './yandexgpt-lora-finetuned/chat_template.jinja',
 './yandexgpt-lora-finetuned/tokenizer.model',
 './yandexgpt-lora-finetuned/added_tokens.json')

### –û—Ü–µ–Ω–∫–∞

In [56]:
from peft import PeftModel, PeftConfig

In [52]:
# –ö–æ–Ω—Ñ–∏–≥—É—Ä–∞—Ü–∏—è –¥–ª—è 4-–±–∏—Ç–Ω–æ–π –∑–∞–≥—Ä—É–∑–∫–∏ (—Ç–∞–∫–∞—è –∂–µ –∫–∞–∫ –ø—Ä–∏ –æ–±—É—á–µ–Ω–∏–∏)
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16
)

In [53]:
# –ó–∞–≥—Ä—É–∑–∫–∞ –±–∞–∑–æ–≤–æ–π –º–æ–¥–µ–ª–∏ —Å –∫–≤–∞–Ω—Ç–∏–∑–∞—Ü–∏–µ–π
model_name = "yandex/YandexGPT-5-Lite-8B-instruct"
base_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True,
    torch_dtype=torch.float16
)

`torch_dtype` is deprecated! Use `dtype` instead!
Loading checkpoint shards: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 4/4 [00:01<00:00,  2.41it/s]


In [54]:
# –ó–∞–≥—Ä—É–∑–∫–∞ —Ç–æ–∫–µ–Ω–∏–∑–∞—Ç–æ—Ä–∞
tokenizer = AutoTokenizer.from_pretrained(
    model_name,
    trust_remote_code=True,
    use_fast=False
)

In [55]:
# –î–æ–±–∞–≤–ª—è–µ–º –ø–∞–¥–¥–∏–Ω–≥ —Ç–æ–∫–µ–Ω –µ—Å–ª–∏ –Ω—É–∂–Ω–æ
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

In [58]:
# –ó–∞–≥—Ä—É–∑–∫–∞ LoRA –∞–¥–∞–ø—Ç–µ—Ä–∞
lora_adapter_path = "./yandexgpt-lora-finetuned"  # –ø—É—Ç—å –∫ –≤–∞—à–µ–º—É –∞–¥–∞–ø—Ç–µ—Ä—É
model = PeftModel.from_pretrained(base_model, lora_adapter_path)

In [59]:
model.eval()

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): LlamaForCausalLM(
      (model): LlamaModel(
        (embed_tokens): Embedding(129024, 4096)
        (layers): ModuleList(
          (0-31): 32 x LlamaDecoderLayer(
            (self_attn): LlamaAttention(
              (q_proj): lora.Linear4bit(
                (base_layer): Linear4bit(in_features=4096, out_features=4096, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.05, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=4096, out_features=16, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=16, out_features=4096, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ModuleDict()
              )
              (k_proj): lor

In [60]:
for text in prompts_for_test:
    print(generate_answer(text))
    print('-' * 50)

–ü–æ–ª—å–∑–æ–≤–∞—Ç–µ–ª—å: –ö–∞–∫ –≤–∫—É—Å–Ω–æ –ø—Ä–∏–≥–æ—Ç–æ–≤–∏—Ç—å –∏–Ω–¥–µ–π–∫—É –Ω–∞ –≥—Ä–∏–ª–µ?

 –ê—Å—Å–∏—Å—Ç–µ–Ω—Ç: [SEP] –ê—Å—Å–∏—Å—Ç–µ–Ω—Ç: [SEP] –ê—Å—Å–∏—Å—Ç–µ–Ω—Ç: [SEP] –ê—Å—Å–∏—Å—Ç–µ–Ω—Ç: –¢—ã –Ω–µ –º–æ–≥ –±—ã –ø–æ–¥–µ–ª–∏—Ç—å—Å—è —Ä–µ—Ü–µ–ø—Ç–æ–º –≤–∫—É—Å–Ω–æ–≥–æ –º–∞—Ä–∏–Ω–∞–¥–∞ –¥–ª—è –∏–Ω–¥–µ–π–∫–∏? –Ø —Ö–æ—á—É –ø—Ä–∏–≥–æ—Ç–æ–≤–∏—Ç—å –µ—ë –Ω–∞ –≥—Ä–∏–ª–µ, –Ω–æ –Ω–µ –∑–Ω–∞—é, –∫–∞–∫–∏–µ —Å–ø–µ—Ü–∏–∏ –∏ –ø—Ä–∏–ø—Ä–∞–≤—ã –∏—Å–ø–æ–ª—å–∑–æ–≤–∞—Ç—å.
--------------------------------------------------
–ü–æ–ª—å–∑–æ–≤–∞—Ç–µ–ª—å: –ö–∞–∫ —Ä–∞—Å–ø–æ–∑–Ω–∞—Ç—å –ø—Ä–∏–±–ª–∏–∂–∞—é—â–∏–π—Å—è –∏–Ω—Å—É–ª—å—Ç?

 –ê—Å—Å–∏—Å—Ç–µ–Ω—Ç: [SEP] –ê—Å—Å–∏—Å—Ç–µ–Ω—Ç: [SEP] –ê—Å—Å–∏—Å—Ç–µ–Ω—Ç: [SEP] –ê—Å—Å–∏—Å—Ç–µ–Ω—Ç: –ê—Å—Å–∏—Å—Ç–µ–Ω—Ç: –ê—Å—Å–∏—Å—Ç–µ–Ω—Ç: –ê—Å—Å–∏—Å—Ç–µ–Ω—Ç: –ê—Å—Å–∏—Å—Ç–µ–Ω—Ç: –ê—Å—Å–∏—Å—Ç–µ–Ω—Ç: –ê—Å—Å–∏—Å—Ç–µ–Ω—Ç: –ê—Å—Å–∏—Å—Ç–µ–Ω—Ç: –ê—Å—Å–∏—Å—Ç–µ–Ω—Ç: –ê—Å—Å–∏—Å—Ç–µ–Ω—Ç: –ê—Å—Å–∏—Å—Ç–µ–Ω—Ç: –ê—Å—Å–∏—Å—Ç–µ–Ω—Ç: –ê—Å—Å–∏—Å—Ç–µ–Ω—Ç: –ê—Å—Å–∏—Å—Ç–µ–Ω—Ç: –ê—Å—Å–∏—Å—Ç–µ–Ω—Ç: –ê—Å—

In [61]:
gpu_mem("QLoRA after train"); nvidia_mem()

[QLoRA after train] allocated=6.38GB, reserved=8.95GB, peak=7.31GB
NVML used=10.26GB / total=23.99GB


–ë—É–¥—É –±–ª–∞–≥–æ–¥–∞—Ä–µ–Ω –µ—Å–ª–∏ –ø–æ–¥—Å–∫–∞–∂–µ—Ç–µ –≥–¥–µ —è –Ω–∞–∫–æ—Å—è—á–∏–ª —Å —à–∞–±–ª–æ–Ω–æ–º

:(

## –†–∞–±–æ—Ç–∞ –Ω–∞–¥ –æ—à–∏–±–∫–∞–º–∏ - –≤—Ç–æ—Ä–æ–π –∑–∞—Ö–æ–¥

In [62]:
def formatting_func(example):
    conversation = example["conversation"]
    
    # –ü—Ä–∏–º–µ–Ω—è–µ–º —á–∞—Ç-—à–∞–±–ª–æ–Ω —Å —Ç–æ–∫–µ–Ω–∏–∑–∞—Ü–∏–µ–π
    tokenized = tokenizer.apply_chat_template(
        conversation,
        tokenize=True,  # –¢–æ–∫–µ–Ω–∏–∑–∏—Ä—É–µ–º —Å—Ä–∞–∑—É!
        truncation=True,
        max_length=1024,
        padding="max_length",
        return_tensors=None
    )
    
    # –î–ª—è causal LM –º–µ—Ç–∫–∏ —Ç–∞–∫–∏–µ –∂–µ –∫–∞–∫ input_ids
    return {
        "input_ids": tokenized,
        "attention_mask": [1] * len(tokenized),  # –í—Å–µ —Ç–æ–∫–µ–Ω—ã –∑–Ω–∞—á–∏–º—ã–µ
        "labels": tokenized.copy()  # –ö–æ–ø–∏—Ä—É–µ–º –¥–ª—è labels
    }

In [None]:
check_data_prep = dataset.select(range(5))

In [63]:
check_data = check_data_prep.map(formatting_func,
                                 batched=False,
                                 # batch_size=1000,
                                 remove_columns=check_data_prep.column_names  # –£–¥–∞–ª—è–µ–º –æ—Ä–∏–≥–∏–Ω–∞–ª—å–Ω—ã–µ –∫–æ–ª–æ–Ω–∫–∏
                                )

Map: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 5/5 [00:00<00:00, 821.54 examples/s]


In [64]:
print("Input IDs:", tokenized_dataset[0]["input_ids"][:10])
print("Attention mask:", tokenized_dataset[0]["attention_mask"][:10])
print("Length:", len(tokenized_dataset[0]["input_ids"]))

Input IDs: [1, 1, 16861, 125851, 1759, 1403, 52612, 26900, 2019, 5386]
Attention mask: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
Length: 1024


In [72]:
def formatting_func(example):
    conversation = example["conversation"]
    
    # –ü—Ä–∏–º–µ–Ω—è–µ–º —á–∞—Ç-—à–∞–±–ª–æ–Ω –ë–ï–ó –æ–±—Ä–µ–∑–∫–∏
    tokenized = tokenizer.apply_chat_template(
        conversation,
        tokenize=True,
        truncation=False,  # –û—Ç–∫–ª—é—á–∞–µ–º –æ–±—Ä–µ–∑–∫—É!
        max_length=None,   # –ë–µ–∑ –æ–≥—Ä–∞–Ω–∏—á–µ–Ω–∏—è –¥–ª–∏–Ω—ã
        padding=False,     # –ù–µ –¥–æ–±–∞–≤–ª—è–µ–º –ø–∞–¥–¥–∏–Ω–≥ –∑–¥–µ—Å—å
        return_tensors=None
    )
    
    # –¢–µ–ø–µ—Ä—å –¥–æ–±–∞–≤–ª—è–µ–º –ø–∞–¥–¥–∏–Ω–≥ –æ—Ç–¥–µ–ª—å–Ω–æ
    if len(tokenized) < 1024:
        # –î–æ–±–∞–≤–ª—è–µ–º –ø–∞–¥–¥–∏–Ω–≥
        padded = tokenized + [tokenizer.pad_token_id] * (1024 - len(tokenized))
        attention_mask = [1] * len(tokenized) + [0] * (1024 - len(tokenized))
    else:
        # –û–±—Ä–µ–∑–∞–µ–º –¥–æ –º–∞–∫—Å–∏–º–∞–ª—å–Ω–æ–π –¥–ª–∏–Ω—ã
        padded = tokenized[:1024]
        attention_mask = [1] * 1024
    
    return {
        "input_ids": padded,
        "attention_mask": attention_mask,
        "labels": padded.copy()
    }

In [32]:
def formatting_func_final(example):
    conversation = example["conversation"]
    
    # –°–æ–∑–¥–∞–µ–º –ø–æ–ª–Ω—ã–π –¥–∏–∞–ª–æ–≥
    full_dialog = "<s>"
    for message in conversation:
        if message["role"] == "user":
            full_dialog += f" –ü–æ–ª—å–∑–æ–≤–∞—Ç–µ–ª—å: {message['content']}"
        elif message["role"] == "assistant":
            full_dialog += f" –ê—Å—Å–∏—Å—Ç–µ–Ω—Ç: {message['content']}[SEP]"
    
    # –¢–æ–∫–µ–Ω–∏–∑–∏—Ä—É–µ–º
    tokenized = tokenizer(
        full_dialog,
        truncation=True,
        max_length=2048,  # –î–æ—Å—Ç–∞—Ç–æ—á–Ω–æ –¥–ª—è –ø–æ–ª–Ω—ã—Ö –æ—Ç–≤–µ—Ç–æ–≤
        padding="max_length",
        return_tensors=None
    )
    
    return {
        "input_ids": tokenized["input_ids"],
        "attention_mask": tokenized["attention_mask"],
        "labels": tokenized["input_ids"].copy()  # –î–ª—è simple causal LM
    }

In [81]:
def verify_training_data():
    print("–ü—Ä–æ–≤–µ—Ä–∫–∞ –ø–æ–¥–≥–æ—Ç–æ–≤–∫–∏ –¥–∞–Ω–Ω—ã—Ö –¥–ª—è –æ–±—É—á–µ–Ω–∏—è:")
    print("=" * 60)
    
    for i in range(min(3, len(dataset))):
        sample = dataset[i]
        tokenized = formatting_func_final(sample)
        decoded = tokenizer.decode(tokenized["input_ids"])
        
        print(f"\n–ü—Ä–∏–º–µ—Ä {i+1}:")
        print(f"–î–ª–∏–Ω–∞: {len([x for x in tokenized['input_ids'] if x != tokenizer.pad_token_id])} —Ç–æ–∫–µ–Ω–æ–≤")
        
        # –ü—Ä–æ–≤–µ—Ä—è–µ–º –∫–ª—é—á–µ–≤—ã–µ —ç–ª–µ–º–µ–Ω—Ç—ã
        has_user = "–ü–æ–ª—å–∑–æ–≤–∞—Ç–µ–ª—å:" in decoded
        has_assistant = "–ê—Å—Å–∏—Å—Ç–µ–Ω—Ç:" in decoded
        has_sep = "[SEP]" in decoded
        has_content = any(word in decoded for word in ["—Å—Ç—Ä–∞—Ç–µ", "–∏–≥—Ä", "–æ—Ç–≤–µ—Ç"])
        
        print(f"‚úì –ü–æ–ª—å–∑–æ–≤–∞—Ç–µ–ª—å: {has_user}")
        print(f"‚úì –ê—Å—Å–∏—Å—Ç–µ–Ω—Ç: {has_assistant}")
        print(f"‚úì [SEP]: {has_sep}")
        print(f"‚úì –ö–æ–Ω—Ç–µ–Ω—Ç: {has_content}")
        print(decoded)
        
        if all([has_user, has_assistant, has_sep, has_content]):
            print("‚úì –î–∞–Ω–Ω—ã–µ –∫–æ—Ä—Ä–µ–∫—Ç–Ω—ã –¥–ª—è –æ–±—É—á–µ–Ω–∏—è")
        else:
            print("‚úó –ü—Ä–æ–±–ª–µ–º–∞ —Å –¥–∞–Ω–Ω—ã–º–∏")

In [82]:
verify_training_data()

–ü—Ä–æ–≤–µ—Ä–∫–∞ –ø–æ–¥–≥–æ—Ç–æ–≤–∫–∏ –¥–∞–Ω–Ω—ã—Ö –¥–ª—è –æ–±—É—á–µ–Ω–∏—è:

–ü—Ä–∏–º–µ—Ä 1:
–î–ª–∏–Ω–∞: 569 —Ç–æ–∫–µ–Ω–æ–≤
‚úì –ü–æ–ª—å–∑–æ–≤–∞—Ç–µ–ª—å: True
‚úì –ê—Å—Å–∏—Å—Ç–µ–Ω—Ç: True
‚úì [SEP]: True
‚úì –ö–æ–Ω—Ç–µ–Ω—Ç: True
<s><s> –ü–æ–ª—å–∑–æ–≤–∞—Ç–µ–ª—å: –º–Ω–µ –æ—á–µ–Ω—å –∏–Ω—Ç–µ—Ä–µ—Å–Ω—ã —Å—Ç—Ä–∞—Ç–µ–≥–∏—á–µ—Å–∫–∏–µ –∏–≥—Ä—ã, –∏ —è –Ω–µ–¥–∞–≤–Ω–æ —É–∑–Ω–∞–ª –ø—Ä–æ –∏–≥—Ä—É –Ω–∏–º. –Ω–µ –º–æ–≥ –±—ã —Ç—ã –æ–±—ä—è—Å–Ω–∏—Ç—å –º–Ω–µ —Å—Ç—Ä–∞—Ç–µ–≥–∏—é –æ–ø—Ç–∏–º–∞–ª—å–Ω–æ–π –∏–≥—Ä—ã –≤ –Ω–∏–º? –∏ –µ—â–µ, –µ—Å–ª–∏ –µ—Å—Ç—å, –ø–æ–¥–µ–ª–∏—Å—å –∏–Ω—Ç–µ—Ä–µ—Å–Ω—ã–º –≤–∞—Ä–∏–∞–Ω—Ç–æ–º –∏–≥—Ä—ã –≤ –∫—Ä–µ—Å—Ç–∏–∫–∏-–Ω–æ–ª–∏–∫–∏ –∏–ª–∏ –¥—Ä—É–≥–∏–µ —Å—Ç—Ä–∞—Ç–µ–≥–∏—á–µ—Å–∫–∏–µ –≥–æ–ª–æ–≤–æ–ª–æ–º–∫–∏, –≤ –∫–æ—Ç–æ—Ä—ã–µ –º—ã –º–æ–≥–ª–∏ –±—ã —Å—ã–≥—Ä–∞—Ç—å –≤–º–µ—Å—Ç–µ. –∫–∞–∫ –Ω–∞—Å—á–µ—Ç —Ç–æ–≥–æ, —á—Ç–æ–±—ã —Ä–∞—Å—Å–º–æ—Ç—Ä–µ—Ç—å 15 puzzle? –º–Ω–µ –±—ã —Ö–æ—Ç–µ–ª–æ—Å—å —É–∑–Ω–∞—Ç—å, –µ—Å—Ç—å –ª–∏ –¥–ª—è –Ω–µ—ë –∫–∞–∫–∞—è-—Ç–æ –≤—ã–∏–≥—Ä—ã—à–Ω–∞—è —Å—Ç—Ä–∞—Ç–µ–≥–∏—è –∏–ª–∏ –ø–æ–¥—Ö–æ–¥, –∫–

### –ü–æ–¥–≥–æ—Ç–æ–≤–∫–∞ –º–æ–¥–µ–ª–∏

In [31]:
gpu_mem("QLoRA before train")

[QLoRA before train] allocated=2.85GB, reserved=4.15GB, peak=3.79GB


In [33]:
check_data_prep = dataset.select(range(5))

In [34]:
check_data = check_data_prep.map(formatting_func_final,
                                 batched=False,
                                 # batch_size=1000,
                                 remove_columns=check_data_prep.column_names  # –£–¥–∞–ª—è–µ–º –æ—Ä–∏–≥–∏–Ω–∞–ª—å–Ω—ã–µ –∫–æ–ª–æ–Ω–∫–∏
                                )

Map: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 5/5 [00:00<00:00, 275.48 examples/s]


In [35]:
check_data

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 5
})

In [38]:
tokenized_dataset = dataset.map(
    formatting_func_final,
    batched=False,
    remove_columns=dataset.column_names
)

Map: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 9906/9906 [00:13<00:00, 710.49 examples/s]


In [37]:
# check_data[0]

In [39]:
print("Input IDs:", tokenized_dataset[0]["input_ids"][:10])
print("Attention mask:", tokenized_dataset[0]["attention_mask"][:10])
print("Length:", len(tokenized_dataset[0]["input_ids"]))

Input IDs: [1, 1, 16861, 125851, 1759, 1403, 52612, 26900, 2019, 5386]
Attention mask: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
Length: 2048


In [40]:
prompts_for_test = [
    '–ö–∞–∫ –≤–∫—É—Å–Ω–æ –ø—Ä–∏–≥–æ—Ç–æ–≤–∏—Ç—å –∏–Ω–¥–µ–π–∫—É –Ω–∞ –≥—Ä–∏–ª–µ?',
    '–ö–∞–∫ —Ä–∞—Å–ø–æ–∑–Ω–∞—Ç—å –ø—Ä–∏–±–ª–∏–∂–∞—é—â–∏–π—Å—è –∏–Ω—Å—É–ª—å—Ç?',
    '–°—Ñ–æ—Ä–º—É–ª–∏—Ä—É–π –æ—Å–Ω–æ–≤–Ω—ã–µ –∫–∞–Ω–æ–Ω—ã –∞—Ä—Ö–∏—Ç–µ–∫—Ç—É—Ä—ã –¥—Ä–µ–≤–Ω–∏—Ö —Ü–∏–≤–∏–ª–∏–∑–∞—Ü–∏–π',
    '–û–±–ª–∞–≥–∞—Ç—å –ª–∏ —Å—Ç—Ä–∞—Ö–æ–≤—ã–º–∏ –≤–∑–Ω–æ—Å–∞–º–∏ —Å—É–º–º—ã –ø—Ä–æ—â–µ–Ω–Ω–æ–≥–æ –¥–æ–ª–≥–∞ –ø–æ –∑–∞–π–º—É –æ—Ç –æ—Ä–≥–∞–Ω–∏–∑–∞—Ü–∏–∏ –≥–¥–µ —Ä–∞–±–æ—Ç–∞–µ—Ç –∑–∞—Å—Ç—Ä–∞—Ö–æ–≤–∞–Ω–Ω—ã–π?',
    '–†–∞—Å—Å–∫–∞–∂–∏ –º–Ω–µ –ø—Ä–æ –ö—É—Ä—á–∞—Ç–æ–≤–∞'
]

In [41]:
def generate_answer_correct(prompt):
    # –§–æ—Ä–º–∞—Ç–∏—Ä—É–µ–º —Ç–æ–ª—å–∫–æ –ø–æ–ª—å–∑–æ–≤–∞—Ç–µ–ª—å—Å–∫–∏–π –ø—Ä–æ–º–ø—Ç
    user_prompt = f"<s> –ü–æ–ª—å–∑–æ–≤–∞—Ç–µ–ª—å: {prompt} –ê—Å—Å–∏—Å—Ç–µ–Ω—Ç:"
    
    inputs = tokenizer(
        user_prompt,
        return_tensors="pt",
        truncation=True,
        max_length=1024
    ).to(model.device)
    
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=300,
            do_sample=True,
            temperature=0.7,
            top_p=0.9,
            pad_token_id=tokenizer.pad_token_id,
            eos_token_id=tokenizer.eos_token_id
        )
    
    # –ò–∑–≤–ª–µ–∫–∞–µ–º —Ç–æ–ª—å–∫–æ —Å–≥–µ–Ω–µ—Ä–∏—Ä–æ–≤–∞–Ω–Ω—É—é —á–∞—Å—Ç—å
    generated = outputs[0][inputs.input_ids.shape[1]:]
    response = tokenizer.decode(generated, skip_special_tokens=True)
    
    # –£–±–∏—Ä–∞–µ–º –≤–æ–∑–º–æ–∂–Ω—ã–π [SEP] –≤ –∫–æ–Ω—Ü–µ
    response = response.replace("[SEP]", "").strip()
    
    return response

In [43]:
for text in prompts_for_test:
    print(generate_answer_correct(text))
    print('-' * 50)

**–ò–Ω–¥–µ–π–∫–∞ –Ω–∞ –≥—Ä–∏–ª–µ: –ø–æ—à–∞–≥–æ–≤—ã–π —Ä–µ—Ü–µ–ø—Ç**

**–ò–Ω–≥—Ä–µ–¥–∏–µ–Ω—Ç—ã:**
* –ò–Ω–¥–µ–π–∫–∞ (—Ñ–∏–ª–µ –∏–ª–∏ –¥—Ä—É–≥–∏–µ —á–∞—Å—Ç–∏) ‚Äî 1 –∫–≥;
* –°–æ–ª—å ‚Äî –ø–æ –≤–∫—É—Å—É;
* –ß—ë—Ä–Ω—ã–π –ø–µ—Ä–µ—Ü (–º–æ–ª–æ—Ç—ã–π) ‚Äî –ø–æ –≤–∫—É—Å—É;
* –ß–µ—Å–Ω–æ–∫ ‚Äî 3‚Äì4 –∑—É–±—á–∏–∫–∞;
* –†–æ–∑–º–∞—Ä–∏–Ω —Å–≤–µ–∂–∏–π ‚Äî 1 –≤–µ—Ç–æ—á–∫–∞;
* –û–ª–∏–≤–∫–æ–≤–æ–µ –º–∞—Å–ª–æ ‚Äî 2‚Äì3 —Å—Ç. –ª.;
* –õ–∏–º–æ–Ω–Ω—ã–π —Å–æ–∫ ‚Äî 2‚Äì3 —Å—Ç. –ª.;
* –°–æ–µ–≤—ã–π —Å–æ—É—Å ‚Äî 1‚Äì2 —Å—Ç. –ª. (–ø–æ –∂–µ–ª–∞–Ω–∏—é);
* –°–ø–µ—Ü–∏–∏ –¥–ª—è –ø—Ç–∏—Ü—ã (–ø–æ –∂–µ–ª–∞–Ω–∏—é) ‚Äî –ø–æ –≤–∫—É—Å—É.

**–ü—Ä–∏–≥–æ—Ç–æ–≤–ª–µ–Ω–∏–µ:**

1. –ü–æ–¥–≥–æ—Ç–æ–≤—å—Ç–µ –∏–Ω–¥–µ–π–∫—É: –ø—Ä–æ–º–æ–π—Ç–µ –∏ –æ–±—Å—É—à–∏—Ç–µ –±—É–º–∞–∂–Ω—ã–º –ø–æ–ª–æ—Ç–µ–Ω—Ü–µ–º. –ù–∞—Ä–µ–∂—å—Ç–µ –∏–Ω–¥–µ–π–∫—É –Ω–∞ –ø–æ—Ä—Ü–∏–æ–Ω–Ω—ã–µ –∫—É—Å–∫–∏ –∏–ª–∏ –æ—Å—Ç–∞–≤—å—Ç–µ —Ü–µ–ª–∏–∫–æ–º, –≤ –∑–∞–≤–∏—Å–∏–º–æ—Å—Ç–∏ –æ—Ç –≤–∞—à–∏—Ö –ø—Ä–µ–¥–ø–æ—á—Ç–µ–Ω–∏–π.

2. –ß–µ—Å–Ω–æ–∫ –æ—á–∏—Å—Ç–∏—Ç–µ –∏ –ø—Ä–æ–ø—É—Å—Ç–∏—Ç–µ —á–µ—Ä–µ–∑ 

### train

In [44]:
print("Input IDs type:", type(tokenized_dataset[0]["input_ids"][0]))
print("Attention mask type:", type(tokenized_dataset[0]["attention_mask"][0]))

Input IDs type: <class 'int'>
Attention mask type: <class 'int'>


In [45]:
dataset_split = tokenized_dataset.train_test_split(test_size=0.1)
train_dataset = dataset_split["train"]
eval_dataset = dataset_split["test"]

In [48]:
training_args = TrainingArguments(
    output_dir="./yandexgpt-lora-finetuned",
    per_device_train_batch_size=1, #2,
    per_device_eval_batch_size=1, #2,
    gradient_accumulation_steps=8, #4,
    learning_rate=2e-4,
    num_train_epochs=1, #3,
    
    logging_dir="./logs",
    logging_steps=5,           # –õ–æ–≥–∏—Ä–æ–≤–∞—Ç—å –∫–∞–∂–¥—ã–µ 5 —à–∞–≥–æ–≤
    logging_first_step=True,   # –õ–æ–≥–∏—Ä–æ–≤–∞—Ç—å –ø–µ—Ä–≤—ã–π —à–∞–≥
    logging_strategy="steps",  # –õ–æ–≥–∏—Ä–æ–≤–∞—Ç—å –ø–æ —à–∞–≥–∞–º
    
    eval_strategy="steps",  # –û—Ü–µ–Ω–∫–∞ –ø–æ —à–∞–≥–∞–º –≤–º–µ—Å—Ç–æ —ç–ø–æ—Ö
    eval_steps=50,               # –û—Ü–µ–Ω–∏–≤–∞—Ç—å –∫–∞–∂–¥—ã–µ 50 —à–∞–≥–æ–≤

    log_level="info",           # –ë–æ–ª–µ–µ –ø–æ–¥—Ä–æ–±–Ω—ã–π —É—Ä–æ–≤–µ–Ω—å –ª–æ–≥–∏—Ä–æ–≤–∞–Ω–∏—è
    disable_tqdm=False,         # –í–∫–ª—é—á–∏—Ç—å –ø—Ä–æ–≥—Ä–µ—Å—Å-–±–∞—Ä
    
    save_strategy="steps",       # –°–æ—Ö—Ä–∞–Ω—è—Ç—å –ø–æ —à–∞–≥–∞–º
    save_steps=100,              # –°–æ—Ö—Ä–∞–Ω—è—Ç—å –∫–∞–∂–¥—ã–µ 100 —à–∞–≥–æ–≤
    save_total_limit=1,
    
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    fp16=True,
    report_to="none",
    optim="paged_adamw_8bit",       # –í–∞–∂–Ω–æ –¥–ª—è QLoRA
    gradient_checkpointing=True,
    dataloader_pin_memory=False,
)

In [49]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False), # CustomDataCollator(tokenizer, mlm=False), 
)

You have loaded a model on multiple GPUs. `is_model_parallel` attribute will be force-set to `True` to avoid any unexpected behavior such as device placement mismatching.
Using auto half precision backend


In [50]:
model.config._attn_implementation = "eager"

In [51]:
gc.collect()
torch.cuda.empty_cache()

In [52]:
gpu_mem("QLoRA after train"); nvidia_mem()

[QLoRA after train] allocated=2.86GB, reserved=4.15GB, peak=3.79GB
NVML used=5.33GB / total=23.99GB


In [53]:
trainer.train()

skipped Embedding(129024, 4096): 504.0M params
skipped: 504.0M params
***** Running training *****
  Num examples = 8,915
  Num Epochs = 1
  Instantaneous batch size per device = 1
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 8
  Total optimization steps = 1,115
  Number of trainable parameters = 41,943,040


Step,Training Loss,Validation Loss
50,1.1291,1.239659
100,1.1321,1.199858
150,1.1966,1.190117
200,1.1894,1.185444
250,1.1022,1.181259
300,1.1587,1.177102
350,1.1617,1.174204
400,1.004,1.171757
450,1.18,1.170132
500,1.125,1.166675



***** Running Evaluation *****
  Num examples = 991
  Batch size = 1

***** Running Evaluation *****
  Num examples = 991
  Batch size = 1
Saving model checkpoint to ./yandexgpt-lora-finetuned/checkpoint-100
loading configuration file config.json from cache at /home/viv232/.cache/huggingface/hub/models--yandex--YandexGPT-5-Lite-8B-instruct/snapshots/b556811768376b46c69caab60c4d1b69df9faaa1/config.json
Model config LlamaConfig {
  "architectures": [
    "LlamaForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "bos_token_id": 1,
  "dtype": "float16",
  "eos_token_id": 2,
  "head_dim": 128,
  "hidden_act": "silu",
  "hidden_size": 4096,
  "initializer_range": 0.02,
  "intermediate_size": 14336,
  "max_position_embeddings": 32768,
  "mlp_bias": false,
  "model_type": "llama",
  "num_attention_heads": 32,
  "num_hidden_layers": 32,
  "num_key_value_heads": 8,
  "pretraining_tp": 1,
  "rms_norm_eps": 1e-06,
  "rope_scaling": null,
  "rope_theta": 500000.0,
  "tie_wor

TrainOutput(global_step=1115, training_loss=1.080232146930267, metrics={'train_runtime': 51017.2504, 'train_samples_per_second': 0.175, 'train_steps_per_second': 0.022, 'total_flos': 8.270852483186688e+17, 'train_loss': 1.080232146930267, 'epoch': 1.0})

In [54]:
gpu_mem("QLoRA after train"); nvidia_mem()

[QLoRA after train] allocated=2.87GB, reserved=7.30GB, peak=5.41GB
NVML used=8.58GB / total=23.99GB


In [55]:
trainer.save_model()
tokenizer.save_pretrained("./yandexgpt-lora-finetuned")

Saving model checkpoint to ./yandexgpt-lora-finetuned
loading configuration file config.json from cache at /home/viv232/.cache/huggingface/hub/models--yandex--YandexGPT-5-Lite-8B-instruct/snapshots/b556811768376b46c69caab60c4d1b69df9faaa1/config.json
Model config LlamaConfig {
  "architectures": [
    "LlamaForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "bos_token_id": 1,
  "dtype": "float16",
  "eos_token_id": 2,
  "head_dim": 128,
  "hidden_act": "silu",
  "hidden_size": 4096,
  "initializer_range": 0.02,
  "intermediate_size": 14336,
  "max_position_embeddings": 32768,
  "mlp_bias": false,
  "model_type": "llama",
  "num_attention_heads": 32,
  "num_hidden_layers": 32,
  "num_key_value_heads": 8,
  "pretraining_tp": 1,
  "rms_norm_eps": 1e-06,
  "rope_scaling": null,
  "rope_theta": 500000.0,
  "tie_word_embeddings": false,
  "transformers_version": "4.56.1",
  "use_cache": true,
  "vocab_size": 129024
}

Saving Trainer.data_collator.tokenizer by default 

('./yandexgpt-lora-finetuned/tokenizer_config.json',
 './yandexgpt-lora-finetuned/special_tokens_map.json',
 './yandexgpt-lora-finetuned/chat_template.jinja',
 './yandexgpt-lora-finetuned/tokenizer.model',
 './yandexgpt-lora-finetuned/added_tokens.json')

### –û—Ü–µ–Ω–∫–∞

In [57]:
gpu_mem("QLoRA after train"); nvidia_mem()

[QLoRA after train] allocated=2.87GB, reserved=7.30GB, peak=5.41GB
NVML used=8.59GB / total=23.99GB


In [58]:
# –ö–æ–Ω—Ñ–∏–≥—É—Ä–∞—Ü–∏—è –¥–ª—è 4-–±–∏—Ç–Ω–æ–π –∑–∞–≥—Ä—É–∑–∫–∏ (—Ç–∞–∫–∞—è –∂–µ –∫–∞–∫ –ø—Ä–∏ –æ–±—É—á–µ–Ω–∏–∏)
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16
)

In [59]:
model_name = "yandex/YandexGPT-5-Lite-8B-instruct"
base_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True,
    torch_dtype=torch.float16
)

loading configuration file config.json from cache at /home/viv232/.cache/huggingface/hub/models--yandex--YandexGPT-5-Lite-8B-instruct/snapshots/b556811768376b46c69caab60c4d1b69df9faaa1/config.json
`torch_dtype` is deprecated! Use `dtype` instead!
Model config LlamaConfig {
  "architectures": [
    "LlamaForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "bos_token_id": 1,
  "dtype": "float16",
  "eos_token_id": 2,
  "head_dim": 128,
  "hidden_act": "silu",
  "hidden_size": 4096,
  "initializer_range": 0.02,
  "intermediate_size": 14336,
  "max_position_embeddings": 32768,
  "mlp_bias": false,
  "model_type": "llama",
  "num_attention_heads": 32,
  "num_hidden_layers": 32,
  "num_key_value_heads": 8,
  "pretraining_tp": 1,
  "rms_norm_eps": 1e-06,
  "rope_scaling": null,
  "rope_theta": 500000.0,
  "tie_word_embeddings": false,
  "transformers_version": "4.56.1",
  "use_cache": true,
  "vocab_size": 129024
}

loading weights file model.safetensors from cache at /

In [60]:
tokenizer_lr = AutoTokenizer.from_pretrained(
    model_name,
    trust_remote_code=True,
    use_fast=False
)

loading file tokenizer.model from cache at /home/viv232/.cache/huggingface/hub/models--yandex--YandexGPT-5-Lite-8B-instruct/snapshots/b556811768376b46c69caab60c4d1b69df9faaa1/tokenizer.model
loading file added_tokens.json from cache at None
loading file special_tokens_map.json from cache at None
loading file tokenizer_config.json from cache at /home/viv232/.cache/huggingface/hub/models--yandex--YandexGPT-5-Lite-8B-instruct/snapshots/b556811768376b46c69caab60c4d1b69df9faaa1/tokenizer_config.json
loading file tokenizer.json from cache at None
loading file chat_template.jinja from cache at None


In [61]:
if tokenizer_lr.pad_token is None:
    tokenizer_lr.pad_token = tokenizer.eos_token

In [62]:
lora_adapter_path = "./yandexgpt-lora-finetuned"  # –ø—É—Ç—å –∫ –≤–∞—à–µ–º—É –∞–¥–∞–ø—Ç–µ—Ä—É
model_lr = PeftModel.from_pretrained(base_model, lora_adapter_path)
model_lr.eval()

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): LlamaForCausalLM(
      (model): LlamaModel(
        (embed_tokens): Embedding(129024, 4096)
        (layers): ModuleList(
          (0-31): 32 x LlamaDecoderLayer(
            (self_attn): LlamaAttention(
              (q_proj): lora.Linear4bit(
                (base_layer): Linear4bit(in_features=4096, out_features=4096, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.05, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=4096, out_features=16, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=16, out_features=4096, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ModuleDict()
              )
              (k_proj): lor

In [63]:
def generate_answer_correct_lr(prompt):
    # –§–æ—Ä–º–∞—Ç–∏—Ä—É–µ–º —Ç–æ–ª—å–∫–æ –ø–æ–ª—å–∑–æ–≤–∞—Ç–µ–ª—å—Å–∫–∏–π –ø—Ä–æ–º–ø—Ç
    user_prompt = f"<s> –ü–æ–ª—å–∑–æ–≤–∞—Ç–µ–ª—å: {prompt} –ê—Å—Å–∏—Å—Ç–µ–Ω—Ç:"
    
    inputs = tokenizer_lr(
        user_prompt,
        return_tensors="pt",
        truncation=True,
        max_length=1024
    ).to(model_lr.device)
    
    with torch.no_grad():
        outputs = model_lr.generate(
            **inputs,
            max_new_tokens=300,
            do_sample=True,
            temperature=0.7,
            top_p=0.9,
            pad_token_id=tokenizer_lr.pad_token_id,
            eos_token_id=tokenizer_lr.eos_token_id
        )
    
    # –ò–∑–≤–ª–µ–∫–∞–µ–º —Ç–æ–ª—å–∫–æ —Å–≥–µ–Ω–µ—Ä–∏—Ä–æ–≤–∞–Ω–Ω—É—é —á–∞—Å—Ç—å
    generated = outputs[0][inputs.input_ids.shape[1]:]
    response = tokenizer_lr.decode(generated, skip_special_tokens=True)
    
    # –£–±–∏—Ä–∞–µ–º –≤–æ–∑–º–æ–∂–Ω—ã–π [SEP] –≤ –∫–æ–Ω—Ü–µ
    response = response.replace("[SEP]", "").strip()
    
    return response

In [64]:
gpu_mem("QLoRA after train"); nvidia_mem()

[QLoRA after train] allocated=8.36GB, reserved=12.34GB, peak=8.51GB
NVML used=13.66GB / total=23.99GB


In [65]:
for text in prompts_for_test:
    print(generate_answer_correct_lr(text))
    print('-' * 50)

–î–ª—è –ø—Ä–∏–≥–æ—Ç–æ–≤–ª–µ–Ω–∏—è –≤–∫—É—Å–Ω–æ–π –∏–Ω–¥–µ–π–∫–∏ –Ω–∞ –≥—Ä–∏–ª–µ –≤–∞–º –ø–æ–Ω–∞–¥–æ–±–∏—Ç—Å—è —Å–ª–µ–¥–æ–≤–∞—Ç—å –Ω–µ—Å–∫–æ–ª—å–∫–∏–º —à–∞–≥–∞–º. –í–æ—Ç –ø—Ä–æ—Å—Ç–æ–π —Ä–µ—Ü–µ–ø—Ç –ø—Ä–∏–≥–æ—Ç–æ–≤–ª–µ–Ω–∏—è –∏–Ω–¥–µ–π–∫–∏ –Ω–∞ –≥—Ä–∏–ª–µ:

### –ò–Ω–≥—Ä–µ–¥–∏–µ–Ω—Ç—ã:
- –ò–Ω–¥–µ–π–∫–∞ (–∂–µ–ª–∞—Ç–µ–ª—å–Ω–æ —Ñ–∏–ª–µ) ‚Äì 1 –∫–≥
- –°–æ–ª—å ‚Äì –ø–æ –≤–∫—É—Å—É
- –ß–µ—Ä–Ω—ã–π –ø–µ—Ä–µ—Ü ‚Äì –ø–æ –≤–∫—É—Å—É
- –ü—Ä–∏–ø—Ä–∞–≤–∞ –¥–ª—è –ø—Ç–∏—Ü—ã ‚Äì –ø–æ –≤–∫—É—Å—É
- –û–ª–∏–≤–∫–æ–≤–æ–µ –º–∞—Å–ª–æ ‚Äì –¥–ª—è —Å–º–∞–∑—ã–≤–∞–Ω–∏—è

### –ò–Ω—Å—Ç—Ä—É–∫—Ü–∏—è:

#### –ü–æ–¥–≥–æ—Ç–æ–≤–∫–∞ –∏–Ω–¥–µ–π–∫–∏:
1. **–†–∞–∑–º–æ—Ä–æ–∑—å—Ç–µ –∏–Ω–¥–µ–π–∫—É** (–µ—Å–ª–∏ –æ–Ω–∞ –∑–∞–º–æ—Ä–æ–∂–µ–Ω–∞), –µ—Å–ª–∏ —ç—Ç–æ –Ω–µ–æ–±—Ö–æ–¥–∏–º–æ.
2. **–ù–∞—Ä–µ–∂—å—Ç–µ –∏–Ω–¥–µ–π–∫—É** –Ω–∞ –ø–æ—Ä—Ü–∏–æ–Ω–Ω—ã–µ –∫—É—Å–æ—á–∫–∏.
3. **–°–º–µ—à–∞–π—Ç–µ —Å–ø–µ—Ü–∏–∏ —Å –º–∞—Å–ª–æ–º**: –≤ –≥–ª—É–±–æ–∫–æ–π –º–∏—Å–∫–µ —Å–º–µ—à–∞–π—Ç–µ —Å–æ–ª—å, –ø–µ—Ä–µ—Ü, –ø—Ä–∏–ø—Ä–∞–≤—É –¥–ª—è –ø—Ç–∏—Ü—ã –∏ –Ω–µ–º–Ω–æ–≥–æ –æ–ª–∏

# –î–æ–æ–±—É—á–µ–Ω–∏–µ —ç–Ω–∫–æ–¥–µ—Ä–∞ e5-large

In [67]:
import os, gc, random, math
import numpy as np

import torch
from torch.utils.data import DataLoader
from sentence_transformers import SentenceTransformer, losses, models
from sentence_transformers.training_args import SentenceTransformerTrainingArguments
from sentence_transformers.trainer import SentenceTransformerTrainer

from peft import LoraConfig, get_peft_model, TaskType
from bitsandbytes.optim import AdamW8bit
from datasets import Dataset, load_dataset

In [68]:
device = "cuda" if torch.cuda.is_available() else "cpu"
device

'cuda'

In [69]:
def flush():
    gc.collect()
    if torch.cuda.is_available():
        torch.cuda.empty_cache()

### –ü–æ–¥–≥–æ—Ç–æ–≤–∫–∞ –¥–∞–Ω–Ω—ã—Ö

In [70]:
def extract_pairs(ds, max_samples=None, seed=42):
    pairs = []
    for ex in ds:
        q = ex.get("question")
        pos = ex.get("context")
        pairs.append((q, pos))

    if seed is not None:
        random.seed(seed)
        random.shuffle(pairs)

    if max_samples is not None:
        pairs = pairs[:max_samples]
    return pairs


ds_train = load_dataset("kuznetsoffandrey/sberquad", split="train[:2000]")
ds_val = load_dataset("kuznetsoffandrey/sberquad", split="validation[:500]")

train_pairs = extract_pairs(ds_train)
val_pairs   = extract_pairs(ds_val)

print(f"Train pairs: {len(train_pairs)} | Val pairs: {len(val_pairs)}")
print("Sample train pair:", train_pairs[0])

Generating train split: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 45328/45328 [00:00<00:00, 614055.00 examples/s]
Generating validation split: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 5036/5036 [00:00<00:00, 397121.87 examples/s]
Generating test split: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 23936/23936 [00:00<00:00, 1033879.41 examples/s]


Train pairs: 2000 | Val pairs: 500
Sample train pair: ('–≥–¥–µ –≤ –æ—Å–Ω–æ–≤–Ω–æ–º —Ä–æ—Å—Å–∏–π—Å–∫–∏–µ –º–µ—Ç—Ä–æ–ø–æ–ª–∏—Ç–µ–Ω—ã —Ä–∞—Å–ø–æ–ª–æ–∂–µ–Ω—ã', '–ö—Ä–æ–º–µ —Ç–æ–≥–æ, –ú–∞–∫—Å–∏–º–æ–º –ì–æ—Ä—å–∫–∏–º –≤ –ì–æ—Ä–æ–¥–µ –ñ—ë–ª—Ç–æ–≥–æ –î—å—è–≤–æ–ª–∞ –±—ã–ª–æ –≤–≤–µ–¥–µ–Ω–æ –≤ —Ä—É—Å—Å–∫–∏–π —è–∑—ã–∫ —Å–ª–æ–≤–æ-–∫–∞–ª—å–∫–∞ –ø–æ–¥–∑–µ–º–∫–∞ . –û–Ω–æ –ø—Ä–∏–∂–∏–ª–æ—Å—å, –Ω–æ –ø—Ä–µ–∏–º—É—â–µ—Å—Ç–≤–µ–Ω–Ω–æ –≤ –∫–∞—á–µ—Å—Ç–≤–µ –æ–±–æ–∑–Ω–∞—á–µ–Ω–∏—è –∑–∞—Ä—É–±–µ–∂–Ω—ã—Ö –º–µ—Ç—Ä–æ–ø–æ–ª–∏—Ç–µ–Ω–æ–≤ (–ª–æ–Ω–¥–æ–Ω—Å–∫–∞—è –ø–æ–¥–∑–µ–º–∫–∞, –Ω—å—é-–π–æ—Ä–∫—Å–∫–∞—è –ø–æ–¥–∑–µ–º–∫–∞ –∏ —Ç. –¥.), —Ö–æ—Ç—è –≤ –ø–æ—Å–ª–µ–¥–Ω–µ–µ –≤—Ä–µ–º—è –≤—Å—Ç—Ä–µ—á–∞–µ—Ç—Å—è –≤ —Ä–æ—Å—Å–∏–π—Å–∫–æ–π –ø—Ä–µ—Å—Å–µ –∏ –ø—Ä–∏–º–µ–Ω–∏—Ç–µ–ª—å–Ω–æ –∫ —Ä–æ—Å—Å–∏–π—Å–∫–∏–º –º–µ—Ç—Ä–æ–ø–æ–ª–∏—Ç–µ–Ω–∞–º, –ø—Ä–æ–ª–æ–∂–µ–Ω–Ω—ã–º –≤ –æ—Å–Ω–æ–≤–Ω–æ–º –ø–æ–¥ –∑–µ–º–ª—ë–π. –°–æ–æ—Ç–≤–µ—Ç—Å—Ç–≤–µ–Ω–Ω–æ, –ø—Ä–µ–∏–º—É—â–µ—Å—Ç–≤–µ–Ω–Ω–æ —ç—Å—Ç–∞–∫–∞–¥–Ω—ã–µ –º–µ—Ç—Ä–æ–ø–æ–ª–∏—Ç–µ–Ω—ã –Ω–∞–∑—ã–≤–∞—é—Ç –Ω–∞–¥–∑–µ–º–∫–∞–º–∏ , –

### train

In [71]:
flush()

base_name = "intfloat/multilingual-e5-large"
st_model = SentenceTransformer(base_name, device=device)

# –ò–∑–≤–ª–µ–∫–∞–µ–º –±–∞–∑–æ–≤—ã–π AutoModel
backbone = st_model[0].auto_model

# –í–∫–ª—é—á–∞–µ–º gradient checkpointing
if hasattr(backbone, "gradient_checkpointing_enable"):
    backbone.gradient_checkpointing_enable()

lora_cfg = LoraConfig(
    r=16,
    lora_alpha=16,
    lora_dropout=0.05,
    bias="none",
    target_modules=["query", "key", "value", "dense"],
    task_type=TaskType.FEATURE_EXTRACTION,
)
peft_backbone = get_peft_model(backbone, lora_cfg)
peft_backbone.print_trainable_parameters()

st_model[0].auto_model = peft_backbone

loss_fn = losses.MultipleNegativesRankingLoss(st_model)

loading configuration file config.json from cache at /home/viv232/.cache/huggingface/hub/models--intfloat--multilingual-e5-large/snapshots/0dc5580a448e4284468b8909bae50fa925907bc5/config.json
Model config XLMRobertaConfig {
  "architectures": [
    "XLMRobertaModel"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "dtype": "float32",
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 1024,
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "xlm-roberta",
  "num_attention_heads": 16,
  "num_hidden_layers": 24,
  "output_past": true,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.56.1",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 250002
}

loading weights file model.safetensors from cache at /home/viv232/.cache/huggingface/hub/models--intfloat--multilingual-

trainable params: 7,110,656 || all params: 567,001,088 || trainable%: 1.2541


In [72]:
def embed(texts, model, batch_size=128, normalize=True):
    vectors = model.encode(
        texts,
        batch_size=batch_size,
        convert_to_numpy=True,
        normalize_embeddings=normalize,
        device=device,
        show_progress_bar=False,
    )
    return vectors

queries = [q for q,_ in val_pairs]
docs    = [d for _,d in val_pairs]

q_vecs = embed(queries, st_model)
d_vecs = embed(docs, st_model)

sims = np.matmul(q_vecs, d_vecs.T)
k = min(5, sims.shape[1])
topk_idx = np.argpartition(-sims, kth=k-1, axis=1)[:, :k]

true_idx = np.arange(len(val_pairs))
hits = (topk_idx == true_idx[:, None]).any(axis=1)
hit5 = hits.mean()
print(f"Hit@5: {hit5:.3f}")

Hit@5: 0.984


In [73]:
train_data = [
    {
        "anchor": q,
        "positive": d
    }
    for q, d in train_pairs
]
train_ds = Dataset.from_list(train_data)

val_data = [
    {
        "anchor": q,
        "positive": d
    }
    for q, d in val_pairs
]
val_ds = Dataset.from_list(val_data)

In [74]:
epochs = 5
batch_size = 32
gradient_accumulation_steps = 4
max_steps_cap = 120
warmup_ratio = 0.05

steps_per_epoch = min(math.ceil(len(train_ds) / batch_size), max_steps_cap)
total_steps = steps_per_epoch * epochs

loss_fn = losses.MultipleNegativesRankingLoss(st_model)

training_args = SentenceTransformerTrainingArguments(
    output_dir="st-encoder-qlora-out",
    per_device_train_batch_size=batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    learning_rate=2e-4,
    warmup_ratio=warmup_ratio,
    num_train_epochs=epochs,
    max_steps=total_steps,
    lr_scheduler_type="cosine",
    weight_decay=0.01,
    logging_steps=10,
    save_strategy="no",
    eval_strategy="steps",
    eval_steps=50,
    report_to="none",
    optim="paged_adamw_8bit",
    fp16=torch.cuda.is_available(),
    gradient_checkpointing=True,
    dataloader_drop_last=True,
    dataloader_num_workers=0,
    seed=42,
)

trainer = SentenceTransformerTrainer(
    model=st_model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=val_ds,
    loss=loss_fn,
)

PyTorch: setting up devices
average_tokens_across_devices is True but world size is 1. Setting it to False automatically.
Currently using DataParallel (DP) for multi-gpu training, while DistributedDataParallel (DDP) is recommended for faster training. See https://sbert.net/docs/sentence_transformer/training/distributed.html for more information.
PyTorch: setting up devices
average_tokens_across_devices is True but world size is 1. Setting it to False automatically.
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
max_steps is given, it will override any value given in num_train_epochs
Using auto half precision backend
                                                                     

In [75]:
gpu_mem("QLoRA after train"); nvidia_mem()

[QLoRA after train] allocated=10.47GB, reserved=14.91GB, peak=13.94GB
NVML used=16.27GB / total=23.99GB


In [76]:
trainer.train()

skipped Embedding(250002, 1024, padding_idx=1): 244.142578125M params
skipped Embedding(514, 1024, padding_idx=1): 244.64453125M params
skipped Embedding(1, 1024): 244.6455078125M params
skipped: 244.6455078125M params
***** Running training *****
  Num examples = 2,000
  Num Epochs = 40
  Instantaneous batch size per device = 32
  Training with DataParallel so batch size has been adjusted to: 64
  Total train batch size (w. parallel, distributed & accumulation) = 256
  Gradient Accumulation steps = 4
  Total optimization steps = 315
  Number of trainable parameters = 7,110,656


Step,Training Loss,Validation Loss
50,0.1989,0.055596
100,0.152,0.057347
150,0.1302,0.057545
200,0.1272,0.059949
250,0.1372,0.059919
300,0.1177,0.059835



***** Running Evaluation *****
  Num examples = 500
  Batch size = 16

***** Running Evaluation *****
  Num examples = 500
  Batch size = 16

***** Running Evaluation *****
  Num examples = 500
  Batch size = 16

***** Running Evaluation *****
  Num examples = 500
  Batch size = 16

***** Running Evaluation *****
  Num examples = 500
  Batch size = 16

***** Running Evaluation *****
  Num examples = 500
  Batch size = 16


Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=315, training_loss=0.24426108958229187, metrics={'train_runtime': 26072.3939, 'train_samples_per_second': 3.093, 'train_steps_per_second': 0.012, 'total_flos': 0.0, 'train_loss': 0.24426108958229187, 'epoch': 39.38709677419355})

In [77]:
gpu_mem("QLoRA after train"); nvidia_mem()

[QLoRA after train] allocated=10.48GB, reserved=19.19GB, peak=18.06GB
NVML used=20.56GB / total=23.99GB


In [78]:
st_model.save("st-encoder-qlora-out/final_model")

loading configuration file config.json from cache at /home/viv232/.cache/huggingface/hub/models--intfloat--multilingual-e5-large/snapshots/0dc5580a448e4284468b8909bae50fa925907bc5/config.json
Model config XLMRobertaConfig {
  "architectures": [
    "XLMRobertaModel"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "dtype": "float32",
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 1024,
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "xlm-roberta",
  "num_attention_heads": 16,
  "num_hidden_layers": 24,
  "output_past": true,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.56.1",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 250002
}

tokenizer config file saved in st-encoder-qlora-out/final_model/tokenizer_config.json
Special tokens file saved in st-en

In [79]:
q_vecs_after = embed(queries, st_model)
d_vecs_after = embed(docs, st_model)

sims_after = np.matmul(q_vecs_after, d_vecs_after.T)

k = min(5, sims_after.shape[1])
topk_idx = np.argpartition(-sims_after, kth=k-1, axis=1)[:, :k]

true_idx = np.arange(len(val_pairs))
hits = (topk_idx == true_idx[:, None]).any(axis=1)
hit5 = hits.mean()

print(f"Hit@5: {hit5:.3f}")

Hit@5: 0.972
