In [1]:
from unsloth import FastLanguageModel
import torch
from datasets import load_dataset
from trl import SFTTrainer
from transformers import TrainingArguments, AutoTokenizer

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.


2025-05-08 12:13:37.017258: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1746695617.114885    3072 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1746695617.143547    3072 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1746695617.341352    3072 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1746695617.341377    3072 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1746695617.341380    3072 computation_placer.cc:177] computation placer alr

🦥 Unsloth Zoo will now patch everything to make training faster!


In [2]:
model_name = "meta-llama/Llama-3.2-1B-Instruct"  # unsloth/Llama-3.2-1B-Instruct
max_seq_length = 2048

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name=model_name,
    max_seq_length=max_seq_length,
    load_in_8bit=False,
    load_in_4bit=False,
    # attn_implementation="flash_attention_2",
    
    # use_flash_attention_2=True,  # Основной флаг
    # fused_mlp=True,              # Оптимизация MLP
    # fused_dense=True,            # Оптимизация dense слоев
    device_map="auto",
)

==((====))==  Unsloth 2025.4.7: Fast Llama patching. Transformers: 4.51.3.
   \\   /|    Tesla V100-SXM2-32GB. Num GPUs = 1. Max memory: 31.733 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.7.0+cu126. CUDA: 7.0. CUDA Toolkit: 12.6. Triton: 3.3.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.30. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


In [3]:
model = FastLanguageModel.get_peft_model(
    model,
    r=16,
    lora_alpha=16,
    lora_dropout=0,
    target_modules=["q_proj", "k_proj", "v_proj", "up_proj", "down_proj", "o_proj", "gate_proj"],
    use_rslora=False,
    # use_gradient_checkpointing="unsloth",
    random_state=42,
    loftq_config=None,
)

print(model.print_trainable_parameters())

Unsloth 2025.4.7 patched 16 layers with 16 QKV layers, 16 O layers and 16 MLP layers.


trainable params: 11,272,192 || all params: 1,247,086,592 || trainable%: 0.9039
None


In [4]:
import json
from datasets import Dataset

def read_jsonl(file_name):
    with open(file_name, encoding="utf-8") as r:
        return [json.loads(line) for line in r]
    
data = read_jsonl("../Semyon/data/train/sft_d1_train.jsonl")
val_data = read_jsonl("../Semyon/data/val/sft_d1_val.jsonl")
dataset = Dataset.from_list(data)
val_dataset = Dataset.from_list(val_data)

In [5]:
custom_system_message = {
    "role": "system", 
    "content": "Ты — экспертная система Compressa RAG. Предоставляющая точные и релевантные ответы на вопросы."
}

def generate_conversation(examples):
    conversations = []
    for messages in examples["messages"]:
        formatted_messages = [custom_system_message] + [  # <- кастомный промпт
            {"role": msg["role"], "content": msg["content"]}
            for msg in messages
        ]
        conversations.append(formatted_messages)
    return {"conversations": conversations}

train = tokenizer.apply_chat_template(
    dataset.map(generate_conversation, batched=True)["conversations"],
    tokenize=False,
)

val = tokenizer.apply_chat_template(
    val_dataset.map(generate_conversation, batched=True)["conversations"],
    tokenize=False,
)


Map:   0%|          | 0/34640 [00:00<?, ? examples/s]

Map:   0%|          | 0/1011 [00:00<?, ? examples/s]

In [6]:
print(val[0])

<|begin_of_text|><|start_header_id|>system<|end_header_id|>

Cutting Knowledge Date: December 2023
Today Date: 07 May 2025

Ты — экспертная система Compressa RAG. Предоставляющая точные и релевантные ответы на вопросы.<|eot_id|><|start_header_id|>user<|end_header_id|>

Read the following text about wearable fitness technology and answer the subsequent questions:      "Wearable fitness technology includes devices such as fitness trackers, smartwatches, and heart rate monitors. These devices track various health metrics like steps taken, calories burned, and heart rate. Fitness trackers are usually worn on the wrist and provide users with real-time data on their physical activities. Smartwatches not only track fitness metrics but also offer additional functionalities like GPS navigation, notifications from your phone, and sometimes even the capability to make phone calls. Heart rate monitors, often used by athletes, provide accurate readings of the user's heart rate during different type

In [6]:
import pandas as pd
from datasets import Dataset

train_tmp = pd.Series(train)
val_tmp = pd.Series(val)

train_tmp.name = "text"
val_tmp.name = "text"

train_dataset = Dataset.from_pandas(pd.DataFrame(train_tmp))
train_dataset = train_dataset.shuffle(seed = 3407)
val_dataset = Dataset.from_pandas(pd.DataFrame(val_tmp))

In [8]:
34/1.7

20.0

In [7]:
from trl import SFTTrainer, SFTConfig
trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = train_dataset,
    eval_dataset = val_dataset,
    packing = False,
    args = SFTConfig(
        dataset_text_field = "text",
        per_device_train_batch_size = 12,
        gradient_accumulation_steps = 2,
        warmup_steps = 50,
        num_train_epochs = 1,
        learning_rate = 2e-5,
        logging_steps = 10,
        optim = "adamw_8bit",  # adamw_8bit
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        report_to="wandb",
        output_dir="llama-v100-bs_12_2",
        eval_steps=50,
        eval_strategy="steps",
        dataloader_num_workers=8
    ),
)

Unsloth: Tokenizing ["text"] (num_proc=20):   0%|          | 0/34640 [00:00<?, ? examples/s]

Unsloth: Tokenizing ["text"] (num_proc=20):   0%|          | 0/1011 [00:00<?, ? examples/s]

In [8]:
trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 34,640 | Num Epochs = 1 | Total steps = 1,443
O^O/ \_/ \    Batch size per device = 12 | Gradient accumulation steps = 2
\        /    Data Parallel GPUs = 1 | Total batch size (12 x 2 x 1) = 24
 "-____-"     Trainable parameters = 11,272,192/1,247,086,592 (0.90% trained)
[34m[1mwandb[0m: [32m[41mERROR[0m Failed to detect the name of this notebook. You can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mseba-vicin[0m ([33mseba-vicin-oxford[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Unsloth: Will smartly offload gradients to save VRAM!


Step,Training Loss,Validation Loss
50,1.9378,1.902725
100,1.6763,1.631617
150,1.655,1.58684
200,1.5973,1.563779
250,1.5945,1.546911
300,1.5102,1.533495
350,1.5379,1.522028
400,1.5821,1.512389
450,1.5587,1.503936
500,1.5122,1.496387


Unsloth: Not an error, but LlamaForCausalLM does not accept `num_items_in_batch`.
Using gradient accumulation will be very slightly less accurate.
Read more on gradient accumulation issues here: https://unsloth.ai/blog/gradient


In [9]:
torch.save(model.state_dict(), "llama-v100-bs_12_2/model.pth")

In [10]:
model.save_pretrained("llama-v100-bs_12_2/pretrain_save")