In [4]:
from peft import get_peft_model, LoraConfig, TaskType
from transformers import Trainer, TrainingArguments
import json
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForCausalLM


data = None
with open("dataset.jsonl", "r", encoding="utf-8") as f:
    data = [json.loads(line) for line in f]

dataset = Dataset.from_list(data)


In [5]:
model_name = "Vikhrmodels/Vikhr-Llama-3.2-1B-Instruct"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)
tokenizer.pad_token_id = tokenizer.eos_token_id


def tokenize_function(example):
    inputs = tokenizer(
        example["system"], max_length=100, truncation=True, padding="max_length"
    )
    inputs["labels"] = tokenizer(
        example["output"], max_length=100, truncation=True, padding="max_length"
    )["input_ids"]
    return inputs


train_dataset = dataset.map(tokenize_function, batched=True)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/54.6k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/457 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/926 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.47G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/184 [00:00<?, ?B/s]

Map:   0%|          | 0/60 [00:00<?, ? examples/s]

In [6]:
import os

# Отключаем W&B
os.environ["WANDB_DISABLED"] = "true"

config = LoraConfig(
    task_type=TaskType.CAUSAL_LM,
    inference_mode=False,
    r=16,
    lora_alpha=16,
    lora_dropout=0.1,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
)
model = get_peft_model(model, config)

training_args = TrainingArguments(
    output_dir="./results",
    per_device_train_batch_size=4,
    num_train_epochs=3,
    logging_dir="./logs",
    logging_steps=10,
    save_strategy="epoch",
    learning_rate=5e-5,
    weight_decay=1e-4,
)

trainer = Trainer(model=model, args=training_args, train_dataset=train_dataset)

trainer.train()

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Step,Training Loss
10,9.4231
20,6.4265
30,5.0449
40,4.6526


TrainOutput(global_step=45, training_loss=6.1744567023383246, metrics={'train_runtime': 31.6414, 'train_samples_per_second': 5.689, 'train_steps_per_second': 1.422, 'total_flos': 105467830272000.0, 'train_loss': 6.1744567023383246, 'epoch': 3.0})

In [7]:
model.eval()

system = "Ты чат-бот, который генерирует анекдоты. Продолжи анекдот."
prompt = system + "Что общего у кота и программиста?"

inputs = tokenizer(
    prompt, return_tensors="pt", padding=True, truncation=True, max_length=100
)

model.to("cuda")
generated = model.generate(
    input_ids=inputs["input_ids"].to("cuda"),
    attention_mask=inputs["attention_mask"].to("cuda"),
    max_length=100,
    temperature=0.5,
    top_p=0.9,
)

print(tokenizer.decode(generated[0], skip_special_tokens=True))


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Ты чат-бот, который генерирует анекдоты. Продолжи анекдот.Что общего у кота и программиста?


In [8]:
# Обучение с torch

In [9]:
from torch.utils.data import DataLoader

train_dataset.set_format(
    type="torch", columns=["input_ids", "attention_mask", "labels"]
)
train_dataloader = DataLoader(train_dataset, batch_size=4, shuffle=True)

In [11]:
import torch
from torch.optim import AdamW
from torch.nn import CrossEntropyLoss
from tqdm import tqdm

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

optimizer = AdamW(model.parameters(), lr=5e-5, weight_decay=1e-4)
criterion = CrossEntropyLoss()
num_epochs = 3


model.train()
for epoch in range(num_epochs):
    epoch_loss = 0.0
    progress_bar = tqdm(train_dataloader, desc=f"Epoch {epoch+1}/{num_epochs}")

    for batch in progress_bar:
        input_ids, labels, attention_mask = (
            batch["input_ids"].to(device),
            batch["labels"].to(device),
            batch["attention_mask"].to(device),
        )

        optimizer.zero_grad()
        outputs = model(
            input_ids=input_ids, attention_mask=attention_mask, labels=labels
        )
        loss = outputs.loss  # Лосс возвращается напрямую в режиме Causal LM
        loss.backward()
        optimizer.step()

        epoch_loss += loss.item()
        progress_bar.set_postfix({"Loss": loss.item()})

    avg_epoch_loss = epoch_loss / len(train_dataloader)
    print(f"Epoch {epoch+1}/{num_epochs}, Average Loss: {avg_epoch_loss:.4f}")


Epoch 1/3: 100%|██████████| 15/15 [00:09<00:00,  1.64it/s, Loss=5.44]


Epoch 1/3, Average Loss: 8.3292


Epoch 2/3: 100%|██████████| 15/15 [00:09<00:00,  1.64it/s, Loss=4.59]


Epoch 2/3, Average Loss: 4.8495


Epoch 3/3: 100%|██████████| 15/15 [00:09<00:00,  1.62it/s, Loss=2.43]

Epoch 3/3, Average Loss: 3.4661





In [12]:
model.eval()

system = "Ты чат-бот, который генерирует анекдоты. Продолжи анекдот."
prompt = system + "Что общего у кота и программиста?"

inputs = tokenizer(
    prompt, return_tensors="pt", padding=True, truncation=True, max_length=100
)

# Генерация текста с исправлениями
generated = model.generate(
    input_ids=inputs["input_ids"].to("cuda"),
    attention_mask=inputs["attention_mask"].to("cuda"),
    max_length=100,
    temperature=0.5,
)

print(tokenizer.decode(generated[0], skip_special_tokens=True))


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Ты чат-бот, который генерирует анекдоты. Продолжи анекдот.Что общего у кота и программиста?
