In [1]:
!nvidia-smi

Thu Oct 31 12:59:01 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  Tesla T4                       Off | 00000000:00:04.0 Off |                    0 |
| N/A   69C    P8              12W /  70W |      0MiB / 15360MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [2]:
!pip install accelerate
!pip install transformers==4.45.2
!pip install bitsandbytes
!pip install datasets
!pip install rouge-score
!pip install pymorphy2
!pip install peft
!pip install git+https://github.com/unslothai/unsloth@38663b01f5dd0e610b12475bd95b144303cff539
#!pip install flash_attn

Collecting git+https://github.com/unslothai/unsloth@38663b01f5dd0e610b12475bd95b144303cff539
  Cloning https://github.com/unslothai/unsloth (to revision 38663b01f5dd0e610b12475bd95b144303cff539) to /tmp/pip-req-build-_klzjfxo
  Running command git clone --filter=blob:none --quiet https://github.com/unslothai/unsloth /tmp/pip-req-build-_klzjfxo
  Running command git rev-parse -q --verify 'sha^38663b01f5dd0e610b12475bd95b144303cff539'
  Running command git fetch -q https://github.com/unslothai/unsloth 38663b01f5dd0e610b12475bd95b144303cff539
  Running command git checkout -q 38663b01f5dd0e610b12475bd95b144303cff539
  Resolved https://github.com/unslothai/unsloth to commit 38663b01f5dd0e610b12475bd95b144303cff539
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone


In [3]:
!pip install git+https://github.com/unslothai/unsloth-zoo@9c463920d49d4d438b681e27295e5bb4cfd3a351

Collecting git+https://github.com/unslothai/unsloth-zoo@9c463920d49d4d438b681e27295e5bb4cfd3a351
  Cloning https://github.com/unslothai/unsloth-zoo (to revision 9c463920d49d4d438b681e27295e5bb4cfd3a351) to /tmp/pip-req-build-btwubq1r
  Running command git clone --filter=blob:none --quiet https://github.com/unslothai/unsloth-zoo /tmp/pip-req-build-btwubq1r
  Running command git rev-parse -q --verify 'sha^9c463920d49d4d438b681e27295e5bb4cfd3a351'
  Running command git fetch -q https://github.com/unslothai/unsloth-zoo 9c463920d49d4d438b681e27295e5bb4cfd3a351
  Running command git checkout -q 9c463920d49d4d438b681e27295e5bb4cfd3a351
  Resolved https://github.com/unslothai/unsloth-zoo to commit 9c463920d49d4d438b681e27295e5bb4cfd3a351
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone


In [4]:
!pip install xformers==0.0.27.post2



In [2]:
import random
from typing import List, Dict

import torch
from torch.utils.data import Dataset
from transformers import AutoTokenizer
from tqdm import tqdm


class ChatDataset(Dataset):
    def __init__(
        self,
        original_records: List[Dict],
        tokenizer: AutoTokenizer,
        max_tokens_count: int,
        sample_rate: float = 1.0,
        only_target_loss: bool = True,
        add_global_bos: bool = True,
        add_global_eos: bool = True,
        labels_pad_token_id: int = -100,
    ):
        self.original_records = original_records
        self.sample_rate = sample_rate
        self.tokenizer = tokenizer
        self.max_tokens_count = max_tokens_count
        self.only_target_loss = only_target_loss
        self.labels_pad_token_id = labels_pad_token_id
        self.add_global_bos = add_global_bos
        self.add_global_eos = add_global_eos
        self.is_printed = False

        self.records = []
        for record in tqdm(original_records):
            if random.random() > self.sample_rate:
                continue
            tensors = self.convert_record(record)
            if tensors is None:
                continue
            self.records.append(tensors)

    def __len__(self):
        return len(self.records)

    def __getitem__(self, index):
        return self.records[index]

    def get_tokens(self, messages):
        #print(messages)
        tokens = self.tokenizer.apply_chat_template(
            messages,
            add_special_tokens=False,
            tokenize=True,
            add_generation_prompt=False,
        )
        if tokens[0] == self.tokenizer.bos_token_id:
            tokens = tokens[1:]
        return tokens

    def convert_record(self, record):
        input_ids, labels = [], []

        for i, message in enumerate(record["messages"]):
            if message['role'] == 'bot':
                message['role'] = 'assistant'
                record["messages"][i]['role'] = 'assistant'

            message_input_ids = self.get_tokens([message])
            message_labels = message_input_ids
            if len(input_ids) + len(message_input_ids) > self.max_tokens_count - 2:
                break

            labels_mask = [
                self.labels_pad_token_id for _ in range(len(message_input_ids))
            ]
            if (
                message["role"] not in ("assistant", "bot", "gpt")
                and self.only_target_loss
            ):
                message_labels = labels_mask

            input_ids.extend(message_input_ids)
            labels.extend(message_labels)

        if not input_ids:
            return None

        original_input_ids = self.get_tokens(record["messages"])
        if input_ids != original_input_ids[: len(input_ids)]:
            print(input_ids)
            print(original_input_ids[: len(input_ids)])
        assert input_ids == original_input_ids[: len(input_ids)]

        if self.add_global_bos and input_ids[0] != self.tokenizer.bos_token_id:
            input_ids.insert(0, self.tokenizer.bos_token_id)
            labels.insert(0, self.labels_pad_token_id)

        if input_ids[-2] == self.tokenizer.eos_token_id:
            input_ids = input_ids[:-1]
            labels = labels[:-1]

        if self.add_global_eos and input_ids[-1] != self.tokenizer.eos_token_id:
            input_ids.append(self.tokenizer.eos_token_id)
            labels.append(self.tokenizer.eos_token_id)

        if not self.is_printed:
            print(input_ids)
            print(labels)
            print(
                "Full prompt:" +
                self.tokenizer.decode(input_ids, skip_special_tokens=False)
            )
            assert '\n' in self.tokenizer.decode(input_ids, skip_special_tokens=False)
            self.is_printed = True

        input_ids = torch.LongTensor(input_ids)
        labels = torch.LongTensor(labels)
        attention_mask = input_ids.new_ones(input_ids.size())
        assert (
            input_ids.size(0)
            == labels.size(0)
            == attention_mask.size(0)
            <= self.max_tokens_count
        )
        return {
            "input_ids": input_ids,
            "labels": labels,
            "attention_mask": attention_mask,
        }

In [3]:
from datasets import load_dataset
dataset = load_dataset('IlyaGusev/saiga_scored')
dataset = dataset['train'].select(range(1000))
dataset = dataset.train_test_split(test_size=0.1)
dataset

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


DatasetDict({
    train: Dataset({
        features: ['messages', 'source', 'opus_score', 'language', 'turns', 'sonnet_topic', 'sonnet_topic_explanation', 'sonnet_complexity', 'sonnet_complexity_explanation', 'is_bad_by_regex', 'score_explanation'],
        num_rows: 900
    })
    test: Dataset({
        features: ['messages', 'source', 'opus_score', 'language', 'turns', 'sonnet_topic', 'sonnet_topic_explanation', 'sonnet_complexity', 'sonnet_complexity_explanation', 'is_bad_by_regex', 'score_explanation'],
        num_rows: 100
    })
})

In [4]:
import random
import json
import os

import torch
import numpy as np
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    DataCollatorForTokenClassification,
    AutoConfig,
)
from transformers import (
    Trainer,
    TrainingArguments,
    logging,
    TrainerCallback,
    TrainerState,
    TrainerControl,
    BitsAndBytesConfig,
)
from transformers.trainer_utils import PREFIX_CHECKPOINT_DIR
from unsloth import FastLanguageModel, UnslothTrainingArguments, UnslothTrainer

from peft import get_peft_model, LoraConfig
import re
from peft import prepare_model_for_kbit_training
import codecs

os.environ["WANDB_DISABLED"] = "true"

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.


In [5]:
instruct_model_name = 'RefalMachine/ruadapt_qwen2.5_3B_ext_u48_instruct_v4'

tokenizer = AutoTokenizer.from_pretrained(instruct_model_name)
chat_template = tokenizer.chat_template
bos_token = tokenizer.bos_token
eos_token = tokenizer.eos_token
pad_token = tokenizer.pad_token
chat_template

'{%- if tools %}\n    {{- \'<|im_start|>system\\n\' }}\n    {%- if messages[0][\'role\'] == \'system\' %}\n        {{- messages[0][\'content\'] }}\n    {%- else %}\n        {{- \'You are Qwen, created by Alibaba Cloud. You are a helpful assistant.\' }}\n    {%- endif %}\n    {{- "\\n\\n# Tools\\n\\nYou may call one or more functions to assist with the user query.\\n\\nYou are provided with function signatures within <tools></tools> XML tags:\\n<tools>" }}\n    {%- for tool in tools %}\n        {{- "\\n" }}\n        {{- tool | tojson }}\n    {%- endfor %}\n    {{- "\\n</tools>\\n\\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\\n<tool_call>\\n{\\"name\\": <function-name>, \\"arguments\\": <args-json-object>}\\n</tool_call><|im_end|>\\n" }}\n{%- else %}\n    {%- if messages[0][\'role\'] == \'system\' %}\n        {{- \'<|im_start|>system\\n\' + messages[0][\'content\'] + \'<|im_end|>\\n\' }}\n    {%- endif %}\n{%- en

In [6]:
model_name = 'RefalMachine/ruadapt_qwen2.5_3B_u48_full_lr3e4_bs256'
max_tokens_count = 512
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name=model_name,
    max_seq_length=max_tokens_count,
    dtype=torch.float16,
    load_in_4bit=True,
    attn_implementation="sdpa",
)

==((====))==  Unsloth 2024.10.0: Fast Qwen2 patching. Transformers = 4.45.2.
   \\   /|    GPU: Tesla T4. Max memory: 14.748 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.4.0+cu121. CUDA = 7.5. CUDA Toolkit = 12.1.
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.27.post2. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Unsloth: Will load RefalMachine/ruadapt_qwen2.5_3B_u48_full_lr3e4_bs256 as a legacy tokenizer.
RefalMachine/ruadapt_qwen2.5_3B_u48_full_lr3e4_bs256 does not have a padding token! Will use pad_token = <unk>.


In [7]:
model_name = 'RefalMachine/ruadapt_qwen2.5_3B_u48_full_lr3e4_bs256'

tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.bos_token = bos_token
tokenizer.eos_token = eos_token
tokenizer.pad_token = pad_token
tokenizer.chat_template = chat_template
tokenizer.padding_side = 'left'

In [8]:
only_target_loss = True

datasets = []
for records in (dataset['train'], dataset['test']):
    datasets.append(
        ChatDataset(
            records,
            tokenizer,
            max_tokens_count=max_tokens_count,
            sample_rate=1.0,
            only_target_loss=only_target_loss,
            add_global_eos=False,
            add_global_bos=False
        )
    )
train_dataset, val_dataset = datasets

  1%|          | 8/900 [00:00<00:11, 76.70it/s]

[48001, 14598, 13, 1410, 37762, 35387, 8063, 266, 20089, 12404, 313, 12769, 1787, 3281, 362, 2414, 308, 30431, 633, 4443, 580, 259, 9770, 2211, 695, 262, 1246, 259, 327, 408, 716, 36149, 298, 266, 442, 17364, 265, 756, 298, 485, 260, 48002, 261, 13]
[-100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100]
Full prompt:<|im_start|> user
Напиши оригинальный мини-сценарий для короткометражного фильма о дружбе двух молодых людей, выросших вместе в городе, но которые должны расстаться из-за переезда одного из них.<|im_end|> 



100%|██████████| 900/900 [00:05<00:00, 160.64it/s]
 21%|██        | 21/100 [00:00<00:00, 205.27it/s]

[48001, 14598, 13, 9240, 766, 5310, 295, 594, 530, 9568, 7264, 11817, 445, 594, 261, 289, 261, 260, 4397, 11945, 5443, 20777, 294, 15136, 7434, 2472, 14284, 282, 4003, 3257, 358, 927, 365, 303, 16343, 1157, 33712, 19487, 269, 14250, 260, 48002, 261, 13]
[-100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100]
Full prompt:<|im_start|> user
add more points to this or paraphrase this : . My Team got caught in academic misconduct (cheating). This is a reflective essay regarding the incident.<|im_end|> 



100%|██████████| 100/100 [00:00<00:00, 187.66it/s]


In [9]:
messages = [{'role': 'user', 'content': 'Напиши что такое LLM.'}]
tokenizer.apply_chat_template(messages, return_tensors='pt', add_special_tokens=True, add_generation_prompt=True)

tensor([[48001, 14598,    13,  1410, 37762,   279,  1001,   981, 35642,   260,
         48002,   261,    13, 48001, 17130,    13]])

In [10]:
from transformers import GenerationConfig

def generate(messages, model, tokenizer, generation_config):
    input_ids = tokenizer.apply_chat_template(messages, return_tensors='pt', add_special_tokens=True, add_generation_prompt=True)
    input_ids = input_ids.to(model.device)
    with torch.no_grad():
        output_ids = model.generate(
            input_ids,
            generation_config=generation_config
        )
    outputs = []
    for sample_output_ids, sample_input_ids in zip(output_ids, input_ids):
        sample_output_ids = sample_output_ids[len(sample_input_ids):]
        sample_output = tokenizer.decode(sample_output_ids, skip_special_tokens=True)
        outputs.append(sample_output)

    if len(outputs) == 1:
        outputs = outputs[0]
    return outputs



generation_config = GenerationConfig.from_dict(
    {
        'top_k': 40,
        'top_p': 0.9,
        'temperature': 0.2,
        'repetition_penalty': 1.0,
        'max_new_tokens': 64,
        'do_sample': True,
        'pad_token_id': tokenizer.pad_token_id,
        'bos_token_id': tokenizer.bos_token_id,
        'eos_token_id': tokenizer.eos_token_id
    }
)
generation_config

GenerationConfig {
  "bos_token_id": 48000,
  "do_sample": true,
  "eos_token_id": 48002,
  "max_new_tokens": 64,
  "pad_token_id": 48000,
  "temperature": 0.2,
  "top_k": 40,
  "top_p": 0.9
}

In [11]:
#generate(messages, model, tokenizer, generation_config)

In [12]:
lora_config = {
    "r": 16,
    "lora_alpha": 16,
    "lora_dropout": 0.0,
    "bias": "none",
    "target_modules": ["q_proj", "v_proj", "k_proj", "o_proj"],
    "use_gradient_checkpointing": "unsloth"
}

In [13]:
model = FastLanguageModel.get_peft_model(
    model, **lora_config, max_seq_length=max_tokens_count
)

Not an error, but Unsloth cannot patch MLP layers with our manual autograd engine since either LoRA adapters
are not enabled or a bias term (like in Qwen) is used.
Unsloth 2024.10.0 patched 36 layers with 0 QKV layers, 36 O layers and 0 MLP layers.


In [15]:
training_args = {
    "evaluation_strategy": "steps",
    "per_device_train_batch_size": 1,
    "per_device_eval_batch_size": 1,
    "gradient_accumulation_steps": 8,
    "eval_steps": 16,
    "save_steps": 128,
    "logging_steps": 1,
    "learning_rate": 0.00005,
    "num_train_epochs": 1,
    "lr_scheduler_type": "cosine",
    "warmup_steps": 16,
    "bf16": False,
    "fp16": True,
    "optim": "paged_adamw_8bit",
    "save_total_limit": 1,
    "seed": 1337,
    "max_grad_norm": 1.0,
    "weight_decay": 0.05
}
training_args = UnslothTrainingArguments(output_dir='./instruct_unsloth', **training_args)

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


In [17]:
from unsloth.trainer import _create_unsloth_optimizer
class CustomTrainer(Trainer):
    def create_optimizer(self):
        embedding_learning_rate = getattr(self.args, "embedding_learning_rate", None)
        if embedding_learning_rate is None:
            return super().create_optimizer()
        if self.optimizer is None:
            optimizer_cls, optimizer_kwargs = Trainer.get_optimizer_cls_and_kwargs(self.args)
            self.optimizer = _create_unsloth_optimizer(
                self.model,
                optimizer_cls,
                optimizer_kwargs,
                embedding_learning_rate,
            )
        return self.optimizer

In [18]:
trainer = data_collator = DataCollatorForTokenClassification(tokenizer, pad_to_multiple_of=8)
trainer = CustomTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    data_collator=data_collator,
)
trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 877 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 1 | Gradient Accumulation steps = 8
\        /    Total batch size = 8 | Total steps = 109
 "-____-"     Number of trainable parameters = 7,372,800


Step,Training Loss,Validation Loss
16,3.3601,
32,2.5939,
48,4.191,
64,6.2552,
80,7.4402,
96,5.5921,


TrainOutput(global_step=109, training_loss=5.140211858333798, metrics={'train_runtime': 685.7124, 'train_samples_per_second': 1.279, 'train_steps_per_second': 0.159, 'total_flos': 3544807036944384.0, 'train_loss': 5.140211858333798, 'epoch': 0.9942987457240593})

In [19]:
FastLanguageModel.for_inference(model)

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): Qwen2ForCausalLM(
      (model): Qwen2Model(
        (embed_tokens): Embedding(48022, 2048)
        (layers): ModuleList(
          (0-35): 36 x Qwen2DecoderLayer(
            (self_attn): Qwen2Attention(
              (q_proj): lora.Linear4bit(
                (base_layer): Linear4bit(in_features=2048, out_features=2048, bias=True)
                (lora_dropout): ModuleDict(
                  (default): Identity()
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=2048, out_features=16, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=16, out_features=2048, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ModuleDict()
              )
              (k_proj): lora.Linear4bit(
        

In [20]:
generate(messages, model, tokenizer, generation_config)

'LLM (large language model) — это искусственный интеллект, который способен генерировать текст на основе больших объемов данных и ранее обученных моделей. LLMы могут выполнять различные задачи, такие как перевод, ответ на вопросы, создание контента и т.д. Они обычно основаны на нейросетях и обучены на больших данных'