In [1]:
%pip install --upgrade pip
%pip install transformers==4.41.2
%pip install torch --index-url https://download.pytorch.org/whl/cu121
%pip install huggingface_hub
%pip install datasets==3.6.0
%pip install accelerate==1.7.0

Defaulting to user installation because normal site-packages is not writeable
Collecting pip
  Downloading pip-25.1.1-py3-none-any.whl.metadata (3.6 kB)
Downloading pip-25.1.1-py3-none-any.whl (1.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m19.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pip
[0mSuccessfully installed pip-25.1.1

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.0.1[0m[39;49m -> [0m[32;49m25.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3 -m pip install --upgrade pip[0m
Defaulting to user installation because normal site-packages is not writeable
Collecting transformers==4.41.2
  Downloading transformers-4.41.2-py3-none-any.whl.metadata (43 kB)
Collecting huggingface-hub<1.0,>=0.23.0 (from transformers==4.41.2)
  Downloading huggingface_hub-0.33.4-py3-none-any.whl.metadata (14 kB)
Collecting tokenizers

In [None]:
%pip install datasets==3.6.0

In [1]:
import re
import logging
import torch
from datasets import load_dataset, Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    TrainingArguments,
    Trainer
)
import accelerate

2025-07-23 07:50:28.557666: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
ANS_RE = re.compile(r'####\s(.*)$', re.DOTALL)
REMOVE_ANNOTATION = re.compile(r'<<.*?>>')
MODEL_NAME = "Qwen/Qwen2.5-1.5B"
OUTPUT_DIR = '/home/jupyter/datasphere/project/check_sft'

In [3]:
logging.basicConfig(
    level=logging.INFO,
    format="%(message)s",
    handlers=[logging.StreamHandler()]
)

In [4]:
SYSTEM_PROMPT = """
Respond in the following format:
<reasoning>
...
</reasoning>
<answer>
...
</answer>
"""
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
model = AutoModelForCausalLM.from_pretrained(MODEL_NAME, torch_dtype=torch.bfloat16, trust_remote_code=True)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [5]:
model.resize_token_embeddings(len(tokenizer))

Embedding(151665, 1536)

In [6]:
def get_hendrycks_math(split="train") -> Dataset:
    ds = load_dataset("nlile/hendrycks-MATH-benchmark", split=split)
    def preprocess(x):
        # Составляем единый текстовый prompt
        prompt = SYSTEM_PROMPT + " Problem: " + x["problem"]
        return {
            "prompt": prompt,
            "solution": x["solution"],
            "answer": x["answer"],
        }
    return ds.map(preprocess, remove_columns=ds.column_names)

In [7]:
dataset = get_hendrycks_math()

In [8]:
dataset[0]

{'solution': 'The denominator of the rational function factors into $x^2+x-6=(x-2)(x+3)$. Since the numerator is always nonzero, there is a vertical asymptote whenever the denominator is $0$, which occurs for $x = 2$ and $x = -3$.  Therefore, the graph has $\\boxed{2}$ vertical asymptotes.',
 'answer': '2',
 'prompt': '\nRespond in the following format:\n<reasoning>\n...\n</reasoning>\n<answer>\n...\n</answer>\n Problem: How many vertical asymptotes does the graph of $y=\\frac{2}{x^2+x-6}$ have?'}

In [9]:
def preprocess_function(examples):
    processed_inputs = []
    prompts   = examples["prompt"]
    solutions = examples["solution"]
    
    for prompt, solution in zip(prompts, solutions):
        # просто полный текст = prompt + решение
        full_text = prompt + solution

        # токенизируем без паддинга
        full_toks = tokenizer(
            full_text,
            max_length=512,
            truncation=True,
            padding=False,
            return_tensors=None,
            add_special_tokens=False
        )
        prompt_toks = tokenizer(
            prompt,
            max_length=512,
            truncation=True,
            padding=False,
            return_tensors=None,
            add_special_tokens=False
        )

        input_ids      = full_toks["input_ids"]
        attention_mask = full_toks["attention_mask"]
        labels         = input_ids.copy()

        # маскируем всё, что относится к prompt
        prompt_len = len(prompt_toks["input_ids"])
        if prompt_len < len(labels):
            labels[:prompt_len] = [-100] * prompt_len
        else:
            labels = [-100] * len(labels)

        processed_inputs.append({
            "input_ids":      input_ids,
            "attention_mask": attention_mask,
            "labels":         labels,
        })

    return {
        "input_ids":      [x["input_ids"]      for x in processed_inputs],
        "attention_mask":[x["attention_mask"] for x in processed_inputs],
        "labels":         [x["labels"]         for x in processed_inputs],
    }


In [10]:
tokenized_datasets = dataset.map(
    preprocess_function,
    batched=True,
    remove_columns=dataset.column_names  # удаляем старые поля
)


In [11]:
class CustomDataCollator:
    def __init__(self, tokenizer, max_length=512):
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __call__(self, features):
        input_ids = [f["input_ids"] for f in features]
        attention_mask = [f["attention_mask"] for f in features]
        labels = [f["labels"] for f in features]

        max_len = min(max(len(ids) for ids in input_ids), self.max_length)

        padded_input_ids = []
        padded_attention_mask = []
        padded_labels = []

        for ids, mask, label in zip(input_ids, attention_mask, labels):
            ids = ids[:max_len]
            mask = mask[:max_len]
            label = label[:max_len]

            pad_length = max_len - len(ids)
            if pad_length > 0:
                ids += [self.tokenizer.pad_token_id] * pad_length
                mask += [0] * pad_length
                label += [-100] * pad_length

            padded_input_ids.append(ids)
            padded_attention_mask.append(mask)
            padded_labels.append(label)

        return {
            "input_ids": torch.tensor(padded_input_ids, dtype=torch.long),
            "attention_mask": torch.tensor(padded_attention_mask, dtype=torch.long),
            "labels": torch.tensor(padded_labels, dtype=torch.long)
        }

In [12]:
data_collator = CustomDataCollator(tokenizer)

In [13]:
import os
import matplotlib.pyplot as plt
from transformers import TrainerCallback, TrainingArguments, Trainer

In [14]:
class TrainingMetricsCallback(TrainerCallback):
    def __init__(self):
        self.train_loss = []

    def on_log(self, args, state, control, logs=None, **kwargs):
        # будем ловить при каждом логировании 'loss'
        if logs is None:
            return
        if "loss" in logs:
            self.train_loss.append(logs["loss"])

    def plot(self, output_dir):
        os.makedirs(output_dir, exist_ok=True)
        plt.figure(figsize=(6,4))
        plt.plot(self.train_loss, label="train_loss")
        plt.xlabel("logging steps")
        plt.ylabel("loss")
        plt.legend()
        plt.tight_layout()
        plt.savefig(os.path.join(output_dir, "train_loss.png"))
        plt.close()

In [15]:
metrics_cb = TrainingMetricsCallback()

In [18]:
logging_dir = '/home/jupyter/datasphere/project/log_dir'

In [22]:
training_args = TrainingArguments(
    per_device_train_batch_size=2,
    gradient_accumulation_steps=16,
    num_train_epochs=2,
    learning_rate=2e-5,
    weight_decay=1e-3,
    lr_scheduler_type="cosine",
    warmup_ratio=0.1,
    bf16=True,
    label_names=["labels"],
    
    report_to='none',
    
    logging_steps=35,
    logging_strategy='steps',
    logging_first_step=True,
    logging_dir=logging_dir,
    
    save_strategy="no",
    output_dir=OUTPUT_DIR
    
    # output_dir=OUTPUT_DIR,
    # save_strategy="steps",
    # save_steps=200,
    # save_total_limit=2    
    
    #per_device_eval_batch_size=8
    # do_eval=False,
    # evaluation_strategy="steps",
    # eval_steps=100,
    # load_best_model_at_end=True,   
)

In [23]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets,
    data_collator=data_collator,
    tokenizer=tokenizer,
    callbacks=[metrics_cb]
)

In [24]:
trainer.train()

  0%|          | 1/750 [00:04<56:31,  4.53s/it]

{'loss': 0.8374, 'grad_norm': 6.9375, 'learning_rate': 2.666666666666667e-07, 'epoch': 0.0}


  5%|▍         | 35/750 [01:13<24:17,  2.04s/it]

{'loss': 0.7248, 'grad_norm': 3.96875, 'learning_rate': 9.333333333333334e-06, 'epoch': 0.09}


  9%|▉         | 70/750 [02:24<22:25,  1.98s/it]

{'loss': 0.6482, 'grad_norm': 2.78125, 'learning_rate': 1.866666666666667e-05, 'epoch': 0.19}


 14%|█▍        | 105/750 [03:35<21:43,  2.02s/it]

{'loss': 0.6306, 'grad_norm': 2.15625, 'learning_rate': 1.9902680687415704e-05, 'epoch': 0.28}


 19%|█▊        | 140/750 [04:46<20:49,  2.05s/it]

{'loss': 0.6277, 'grad_norm': 2.671875, 'learning_rate': 1.954587632247732e-05, 'epoch': 0.37}


 23%|██▎       | 175/750 [05:58<19:33,  2.04s/it]

{'loss': 0.6329, 'grad_norm': 2.484375, 'learning_rate': 1.8936326403234125e-05, 'epoch': 0.47}


 28%|██▊       | 210/750 [07:09<18:16,  2.03s/it]

{'loss': 0.6523, 'grad_norm': 2.171875, 'learning_rate': 1.8090169943749477e-05, 'epoch': 0.56}


 33%|███▎      | 245/750 [08:19<16:50,  2.00s/it]

{'loss': 0.6212, 'grad_norm': 2.109375, 'learning_rate': 1.702981057425662e-05, 'epoch': 0.65}


 37%|███▋      | 280/750 [09:29<15:29,  1.98s/it]

{'loss': 0.6278, 'grad_norm': 2.734375, 'learning_rate': 1.5783323361679865e-05, 'epoch': 0.75}


 42%|████▏     | 315/750 [10:39<14:20,  1.98s/it]

{'loss': 0.6166, 'grad_norm': 2.25, 'learning_rate': 1.4383711467890776e-05, 'epoch': 0.84}


 47%|████▋     | 350/750 [11:49<13:37,  2.04s/it]

{'loss': 0.6061, 'grad_norm': 2.09375, 'learning_rate': 1.2868032327110904e-05, 'epoch': 0.93}


 51%|█████▏    | 385/750 [12:59<12:22,  2.03s/it]

{'loss': 0.5991, 'grad_norm': 1.921875, 'learning_rate': 1.127641647860595e-05, 'epoch': 1.03}


 56%|█████▌    | 420/750 [14:08<11:08,  2.03s/it]

{'loss': 0.5821, 'grad_norm': 2.359375, 'learning_rate': 9.651005032974994e-06, 'epoch': 1.12}


 61%|██████    | 455/750 [15:19<09:50,  2.00s/it]

{'loss': 0.5693, 'grad_norm': 2.109375, 'learning_rate': 8.034833904671698e-06, 'epoch': 1.21}


 65%|██████▌   | 490/750 [16:30<09:02,  2.09s/it]

{'loss': 0.573, 'grad_norm': 2.78125, 'learning_rate': 6.4706943528613135e-06, 'epoch': 1.31}


 70%|███████   | 525/750 [17:42<07:48,  2.08s/it]

{'loss': 0.5876, 'grad_norm': 2.3125, 'learning_rate': 5.000000000000003e-06, 'epoch': 1.4}


 75%|███████▍  | 560/750 [18:52<06:15,  1.98s/it]

{'loss': 0.568, 'grad_norm': 2.21875, 'learning_rate': 3.661690326012897e-06, 'epoch': 1.49}


 79%|███████▉  | 595/750 [20:04<05:14,  2.03s/it]

{'loss': 0.5594, 'grad_norm': 2.234375, 'learning_rate': 2.4911996701850083e-06, 'epoch': 1.59}


 84%|████████▍ | 630/750 [21:15<04:04,  2.04s/it]

{'loss': 0.5782, 'grad_norm': 2.5, 'learning_rate': 1.5195190384357405e-06, 'epoch': 1.68}


 89%|████████▊ | 665/750 [22:27<02:52,  2.03s/it]

{'loss': 0.5759, 'grad_norm': 2.328125, 'learning_rate': 7.723755564455771e-07, 'epoch': 1.77}


 93%|█████████▎| 700/750 [23:39<01:42,  2.05s/it]

{'loss': 0.5549, 'grad_norm': 2.3125, 'learning_rate': 2.6955129420176193e-07, 'epoch': 1.87}


 98%|█████████▊| 735/750 [24:52<00:30,  2.07s/it]

{'loss': 0.5706, 'grad_norm': 2.0, 'learning_rate': 2.4359497401758026e-08, 'epoch': 1.96}


100%|██████████| 750/750 [25:22<00:00,  2.03s/it]

{'train_runtime': 1522.6553, 'train_samples_per_second': 15.762, 'train_steps_per_second': 0.493, 'train_loss': 0.6046233045260111, 'epoch': 2.0}





TrainOutput(global_step=750, training_loss=0.6046233045260111, metrics={'train_runtime': 1522.6553, 'train_samples_per_second': 15.762, 'train_steps_per_second': 0.493, 'total_flos': 6.793175227923456e+16, 'train_loss': 0.6046233045260111, 'epoch': 2.0})

In [26]:
trainer.save_model("/home/jupyter/datasphere/project/check_sft")