In [None]:
%pip install --upgrade pip
%pip install transformers==4.41.2
%pip install torch --index-url https://download.pytorch.org/whl/cu121
%pip install huggingface_hub
%pip install datasets==3.6.0
%pip install accelerate==1.7.0

In [None]:
%pip install datasets==3.6.0

In [None]:
import re
import logging
import torch
from datasets import load_dataset, Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    TrainingArguments,
    Trainer
)
import accelerate

In [None]:
ANS_RE = re.compile(r'####\s(.*)$', re.DOTALL)
REMOVE_ANNOTATION = re.compile(r'<<.*?>>')
MODEL_NAME = "Qwen/Qwen2.5-1.5B"
OUTPUT_DIR = '/home/jupyter/datasphere/project/check_sft'

In [None]:
logging.basicConfig(
    level=logging.INFO,
    format="%(message)s",
    handlers=[logging.StreamHandler()]
)

In [None]:
SYSTEM_PROMPT = """
Respond in the following format:
<reasoning>
...
</reasoning>
<answer>
...
</answer>
"""
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
model = AutoModelForCausalLM.from_pretrained(MODEL_NAME, torch_dtype=torch.bfloat16, trust_remote_code=True)

In [None]:
model.resize_token_embeddings(len(tokenizer))

In [None]:
def get_hendrycks_math(split="train") -> Dataset:
    ds = load_dataset("nlile/hendrycks-MATH-benchmark", split=split)
    def preprocess(x):
        # Составляем единый текстовый prompt
        prompt = SYSTEM_PROMPT + " Problem: " + x["problem"]
        return {
            "prompt": prompt,
            "solution": x["solution"],
            "answer": x["answer"],
        }
    return ds.map(preprocess, remove_columns=ds.column_names)

In [None]:
dataset = get_hendrycks_math()

In [None]:
dataset[0]

In [None]:
def preprocess_function(examples):
    processed_inputs = []
    prompts   = examples["prompt"]
    solutions = examples["solution"]
    
    for prompt, solution in zip(prompts, solutions):
        # просто полный текст = prompt + решение
        full_text = prompt + solution

        # токенизируем без паддинга
        full_toks = tokenizer(
            full_text,
            max_length=512,
            truncation=True,
            padding=False,
            return_tensors=None,
            add_special_tokens=False
        )
        prompt_toks = tokenizer(
            prompt,
            max_length=512,
            truncation=True,
            padding=False,
            return_tensors=None,
            add_special_tokens=False
        )

        input_ids      = full_toks["input_ids"]
        attention_mask = full_toks["attention_mask"]
        labels         = input_ids.copy()

        # маскируем всё, что относится к prompt
        prompt_len = len(prompt_toks["input_ids"])
        if prompt_len < len(labels):
            labels[:prompt_len] = [-100] * prompt_len
        else:
            labels = [-100] * len(labels)

        processed_inputs.append({
            "input_ids":      input_ids,
            "attention_mask": attention_mask,
            "labels":         labels,
        })

    return {
        "input_ids":      [x["input_ids"]      for x in processed_inputs],
        "attention_mask":[x["attention_mask"] for x in processed_inputs],
        "labels":         [x["labels"]         for x in processed_inputs],
    }


In [None]:
tokenized_datasets = dataset.map(
    preprocess_function,
    batched=True,
    remove_columns=dataset.column_names  # удаляем старые поля
)


In [None]:
class CustomDataCollator:
    def __init__(self, tokenizer, max_length=512):
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __call__(self, features):
        input_ids = [f["input_ids"] for f in features]
        attention_mask = [f["attention_mask"] for f in features]
        labels = [f["labels"] for f in features]

        max_len = min(max(len(ids) for ids in input_ids), self.max_length)

        padded_input_ids = []
        padded_attention_mask = []
        padded_labels = []

        for ids, mask, label in zip(input_ids, attention_mask, labels):
            ids = ids[:max_len]
            mask = mask[:max_len]
            label = label[:max_len]

            pad_length = max_len - len(ids)
            if pad_length > 0:
                ids += [self.tokenizer.pad_token_id] * pad_length
                mask += [0] * pad_length
                label += [-100] * pad_length

            padded_input_ids.append(ids)
            padded_attention_mask.append(mask)
            padded_labels.append(label)

        return {
            "input_ids": torch.tensor(padded_input_ids, dtype=torch.long),
            "attention_mask": torch.tensor(padded_attention_mask, dtype=torch.long),
            "labels": torch.tensor(padded_labels, dtype=torch.long)
        }

In [None]:
data_collator = CustomDataCollator(tokenizer)

In [None]:
import os
import matplotlib.pyplot as plt
from transformers import TrainerCallback, TrainingArguments, Trainer

In [None]:
class TrainingMetricsCallback(TrainerCallback):
    def __init__(self):
        self.train_loss = []

    def on_log(self, args, state, control, logs=None, **kwargs):
        # будем ловить при каждом логировании 'loss'
        if logs is None:
            return
        if "loss" in logs:
            self.train_loss.append(logs["loss"])

    def plot(self, output_dir):
        os.makedirs(output_dir, exist_ok=True)
        plt.figure(figsize=(6,4))
        plt.plot(self.train_loss, label="train_loss")
        plt.xlabel("logging steps")
        plt.ylabel("loss")
        plt.legend()
        plt.tight_layout()
        plt.savefig(os.path.join(output_dir, "train_loss.png"))
        plt.close()

In [None]:
metrics_cb = TrainingMetricsCallback()

In [None]:
logging_dir = '/home/jupyter/datasphere/project/log_dir'

In [None]:
training_args = TrainingArguments(
    per_device_train_batch_size=2,
    gradient_accumulation_steps=16,
    num_train_epochs=2,
    learning_rate=2e-5,
    weight_decay=1e-3,
    lr_scheduler_type="cosine",
    warmup_ratio=0.1,
    bf16=True,
    label_names=["labels"],
    
    report_to='none',
    
    logging_steps=35,
    logging_strategy='steps',
    logging_first_step=True,
    logging_dir=logging_dir,
    
    save_strategy="no",
    output_dir=OUTPUT_DIR
    
    # output_dir=OUTPUT_DIR,
    # save_strategy="steps",
    # save_steps=200,
    # save_total_limit=2    
    
    #per_device_eval_batch_size=8
    # do_eval=False,
    # evaluation_strategy="steps",
    # eval_steps=100,
    # load_best_model_at_end=True,   
)

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets,
    data_collator=data_collator,
    tokenizer=tokenizer,
    callbacks=[metrics_cb]
)

In [None]:
trainer.train()

In [None]:
trainer.save_model("/home/jupyter/datasphere/project/check_sft")