In [1]:
import os
import sys
from dotenv import load_dotenv
from pathlib import Path

if sys.platform == 'linux':
    load_dotenv(dotenv_path=Path('.') / '.env.linux')
elif sys.platform == 'win32':
    load_dotenv(dotenv_path=Path('.') / '.env.win')
else:
    raise ValueError('Ваша операционная система не поддерживается')

os.environ['HF_HOME'] = os.getenv('HUGGING_FACE_CACHE_DIR')
DATASET_PATH = os.getenv('DATASET_PATH', None)
print(DATASET_PATH)
DATASET_FOLDER = os.getenv('DATASET_FOLDER_PATH', None)
MODEL_NAME = os.getenv('MODEL_NAME', None)
MODEL_SIZE = os.getenv('MODEL_SIZE', None)
MODEL_SAVE_DIR=os.getenv('MODEL_SAVE_DIR', None)
MODEL_SAVE_NAME = os.getenv('MODEL_SAVE_NAME', None)
print(MODEL_NAME)

/mnt/12A4CA9DA4CA8329/Files/Datasets/recipes_generation/fine_tuning_preprocessed.csv
ai-forever/rugpt3large_based_on_gpt2


In [2]:
import torch
from transformers import GPT2LMHeadModel, AutoTokenizer, DataCollatorForLanguageModeling, AutoModel
from transformers import Trainer, TrainingArguments

from datasets import load_from_disk, DatasetDict

pad_token = '<PAD>'
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f'DEVICE: {DEVICE}')

DEVICE: cuda


In [3]:
model = GPT2LMHeadModel.from_pretrained(MODEL_NAME)
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
# У GPT2 не было паддинг токена по умолчанию
tokenizer.pad_token = tokenizer.eos_token

# Количество параметров модели
model_size = sum(t.numel() for t in model.parameters())
print(f"model size: {model_size/1000**2:.1f}M parameters")

model size: 760.3M parameters


In [4]:
prepared_dataset = load_from_disk(Path(DATASET_FOLDER) / 'prepared_dataset_hg_format')
prepared_dataset = prepared_dataset.map(remove_columns=['name', 'instructions', 'ingredients', 'composition_list', 'prompt', 'length'])

In [5]:
# before_main_train_dataset = DatasetDict({
#     # Перемешиваем train‑датасет и берём первые 5000 строк
#     'train': prepared_dataset['train'].shuffle(seed=42).take(5000),

#     'test': prepared_dataset['test'].shuffle(seed=42).take(500)
# })

## Обучение при помощи Trainer API

In [6]:
# data_collator отвечает за создание батчей, их выравнивание при помощи паддинга, а также создает метки для входов 
# Сдвиг входов и меток для их выравнивания происходит внутри модели, поэтому коллатор данных просто копирует входы для создания меток
data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False) # Отключаем masked language modelling (mlm)

# out = data_collator([before_main_train_dataset["train"][i] for i in range(5)])
# for key in out:
#     print(f"{key} shape: {out[key].shape}")

In [7]:
args = TrainingArguments(
    output_dir=os.path.join(MODEL_SAVE_DIR, 'training_checkpoints'),  # Кастомная директория
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    gradient_accumulation_steps=4,
    num_train_epochs=3,
    weight_decay=0.01,
    warmup_steps=300,
    lr_scheduler_type="cosine",
    learning_rate=1e-5,
    save_steps=800,
    eval_steps=800,
    logging_steps=800,
    eval_strategy="steps",
    save_strategy="steps",
    load_best_model_at_end=True,  # Загружать лучшую модель
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    fp16=torch.cuda.is_available(),  # Только если GPU поддерживает
    gradient_checkpointing=True,  # Экономия памяти
    save_total_limit=3,  # Хранить только 3 лучших чекпоинта
    push_to_hub=False,
    report_to="none",  # Отключить логирование в сторонние сервисы
)

trainer = Trainer(
    model=model,
    tokenizer=tokenizer,
    args=args,
    data_collator=data_collator,
    train_dataset=prepared_dataset["train"],
    eval_dataset=prepared_dataset["test"],
)

  trainer = Trainer(


In [8]:
# Обучение
print("Начало обучения...")
trainer.train()

The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'pad_token_id': 2}.
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


Начало обучения...


`loss_type=None` was set in the config but it is unrecognized. Using the default loss: `ForCausalLMLoss`.


Step,Training Loss,Validation Loss
800,2.3293,0.554228
1600,0.5689,0.53057
2400,0.5451,0.520364
3200,0.5326,0.514102
4000,0.522,0.510065
4800,0.5184,0.505659
5600,0.5117,0.503057
6400,0.5096,0.502294
7200,0.5026,0.501103


KeyboardInterrupt: 

In [9]:
# Сохранение финальной модели
trainer.save_model(os.path.join(MODEL_SAVE_DIR, MODEL_SAVE_NAME))
tokenizer.save_pretrained(os.path.join(MODEL_SAVE_DIR, MODEL_SAVE_NAME))

print(f'Модель сохранена в {os.path.join(MODEL_SAVE_DIR, MODEL_SAVE_NAME)}')

Модель сохранена в /mnt/2FADF63B267AA05B/AI_models/recepies_generation/final_gpt2_large
