In [1]:
import os
import sys
from dotenv import load_dotenv
from pathlib import Path

if sys.platform == 'linux':
    load_dotenv(dotenv_path=Path('.') / '.env.linux')
elif sys.platform == 'win32':
    load_dotenv(dotenv_path=Path('.') / '.env.win')
else:
    raise ValueError('Ваша операционная система не поддерживается')

os.environ['HF_HOME'] = os.getenv('HUGGING_FACE_CACHE_DIR')
DATASET_PATH = os.getenv('DATASET_PATH', None)
print(DATASET_PATH)
DATASET_FOLDER = os.getenv('DATASET_FOLDER_PATH', None)
# MODEL_NAME = os.getenv('MODEL_NAME', None)
MODEL_NAME = 'gpt2-large'
MODEL_SIZE = os.getenv('MODEL_SIZE', None)
MODEL_SAVE_DIR=os.getenv('MODEL_SAVE_DIR', None)
MODEL_SAVE_NAME = os.getenv('MODEL_SAVE_NAME', None)
print(MODEL_NAME)

/mnt/12A4CA9DA4CA8329/Files/Datasets/recipes_generation/fine_tuning_preprocessed.csv
gpt2-large


In [2]:
from peft import LoraConfig, TaskType, get_peft_model
import torch
from transformers import GPT2LMHeadModel, AutoTokenizer, DataCollatorForLanguageModeling, AutoModel
from transformers import Trainer, TrainingArguments
from datasets import load_from_disk

In [3]:
peft_config = LoraConfig(task_type=TaskType.CAUSAL_LM, r=16, lora_alpha=32, lora_dropout=0.1,target_modules=["c_attn", "c_proj", "c_fc"],\
                         inference_mode=False, fan_in_fan_out=True, bias='none')

In [4]:
base_model = GPT2LMHeadModel.from_pretrained(MODEL_NAME, torch_dtype=torch.bfloat16)
# Количество параметров модели
model_size = sum(t.numel() for t in base_model.parameters())
print(f"model size: {model_size/1e-6:.2f}M parameters")

`torch_dtype` is deprecated! Use `dtype` instead!


model size: 774030080000000.00M parameters


In [5]:
model = get_peft_model(base_model, peft_config)
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
# У GPT2 не было паддинг токена по умолчанию
tokenizer.pad_token = tokenizer.eos_token

# Количество обучаемых параметров
model.print_trainable_parameters()

trainable params: 11,796,480 || all params: 785,826,560 || trainable%: 1.5012


In [6]:
prepared_dataset = load_from_disk(Path(DATASET_FOLDER) / 'prepared_dataset_hg_format')
prepared_dataset = prepared_dataset.map(remove_columns=['name', 'instructions', 'ingredients', 'composition_list', 'prompt', 'length'])

In [7]:
data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False) # Отключаем masked language modelling (mlm)

In [8]:
args = TrainingArguments(
    output_dir=os.path.join(MODEL_SAVE_DIR, 'training_checkpoints'),  # Кастомная директория
    overwrite_output_dir=True,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    gradient_accumulation_steps=4,
    num_train_epochs=1,
    weight_decay=0.01,
    warmup_steps=200,
    lr_scheduler_type="linear",
    learning_rate=1e-5,
    save_steps=200,
    eval_steps=200,
    logging_steps=200,
    eval_strategy="steps",
    save_strategy="steps",
    load_best_model_at_end=True,  # Загружать лучшую модель
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    bf16=torch.cuda.is_available(),  # Только если GPU поддерживает
    remove_unused_columns=False,
    gradient_checkpointing=False,  # Экономия памяти
    save_total_limit=2,  # Хранить только 2 лучших чекпоинта
    push_to_hub=False,
    report_to="none",  # Отключить логирование в сторонние сервисы
)

trainer = Trainer(
    model=model,
    tokenizer=tokenizer,
    args=args,
    data_collator=data_collator,
    train_dataset=prepared_dataset["train"],
    eval_dataset=prepared_dataset["test"],
)

  trainer = Trainer(


In [9]:
# Обучение
try:
    print("Начало обучения...")
    trainer.train()
except KeyboardInterrupt:
    print('Обучение остановлено')

The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'pad_token_id': 50256}.


Начало обучения...


`loss_type=None` was set in the config but it is unrecognized. Using the default loss: `ForCausalLMLoss`.


Step,Training Loss,Validation Loss


Обучение остановлено


In [None]:
# Сохранение финальной модели
trainer.save_model(os.path.join(MODEL_SAVE_DIR, MODEL_SAVE_NAME))
tokenizer.save_pretrained(os.path.join(MODEL_SAVE_DIR, MODEL_SAVE_NAME))

print(f'Модель сохранена в {os.path.join(MODEL_SAVE_DIR, MODEL_SAVE_NAME)}')

In [None]:
dfdf

In [None]:
from transformers import pipeline
import torch

pad_token = '<PAD>'
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f'DEVICE: {DEVICE}')

In [None]:
pipe = pipeline("text-generation", model=os.path.join(MODEL_SAVE_DIR, MODEL_SAVE_NAME), device=DEVICE)

dish = 'Блины'

prompt = f'Я хотел бы приготовить {dish}. Дай мне рецепт со всеми поодробностями приготовления.'

In [None]:
output = pipe(dish, max_length=1024, num_return_sequences=1)
print(output[0]['generated_text'])