In [None]:
%pip install trl lm_eval -q

# Семинар "Natural Language Processing. Часть 2"

Попробуем воспользоваться SFT, чтобы подогнать gpt2 к нужному формату ответа. Обучать будем только lora адаптер

In [2]:
import os
import random
import warnings

import numpy as np
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments
from peft import LoraConfig, get_peft_model
from trl import SFTTrainer, SFTConfig
from datasets import load_dataset
from tqdm.notebook import tqdm

warnings.filterwarnings("ignore")

In [3]:
def seed_everything(seed: int):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

seed_everything(42)

In [4]:
model_name = "gpt2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto",
    torch_dtype=torch.float16
)

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

`torch_dtype` is deprecated! Use `dtype` instead!


model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

In [5]:
lora_config = LoraConfig(
    r=8,
    lora_alpha=16,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=["c_attn", "c_proj"],
    fan_in_fan_out=True
)

peft_model = get_peft_model(model, lora_config)
peft_model.print_trainable_parameters()

trainable params: 811,008 || all params: 125,250,816 || trainable%: 0.6475


In [6]:
def format_instruction(sample):
    return f"Question: {sample['question']}\n\n Answer: #### {sample['answer']}"

train_dataset = load_dataset("gsm8k", "main", split="train[:1000]")
train_dataset = train_dataset.map(lambda x: {"text": format_instruction(x)})
eval_dataset = load_dataset("gsm8k", "main", split="test")

README.md: 0.00B [00:00, ?B/s]

main/train-00000-of-00001.parquet:   0%|          | 0.00/2.31M [00:00<?, ?B/s]

main/test-00000-of-00001.parquet:   0%|          | 0.00/419k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/7473 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1319 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [7]:
print(train_dataset)

Dataset({
    features: ['question', 'answer', 'text'],
    num_rows: 1000
})


In [8]:
training_args = SFTConfig(
    output_dir="./gpt2-lora-gsm8k",
    per_device_train_batch_size=4,
    gradient_accumulation_steps=4,
    learning_rate=1e-4,
    num_train_epochs=10,
    logging_steps=10,
    save_steps=500,
    fp16=True,
    dataset_text_field="text",
    optim="adamw_torch",
    report_to="none"
)

In [9]:
trainer = SFTTrainer(
    model=peft_model,
    args=training_args,
    train_dataset=train_dataset,
)

Adding EOS to train dataset:   0%|          | 0/1000 [00:00<?, ? examples/s]

Tokenizing train dataset:   0%|          | 0/1000 [00:00<?, ? examples/s]

Truncating train dataset:   0%|          | 0/1000 [00:00<?, ? examples/s]

The model is already on multiple devices. Skipping the move to device specified in `args`.


In [10]:
trainer.train()
trainer.save_model()

`loss_type=None` was set in the config but it is unrecognized. Using the default loss: `ForCausalLMLoss`.


Step,Training Loss
10,3.4858
20,3.3597
30,3.216
40,3.1218
50,2.927
60,2.8921
70,2.7825
80,2.7377
90,2.6983
100,2.6693


In [12]:
from transformers import pipeline

def evaluate_model(model, tokenizer, model_name):
    pipe = pipeline(
        "text-generation",
        model=model,
        tokenizer=tokenizer,
        device_map="auto",
        torch_dtype=torch.float16
    )

    gsm8k_test = load_dataset("gsm8k", "main", split="test")

    correct = 0
    total = 0

    for example in tqdm(gsm8k_test):
        prompt = f"Question: {example['question']}\n\nAnswer"

        inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
        outputs = model.generate(
            **inputs,
            max_new_tokens=200,
            temperature=0,
            pad_token_id=tokenizer.eos_token_id
        )

        response = tokenizer.decode(outputs[0], skip_special_tokens=True)

        if example['answer'].split("#### ")[-1] in response:
            correct += 1
        total += 1

    return correct / total

base_accuracy = evaluate_model(model, tokenizer, "base-gpt2")
print(f"Base model accuracy: {base_accuracy:.2%}")

lora_accuracy = evaluate_model(peft_model, tokenizer, "lora-gpt2")
print(f"LoRA model accuracy: {lora_accuracy:.2%}")

print(f"Improvement: {lora_accuracy - base_accuracy:.2%}")

Device set to use cuda:0


  0%|          | 0/1319 [00:00<?, ?it/s]

Device set to use cuda:0


Base model accuracy: 11.22%


  0%|          | 0/1319 [00:00<?, ?it/s]

LoRA model accuracy: 11.75%
Improvement: 0.53%
