### Установка библиотек, импорты

In [None]:
#!g1.1
%pip uninstall keras tensorflow transformers
%pip install --upgrade keras tensorflow transformers

In [249]:
#!g1.1
import torch
from tqdm import tqdm
import pandas as pd

from dataclasses import dataclass, field
from typing import Dict, Optional

tqdm.pandas()

from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM
from datasets import load_dataset
from torch.utils.data import Dataset, DataLoader, RandomSampler, random_split

from trl import PPOTrainer, PPOConfig, AutoModelForCausalLMWithValueHead, DPOTrainer
from trl.core import LengthSampler

2023-11-30 19:33:31.316255: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2023-11-30 19:33:31.316470: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2023-11-30 19:33:32.800936: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [250]:
#!g1.1
device = 'cuda' if torch.cuda.is_available() else 'cpu'

### Настраиваем конфиг

In [251]:
#!g1.1

kwargs = {
    "model_name": "lvwerra/gpt2-imdb",
    "report_to": "wandb",
    "learning_rate": 1e-3,
    "per_device_train_batch_size": 16,
    "max_length": 512
}

### Инициализация wandb

In [232]:
#!g1.1
import wandb

wandb.init()

### Создаём модель для обучения, референсную модель и токенизатор

In [253]:
#!g1.1
model = AutoModelForCausalLM.from_pretrained(kwargs["model_name"])
ref_model = AutoModelForCausalLM.from_pretrained(kwargs["model_name"])

config.json:   0%|          | 0.00/577 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/548M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/17.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/90.0 [00:00<?, ?B/s]

In [312]:
#!g1.1
tokenizer = AutoTokenizer.from_pretrained(kwargs["model_name"], padding_side='left', return_tensors="pt")

tokenizer.pad_token = tokenizer.eos_token

### Генерация и оценка текста

In [331]:
#!g1.1
sent_kwargs = {"top_k": None, "function_to_apply": "none", "batch_size": 16}

In [332]:
#!g1.1
sentiment_pipe = pipeline(model="lvwerra/distilbert-imdb", device=device,  **sent_kwargs)

In [333]:
#!g1.1
text = "this movie was really bad!!"
sentiment_pipe(text)

[[{'label': 'NEGATIVE', 'score': 2.335048198699951},
  {'label': 'POSITIVE', 'score': -2.7265758514404297}]]

In [338]:
#!g1.1
text = "this movie was really astonishing amazing beautiful!!"
sentiment_pipe(text, **sent_kwargs)

[{'label': 'POSITIVE', 'score': 2.8005118370056152},
 {'label': 'NEGATIVE', 'score': -2.5074386596679688}]

In [399]:
#!g1.1
gen_kwargs = {"min_length": -1,
              "max_length": 64,
              "top_k": 0.0,
              "top_p": 1.0,
              "do_sample": True,
              "num_return_sequences": 2,
              "pad_token_id": tokenizer.eos_token_id}

In [400]:
#!g1.1
generator = pipeline('text-generation', model=kwargs["model_name"], device=device, tokenizer=tokenizer, **gen_kwargs)

Мы научились генерировать и оценивать positive/negative label текстов, а также генерировать данные! Настала пора создавать датасет

In [183]:
#!g1.1
new_prompt_chosen_rejected_list = {"chosen": [], "rejected": []}

In [None]:
#!g1.1
query = ''
gen_kwargs["num_return_sequences"] = 5
output_min_length = 4
output_max_length = 16
output_length_sampler = LengthSampler(output_min_length, output_max_length)

for i in tqdm(range(10000)):
    gen_len = output_length_sampler()
    gen_kwargs["max_new_tokens"] = gen_len
    texts_gen = generator(query, **gen_kwargs)
    texts = [elem['generated_text'] for elem in texts_gen]
    sentiment_samples = sentiment_pipe(texts, **sent_kwargs)
    positive_scores = []
    for sent in sentiment_samples:
        for dict_label_score in sent:
            if dict_label_score['label'] == 'POSITIVE':
                positive_scores.append(dict_label_score['score'])
    max_index = positive_scores.index(max(positive_scores))
    min_index = positive_scores.index(min(positive_scores))
    new_prompt_chosen_rejected_list["chosen"].append(texts[max_index])
    new_prompt_chosen_rejected_list["rejected"].append(texts[min_index])

In [201]:
#!g1.1
new_prompt_chosen_rejected_list["prompt"] = [tokenizer.pad_token for _ in range(len(new_prompt_chosen_rejected_list["chosen"]))]

In [240]:
#!g1.1
kwargs = {
    "model_name": "lvwerra/gpt2-imdb",
    "report_to": "wandb",
    "learning_rate": 1e-3,
    "per_device_train_batch_size": 16,
    "max_length": 512,
    "max_steps": 15000,
    "gradient_accumulation_steps": 1,
    "beta": 0.1
}

In [203]:
#!g1.1
from transformers import TrainingArguments

In [217]:
#!g1.1
train_split = {"prompt": new_prompt_chosen_rejected_list["prompt"][:7000],
                "chosen": new_prompt_chosen_rejected_list["chosen"][:7000],
                "rejected": new_prompt_chosen_rejected_list["rejected"][:7000]}

eval_split = {"prompt": new_prompt_chosen_rejected_list["prompt"][7000:],
                "chosen": new_prompt_chosen_rejected_list["chosen"][7000:],
                "rejected": new_prompt_chosen_rejected_list["rejected"][7000:]}


In [219]:
#!g1.1
import datasets
train_dataset = datasets.Dataset.from_dict(train_split)
eval_dataset = datasets.Dataset.from_dict(eval_split)

In [244]:
#!g1.1
training_args = TrainingArguments(
        per_device_train_batch_size=args["per_device_train_batch_size"],
        max_steps=args["max_steps"],
        remove_unused_columns=False,
        gradient_accumulation_steps=args["gradient_accumulation_steps"],
        learning_rate=args["learning_rate"],
        evaluation_strategy="steps",
        logging_first_step=True,
        logging_steps=10,
        eval_steps=4000,
        output_dir="./test",
        optim="rmsprop",
        warmup_steps=100,
        report_to=args["report_to"],
        gradient_checkpointing=False,
    )

In [245]:
#!g1.1
dpo_trainer = DPOTrainer(
        model,
        ref_model,
        args=training_args,
        beta=kwargs["beta"],
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        tokenizer=tokenizer,
        max_length=kwargs["max_length"],
#         max_target_length=script_args.max_target_length,
#         max_prompt_length=script_args.max_prompt_length,
        generate_during_eval=True,
    )


In [None]:
#!g1.1
dpo_trainer.train()

Как вы можете видеть на графиках, результаты неутешительные. Попробуем сгенерировать тексты исходя из промптов датасета imdb, а также чуть большего размера. Посмотрим на результат.

### Вспомогательная функция для генерации датасета

In [353]:
#!g1.1
def collator(data):
    return dict((key, [d[key] for d in data]) for key in data[0])

In [354]:
#!g1.1
def build_dataset(kwargs, dataset_name="imdb", input_min_text_length=2, input_max_text_length=8):
    """
    Build dataset for training. This builds the dataset from `load_dataset`, one should
    customize this function to train the model on its own dataset.

    Args:
        dataset_name (`str`):
            The name of the dataset to be loaded.

    Returns:
        dataloader (`torch.utils.data.DataLoader`):
            The dataloader for the dataset.
    """
    tokenizer = AutoTokenizer.from_pretrained(kwargs["model_name"])
    tokenizer.pad_token = tokenizer.eos_token
    # load imdb with datasets
    ds = load_dataset(dataset_name, split="train")
    ds = ds.rename_columns({"text": "review"})
    ds = ds.filter(lambda x: len(x["review"]) > 200, batched=False)
    ds = ds.filter(lambda x: len(x["review"]) < 512, batched=False)

#     input_size = LengthSampler(input_min_text_length, input_max_text_length)
    input_size = 8
    def tokenize(sample):
        sample["input_ids"] = tokenizer.encode(sample["review"])[: input_size]
        sample["query"] = tokenizer.decode(sample["input_ids"])
        return sample

    ds = ds.map(tokenize, batched=False)
    ds.set_format(type="torch")
    return ds

In [355]:
#!g1.1
dataset = build_dataset(kwargs)

Downloading builder script:   0%|          | 0.00/4.31k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/2.17k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/7.59k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/84.1M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating unsupervised split:   0%|          | 0/50000 [00:00<?, ? examples/s]



Filter:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/24895 [00:00<?, ? examples/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (1168 > 1024). Running this sequence through the model will result in indexing errors


In [356]:
#!g1.1
dataset = dataset.remove_columns("label")

In [357]:
#!g1.1
dataset = dataset.remove_columns("review")

In [358]:
#!g1.1
loader = DataLoader(
    dataset,
    batch_size=sent_kwargs["batch_size"],
    shuffle=False,
    num_workers=8
    )

### Посмотрим, как выглядят данные

In [359]:
#!g1.1
dataset[0]["input_ids"]

tensor([   40, 26399,   314,  3001,   327, 47269, 20958,    12])

In [360]:
#!g1.1
tokenizer.decode(dataset[0]["input_ids"])

'I rented I AM CURIOUS-'

Мы видим, что в `input_ids` лежит обрезанное ревью. Пусть это будет нашим промптом. По этому промпту мы будем генерировать N выходов и среди них сделаем N-1 пар вида `winner-loser`, где `winner sentiment score` > `loser sentiment score`

In [408]:
#!g1.1
prompt_chosen_rejected_list = {"prompt": [], "chosen": [], "rejected": []}

In [366]:
#!g1.1
import warnings
warnings.filterwarnings("ignore")

In [None]:
#!g1.1
for d in tqdm(loader):
    query = d["query"]
    generated_samples = generator(query, **gen_kwargs)
    texts = []
    for batch_elem in generated_samples:
        for x in batch_elem:
            texts.append(x['generated_text'])
            
    sentiment_samples = sentiment_pipe(texts, **sent_kwargs)
    positive_scores = []
    for sent in sentiment_samples:
        for dict_label_score in sent:
            if dict_label_score['label'] == 'POSITIVE':
                positive_scores.append(dict_label_score['score'])
    # generate only 2 samples
    prompt_chosen_rejected_list["prompt"].extend(query)
    for i in range(0, 2*len(query), 2):
        if positive_scores[i] > positive_scores[i+1]:
            prompt_chosen_rejected_list["chosen"].append(texts[i])
            prompt_chosen_rejected_list["rejected"].append(texts[i+1])
        else:
            prompt_chosen_rejected_list["chosen"].append(texts[i+1])
            prompt_chosen_rejected_list["rejected"].append(texts[i])

  1%|          | 8/1556 [02:06<6:42:58, 15.62s/it]

In [None]:
#!g1.1
import pickle


with open("prompt_chosen_rejected_list.pkl", "wb") as file:
    pickle.dump(prompt_chosen_rejected_list, file)

In [None]:
#!g1.1
