In [1]:
pip install transformers trl datasets accelerate wandb

Collecting trl
  Downloading trl-0.12.2-py3-none-any.whl.metadata (11 kB)
Downloading trl-0.12.2-py3-none-any.whl (365 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m365.7/365.7 kB[0m [31m6.5 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: trl
Successfully installed trl-0.12.2
Note: you may need to restart the kernel to use updated packages.


In [2]:
import pandas as pd
import torch
import torch.nn as nn

from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, OPTForCausalLM, DataCollatorForLanguageModeling
from trl import SFTTrainer, SFTConfig, RewardTrainer, RewardConfig, DataCollatorForCompletionOnlyLM
from datasets import load_dataset, Dataset

In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [4]:
dataset = load_dataset("nvidia/HelpSteer2")

train_dataset = dataset['train']
validation_dataset = dataset['validation']

README.md:   0%|          | 0.00/25.0k [00:00<?, ?B/s]

train.jsonl.gz:   0%|          | 0.00/11.3M [00:00<?, ?B/s]

validation.jsonl.gz:   0%|          | 0.00/582k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/20324 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/1038 [00:00<?, ? examples/s]

В датасете представлены колонки с оценками пользователей в 5 категориях: 'helpfulness', 'correctness', 'coherence', 'complexity', 'verbosity. Для каждого prompt имеется 2 responses модели. Для выбора chosen и rejected авторы предлагают следующую формулу (Appendix H [HelpSteer2: Open-source dataset for training top-performing reward models](https://arxiv.org/pdf/2406.08673)):

0.65 $\cdot$ Helpfulness + 0.8 $\cdot$ Correctness + 0.45 $\cdot$ Coherence + 0.55 $\cdot$ Complexity - 0.4 $\cdot$ Verbosity

In [5]:
df_train = pd.DataFrame(train_dataset)
df_val = pd.DataFrame(validation_dataset)

df_train['score_1'] = df_train.apply(lambda row: 0.65 * row['helpfulness'] + 0.8 * row['correctness'] + 0.45 * row['coherence'] + 0.55 * row['complexity'] - 0.4 * row['verbosity'], axis=1)
df_val['score_1'] = df_val.apply(lambda row: 0.65 * row['helpfulness'] + 0.8 * row['correctness'] + 0.45 * row['coherence'] + 0.55 * row['complexity'] - 0.4 * row['verbosity'], axis=1)

df_train = df_train.drop(columns=['helpfulness', 'correctness', 'coherence', 'complexity', 'verbosity'])
df_val = df_val.drop(columns=['helpfulness', 'correctness', 'coherence', 'complexity', 'verbosity'])

In [5]:
model_name = 'facebook/opt-350m'
model = AutoModelForCausalLM.from_pretrained(model_name)
model.to(device)
tokenizer = AutoTokenizer.from_pretrained(model_name)

config.json:   0%|          | 0.00/644 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/663M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/137 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/685 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/441 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/662M [00:00<?, ?B/s]

In [7]:
def preprocessing_for_sft(examples):

    return tokenizer(examples['prompt'], examples['response'], padding="max_length", truncation=True, max_length=512)

In [8]:
def best_responses(df):
    responses_dict = {col: [] for col in df.columns}

    for i in range(0, len(df), 2):
        if i + 1 < len(df):
            row1 = df.iloc[i]
            row2 = df.iloc[i + 1]
            score1 = row1['score_1']
            score2 = row2['score_1']

            if score1 >= score2:
                chosen_row = row1
            else:
                chosen_row = row2

            for col in df.columns:
                responses_dict[col].append(chosen_row[col])

    data_best =  Dataset.from_dict(responses_dict)

    return data_best

Отбираем лучшие ответы:

In [9]:
train_data = best_responses(df_train)
val_data = best_responses(df_val)

train_data = train_data.remove_columns(['score_1'])
val_data = val_data.remove_columns(['score_1'])

In [10]:
train_data_sft = train_data.map(preprocessing_for_sft, batched=True)
val_data_sft = val_data.map(preprocessing_for_sft, batched=True)

Map:   0%|          | 0/10162 [00:00<?, ? examples/s]

Map:   0%|          | 0/519 [00:00<?, ? examples/s]

**Рассмотрим 2 варианта.**

**1)Первый вариант**: SFTTrainer по дефолту использует DataCollatorForLanguageModeling ([here](https://github.com/huggingface/trl/blob/974b0d380f12c357b70265c5f2dd2c8cb39a6a3e/trl/trainer/sft_trainer.py#L337-L338)), который не отделяет prompt от response при вычислении CrossEntropyLoss. Обучим модель с помощью него.

In [11]:
training_args = TrainingArguments(
    output_dir='./results/sft',
    learning_rate=1e-4,
    per_device_train_batch_size=8,
    num_train_epochs=1,
    fp16=True,
    logging_dir='./logs',
    logging_steps=10,
    eval_strategy='epoch',
    save_strategy='epoch'
)

trainer = SFTTrainer(
    model=model,
    args=training_args,
    train_dataset=train_data_sft,
    eval_dataset=val_data_sft,
    tokenizer=tokenizer,
    data_collator=None,
    max_seq_length=512
)


Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.


In [None]:
trainer.train()

Скачаем уже обученную: [here](https://drive.google.com/drive/folders/1D2CxM0_1_GXgg7LXO4JRKuS7d_5q5hFY?usp=drive_link).

In [15]:
model_sft_name = 'SFT_model'
model_sft = AutoModelForCausalLM.from_pretrained(model_sft_name)
model_sft.to(device)

OPTForCausalLM(
  (model): OPTModel(
    (decoder): OPTDecoder(
      (embed_tokens): Embedding(50272, 512, padding_idx=1)
      (embed_positions): OPTLearnedPositionalEmbedding(2050, 1024)
      (project_out): Linear(in_features=1024, out_features=512, bias=False)
      (project_in): Linear(in_features=512, out_features=1024, bias=False)
      (layers): ModuleList(
        (0-23): 24 x OPTDecoderLayer(
          (self_attn): OPTSdpaAttention(
            (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (v_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (q_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (out_proj): Linear(in_features=1024, out_features=1024, bias=True)
          )
          (activation_fn): ReLU()
          (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (fc1): Linear(in_features=1024, out_features=4096, bias=True)
          (fc2): Linear(in_features

Обучим для нее Reward Model. Подготовим датасет.

In [16]:
def preprocessing_for_chosen(examples):

    return tokenizer(examples['chosen'], padding='max_length', truncation=True, max_length=512)

def preprocessing_for_rejected(examples):

    return tokenizer(examples['rejected'], padding='max_length', truncation=True, max_length=512)

def pairs_for_rm(df):
    responses_dict = {col: [] for col in ['chosen', 'rejected']}

    for i in range(0, len(df), 2):
        if i + 1 < len(df):
            row1 = df.iloc[i]
            row2 = df.iloc[i + 1]
            score1 = row1['score_1']
            score2 = row2['score_1']

            if score1 >= score2:
                responses_dict['chosen'].append(row1['response'])
                responses_dict['rejected'].append(row2['response'])
            else:
                responses_dict['chosen'].append(row2['response'])
                responses_dict['rejected'].append(row1['response'])

    data_pairs =  Dataset.from_dict(responses_dict)

    return data_pairs

In [17]:
train_data_rm = pairs_for_rm(df_train)
val_data_rm = pairs_for_rm(df_val)

In [18]:
train_data_rm = train_data_rm.map(preprocessing_for_chosen, batched=True)

train_data_rm = train_data_rm.remove_columns(['chosen'])
train_data_rm = train_data_rm.rename_columns({'input_ids': 'input_ids_chosen'})
train_data_rm = train_data_rm.rename_columns({'attention_mask': 'attention_mask_chosen'})

train_data_rm = train_data_rm.map(preprocessing_for_rejected, batched=True)

train_data_rm = train_data_rm.remove_columns(['rejected'])
train_data_rm = train_data_rm.rename_columns({'input_ids': 'input_ids_rejected'})
train_data_rm = train_data_rm.rename_columns({'attention_mask': 'attention_mask_rejected'})

Map:   0%|          | 0/10162 [00:00<?, ? examples/s]

Map:   0%|          | 0/10162 [00:00<?, ? examples/s]

In [19]:
val_data_rm = val_data_rm.map(preprocessing_for_chosen, batched=True)

val_data_rm = val_data_rm.remove_columns(['chosen'])
val_data_rm = val_data_rm.rename_columns({'input_ids': 'input_ids_chosen'})
val_data_rm = val_data_rm.rename_columns({'attention_mask': 'attention_mask_chosen'})

val_data_rm = val_data_rm.map(preprocessing_for_rejected, batched=True)

val_data_rm = val_data_rm.remove_columns(['rejected'])
val_data_rm = val_data_rm.rename_columns({'input_ids': 'input_ids_rejected'})
val_data_rm = val_data_rm.rename_columns({'attention_mask': 'attention_mask_rejected'})

Map:   0%|          | 0/519 [00:00<?, ? examples/s]

Map:   0%|          | 0/519 [00:00<?, ? examples/s]

Reward Model из библиотеки trl в качестве reward при обучении использует среднее значение logits, полученных для всех токенов [here](https://github.com/huggingface/trl/blob/974b0d380f12c357b70265c5f2dd2c8cb39a6a3e/trl/trainer/reward_trainer.py#L243-L245). При оценке модели она усредняет logits для каждого токена в отдельности [here](https://github.com/huggingface/trl/blob/974b0d380f12c357b70265c5f2dd2c8cb39a6a3e/trl/trainer/reward_trainer.py#L279), что вызывает ошибку с размерностью:

In [20]:
training_args = RewardConfig(
    output_dir='./results/reward',
    learning_rate=1e-5,
    per_device_train_batch_size=8,
    num_train_epochs=1,
    fp16=True,
    logging_dir='./logs',
    logging_steps=10,
    eval_strategy='epoch',
    save_strategy='epoch',
)

trainer = RewardTrainer(
    model=model_sft,
    train_dataset=train_data_rm,
    processing_class=tokenizer,
    args=training_args,
    eval_dataset=val_data_rm,
)

In [21]:
trainer.evaluate()

You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
  logits = torch.stack(logits).mean(dim=2).softmax(dim=0).T


TypeError: type list doesn't define __round__ method

**1.1** Рассмотрим Reward Model, которая будет усреднять logits только для последнего слоя. Обучим ее в течении 2-х эпох:

In [31]:
import reward_model_last_logits
from reward_model_last_logits import Reward_Model_ll

In [32]:
training_args_ll = RewardConfig(
    output_dir='./results/reward_ll',
    learning_rate=1e-5,
    per_device_train_batch_size=4,
    num_train_epochs=2,
    fp16=True,
    logging_dir='./logs',
    logging_steps=10,
    eval_strategy='epoch',
    save_strategy='epoch',
)

trainer_ll = Reward_Model_ll(
    model=model_sft,
    train_dataset=train_data_rm,
    processing_class=tokenizer,
    args=training_args_ll,
    eval_dataset=val_data_rm,
)

In [None]:
trainer_ll.train()

Скачаем уже обученную модель: [here](https://drive.google.com/drive/folders/1EGf7hu0FKySXjJVbRUp2vTxxrK5_Ud89?usp=drive_link).

In [34]:
reward_model_ll_name = 'Reward_Model_LL'
reward_model_ll = AutoModelForCausalLM.from_pretrained(reward_model_ll_name)
reward_model_ll.to(device)

OPTForCausalLM(
  (model): OPTModel(
    (decoder): OPTDecoder(
      (embed_tokens): Embedding(50272, 512, padding_idx=1)
      (embed_positions): OPTLearnedPositionalEmbedding(2050, 1024)
      (project_out): Linear(in_features=1024, out_features=512, bias=False)
      (project_in): Linear(in_features=512, out_features=1024, bias=False)
      (layers): ModuleList(
        (0-23): 24 x OPTDecoderLayer(
          (self_attn): OPTSdpaAttention(
            (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (v_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (q_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (out_proj): Linear(in_features=1024, out_features=1024, bias=True)
          )
          (activation_fn): ReLU()
          (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (fc1): Linear(in_features=1024, out_features=4096, bias=True)
          (fc2): Linear(in_features

In [35]:
trainer_ll = Reward_Model_ll(
    model=reward_model_ll,
    train_dataset=train_data_rm,
    processing_class=tokenizer,
    args=training_args_ll,
    eval_dataset=val_data_rm,
)

In [36]:
trainer_ll.evaluate()

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

  ········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


{'eval_loss': 0.7035278677940369,
 'eval_accuracy': 0.48747591522157996,
 'eval_runtime': 97.1148,
 'eval_samples_per_second': 5.344,
 'eval_steps_per_second': 0.34}

Accuracy модели на отложенной выборке оказывается случайным угадыванием. Рассмотрим другой вариант.



**1.2** Заменим последний линейный слой модели lm_head на nn.Linear(512, 1), который будет генерировать 1 значение - reward для данного response. Обучим модель в течение 2-х эпох:

In [44]:
import reward_model_one_reward
from reward_model_one_reward import Reward_Model_OR

In [12]:
class ModelWithReward(OPTForCausalLM):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.reward = nn.Linear(512, 1)

    def score(self, x):

        return self.reward(x)

In [39]:
del (model, model_sft, reward_model_ll)
torch.cuda.empty_cache()

In [40]:
model_sft_name = 'SFT_model'
model_one_reward = ModelWithReward.from_pretrained(model_sft_name)
model_one_reward.to(device)

Some weights of ModelWithReward were not initialized from the model checkpoint at SFT_model and are newly initialized: ['reward.bias', 'reward.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


ModelWithReward(
  (model): OPTModel(
    (decoder): OPTDecoder(
      (embed_tokens): Embedding(50272, 512, padding_idx=1)
      (embed_positions): OPTLearnedPositionalEmbedding(2050, 1024)
      (project_out): Linear(in_features=1024, out_features=512, bias=False)
      (project_in): Linear(in_features=512, out_features=1024, bias=False)
      (layers): ModuleList(
        (0-23): 24 x OPTDecoderLayer(
          (self_attn): OPTSdpaAttention(
            (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (v_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (q_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (out_proj): Linear(in_features=1024, out_features=1024, bias=True)
          )
          (activation_fn): ReLU()
          (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (fc1): Linear(in_features=1024, out_features=4096, bias=True)
          (fc2): Linear(in_feature

In [41]:
del model_one_reward.lm_head

In [42]:
model_one_reward

ModelWithReward(
  (model): OPTModel(
    (decoder): OPTDecoder(
      (embed_tokens): Embedding(50272, 512, padding_idx=1)
      (embed_positions): OPTLearnedPositionalEmbedding(2050, 1024)
      (project_out): Linear(in_features=1024, out_features=512, bias=False)
      (project_in): Linear(in_features=512, out_features=1024, bias=False)
      (layers): ModuleList(
        (0-23): 24 x OPTDecoderLayer(
          (self_attn): OPTSdpaAttention(
            (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (v_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (q_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (out_proj): Linear(in_features=1024, out_features=1024, bias=True)
          )
          (activation_fn): ReLU()
          (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (fc1): Linear(in_features=1024, out_features=4096, bias=True)
          (fc2): Linear(in_feature

In [45]:
training_args_or = RewardConfig(
    output_dir='./results/reward_one',
    learning_rate=1e-5,
    per_device_train_batch_size=4,
    num_train_epochs=1,
    fp16=True,
    logging_dir='./logs',
    logging_steps=10,
    eval_strategy='epoch',
    save_strategy='epoch',
)

trainer_or = Reward_Model_OR(
    model=model_one_reward,
    train_dataset=train_data_rm,
    processing_class=tokenizer,
    args=training_args_or,
    eval_dataset=val_data_rm,
)

In [None]:
trainer_or.train()

Скачаем уже обученную модель на двух чекпоинтах:



Чекпоинт после 1 эпохи: [here](https://drive.google.com/drive/folders/1F2Dji35Eubnse4UYWUF89XBEqSkPFSLh?usp=sharing).



Чекпоинт после 2 эпохи: [here]().

In [48]:
model_or_1_name = 'Reward_Model_OR_1'
model_or_1 = ModelWithReward.from_pretrained(model_or_1_name)
model_or_1.to(device)
del model_or_1.lm_head

In [49]:
training_args_or = RewardConfig(
    output_dir='./results/reward_ll',
    learning_rate=1e-5,
    per_device_train_batch_size=4,
    num_train_epochs=1,
    fp16=True,
    logging_dir='./logs',
    logging_steps=10,
    eval_strategy='epoch',
    save_strategy='epoch',
)

trainer_or_1 = Reward_Model_OR(
    model=model_or_1,
    train_dataset=train_data_rm,
    processing_class=tokenizer,
    args=training_args_or,
    eval_dataset=val_data_rm,
)

In [50]:
trainer_or_1.evaluate()

{'eval_loss': 0.6538317799568176,
 'eval_accuracy': 0.5992292870905588,
 'eval_runtime': 100.4462,
 'eval_samples_per_second': 5.167,
 'eval_steps_per_second': 0.329}

In [51]:
model_or_2_name = 'Reward_Model_OR_2'
model_or_2 = ModelWithReward.from_pretrained(model_or_2_name)
model_or_2.to(device)

del model_or_2.lm_head

In [52]:
trainer_or_2 = Reward_Model_OR(
    model=model_or_2,
    train_dataset=train_data_rm,
    processing_class=tokenizer,
    args=training_args_or,
    eval_dataset=val_data_rm,
)

In [53]:
trainer_or_2.evaluate()

{'eval_loss': 0.9171338677406311,
 'eval_accuracy': 0.5645472061657033,
 'eval_runtime': 101.1294,
 'eval_samples_per_second': 5.132,
 'eval_steps_per_second': 0.326}

Модель начинает переобучаться. Возникло предположение, что, возможно, невысокое значение accuracy связано со слишком сложной формулой для вычисления score, разделяющего chosen и rejected.

**2) Второй вариант:** для более тонкого обучения используем DataCollatorForCompletionOnlyLM. Он вычисляет CrossEntropyLoss только на основе response. Разделять chosen и rejected будем просто на основе 'helpfulness'.

In [54]:
del (model_one_reward, model_or_1, model_or_2)
torch.cuda.empty_cache()

In [55]:
df_train = pd.DataFrame(train_dataset)
df_val = pd.DataFrame(validation_dataset)

df_train = df_train.drop(columns=['correctness', 'coherence', 'complexity', 'verbosity'])
df_val = df_val.drop(columns=['correctness', 'coherence', 'complexity', 'verbosity'])

In [56]:
def best_helpfulness(df):
    responses_dict = {col: [] for col in df.columns}

    for i in range(0, len(df), 2):
        if i + 1 < len(df):
            row1 = df.iloc[i]
            row2 = df.iloc[i + 1]
            score1 = row1['helpfulness']
            score2 = row2['helpfulness']

            if score1 >= score2:
                chosen_row = row1
            else:
                chosen_row = row2

            for col in df.columns:
                responses_dict[col].append(chosen_row[col])

    data_best =  Dataset.from_dict(responses_dict)

    return data_best

In [57]:
train_data = best_helpfulness(df_train)
val_data = best_helpfulness(df_val)

train_data = train_data.remove_columns(['helpfulness'])
val_data = val_data.remove_columns(['helpfulness'])

In [58]:
model_name = "facebook/opt-350m"

model_sft_help = AutoModelForCausalLM.from_pretrained(model_name)
model_sft_help.to(device)
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [59]:
def formatting_prompts_func(example):
    output_texts = []

    for i in range(len(example['prompt'])):
        text = f"### Question: {example['prompt'][i]}\n ### Answer: {example['response'][i]}"
        output_texts.append(text)

    return output_texts

response_template = ' ### Answer:'
collator = DataCollatorForCompletionOnlyLM(response_template, tokenizer=tokenizer)

training_args = TrainingArguments(
    output_dir='./sft_help',
    learning_rate=1e-4,
    per_device_train_batch_size=8,
    num_train_epochs=1,
    fp16=True,
    logging_dir='./logs',
    logging_steps=10,
    evaluation_strategy='epoch',
    save_strategy='epoch',
)

trainer = SFTTrainer(
    model=model_sft_help,
    train_dataset=train_data,
    eval_dataset=val_data,
    args=training_args,
    formatting_func=formatting_prompts_func,
    data_collator=collator,
    max_seq_length=512
)


Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.


Map:   0%|          | 0/10162 [00:00<?, ? examples/s]

Map:   0%|          | 0/519 [00:00<?, ? examples/s]

In [None]:
trainer.train()

Скачаем уже обученную: [here](https://drive.google.com/drive/folders/1HcrMbd0U_ZEBPWZgUnuxOhxuBYyzNM8X?usp=sharing).

In [69]:
model_name_help = 'SFT_help'
model_sft_help = ModelWithReward.from_pretrained(model_name_help)
model_sft_help.to(device)
del model_sft_help.lm_head

Some weights of ModelWithReward were not initialized from the model checkpoint at SFT_help and are newly initialized: ['reward.bias', 'reward.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Обучим для нее в течение 1 эпохи Reward Model, которая выдает 1 reward на последнем слое.

In [73]:
def preprocessing_for_chosen(examples):

    return tokenizer(examples['chosen'], padding='max_length', truncation=True, max_length=512)

def preprocessing_for_rejected(examples):

    return tokenizer(examples['rejected'], padding='max_length', truncation=True, max_length=512)

def pairs_for_rm_help(df):
    responses_dict = {col: [] for col in ['chosen', 'rejected']}

    for i in range(0, len(df), 2):
        if i + 1 < len(df):
            row1 = df.iloc[i]
            row2 = df.iloc[i + 1]
            score1 = row1['helpfulness']
            score2 = row2['helpfulness']

            if score1 >= score2:
                responses_dict['chosen'].append(row1['response'])
                responses_dict['rejected'].append(row2['response'])

            else:
                responses_dict['chosen'].append(row2['response'])
                responses_dict['rejected'].append(row1['response'])

    data_pairs =  Dataset.from_dict(responses_dict)

    return data_pairs

In [74]:
df_train = pd.DataFrame(train_dataset)
df_val = pd.DataFrame(validation_dataset)

train_data_rm = pairs_for_rm_help(df_train)
val_data_rm = pairs_for_rm_help(df_val)

In [75]:
train_data_rm = train_data_rm.map(preprocessing_for_chosen, batched=True)

train_data_rm = train_data_rm.remove_columns(['chosen'])
train_data_rm = train_data_rm.rename_columns({'input_ids': 'input_ids_chosen'})
train_data_rm = train_data_rm.rename_columns({'attention_mask': 'attention_mask_chosen'})

train_data_rm = train_data_rm.map(preprocessing_for_rejected, batched=True)

train_data_rm = train_data_rm.remove_columns(['rejected'])
train_data_rm = train_data_rm.rename_columns({'input_ids': 'input_ids_rejected'})
train_data_rm = train_data_rm.rename_columns({'attention_mask': 'attention_mask_rejected'})

Map:   0%|          | 0/10162 [00:00<?, ? examples/s]

Map:   0%|          | 0/10162 [00:00<?, ? examples/s]

In [76]:
val_data_rm = val_data_rm.map(preprocessing_for_chosen, batched=True)

val_data_rm = val_data_rm.remove_columns(['chosen'])
val_data_rm = val_data_rm.rename_columns({'input_ids': 'input_ids_chosen'})
val_data_rm = val_data_rm.rename_columns({'attention_mask': 'attention_mask_chosen'})

val_data_rm = val_data_rm.map(preprocessing_for_rejected, batched=True)

val_data_rm = val_data_rm.remove_columns(['rejected'])
val_data_rm = val_data_rm.rename_columns({'input_ids': 'input_ids_rejected'})
val_data_rm = val_data_rm.rename_columns({'attention_mask': 'attention_mask_rejected'})

Map:   0%|          | 0/519 [00:00<?, ? examples/s]

Map:   0%|          | 0/519 [00:00<?, ? examples/s]

In [77]:
training_args_or = RewardConfig(
    output_dir='./results/reward_help',
    learning_rate=1e-5,
    per_device_train_batch_size=4,
    num_train_epochs=1,
    fp16=True,
    logging_dir='./logs',
    logging_steps=10,
    eval_strategy='epoch',
    save_strategy='epoch',
)

trainer_or = Reward_Model_OR(
    model=model_sft_help,
    train_dataset=train_data_rm,
    processing_class=tokenizer,
    args=training_args_or,
    eval_dataset=val_data_rm,
)

In [None]:
trainer_or.train()

Скачаем уже обученную модель: [here](https://drive.google.com/drive/folders/1IRUcY9swWPu6MCmvrxd8uTKZQyXTL5Bi?usp=sharing).

In [14]:
model_or_name = 'SFT_help_RM'
model_or = ModelWithReward.from_pretrained(model_or_name)
model_or.to(device)

del model_or.lm_head

In [79]:
trainer_or = Reward_Model_OR(
    model=model_or,
    train_dataset=train_data_rm,
    processing_class=tokenizer,
    args=training_args_or,
    eval_dataset=val_data_rm,
)

In [80]:
trainer_or.evaluate()

You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'eval_loss': 0.6543089747428894,
 'eval_accuracy': 0.6165703275529865,
 'eval_runtime': 100.8425,
 'eval_samples_per_second': 5.147,
 'eval_steps_per_second': 0.327}

Accuracy немного возросла! Посмотрим на средний reward на генерациях SFT модели.

In [82]:
from transformers import GenerationConfig
from trl.trainer.utils import batch_generation
import numpy as np

del (trainer_or, trainer, model_sft_help)
torch.cuda.empty_cache()

In [13]:
model_name_help = 'SFT_help'
model_sft_help = AutoModelForCausalLM.from_pretrained(model_name_help)
model_sft_help.to(device)

OPTForCausalLM(
  (model): OPTModel(
    (decoder): OPTDecoder(
      (embed_tokens): Embedding(50272, 512, padding_idx=1)
      (embed_positions): OPTLearnedPositionalEmbedding(2050, 1024)
      (project_out): Linear(in_features=1024, out_features=512, bias=False)
      (project_in): Linear(in_features=512, out_features=1024, bias=False)
      (layers): ModuleList(
        (0-23): 24 x OPTDecoderLayer(
          (self_attn): OPTSdpaAttention(
            (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (v_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (q_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (out_proj): Linear(in_features=1024, out_features=1024, bias=True)
          )
          (activation_fn): ReLU()
          (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (fc1): Linear(in_features=1024, out_features=4096, bias=True)
          (fc2): Linear(in_features

In [87]:
def get_score(model, inputs):
    logits = model.model(input_ids=inputs)['last_hidden_state']
    rewards = model.reward(logits[:, -1, :])

    return rewards

score_dict = {'score': [] }

In [15]:
def preprocessing_for_val_sft(examples):

    return tokenizer(examples['prompt'], padding='max_length', truncation=True, max_length=512)

In [16]:
def pairs_for_val_sft(df):
    responses_dict = {'prompt': []}

    for i in range(0, len(df), 2):
        if i + 1 < len(df):
            row = df.iloc[i + 1]
            responses_dict['prompt'].append(row['prompt'])

    data =  Dataset.from_dict(responses_dict)

    return data

In [89]:
df_val = pd.DataFrame(validation_dataset)

val_data = pairs_for_val_sft(df_val)
val_data = val_data.map(preprocessing_for_val_sft, batched=True)

Map:   0%|          | 0/519 [00:00<?, ? examples/s]

In [97]:
args = GenerationConfig(max_new_tokens=100)

for i in range(len(val_data['input_ids'])):
    if i % 100 == 0:
        print(f'Сгенерированно {i} ответов')

    with torch.no_grad():
        input_tensor = torch.LongTensor([val_data['input_ids'][i]]).to(device)
        new_response = batch_generation(
                                         model_sft_help,
                                         input_tensor,
                                         local_rollout_forward_batch_size=1,
                                         pad_token_id=tokenizer.pad_token_id,
                                         generation_config=args
        )[0][:, 512:]

        torch.cuda.empty_cache()

        score = get_score(model_or, new_response).cpu()
        score_dict['score'].append(score)

        del input_tensor, new_response, score
        torch.cuda.empty_cache()

Сгенерированно 0 ответов
Сгенерированно 100 ответов
Сгенерированно 200 ответов
Сгенерированно 300 ответов
Сгенерированно 400 ответов
Сгенерированно 500 ответов


Оценим средний reward генераций SFT модели до RLHF.

In [98]:
np.mean(score_dict['score'])

-0.9487341

**Теперь обучим алгоритм Reinforce.**

In [17]:
import reinforce
from reinforce import Reinforce_Model

In [18]:
df_train = pd.DataFrame(train_dataset)
df_val = pd.DataFrame(validation_dataset)

In [19]:
train_data_rein = pairs_for_val_sft(df_train)
val_data_rein = pairs_for_val_sft(df_val)

train_data_rein = train_data_rein.map(preprocessing_for_val_sft, batched=True)
val_data_rein = val_data_rein.map(preprocessing_for_val_sft, batched=True)

train_data_rein = train_data_rein.remove_columns(['prompt'])
val_data_rein = val_data_rein.remove_columns(['prompt'])

Map:   0%|          | 0/10162 [00:00<?, ? examples/s]

Map:   0%|          | 0/519 [00:00<?, ? examples/s]

In [20]:
import copy

model_sft_help.to('cpu')
model_copy = copy.deepcopy(model_sft_help)
model_sft_help.to(device)
model_copy.to(device)

OPTForCausalLM(
  (model): OPTModel(
    (decoder): OPTDecoder(
      (embed_tokens): Embedding(50272, 512, padding_idx=1)
      (embed_positions): OPTLearnedPositionalEmbedding(2050, 1024)
      (project_out): Linear(in_features=1024, out_features=512, bias=False)
      (project_in): Linear(in_features=512, out_features=1024, bias=False)
      (layers): ModuleList(
        (0-23): 24 x OPTDecoderLayer(
          (self_attn): OPTSdpaAttention(
            (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (v_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (q_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (out_proj): Linear(in_features=1024, out_features=1024, bias=True)
          )
          (activation_fn): ReLU()
          (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (fc1): Linear(in_features=1024, out_features=4096, bias=True)
          (fc2): Linear(in_features

In [21]:
from trl.trainer import RLOOConfig

training_args = RLOOConfig(
    output_dir='./results/reinforce',
    learning_rate=1e-4,
    per_device_train_batch_size=4,
    num_train_epochs=2,
    logging_dir='./logs',
    logging_steps=10,
    eval_strategy='epoch',
    save_strategy='epoch',
)

trainer = Reinforce_Model(
    policy=model_sft_help,
    ref_policy=model_copy,
    reward_model=model_or,
    train_dataset=train_data_rein,
    processing_class=tokenizer,
    config=training_args,
    eval_dataset=val_data_rein,
)

world_size 1
local_batch_size 4
args.gradient_accumulation_steps 1
num_total_batches 5081
local_dataloader_batch_size 4


In [22]:
trainer.train()

===training policy===


[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

  ········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


KeyboardInterrupt: 

Обученная модель не сохранилась. Впрочем, она и не обучилась с такой Reward Model:(

In [23]:
trainer.generate_completions()