In [None]:
import json
import copy
import logging
from dataclasses import dataclass, field

import torch
from torch.utils.data import Dataset
import transformers
from transformers import Trainer, TrainingArguments, AutoTokenizer, AutoModel
from datasets import load_dataset

In [None]:
from transformers import AutoModel

In [None]:
## 모델 준비

base_model_path = 'skt/kogpt2-base-v2'

base_model = AutoModel.from_pretrained(base_model_path)
tokenizer = AutoTokenizer.from_pretrained(
    base_model_path,
    padding_side="right",
    model_max_length=512,
)

In [None]:
# data config
IGNORE_INDEX = -100
DEFAULT_PAD_TOKEN = "[PAD]"
DEFAULT_BOS_TOKEN = "<s>"
DEFAULT_EOS_TOKEN = "</s>"
DEFAULT_UNK_TOKEN = "<UNK>"

In [None]:
tokenizer.add_special_tokens(
    {
        "pad_token": DEFAULT_PAD_TOKEN,
        "bos_token": DEFAULT_BOS_TOKEN,
        "eos_token": DEFAULT_EOS_TOKEN,
        "unk_token": DEFAULT_UNK_TOKEN,
    }
)
tokenizer.pad_token = tokenizer.eos_token
# print(tokenizer)

In [None]:
save_dir = './output_2_RM'
verbose = False

In [None]:
# data_path = 'Ja-ck/Orca-DPO-Pairs-KO'
data_path = 'AIdenU/orca_dpo_data_ko'
dataset = load_dataset(data_path)

In [None]:
dataset['train']

In [None]:
list_data_dict = dataset['train']

In [None]:
total_data_ranking2chosen = []
for tmp in list_data_dict:

    data = {}
    data['system'] = tmp['system']
    data['prompt'] = tmp['question']
    data['chosen'] = tmp['chosen']
    data['rejected'] = tmp['rejected']

    total_data_ranking2chosen.append(data)

In [None]:
print('before data num: %d'%(len(list_data_dict)))
print('after  data num: %d'%(len(total_data_ranking2chosen)))
print('data example: \n%s'%total_data_ranking2chosen[45])

In [None]:
total_data_ranking2chosen[0]

In [None]:
PROMPT_DICT = {
    "prompt_input": """
<start_of_turn>user
{system}

### Input:
{user_input}

<start_of_turn>model
{model_answer}
""".lstrip(),
    "prompt_no_input": """
<start_of_turn>user
### Input:
{user_input}

<start_of_turn>model
{model_answer}
""".lstrip(),
}

In [None]:
prompt_input, prompt_no_input = PROMPT_DICT["prompt_input"], PROMPT_DICT["prompt_no_input"]  # 템플릿 가져오기

In [None]:
example = total_data_ranking2chosen[0]

In [None]:
example

In [None]:
data_col_dict = {
    'system':'system',
    'user_input':'prompt',
    'chosen':'chosen',
    'rejected':'rejected',
}

In [None]:
from typing import Callable

from torch.utils.data import Dataset
from tqdm import tqdm

class RewardDataset(Dataset):
    """
    Dataset for reward model

    Args:
        dataset: dataset for reward model
        tokenizer: tokenizer for reward model
        max_length: max length of input
    """

    def __init__(self, dataset, data_col_dict, tokenizer: Callable, max_length: int) -> None:
        super().__init__()
        self.chosen = []
        self.reject = []
        
        prompt_input, prompt_no_input = PROMPT_DICT["prompt_input"], PROMPT_DICT["prompt_no_input"]  # 템플릿 가져오기
                
        for data in tqdm(dataset):
            
            if data.get(data_col_dict['system'], "") != "":
                chosen = prompt_input.format_map({
                    'system':data[data_col_dict['system']],
                    'user_input':data[data_col_dict['user_input']],
                    'model_answer':data[data_col_dict['chosen']],
                }) + tokenizer.eos_token
            else:
                chosen = prompt_no_input.format_map({
                    'user_input':data[data_col_dict['user_input']],
                    'model_answer':data[data_col_dict['chosen']],
                }) + tokenizer.eos_token
            
            chosen_token = tokenizer(chosen,
                                     max_length=max_length,
                                     padding="longest",
                                     truncation=True,
                                     return_tensors="pt")
            self.chosen.append({
                "input_ids": chosen_token['input_ids'][0],
                "attention_mask": chosen_token['attention_mask'][0]
            })

            # reject = prompt + data['rejected'] + "<|endoftext|>"
            
            if data.get(data_col_dict['system'], "") != "":
                reject = prompt_input.format_map({
                    'system':data[data_col_dict['system']],
                    'user_input':data[data_col_dict['user_input']],
                    'model_answer':data[data_col_dict['rejected']],
                }) + tokenizer.eos_token
            else:
                reject = prompt_no_input.format_map({
                    'user_input':data[data_col_dict['user_input']],
                    'model_answer':data[data_col_dict['rejected']],
                }) + tokenizer.eos_token
            
            
            reject_token = tokenizer(reject,
                                     max_length=max_length,
                                     padding="longest",
                                     truncation=True,
                                     return_tensors="pt")
            self.reject.append({
                "input_ids": reject_token['input_ids'][0],
                "attention_mask": reject_token['attention_mask'][0]
            })

    def __len__(self):
        length = len(self.chosen)
        return length
    
    def __getitem__(self, idx):
        return self.chosen[idx]["input_ids"], self.reject[idx]["input_ids"]

In [None]:
max_len = 512

In [None]:
# prepare for data and dataset
import random
random.seed(230319)
# list_tmp = list(range(10))
random.shuffle(total_data_ranking2chosen)
print(total_data_ranking2chosen[45])

# train_data = total_data_ranking2chosen[:-1000]  # 29000 학습
# eval_data = total_data_ranking2chosen[-1000:0]  # 1000개만 평가

train_data = total_data_ranking2chosen[:100]  # 29000 학습
eval_data = total_data_ranking2chosen[100:130]  # 1000개만 평가


train_dataset = RewardDataset(train_data, data_col_dict, tokenizer, max_len)
eval_dataset = RewardDataset(eval_data, data_col_dict, tokenizer, max_len)

# check
idx = 10
print('#'*70)
print('## prompt ##')
print(train_data[idx]['prompt'])
print('#'*70)
print('## chosen ##')
print(train_data[idx]['chosen'])
print('#'*70)
print('## rejected ##')
print(train_data[idx]['rejected'])

In [None]:
# train_dataset[0]['input_ids_j'][0]

print(tokenizer.decode(train_dataset[0][0]))

In [None]:
from typing import Optional

import torch
import torch.nn as nn

class RewardModel(nn.Module):
    """
    Reward model base class.

    Args:
        model (nn.Module): Reward model.
        value_head (nn.Module): Value head to get reward score.
    """

    def __init__(self,
                 base_model: nn.Module,
                 value_head: Optional[nn.Module] = None,
                 ) -> None:
        super().__init__()
        self.base_model = base_model

        if value_head is not None:
            if value_head.out_features != 1:
                raise ValueError("The value head of reward model's output dim should be 1!")
            self.value_head = value_head
        else:
            self.value_head = nn.Linear(base_model.config.n_embd, 1)

    def forward(self, input_ids: torch.LongTensor, attention_mask: Optional[torch.Tensor] = None) -> torch.Tensor:
        outputs = self.base_model(input_ids, attention_mask=attention_mask)
        last_hidden_states = outputs['last_hidden_state']
        values = self.value_head(last_hidden_states)[:, :-1]
        value = values.mean(dim=1).squeeze(1)    # ensure shape is (B)
        return value

In [None]:
model = RewardModel(base_model=base_model)

In [None]:
from typing import Optional, Dict, Sequence

@dataclass
class DataCollatorForRewardDataset(object):
    """Collate examples for supervised fine-tuning."""

    tokenizer: transformers.PreTrainedTokenizer

    def __call__(self, instances: Sequence[Dict]) -> Dict[str, torch.Tensor]:
        input_ids_chosen, input_ids_reject = zip(*instances)

        input_ids_chosen = torch.nn.utils.rnn.pad_sequence(
            input_ids_chosen, batch_first=True, padding_value=self.tokenizer.pad_token_id
        )

        input_ids_reject = torch.nn.utils.rnn.pad_sequence(
            input_ids_reject, batch_first=True, padding_value=self.tokenizer.pad_token_id
        )

        return {
            "input_ids_j": input_ids_chosen,
            "attention_mask_j": input_ids_chosen.ne(self.tokenizer.pad_token_id),
            "input_ids_k": input_ids_reject,
            "attention_mask_k": input_ids_reject.ne(self.tokenizer.pad_token_id),
        }

In [None]:
data_collator = DataCollatorForRewardDataset(tokenizer=tokenizer)

In [None]:
class RewardTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):

        rewards_j = model(input_ids=inputs["input_ids_j"],  attention_mask=inputs["attention_mask_j"])[0]
        rewards_k = model(input_ids=inputs["input_ids_k"], attention_mask=inputs["attention_mask_k"])[0]
        loss = -nn.functional.logsigmoid(rewards_j - rewards_k).mean()
        if return_outputs:
            return loss, {"rewards_j": rewards_j, "rewards_k": rewards_k}
        return loss

In [None]:
training_args = TrainingArguments(
    output_dir=save_dir, #The output directory
    overwrite_output_dir=True, #overwrite the content of the output directory
    num_train_epochs=1, # number of training epochs
    per_device_train_batch_size=4, # batch size for training
    per_device_eval_batch_size=4,  # batch size for evaluation
    eval_steps = 3, # Number of update steps between two evaluations.
    save_steps=500, # after # steps model is saved
    warmup_steps=5,# number of warmup steps for learning rate scheduler
    prediction_loss_only=True,
    )
trainer = RewardTrainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
)

trainer.train()
trainer.save_state()
safe_save_model_for_hf_trainer(trainer=trainer, output_dir=save_dir)