https://github.com/OctopusMind/RLHF_PPO/tree/main

In [1]:
import random, os, tqdm, time, json
import numpy as np
import pandas as pd
from matplotlib.font_manager import FontProperties
import matplotlib.pyplot as plt

import sys
sys.path.append("../../../../")

random.seed(618)
np.random.seed(907)

new_base_path = os.path.join(
    "/Users/minkexiu/Downloads/",
    "/".join(
        os.getcwd().split("/")[-1*(len(sys.path[-1].split("/")) - 1):]
    ),
)
print("storage dir:", new_base_path)
print("code dir:", os.getcwd())

## 创建文件夹。
if not os.path.exists(new_base_path):
    os.makedirs(
        new_base_path
    )
if not os.path.exists(os.path.join(new_base_path, "preprocessedData")):
    os.makedirs(
        os.path.join(new_base_path, "preprocessedData")
    )
if not os.path.exists(os.path.join(new_base_path, "originalData")):
    os.makedirs(
        os.path.join(new_base_path, "originalData")
    )
if not os.path.exists(os.path.join(new_base_path, "trained_models")):
    os.makedirs(
        os.path.join(new_base_path, "trained_models")
    )

def create_originalData_path(filename_or_path):
    return os.path.join(new_base_path, "originalData", filename_or_path)
def create_preprocessedData_path(filename_or_path):
    return os.path.join(new_base_path, "preprocessedData", filename_or_path)
def create_trained_models_path(filename_or_path):
    return os.path.join(new_base_path, "trained_models", filename_or_path)

def millisec2datetime(timestamp):
    time_local = time.localtime(timestamp/1000)
    return time.strftime("%Y-%m-%d %H:%M:%S", time_local)
    
def run_finish():
    # 假设你的字体文件是 'myfont.ttf' 并且位于当前目录下  
    font = FontProperties(fname="/Users/minkexiu/Documents/GitHub/ML_Tryout/SimHei.ttf", size=24)  
    # 创建一个空白的图形  
    fig, ax = plt.subplots()  
    ax.imshow(
        plt.imread("/Users/minkexiu/Downloads/wallhaven-dgxpyg.jpg")
    )
    # 在图形中添加文字  
    ax.text(
        ax.get_xlim()[1] * 0.5, 
        ax.get_ylim()[0] * 0.5, 
        f"程序于这个点跑完：\n{millisec2datetime(time.time()*1000)}", fontproperties=font, ha="center", va="center", color="red"
    )  
    # 设置图形的布局  
    # ax.set_xlim(0, 1)  
    # ax.set_ylim(0, 1)  
    ax.set_xticks([])  
    ax.set_yticks([])  
    ax.patch.set_color("blue")
    # 显示图形  
    plt.show()
        
tqdm.tqdm.pandas() ## 引入这个，就可以在apply的时候用progress_apply了。

import IPython
def kill_current_kernel():
    '''杀死当前的kernel释放内存空间。'''
    IPython.Application.instance().kernel.do_shutdown(True) 
    
def simply_show_data(df1):
    print(df1.shape)
    display(df1.head())
    
def wait_flag(saved_flag_path, time_interval_sec=10):
    print("waiting for", saved_flag_path)
    time_count = 0
    while True:
        if os.path.exists(saved_flag_path):
            break
        time.sleep(time_interval_sec)
        time_count+=time_interval_sec
        print(time_count, end=" ")
    print("finish!!")

storage dir: /Users/minkexiu/Downloads/GitHub/ML_runCodeFromBook/大规模语言模型：从理论到实践/RLHF_PPO
code dir: /Users/minkexiu/Documents/GitHub/ML_runCodeFromBook/大规模语言模型：从理论到实践/RLHF_PPO


In [2]:
import torch
from torch.utils.data import DataLoader
from torch.optim import Adam

In [3]:
from typing import List
from dataclasses import dataclass, field

# config

In [4]:
class Config:
    # model 参数 ###########################
    # 情感分析模型，下载地址https://huggingface.co/IDEA-CCNL/Erlangshen-Roberta-330M-Sentiment
    Sentiment_model = create_trained_models_path("Erlangshen-Roberta-330M-Sentiment")
    # 文本生成模型,下载地址 https://huggingface.co/Qwen/Qwen1.5-0.5B-Chat
    gpt_model = create_trained_models_path("Qwen1.5-0.5B-Chat")
    
    data_path = "data/train_data.json"
    save_lora_path = create_trained_models_path("ppo/save_lora") 
    save_v_head_path = create_trained_models_path("ppo/v_head/pytorch_model.bin") 
    device = "cuda:0" if torch.cuda.is_available() else "cpu"
    batch_size = 2
    epochs = 10
    lr = 0.001
    # PPO 参数 ############################
    ppo_epochs = 3
    kl_ctl_value = 0.2
    gamma = 1.0  # 用于优势计算的折扣因子。控制未来奖励的重要性。
    lam = 0.95  # 用于优势计算的Lambda参数。它用于控制对未来奖励的考虑程度，结合时间差异方法。
    cliprange_value = 0.2  # 损失计算中值函数的裁剪范围。裁剪可以防止极端值对训练过程的负面影响。
    cliprange = 0.2  # PPO策略梯度损失中的裁剪范围。这个裁剪范围用于限制策略更新的步长，从而保持训练的稳定性。
    vf_coef = 0.1

@dataclass
class LoraArguments:
    lora_r: int = 2
    lora_alpha: int = 8
    lora_dropout: float = 0
    lora_target_modules: List[str] = field(default_factory=lambda: ['k_proj',  'v_proj']) ## xmk：这个我感觉有点脱裤子放屁。直接传入一个list是不是也可以？
    # lora_target_modules = None
    lora_weight_path: str = ""
    q_lora: bool = False
    load_in_4bit: bool = False
    load_in_8bit: bool = False
    is_reload_trained_params = False  # 是否接着上次训练模型继续训练 【TODO】如果是第一次跑，这里要改成 False 哦。

# inference

In [5]:
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import LoraConfig, PeftModel

In [6]:
class LoraPPOModel(PeftModel):
    def __init__(self, config: Config):
        self.config = config
        model = AutoModelForCausalLM.from_pretrained(config.gpt_model).to(config.device)
        self.tokenizer = AutoTokenizer.from_pretrained(config.gpt_model)
        lora_args = LoraArguments()
        lora_config = LoraConfig(
            r=lora_args.lora_r,
            lora_alpha=lora_args.lora_alpha,
            target_modules=lora_args.lora_target_modules,
            lora_dropout=lora_args.lora_dropout,
            task_type="CAUSAL_LM",
        )
        super().__init__(model, lora_config)
        model = super().from_pretrained(model, config.save_lora_path)
        self.lora_ppo_model = model.merge_and_unload()
        self.raw_model = AutoModelForCausalLM.from_pretrained(config.gpt_model).to(config.device)
        print()

    def forward(self, query, system_content):
        messages = [
            {"role": "system", "content": system_content},
            {"role": "user", "content": query}
        ]
        text = self.tokenizer.apply_chat_template(
            messages,
            tokenize=False,
            add_generation_prompt=True
        )
        model_inputs = self.tokenizer([text], return_tensors="pt").to(self.config.device)
        lora_ppo_response = self.predict(model_inputs, self.lora_ppo_model, self.tokenizer)
        raw_response = self.predict(model_inputs, self.raw_model, self.tokenizer)
        return lora_ppo_response, raw_response

    @staticmethod
    def predict(model_inputs, model, tokenizer):
        generated_ids = model.generate(
            model_inputs.input_ids,
            max_new_tokens=512,
            num_beams=1,
            do_sample=False
        )
        generated_ids = [
            output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
        ]

        response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
        return response

In [7]:
lora_ppo_model = LoraPPOModel(Config())
lora_ppo_response, raw_response = lora_ppo_model("饭店服务员的态度太差，使用委婉积极的态度投诉", "你是一个有文化的文明人")
print(f"lora_ppo_response:{lora_ppo_response}")
print(f"raw_response:{raw_response}")

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.







lora_ppo_response:尊敬的经理：

您好！我最近在您的饭店用餐时遇到了一些问题。我想通过这封信向您反映一下。

首先，我对餐厅的服务态度感到非常不满。当我点菜时，服务员的态度并不友好，总是显得有些冷漠和不耐烦。他们似乎对我的需求没有足够的关注，甚至有时会对我提出一些不合理的建议或要求。

其次，我在用餐过程中也遇到了一些困扰。我发现有些菜品的味道并不符合我的口味，而且有些服务人员的服务态度也不够热情。这些都让我感到很失望。

最后，我还注意到有一些环境问题。例如，餐厅的卫生状况不佳，餐具和杯子经常被污染，这让我感到非常不舒服。

我希望您能理解并采取措施来改善我们的用餐体验。我相信，只要我们共同努力，我们的服务质量将会得到显著提高。

再次感谢您抽出宝贵的时间阅读这封信，并期待您的回复。

顺祝商祺，
[你的名字]
raw_response:尊敬的经理：

您好！我最近在您的饭店用餐时遇到了一些问题。我想通过这封信向您反映一下。

首先，我对餐厅的服务态度感到非常不满。当我点菜时，服务员的态度并不友好，总是显得有些冷漠和不耐烦。他们似乎对我的需求没有足够的关注，甚至有时会对我提出一些不合理的建议或要求。

其次，我在用餐过程中也遇到了一些困扰。我发现有些菜品的味道并不符合我的口味，而且有些服务人员的服务态度也不够热情。这些都让我感到很失望。

最后，我还注意到有一些环境问题。例如，餐厅的卫生状况不佳，餐具和杯子经常被污染，这让我感到非常不舒服。

我希望您能理解并采取措施来改善我们的用餐体验。我相信，只要我们共同努力，我们的服务质量将会得到显著提高。

再次感谢您抽出宝贵的时间阅读这封信，并期待您的回复。

顺祝商祺，
[你的名字]


# ppo

In [8]:
from utils.tools import Tools

class PPO:
    def __init__(self, actor_critic_model, config: Config, actor_critic_opt):
        self.actor_critic_model = actor_critic_model
        self.config = config
        self.actor_critic_opt = actor_critic_opt

    def train(self, prompt_generate_ids, attention_mask, prob_refs, reward, tools: Tools):
        with torch.no_grad():
            _, old_values = self.actor_critic_model(prompt_generate_ids, attention_mask, tools)  # 计算每个token的价值
        for _ in range(self.config.ppo_epochs):
            # 获得actor_critic模型新的probs和token对应的价值
            new_probs, new_values = self.actor_critic_model(prompt_generate_ids, attention_mask, tools)
            # 计算奖励值
            rewards, non_score_rewards = self.compute_rewards(reward, new_probs, prob_refs)  # 计算reward
            loss = self.loss(new_probs=new_probs, old_values=old_values, new_values=new_values,
                             rewards=rewards, old_probs=prob_refs)

            self.actor_critic_opt.zero_grad()
            loss.backward()
            self.actor_critic_opt.step()
            print(loss)

    def loss(self, new_probs, old_values, new_values, rewards, old_probs):
        """
        计算actor模型和评价模型的loss
        :param new_probs: actor模型生成的probs
        :param old_values: ppo 优化之前的价值
        :param new_values: ppo 优化过程中新的价值
        :param rewards: 每次生成token对应的奖励
        :param old_probs: reference模型生成的probs
        :return: actor loss 和 critic loss
        """
        """Calculate policy and value losses."""
        loss = torch.tensor(0.0)
        for new_prob, old_value, new_value, reward, old_prob in zip(new_probs, old_values, new_values, rewards,
                                                                    old_probs):
            new_prob = new_prob.unsqueeze(0)
            old_value = old_value.unsqueeze(0)
            new_value = new_value.unsqueeze(0)
            reward = reward.unsqueeze(0)
            old_prob = old_prob.unsqueeze(0)
            last_gae_lam = 0
            advantages_reversed = []
            gen_len = new_prob.shape[1]
            # GAE 计算优势函数，当前token获得的奖励(真实的) + 未来获得的价值(这个是上帝视角，不包含当前token) - 包含当前token在上帝视角下的价值
            # 当前token获得的奖励(真实的) + 未来获得的价值(这个是上帝视角，不包含当前token) 比 包含当前token在上帝视角下的价值 要准
            for t in reversed(range(gen_len)):
                next_values = old_value[:, t + 1] if t < gen_len - 1 else 0.0
                delta = reward[:, t] + self.config.gamma * next_values - old_value[:, t]
                last_gae_lam = delta + self.config.gamma * self.config.lam * last_gae_lam
                advantages_reversed.append(last_gae_lam)
            advantages = torch.stack(advantages_reversed[::-1]).transpose(0, 1)
            returns = advantages + old_value  # Q值，当前token获得的奖励(真实的) + 未来获得的价值(这个是上帝视角，不包含当前token)
            advantages = self.whiten(advantages)
            advantages = advantages.detach()
            value_clipped = torch.clamp(new_value,
                                        old_value - self.config.cliprange_value,
                                        old_value + self.config.cliprange_value)  # 截断防止训练废了
            vf_loss1 = (new_value - returns) ** 2  # 上帝视角的价值减去Q值的误差，用于优化上帝模型
            vf_loss2 = (value_clipped - returns) ** 2
            vf_loss = torch.mean(torch.max(vf_loss2, vf_loss1))

            ratio = torch.exp(new_prob - old_prob)  # 控制优化范围，防止训练离原始模型偏差过大
            pg_losses = -advantages * ratio  # importance sampling
            pg_losses2 = -advantages * torch.clamp(ratio,
                                                   1.0 - self.config.cliprange,
                                                   1.0 + self.config.cliprange)  # 截断防止训练废了
            pg_loss = torch.mean(torch.max(pg_losses, pg_losses2))
            loss += pg_loss + self.config.vf_coef * vf_loss
        return loss

    def compute_rewards(self, scores, probs, ref_probs):
        """
        计算reward值,由于对每一个token不能给与即使的奖励，这里使用kl散度补偿
        :param scores:reward model给出的奖励值，每条句子只有一个值
        :param probs: actor model生成的probs
        :param ref_probs: reference model 生成的probs
        :return: 返回每个token的奖励值
        """
        rewards, non_score_rewards = [], []
        for score, prob, ref_prob in zip(scores, probs, ref_probs):
            kl = prob - ref_prob  # (seq_len, )
            non_score_reward = -self.config.kl_ctl_value * kl  # (seq_len, )
            non_score_rewards.append(non_score_reward)
            reward = non_score_reward.clone()  # 前面每一个token的reward都来自KL惩罚
            reward[-1] += score  # 在最后一位加上人工给的reward
            rewards.append(reward)
        return rewards, non_score_rewards  # (batch, seq_len)

    @staticmethod
    def whiten(values, shift_mean=True):
        """
        归一化
        :param values: 要归一化的值
        :param shift_mean: 负一化方式
        :return: 返回归一化之后的结果
        """
        mean, var = torch.mean(values), torch.var(values)
        whitened = (values - mean) * torch.rsqrt(var + 1e-8)
        if not shift_mean:
            whitened += mean
        return whitened

# main prepare

In [9]:
from torch.utils.data import DataLoader
from torch.optim import Adam

## ActorCriticLoraModel

In [10]:
class LoraModel(PeftModel):
    def __init__(self, config: Config, model):
        lora_args = LoraArguments()
        lora_config = LoraConfig(
            r=lora_args.lora_r,
            lora_alpha=lora_args.lora_alpha,
            target_modules=lora_args.lora_target_modules,
            lora_dropout=lora_args.lora_dropout,
            task_type="CAUSAL_LM",
        )
        super().__init__(model, lora_config)
        self.v_head = torch.nn.Linear(1024, 1, bias=False).to(config.device)
        if lora_args.is_reload_trained_params:
            super().from_pretrained(model, config.save_lora_path)
            self.v_head.load_state_dict(torch.load(config.save_v_head_path))
        for name, module in self.named_modules():
            if 'lora_' in name:
                for param in module.parameters():
                    param.requires_grad = True

    def forward(self, input_ids, attention_mask, tools: Tools):
        res = super().forward(input_ids, attention_mask, output_hidden_states=True)
        values = self.v_head(res.hidden_states[0]).squeeze(-1)[:, :-1]
        values = tools.filter_mask(values)
        probs = tools.probs_from_logits(res.logits[:, :-1, :], input_ids[:, 1:])
        probs = tools.filter_mask(probs)
        return probs, values


class ActorCriticLoraModel(torch.nn.Module):
    def __init__(self, config: Config):
        super().__init__()
        model = AutoModelForCausalLM.from_pretrained(config.gpt_model).to(config.device).eval()
        self.model = LoraModel(config, model)
        self.tokenizer = AutoTokenizer.from_pretrained(config.gpt_model)

    def forward(self, input_ids, attention_mask, tools: Tools):
        probs, values = self.model(input_ids, attention_mask, tools)
        return probs, values

    @torch.no_grad()
    def actor_generate(self, input_ids):
        generated_ids = self.model.generate(input_ids, max_new_tokens=512, top_p=1.0,
                                            num_beams=1,
                                            do_sample=False)
        response = self.tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
        response_id = generated_ids[:, input_ids.shape[1]:]
        return response, generated_ids, response_id

## ReferenceModel

In [11]:
import torch.nn as nn

In [30]:
AutoModelForCausalLM.from_pretrained(Config.gpt_model).to(torch.device("mps"))

Qwen2ForCausalLM(
  (model): Qwen2Model(
    (embed_tokens): Embedding(151936, 1024)
    (layers): ModuleList(
      (0-23): 24 x Qwen2DecoderLayer(
        (self_attn): Qwen2SdpaAttention(
          (q_proj): Linear(in_features=1024, out_features=1024, bias=True)
          (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
          (v_proj): Linear(in_features=1024, out_features=1024, bias=True)
          (o_proj): Linear(in_features=1024, out_features=1024, bias=False)
          (rotary_emb): Qwen2RotaryEmbedding()
        )
        (mlp): Qwen2MLP(
          (gate_proj): Linear(in_features=1024, out_features=2816, bias=False)
          (up_proj): Linear(in_features=1024, out_features=2816, bias=False)
          (down_proj): Linear(in_features=2816, out_features=1024, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): Qwen2RMSNorm()
        (post_attention_layernorm): Qwen2RMSNorm()
      )
    )
    (norm): Qwen2RMSNorm()
  )
  (lm_head): Line

In [31]:
class ReferenceModel(nn.Module):
    def __init__(self, config):
        super(ReferenceModel, self).__init__()
        self.config = config
        self.reference_model = AutoModelForCausalLM.from_pretrained(
            self.config.gpt_model, 
            # torch_dtype="auto"
        ).to(
            self.config.device
        )

    @torch.no_grad()
    def forward(self, input_ids, attention_mask, tools: Tools):
        logits = self.reference_model(input_ids=input_ids,
                                      attention_mask=attention_mask).logits
        prob_refs = tools.probs_from_logits(logits[:, :-1, :], input_ids[:, 1:])
        prob_refs = tools.filter_mask(prob_refs)
        return prob_refs

## RewardModel

In [20]:
from transformers import BertForSequenceClassification, BertTokenizer


# 奖励模型
class RewardModel(nn.Module):
    def __init__(self, config: Config):
        super(RewardModel, self).__init__()
        self.config = config
        self.reward_tokenizer = BertTokenizer.from_pretrained(self.config.Sentiment_model)
        self.reward_model = BertForSequenceClassification.from_pretrained(self.config.Sentiment_model).to(
            self.config.device)

    @torch.no_grad()
    def forward(self, text):
        input_ids, attention_mask = self.data_process(text)
        output = self.reward_model(torch.tensor(input_ids).to(self.config.device),
                                   torch.tensor(attention_mask).to(self.config.device))
        probs = torch.softmax(torch.tensor(output.logits), dim=1).tolist()
        reward = [prob[0] for prob in probs]
        return reward

    def data_process(self, texts):
        attention_mask = []
        input_ids = [self.reward_tokenizer.encode(text)[:512] for text in texts]
        max_length = max(len(i) for i in input_ids)
        res = []
        for one in input_ids:
            padding_num = max_length - len(one)
            res.append(one + [self.reward_tokenizer.pad_token_id] * padding_num)
            attention_mask.append([1] * len(one) + [0] * padding_num)
        return res, attention_mask

## CustomDataset

In [21]:
from torch.utils.data import Dataset, DataLoader

In [22]:
class CustomDataset(Dataset):
    def __init__(self, data_file, actor_tokenizer):
        self.actor_tokenizer = actor_tokenizer
        with open(data_file, 'r', encoding="utf-8") as f:
            self.data = json.load(f)
        self.total_samples = len(self.data)
        self.actor_padding_id = actor_tokenizer.pad_token_id

    def __len__(self):
        return self.total_samples

    def __getitem__(self, idx):
        # 根据索引加载数据
        # 这里可以根据需要从文件中读取数据并进行预处理
        line = self.data[idx]
        query = line["query"]
        system_content = line["system_content"]
        messages = [
            {"role": "system", "content": system_content},
            {"role": "user", "content": query},
        ]
        text = self.actor_tokenizer.apply_chat_template(
            messages,
            tokenize=False,
            add_generation_prompt=True
        )
        model_inputs = self.actor_tokenizer([text], return_tensors="pt")

        return [model_inputs.input_ids.tolist()[0], model_inputs.attention_mask.tolist()[0]]

    def collate_fn(self, batch):
        max_length = max([len(i[0]) for i in batch])
        input_ids = []
        mask_attention = []
        for one in batch:
            padding_num = max_length - len(one[0])
            input_ids.append([self.actor_padding_id] * padding_num + one[0])
            mask_attention.append([0] * padding_num + one[1])
        return torch.tensor(input_ids), torch.tensor(mask_attention)

## Tools

In [23]:
import torch.nn.functional as F


class Tools:
    def __init__(self, response_shape, response_ids_mask):
        """
        :param response_shape: 模型生成的句子长度
        :param response_ids_mask: 模型批量生成的句子中有padding，这里去除padding数据
        """
        self.response_shape = response_shape
        self.response_ids_mask = response_ids_mask

    def filter_mask(self, values):
        """
        :param values: 一般是prob_old、prob_ref、value(价值)的值
        :return: 去除padding之后的数据
        """
        return [value[-self.response_shape:][one_response_ids_mask] for value, one_response_ids_mask in
                zip(values, self.response_ids_mask)]

    @staticmethod
    def probs_from_logits(logits, labels):
        log_probs = F.log_softmax(logits, dim=2)
        probs = torch.gather(log_probs, 2, labels.unsqueeze(2)).squeeze(-1)
        return probs

# actual train

In [34]:
class TrainPpo:
    def __init__(self):
        self.config = Config()
        # 演员和评论家模型
        self.actor_critic_model = ActorCriticLoraModel(self.config)
        self.tokenizer = self.actor_critic_model.tokenizer
        # 获得演员和评论家模型优化器, 这里使用的是lora, 不优化全量数据
        self.actor_critic_opt = Adam(self.actor_critic_model.parameters(), lr=self.config.lr)
        # 参考模型
        self.reference_model = ReferenceModel(self.config)
        # 奖励模型
        self.reward_model = RewardModel(self.config)
        # 训练数据
        dataset = CustomDataset(self.config.data_path, self.tokenizer)
        self.data_loader = DataLoader(dataset, batch_size=self.config.batch_size, shuffle=True,
                                      collate_fn=dataset.collate_fn)
        self.ppo = PPO(self.actor_critic_model, self.config, self.actor_critic_opt)

    def train_ppo(self):
        self.save_model()
        for epoch in range(self.config.epochs):
            print(epoch)
            for batch_data in self.data_loader:
                # 获得演员模型生成的结果(prompt_generate)和ids(prompt_generate_ids, generate_ids)
                prompt_generate, prompt_generate_ids, generate_ids = self.actor_critic_model.actor_generate(
                    batch_data[0].to(self.config.device)
                )
                attention_mask = (prompt_generate_ids != self.tokenizer.pad_token_id)
                generate_ids_mask = (generate_ids[:, :-1] != self.tokenizer.pad_token_id)
                # 模型生成的token, 为什么减去1，因为最后一个字符是结束符
                response_shape = generate_ids.shape[1] - 1
                # 初始化工具
                tools = Tools(response_shape, generate_ids_mask)
                # 去掉输入，获得真正生成的数据。用于计算reword value
                pure_generate = [one.split("assistant\n")[1] for one in prompt_generate]
                reward = self.reward_model(pure_generate)
                # 获得参考模型probs
                prob_refs = self.reference_model(prompt_generate_ids, attention_mask, tools)
                # 获得上帝模型（评论家模型）的价值
                self.ppo.train(prompt_generate_ids, attention_mask, prob_refs, reward, tools)
        self.save_model()

    def save_model(self):
        # 保存lora参数
        self.actor_critic_model.model.save_pretrained(self.config.save_lora_path, safe_serialization=False)
        # 保存价值模型参数
        torch.save(self.actor_critic_model.model.v_head.state_dict(), self.config.save_v_head_path)



train_ppo = TrainPpo()
train_ppo.train_ppo()

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


0


NotImplementedError: The operator 'aten::isin.Tensor_Tensor_out' is not currently implemented for the MPS device. If you want this op to be added in priority during the prototype phase of this feature, please comment on https://github.com/pytorch/pytorch/issues/77764. As a temporary fix, you can set the environment variable `PYTORCH_ENABLE_MPS_FALLBACK=1` to use the CPU as a fallback for this op. WARNING: this will be slower than running natively on MPS.