In [1]:
import os
import torch
import sys, pathlib
from transformers import AutoModelForCausalLM, AutoTokenizer, AutoModelForSequenceClassification,DebertaV2ForSequenceClassification, GPTNeoXForCausalLM
from llm_blender.pair_ranker.pairrm import DebertaV2PairRM
from transformers import DataCollatorWithPadding
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"

from torch.utils.data import DataLoader
from datasets import load_dataset, concatenate_datasets, DatasetDict
import torch.nn.functional as F
from tqdm import tqdm
import re
import yaml

LOCAL_TRL_PARENT = "/workspace/Self_play_DRPO"
if LOCAL_TRL_PARENT not in sys.path:
    sys.path.insert(0, LOCAL_TRL_PARENT)

    
# now the import will use your local copy:
from trl import (
    DPOTrainer,
    DPOConfig,
    ModelConfig,
    DRPOTrainer,
    DRPOConfig,
)
from trl.trainer.utils import SIMPLE_CHAT_TEMPLATE
from trl.data_utils import apply_chat_template
from trl.trainer.drpo_utils import PairRMPipeline

def process_split(original, seed = 42):
    swapped = original.map(lambda x: {
        'a1': x['a2'],
        'a2': x['a1'],
        # 'rank': 1 - int(random.random() < x['chosen_preference']),
        'rank': 1 - x['rank'],
    })

    return concatenate_datasets([original, swapped]).shuffle(seed=seed)

def load_model(model_path, task = 'generation', model_type = 'decoder', model_cache_path =  '/workspace/model_cache'):

    model_args = ModelConfig(model_path)
    model_torch_dtype = (model_args.torch_dtype if model_args.torch_dtype in ["auto", None] else getattr(torch, model_args.torch_dtype))
    model_kwargs = dict(
    revision = model_args.model_revision,
    torch_dtype = model_torch_dtype, 
    trust_remote_code = model_args.trust_remote_code,
    )

    padding_side = 'left' if model_type == 'decoder' else 'right'
    truncation_side = 'left' if model_type == 'decoder' else 'right'

    if task == 'generation':
        model_instance = AutoModelForCausalLM.from_pretrained(
            model_args.model_name_or_path,
            **model_kwargs,
            cache_dir = model_cache_path,
        )

    elif task == 'reward':
        model_instance = AutoModelForSequenceClassification.from_pretrained(
            model_args.model_name_or_path,
            **model_kwargs,
            cache_dir = model_cache_path,
        )
    

    model_tokenizer = AutoTokenizer.from_pretrained(
        model_args.model_name_or_path, 
        padding_side = padding_side, 
        truncation_side = truncation_side,
        use_fast = True,
        trust_remote_code = model_args.trust_remote_code,
        cache_dir = model_cache_path
    )

    if model_tokenizer.pad_token is None:
        model_tokenizer.pad_token = model_tokenizer.eos_token

    if getattr(model_instance.config, "pad_token_id", None) is None:
        model_instance.config.pad_token_id = model_tokenizer.pad_token_id

    if model_tokenizer.eos_token is None:
        model_tokenizer.eos_token = model_tokenizer.pad_token  

    if getattr(model_instance.config, "eos_token_id", None) is None:
        model_instance.config.eos_token_id = model_tokenizer.eos_token_id

    return model_instance, model_tokenizer



data_cache_path = "/workspace/dataset"
model_cache_path = '/workspace/model_cache'
ds_path = 'august66/drpo_ultrafeedback_qwen2.5-1.5b_first_iter_20k'
ref_policy_path = "Qwen/Qwen2.5-1.5B-Instruct" 
target_policy_path = "Qwen/Qwen2.5-1.5B-Instruct" 

#load training argument for drpo
with open("/workspace/Self_play_DRPO/self_play_drpo_code/training_config/config_normal_dist.yaml", "r") as f:
    training_args_config = yaml.safe_load(f)


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
seed = 1234
ref_policy_model, ref_policy_tokenizer = load_model(ref_policy_path)
target_policy_model, target_policy_tokenizer = load_model(target_policy_path)
drpo_train = load_dataset(ds_path, cache_dir=data_cache_path, split = 'train')
drpo_train = process_split(drpo_train)
drpo_train = drpo_train.shuffle(seed=seed)

training_args = DRPOConfig(
    **training_args_config
)
preference_pipeline = PairRMPipeline(
    model_name_or_path = training_args.preference_model_id,
)

trainer = DRPOTrainer(
    model=target_policy_model,
    ref_model=ref_policy_model,
    preference_model=preference_pipeline,
    train_dataset = drpo_train,
    processing_class=ref_policy_tokenizer,
    args=training_args
)
trainer.train()

after chat template dataset sample: {'prompt': '<|im_start|>system\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\n<|im_start|>user\nUtilizing the cognitive abilities of an artificial intelligence assistant has been found to provide substantial benefits in enhancing efficiency and productivity across various industries. In light of this, craft a compelling and original slogan that effectively captures the manifold advantages of leveraging AI assistance in the workplace, taking into account factors such as its ability to learn and adapt, its capacity for processing and analyzing vast amounts of data, and its potential to streamline and optimize complex workflows. Furthermore, the slogan should highlight the potential impact that AI assistants can have on achieving business objectives and improving overall performance, while also acknowledging the ethical considerations and potential challenges associated with their implementation.<|im_end|>\n<|im_start|>

Tokenizing train dataset: 100%|██████████| 40000/40000 [00:52<00:00, 767.54 examples/s]
[34m[1mwandb[0m: Currently logged in as: [33mooooaugust[0m ([33mooooaugust-london-school-of-economics-and-political-science[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.128204345703125, 'dpo_reward_mean_target': -0.03230857849121094, 'standard deviation': 3.0, 'reward_a1': tensor([-0.1282], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9994894862174988, 'numerator': 0.13291285932064056, 'denominator': 0.13298074901103973}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.1494426727294922, 'dpo_reward_mean_target': -0.1494426727294922, 'standard deviation': 3.0, 'reward_a1': tensor([0.0945], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13254190981388092, 'denominator': 0.13254190981388092}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.663238525390625, 'dpo_reward_mean_target': -1.035125732421875, 'standard deviation': 3.0, 'reward_a1': tensor([-0.7086], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9942060708999634, 'numerator': 0.13219520449638367, 'denominator': 0.1329655945301056}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.040798187255859375, 'dpo_reward_mean_target': 0.040798187255859375, 'standard deviation': 3.0, 'reward_a1': tensor([-0.0408], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13293161988258362, 'denominator': 0.13293161988258362}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.06933403015136719, 'dpo_reward_mean_target': -0.06933403015136719, 'standard deviation': 3.0, 'reward_a1': tensor([-0.2861], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13263413310050964, 'denominator': 0.13263413310050964}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.6514053344726562, 'dpo_reward_mean_target': 0.1228485107421875, 'standard deviation': 3.0, 'reward_a1': tensor([-0.4154], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0482617616653442, 'numerator': 0.13085727393627167, 'denominator': 0.1248326301574707}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.3423480987548828, 'dpo_reward_mean_target': 0.487335205078125, 'standard deviation': 3.0, 'reward_a1': tensor([-0.2220], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9897924661636353, 'numerator': 0.12931467592716217, 'denominator': 0.13064827024936676}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.47718048095703125, 'dpo_reward_mean_target': -0.46730804443359375, 'standard deviation': 3.0, 'reward_a1': tensor([-0.5447], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9999203085899353, 'numerator': 0.132936492562294, 'denominator': 0.13294708728790283}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.480133056640625, 'dpo_reward_mean_target': 0.105438232421875, 'standard deviation': 3.0, 'reward_a1': tensor([-0.1808], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0004267692565918, 'numerator': 0.13237686455249786, 'denominator': 0.13232038915157318}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.3533306121826172, 'dpo_reward_mean_target': 0.3533306121826172, 'standard deviation': 3.0, 'reward_a1': tensor([0.6384], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13238196074962616, 'denominator': 0.13238196074962616}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.032440185546875, 'dpo_reward_mean_target': -0.194915771484375, 'standard deviation': 3.0, 'reward_a1': tensor([0.0001], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9979462623596191, 'numerator': 0.13269998133182526, 'denominator': 0.1329730749130249}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.15950775146484375, 'dpo_reward_mean_target': 0.20731353759765625, 'standard deviation': 3.0, 'reward_a1': tensor([-0.3318], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9856062531471252, 'numerator': 0.1308506578207016, 'denominator': 0.13276159763336182}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.05503082275390625, 'dpo_reward_mean_target': -0.013225555419921875, 'standard deviation': 3.0, 'reward_a1': tensor([-0.1189], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.999606728553772, 'numerator': 0.13289834558963776, 'denominator': 0.13295063376426697}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.014986038208007812, 'dpo_reward_mean_target': -0.014986038208007812, 'standard deviation': 3.0, 'reward_a1': tensor([-0.0150], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13298074901103973, 'denominator': 0.13298074901103973}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.26949310302734375, 'dpo_reward_mean_target': 0.26949310302734375, 'standard deviation': 3.0, 'reward_a1': tensor([-0.5005], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.12867209315299988, 'denominator': 0.12867209315299988}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.3012809753417969, 'dpo_reward_mean_target': -0.3012809753417969, 'standard deviation': 3.0, 'reward_a1': tensor([0.3248], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13011592626571655, 'denominator': 0.13011592626571655}


Step,Training Loss
5,74.0513
10,82.7029
15,60.9826
20,67.608
25,39.3113
30,104.7298
35,64.8366
40,84.1463
45,67.6434
50,92.4337


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.113677978515625, 'dpo_reward_mean_target': 0.113677978515625, 'standard deviation': 3.0, 'reward_a1': tensor([-0.1437], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13249218463897705, 'denominator': 0.13249218463897705}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.05506134033203125, 'dpo_reward_mean_target': -0.05506134033203125, 'standard deviation': 3.0, 'reward_a1': tensor([0.0406], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.1329132318496704, 'denominator': 0.1329132318496704}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.05153465270996094, 'dpo_reward_mean_target': -0.05153465270996094, 'standard deviation': 3.0, 'reward_a1': tensor([0.1960], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.1325288861989975, 'denominator': 0.1325288861989975}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.3780975341796875, 'dpo_reward_mean_target': -0.2574958801269531, 'standard deviation': 3.0, 'reward_a1': tensor([-0.0679], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.003353476524353, 'numerator': 0.13271558284759521, 'denominator': 0.13227200508117676}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.14617919921875, 'dpo_reward_mean_target': -0.06109619140625, 'standard deviation': 3.0, 'reward_a1': tensor([-0.0795], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0002281665802002, 'numerator': 0.13297827541828156, 'denominator': 0.13294793665409088}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.31597900390625, 'dpo_reward_mean_target': 0.22379684448242188, 'standard deviation': 3.0, 'reward_a1': tensor([-0.0298], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0030748844146729, 'numerator': 0.1325063407421112, 'denominator': 0.1321001499891281}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.4941444396972656, 'dpo_reward_mean_target': 0.10033416748046875, 'standard deviation': 3.0, 'reward_a1': tensor([-0.0023], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.013195514678955, 'numerator': 0.13290290534496307, 'denominator': 0.13117201626300812}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.5198287963867188, 'dpo_reward_mean_target': 0.7083206176757812, 'standard deviation': 3.0, 'reward_a1': tensor([0.0268], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.987775444984436, 'numerator': 0.12959301471710205, 'denominator': 0.13119684159755707}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.3040771484375, 'dpo_reward_mean_target': 0.3040771484375, 'standard deviation': 3.0, 'reward_a1': tensor([-0.1041], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.1317557543516159, 'denominator': 0.1317557543516159}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.05301856994628906, 'dpo_reward_mean_target': -0.05301856994628906, 'standard deviation': 3.0, 'reward_a1': tensor([0.6854], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.1290125846862793, 'denominator': 0.1290125846862793}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.23795318603515625, 'dpo_reward_mean_target': -0.13013458251953125, 'standard deviation': 3.0, 'reward_a1': tensor([-0.1202], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.007145643234253, 'numerator': 0.13298004865646362, 'denominator': 0.13203656673431396}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.05745506286621094, 'dpo_reward_mean_target': 0.1635894775390625, 'standard deviation': 3.0, 'reward_a1': tensor([-0.0311], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9983317852020264, 'numerator': 0.132701113820076, 'denominator': 0.13292285799980164}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.11561393737792969, 'dpo_reward_mean_target': 0.11561393737792969, 'standard deviation': 3.0, 'reward_a1': tensor([0.1426], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13297538459300995, 'denominator': 0.13297538459300995}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.24059295654296875, 'dpo_reward_mean_target': -0.19585418701171875, 'standard deviation': 3.0, 'reward_a1': tensor([-0.6754], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9977297782897949, 'numerator': 0.13129232823848724, 'denominator': 0.1315910667181015}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.02475738525390625, 'dpo_reward_mean_target': -0.02475738525390625, 'standard deviation': 3.0, 'reward_a1': tensor([-0.0112], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13297942280769348, 'denominator': 0.13297942280769348}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.029979705810546875, 'dpo_reward_mean_target': 0.029979705810546875, 'standard deviation': 3.0, 'reward_a1': tensor([0.0337], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13298067450523376, 'denominator': 0.13298067450523376}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.10760307312011719, 'dpo_reward_mean_target': 0.10760307312011719, 'standard deviation': 3.0, 'reward_a1': tensor([-0.3799], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13123683631420135, 'denominator': 0.13123683631420135}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.0020198822021484375, 'dpo_reward_mean_target': 0.0020198822021484375, 'standard deviation': 3.0, 'reward_a1': tensor([0.0020], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13298074901103973, 'denominator': 0.13298074901103973}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.7212677001953125, 'dpo_reward_mean_target': -0.11895751953125, 'standard deviation': 3.0, 'reward_a1': tensor([0.5989], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.972582221031189, 'numerator': 0.12922726571559906, 'denominator': 0.13287027180194855}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.002285003662109375, 'dpo_reward_mean_target': 0.4047393798828125, 'standard deviation': 3.0, 'reward_a1': tensor([0.7005], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0224727392196655, 'numerator': 0.1323360949754715, 'denominator': 0.12942750751972198}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.01622772216796875, 'dpo_reward_mean_target': -0.0875244140625, 'standard deviation': 3.0, 'reward_a1': tensor([0.3267], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9970057606697083, 'numerator': 0.13171939551830292, 'denominator': 0.132114976644516}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.014204025268554688, 'dpo_reward_mean_target': 0.08259201049804688, 'standard deviation': 3.0, 'reward_a1': tensor([-0.1632], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9983934760093689, 'numerator': 0.1325351595878601, 'denominator': 0.13274842500686646}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.119659423828125, 'dpo_reward_mean_target': -0.006504058837890625, 'standard deviation': 3.0, 'reward_a1': tensor([0.2828], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9968342781066895, 'numerator': 0.132363960146904, 'denominator': 0.13278432190418243}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.05248069763183594, 'dpo_reward_mean_target': -0.05248069763183594, 'standard deviation': 3.0, 'reward_a1': tensor([0.0365], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13292232155799866, 'denominator': 0.13292232155799866}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -1.070770263671875, 'dpo_reward_mean_target': -0.87646484375, 'standard deviation': 3.0, 'reward_a1': tensor([-0.5748], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0086482763290405, 'numerator': 0.13231001794338226, 'denominator': 0.13117557764053345}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.18528366088867188, 'dpo_reward_mean_target': 0.3732337951660156, 'standard deviation': 3.0, 'reward_a1': tensor([-0.1737], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9905850887298584, 'numerator': 0.13078901171684265, 'denominator': 0.13203208148479462}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.307525634765625, 'dpo_reward_mean_target': -0.13146591186523438, 'standard deviation': 3.0, 'reward_a1': tensor([-0.1967], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0139843225479126, 'numerator': 0.1329493671655655, 'denominator': 0.13111579418182373}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.3429374694824219, 'dpo_reward_mean_target': -0.3429374694824219, 'standard deviation': 3.0, 'reward_a1': tensor([-0.4094], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.132948100566864, 'denominator': 0.132948100566864}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.23548126220703125, 'dpo_reward_mean_target': -0.43122100830078125, 'standard deviation': 3.0, 'reward_a1': tensor([-0.0972], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9948763251304626, 'numerator': 0.13215889036655426, 'denominator': 0.13283951580524445}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.09062767028808594, 'dpo_reward_mean_target': 0.06205940246582031, 'standard deviation': 3.0, 'reward_a1': tensor([0.0577], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0000591278076172, 'numerator': 0.13298062980175018, 'denominator': 0.13297276198863983}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.11227035522460938, 'dpo_reward_mean_target': -0.11991691589355469, 'standard deviation': 3.0, 'reward_a1': tensor([0.1711], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.999755859375, 'numerator': 0.13235647976398468, 'denominator': 0.13238880038261414}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.09593963623046875, 'dpo_reward_mean_target': -0.26917266845703125, 'standard deviation': 3.0, 'reward_a1': tensor([0.0461], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9946287870407104, 'numerator': 0.13224825263023376, 'denominator': 0.1329624205827713}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.2186737060546875, 'dpo_reward_mean_target': 0.2186737060546875, 'standard deviation': 3.0, 'reward_a1': tensor([-0.0841], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.132305309176445, 'denominator': 0.132305309176445}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.33232879638671875, 'dpo_reward_mean_target': 0.4922027587890625, 'standard deviation': 3.0, 'reward_a1': tensor([0.1371], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0052491426467896, 'numerator': 0.13205234706401825, 'denominator': 0.13136281073093414}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.031707763671875, 'dpo_reward_mean_target': 0.35784912109375, 'standard deviation': 3.0, 'reward_a1': tensor([-1.1718], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9438582062721252, 'numerator': 0.1167706623673439, 'denominator': 0.1237163171172142}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.012411117553710938, 'dpo_reward_mean_target': 0.012411117553710938, 'standard deviation': 3.0, 'reward_a1': tensor([0.1217], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13289247453212738, 'denominator': 0.13289247453212738}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.07077217102050781, 'dpo_reward_mean_target': 0.07077217102050781, 'standard deviation': 3.0, 'reward_a1': tensor([0.1458], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13293921947479248, 'denominator': 0.13293921947479248}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.0488739013671875, 'dpo_reward_mean_target': 0.20225143432617188, 'standard deviation': 3.0, 'reward_a1': tensor([-0.4973], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.984111487865448, 'numerator': 0.12941399216651917, 'denominator': 0.1315033882856369}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.04888725280761719, 'dpo_reward_mean_target': -0.04888725280761719, 'standard deviation': 3.0, 'reward_a1': tensor([0.7319], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.1285518854856491, 'denominator': 0.1285518854856491}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.14812469482421875, 'dpo_reward_mean_target': 0.0504150390625, 'standard deviation': 3.0, 'reward_a1': tensor([-0.1421], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9979456067085266, 'numerator': 0.13270729780197144, 'denominator': 0.13298049569129944}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.12071418762207031, 'dpo_reward_mean_target': -0.12071418762207031, 'standard deviation': 3.0, 'reward_a1': tensor([-0.2098], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13292211294174194, 'denominator': 0.13292211294174194}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.20970916748046875, 'dpo_reward_mean_target': 0.4208221435546875, 'standard deviation': 3.0, 'reward_a1': tensor([0.2439], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9983272552490234, 'numerator': 0.13274969160556793, 'denominator': 0.1329721212387085}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.1999664306640625, 'dpo_reward_mean_target': -0.17513275146484375, 'standard deviation': 3.0, 'reward_a1': tensor([-0.4845], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0209249258041382, 'numerator': 0.13227571547031403, 'denominator': 0.1295645833015442}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.025020599365234375, 'dpo_reward_mean_target': 0.025020599365234375, 'standard deviation': 3.0, 'reward_a1': tensor([0.0691], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13296641409397125, 'denominator': 0.13296641409397125}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.5328292846679688, 'dpo_reward_mean_target': 0.5328292846679688, 'standard deviation': 3.0, 'reward_a1': tensor([-0.5444], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.12467890232801437, 'denominator': 0.12467890232801437}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.03247261047363281, 'dpo_reward_mean_target': 0.03247261047363281, 'standard deviation': 3.0, 'reward_a1': tensor([0.0319], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13298074901103973, 'denominator': 0.13298074901103973}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.10162353515625, 'dpo_reward_mean_target': -0.7493209838867188, 'standard deviation': 3.0, 'reward_a1': tensor([0.3763], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9439334869384766, 'numerator': 0.12394233793020248, 'denominator': 0.13130410015583038}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.1241455078125, 'dpo_reward_mean_target': 0.0986175537109375, 'standard deviation': 3.0, 'reward_a1': tensor([0.4324], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9990900158882141, 'numerator': 0.1321602165699005, 'denominator': 0.13228058815002441}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.57916259765625, 'dpo_reward_mean_target': 0.21269989013671875, 'standard deviation': 3.0, 'reward_a1': tensor([-0.4217], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9792369604110718, 'numerator': 0.13004043698310852, 'denominator': 0.1327977180480957}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.03153800964355469, 'dpo_reward_mean_target': -0.040981292724609375, 'standard deviation': 3.0, 'reward_a1': tensor([0.3701], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9969847202301025, 'numerator': 0.13173823058605194, 'denominator': 0.13213665783405304}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.00615692138671875, 'dpo_reward_mean_target': -0.00615692138671875, 'standard deviation': 3.0, 'reward_a1': tensor([-0.0062], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13298074901103973, 'denominator': 0.13298074901103973}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.00379180908203125, 'dpo_reward_mean_target': -0.219024658203125, 'standard deviation': 3.0, 'reward_a1': tensor([-0.8225], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.017150640487671, 'numerator': 0.13031747937202454, 'denominator': 0.12812013924121857}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.2307415008544922, 'dpo_reward_mean_target': -0.2307415008544922, 'standard deviation': 3.0, 'reward_a1': tensor([-0.1312], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13290752470493317, 'denominator': 0.13290752470493317}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.23967552185058594, 'dpo_reward_mean_target': -0.2669639587402344, 'standard deviation': 3.0, 'reward_a1': tensor([-0.1090], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0053819417953491, 'numerator': 0.1327965408563614, 'denominator': 0.1320856660604477}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.22283172607421875, 'dpo_reward_mean_target': 0.2553997039794922, 'standard deviation': 3.0, 'reward_a1': tensor([-0.5129], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9722718596458435, 'numerator': 0.12869048118591309, 'denominator': 0.13236059248447418}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.19418907165527344, 'dpo_reward_mean_target': -0.2158527374267578, 'standard deviation': 3.0, 'reward_a1': tensor([0.1648], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9991098642349243, 'numerator': 0.13191430270671844, 'denominator': 0.13203182816505432}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.21303176879882812, 'dpo_reward_mean_target': 0.21303176879882812, 'standard deviation': 3.0, 'reward_a1': tensor([-0.2034], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13170599937438965, 'denominator': 0.13170599937438965}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.923736572265625, 'dpo_reward_mean_target': 0.45343017578125, 'standard deviation': 3.0, 'reward_a1': tensor([-0.3469], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0555992126464844, 'numerator': 0.128332257270813, 'denominator': 0.12157290428876877}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.0052547454833984375, 'dpo_reward_mean_target': -0.0052547454833984375, 'standard deviation': 3.0, 'reward_a1': tensor([0.1591], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13278134167194366, 'denominator': 0.13278134167194366}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.1332855224609375, 'dpo_reward_mean_target': 0.1332855224609375, 'standard deviation': 3.0, 'reward_a1': tensor([-0.3350], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.1313704401254654, 'denominator': 0.1313704401254654}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.04615974426269531, 'dpo_reward_mean_target': 0.04615974426269531, 'standard deviation': 3.0, 'reward_a1': tensor([-0.3623], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13175374269485474, 'denominator': 0.13175374269485474}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.2951622009277344, 'dpo_reward_mean_target': 0.19266510009765625, 'standard deviation': 3.0, 'reward_a1': tensor([0.4993], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9970960021018982, 'numerator': 0.13228796422481537, 'denominator': 0.1326732486486435}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.03014373779296875, 'dpo_reward_mean_target': 0.22776031494140625, 'standard deviation': 3.0, 'reward_a1': tensor([-0.3737], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.986550509929657, 'numerator': 0.13033469021320343, 'denominator': 0.13211151957511902}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.22622299194335938, 'dpo_reward_mean_target': 0.1509571075439453, 'standard deviation': 3.0, 'reward_a1': tensor([0.5401], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9970640540122986, 'numerator': 0.13186649978160858, 'denominator': 0.13225479423999786}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.18328094482421875, 'dpo_reward_mean_target': -0.18328094482421875, 'standard deviation': 3.0, 'reward_a1': tensor([-0.0607], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13286976516246796, 'denominator': 0.13286976516246796}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.15280723571777344, 'dpo_reward_mean_target': -0.15280723571777344, 'standard deviation': 3.0, 'reward_a1': tensor([0.1139], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13245616853237152, 'denominator': 0.13245616853237152}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.039398193359375, 'dpo_reward_mean_target': 0.519500732421875, 'standard deviation': 3.0, 'reward_a1': tensor([-0.1884], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9753539562225342, 'numerator': 0.1293300837278366, 'denominator': 0.13259810209274292}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.0049762725830078125, 'dpo_reward_mean_target': -0.0049762725830078125, 'standard deviation': 3.0, 'reward_a1': tensor([0.4286], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.1315990388393402, 'denominator': 0.1315990388393402}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.018951416015625, 'dpo_reward_mean_target': 0.5101394653320312, 'standard deviation': 3.0, 'reward_a1': tensor([-0.4064], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9623957872390747, 'numerator': 0.12691721320152283, 'denominator': 0.13187631964683533}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.15271759033203125, 'dpo_reward_mean_target': -0.39679718017578125, 'standard deviation': 3.0, 'reward_a1': tensor([-0.3889], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.003098964691162, 'numerator': 0.13298030197620392, 'denominator': 0.13256947696208954}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.14867401123046875, 'dpo_reward_mean_target': 0.2923431396484375, 'standard deviation': 3.0, 'reward_a1': tensor([-0.6345], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9864441752433777, 'numerator': 0.12678350508213043, 'denominator': 0.1285257786512375}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.05552482604980469, 'dpo_reward_mean_target': 0.05552482604980469, 'standard deviation': 3.0, 'reward_a1': tensor([0.0555], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13298074901103973, 'denominator': 0.13298074901103973}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.165802001953125, 'dpo_reward_mean_target': 0.013090133666992188, 'standard deviation': 3.0, 'reward_a1': tensor([-0.2250], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0053505897521973, 'numerator': 0.13256245851516724, 'denominator': 0.13185694813728333}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.0044040679931640625, 'dpo_reward_mean_target': 0.0044040679931640625, 'standard deviation': 3.0, 'reward_a1': tensor([0.0044], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13298074901103973, 'denominator': 0.13298074901103973}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.026580810546875, 'dpo_reward_mean_target': 0.290618896484375, 'standard deviation': 3.0, 'reward_a1': tensor([0.9223], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0282444953918457, 'numerator': 0.13006526231765747, 'denominator': 0.12649254500865936}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.430267333984375, 'dpo_reward_mean_target': 0.0007476806640625, 'standard deviation': 3.0, 'reward_a1': tensor([1.2644], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9511741399765015, 'numerator': 0.12169180065393448, 'denominator': 0.12793850898742676}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.0360260009765625, 'dpo_reward_mean_target': 0.022485733032226562, 'standard deviation': 3.0, 'reward_a1': tensor([-0.4716], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9969826340675354, 'numerator': 0.13118934631347656, 'denominator': 0.13158638775348663}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.01976776123046875, 'dpo_reward_mean_target': 0.43642425537109375, 'standard deviation': 3.0, 'reward_a1': tensor([-0.1666], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9818945527076721, 'numerator': 0.1303214579820633, 'denominator': 0.13272449374198914}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.216766357421875, 'dpo_reward_mean_target': 0.151702880859375, 'standard deviation': 3.0, 'reward_a1': tensor([0.2768], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.01274573802948, 'numerator': 0.13286517560482025, 'denominator': 0.13119302690029144}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.008419036865234375, 'dpo_reward_mean_target': -0.008419036865234375, 'standard deviation': 3.0, 'reward_a1': tensor([-0.0084], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13298074901103973, 'denominator': 0.13298074901103973}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.0539398193359375, 'dpo_reward_mean_target': 0.6643295288085938, 'standard deviation': 3.0, 'reward_a1': tensor([0.2348], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9943940043449402, 'numerator': 0.13162441551685333, 'denominator': 0.13236646354198456}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.18602371215820312, 'dpo_reward_mean_target': 0.4554786682128906, 'standard deviation': 3.0, 'reward_a1': tensor([-0.2955], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.981718897819519, 'numerator': 0.12887884676456451, 'denominator': 0.13127876818180084}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.3211936950683594, 'dpo_reward_mean_target': -0.17934799194335938, 'standard deviation': 3.0, 'reward_a1': tensor([-0.1134], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.002159595489502, 'numerator': 0.1329486072063446, 'denominator': 0.1326621174812317}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.038791656494140625, 'dpo_reward_mean_target': 0.038791656494140625, 'standard deviation': 3.0, 'reward_a1': tensor([-0.0410], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13293376564979553, 'denominator': 0.13293376564979553}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.06615447998046875, 'dpo_reward_mean_target': 0.06615447998046875, 'standard deviation': 3.0, 'reward_a1': tensor([0.0457], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.1329776644706726, 'denominator': 0.1329776644706726}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.6102828979492188, 'dpo_reward_mean_target': -0.5232696533203125, 'standard deviation': 3.0, 'reward_a1': tensor([0.8921], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0142046213150024, 'numerator': 0.11897497624158859, 'denominator': 0.11730865389108658}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.5446548461914062, 'dpo_reward_mean_target': 0.2615203857421875, 'standard deviation': 3.0, 'reward_a1': tensor([-0.1670], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.01809561252594, 'numerator': 0.13163122534751892, 'denominator': 0.1292916089296341}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.09327316284179688, 'dpo_reward_mean_target': 0.11170196533203125, 'standard deviation': 3.0, 'reward_a1': tensor([-0.1974], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.999386191368103, 'numerator': 0.13227681815624237, 'denominator': 0.13235805928707123}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.018474578857421875, 'dpo_reward_mean_target': -0.018474578857421875, 'standard deviation': 3.0, 'reward_a1': tensor([0.3264], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13210470974445343, 'denominator': 0.13210470974445343}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.002780914306640625, 'dpo_reward_mean_target': -0.002780914306640625, 'standard deviation': 3.0, 'reward_a1': tensor([-0.0028], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13298074901103973, 'denominator': 0.13298074901103973}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.013172149658203125, 'dpo_reward_mean_target': -0.013172149658203125, 'standard deviation': 3.0, 'reward_a1': tensor([-0.0132], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13298074901103973, 'denominator': 0.13298074901103973}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.074005126953125, 'dpo_reward_mean_target': 0.103302001953125, 'standard deviation': 3.0, 'reward_a1': tensor([-0.4444], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.998266339302063, 'numerator': 0.13078273832798004, 'denominator': 0.13100986182689667}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.19183349609375, 'dpo_reward_mean_target': 0.3423919677734375, 'standard deviation': 3.0, 'reward_a1': tensor([-0.4866], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9874711632728577, 'numerator': 0.1279996931552887, 'denominator': 0.12962372601032257}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.04146003723144531, 'dpo_reward_mean_target': -0.04146003723144531, 'standard deviation': 3.0, 'reward_a1': tensor([0.1512], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.1327069103717804, 'denominator': 0.1327069103717804}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.024999618530273438, 'dpo_reward_mean_target': -0.024999618530273438, 'standard deviation': 3.0, 'reward_a1': tensor([0.0403], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13294930756092072, 'denominator': 0.13294930756092072}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.59735107421875, 'dpo_reward_mean_target': 0.139617919921875, 'standard deviation': 3.0, 'reward_a1': tensor([-0.7717], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0597050189971924, 'numerator': 0.12698420882225037, 'denominator': 0.11982976645231247}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.01483154296875, 'dpo_reward_mean_target': 0.21570587158203125, 'standard deviation': 3.0, 'reward_a1': tensor([-0.3872], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9888473153114319, 'numerator': 0.1303219497203827, 'denominator': 0.13179178535938263}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.2937889099121094, 'dpo_reward_mean_target': 0.24423980712890625, 'standard deviation': 3.0, 'reward_a1': tensor([-0.1294], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9937666654586792, 'numerator': 0.13195352256298065, 'denominator': 0.13278119266033173}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.080810546875, 'dpo_reward_mean_target': -0.1990966796875, 'standard deviation': 3.0, 'reward_a1': tensor([-0.2211], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0050510168075562, 'numerator': 0.1329772025346756, 'denominator': 0.13230890035629272}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.2655467987060547, 'dpo_reward_mean_target': 0.2655467987060547, 'standard deviation': 3.0, 'reward_a1': tensor([-0.2741], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13084624707698822, 'denominator': 0.13084624707698822}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.0842132568359375, 'dpo_reward_mean_target': -0.0842132568359375, 'standard deviation': 3.0, 'reward_a1': tensor([0.3418], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13164673745632172, 'denominator': 0.13164673745632172}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.0208740234375, 'dpo_reward_mean_target': 0.0208740234375, 'standard deviation': 3.0, 'reward_a1': tensor([0.0209], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13298074901103973, 'denominator': 0.13298074901103973}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.08304023742675781, 'dpo_reward_mean_target': -0.06591987609863281, 'standard deviation': 3.0, 'reward_a1': tensor([0.0251], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0001896619796753, 'numerator': 0.1329195648431778, 'denominator': 0.1328943520784378}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.05902099609375, 'dpo_reward_mean_target': 0.05902099609375, 'standard deviation': 3.0, 'reward_a1': tensor([-0.0939], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13280819356441498, 'denominator': 0.13280819356441498}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.1188812255859375, 'dpo_reward_mean_target': 0.06463623046875, 'standard deviation': 3.0, 'reward_a1': tensor([-0.9160], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9820385575294495, 'numerator': 0.12606292963027954, 'denominator': 0.12836861610412598}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.2362957000732422, 'dpo_reward_mean_target': 0.2362957000732422, 'standard deviation': 3.0, 'reward_a1': tensor([0.2363], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13298074901103973, 'denominator': 0.13298074901103973}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.000102996826171875, 'dpo_reward_mean_target': -0.000102996826171875, 'standard deviation': 3.0, 'reward_a1': tensor([0.1325], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13285095989704132, 'denominator': 0.13285095989704132}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.04184722900390625, 'dpo_reward_mean_target': 0.04184722900390625, 'standard deviation': 3.0, 'reward_a1': tensor([-0.2106], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13251082599163055, 'denominator': 0.13251082599163055}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.07305717468261719, 'dpo_reward_mean_target': 0.06947898864746094, 'standard deviation': 3.0, 'reward_a1': tensor([7.0572e-05], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0000295639038086, 'numerator': 0.13294517993927002, 'denominator': 0.13294124603271484}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.22571182250976562, 'dpo_reward_mean_target': -0.4821891784667969, 'standard deviation': 3.0, 'reward_a1': tensor([-0.1606], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9945053458213806, 'numerator': 0.13221891224384308, 'denominator': 0.13294942677021027}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.2510528564453125, 'dpo_reward_mean_target': 0.484344482421875, 'standard deviation': 3.0, 'reward_a1': tensor([-0.4673], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9785884022712708, 'numerator': 0.12645584344863892, 'denominator': 0.12922270596027374}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.00582122802734375, 'dpo_reward_mean_target': -0.00582122802734375, 'standard deviation': 3.0, 'reward_a1': tensor([0.0265], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.1329730749130249, 'denominator': 0.1329730749130249}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.28494834899902344, 'dpo_reward_mean_target': 0.14309310913085938, 'standard deviation': 3.0, 'reward_a1': tensor([0.3884], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9972551465034485, 'numerator': 0.13253697752952576, 'denominator': 0.13290177285671234}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.024774551391601562, 'dpo_reward_mean_target': -0.024774551391601562, 'standard deviation': 3.0, 'reward_a1': tensor([-0.0248], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13298074901103973, 'denominator': 0.13298074901103973}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.263427734375, 'dpo_reward_mean_target': 0.263427734375, 'standard deviation': 3.0, 'reward_a1': tensor([0.5743], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.1322685331106186, 'denominator': 0.1322685331106186}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.07271957397460938, 'dpo_reward_mean_target': 0.1764678955078125, 'standard deviation': 3.0, 'reward_a1': tensor([0.3938], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0031076669692993, 'numerator': 0.13263241946697235, 'denominator': 0.132221519947052}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.025083541870117188, 'dpo_reward_mean_target': 0.025083541870117188, 'standard deviation': 3.0, 'reward_a1': tensor([0.0227], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13298071920871735, 'denominator': 0.13298071920871735}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.21193695068359375, 'dpo_reward_mean_target': -0.19884872436523438, 'standard deviation': 3.0, 'reward_a1': tensor([-0.3293], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.999819815158844, 'numerator': 0.13285507261753082, 'denominator': 0.13287901878356934}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.3169097900390625, 'dpo_reward_mean_target': 0.2786865234375, 'standard deviation': 3.0, 'reward_a1': tensor([-0.8182], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9484917521476746, 'numerator': 0.12438247352838516, 'denominator': 0.13113711774349213}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.034114837646484375, 'dpo_reward_mean_target': -0.034114837646484375, 'standard deviation': 3.0, 'reward_a1': tensor([0.2914], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.1322002410888672, 'denominator': 0.1322002410888672}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.1279296875, 'dpo_reward_mean_target': 0.17498779296875, 'standard deviation': 3.0, 'reward_a1': tensor([-0.2439], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9910394549369812, 'numerator': 0.13169075548648834, 'denominator': 0.13288144767284393}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.5564727783203125, 'dpo_reward_mean_target': 0.334747314453125, 'standard deviation': 3.0, 'reward_a1': tensor([0.0301], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0102896690368652, 'numerator': 0.1322968304157257, 'denominator': 0.13094940781593323}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.05667304992675781, 'dpo_reward_mean_target': 0.05667304992675781, 'standard deviation': 3.0, 'reward_a1': tensor([0.0995], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.1329672485589981, 'denominator': 0.1329672485589981}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.07012367248535156, 'dpo_reward_mean_target': 0.22729110717773438, 'standard deviation': 3.0, 'reward_a1': tensor([-0.1994], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9908552169799805, 'numerator': 0.13164237141609192, 'denominator': 0.1328573226928711}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.024383544921875, 'dpo_reward_mean_target': -0.07465362548828125, 'standard deviation': 3.0, 'reward_a1': tensor([-0.3203], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0015133619308472, 'numerator': 0.13253574073314667, 'denominator': 0.13233546912670135}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.0035076141357421875, 'dpo_reward_mean_target': 0.0035076141357421875, 'standard deviation': 3.0, 'reward_a1': tensor([0.0083], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.1329805999994278, 'denominator': 0.1329805999994278}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.015253067016601562, 'dpo_reward_mean_target': 0.015253067016601562, 'standard deviation': 3.0, 'reward_a1': tensor([0.1949], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.1327424794435501, 'denominator': 0.1327424794435501}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.095458984375, 'dpo_reward_mean_target': -0.38764190673828125, 'standard deviation': 3.0, 'reward_a1': tensor([-0.0930], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9971534609794617, 'numerator': 0.13234089314937592, 'denominator': 0.13271868228912354}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.3332862854003906, 'dpo_reward_mean_target': 0.21358108520507812, 'standard deviation': 3.0, 'reward_a1': tensor([-0.2057], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9911779165267944, 'numerator': 0.13168840110301971, 'denominator': 0.13286051154136658}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.46849822998046875, 'dpo_reward_mean_target': 0.5706787109375, 'standard deviation': 3.0, 'reward_a1': tensor([-0.3789], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9898512959480286, 'numerator': 0.12648358941078186, 'denominator': 0.12778039276599884}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.1774730682373047, 'dpo_reward_mean_target': 0.03626441955566406, 'standard deviation': 3.0, 'reward_a1': tensor([0.6091], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9921509623527527, 'numerator': 0.13057847321033478, 'denominator': 0.13161149621009827}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.632598876953125, 'dpo_reward_mean_target': 0.3041229248046875, 'standard deviation': 3.0, 'reward_a1': tensor([0.5718], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9962329268455505, 'numerator': 0.1324525773525238, 'denominator': 0.13295342028141022}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.2705841064453125, 'dpo_reward_mean_target': 0.29433631896972656, 'standard deviation': 3.0, 'reward_a1': tensor([-0.1212], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9989352822303772, 'numerator': 0.13171140849590302, 'denominator': 0.13185179233551025}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.01647186279296875, 'dpo_reward_mean_target': 0.1837139129638672, 'standard deviation': 3.0, 'reward_a1': tensor([-0.3285], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9920680522918701, 'numerator': 0.1310568004846573, 'denominator': 0.13210465013980865}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.06223106384277344, 'dpo_reward_mean_target': -0.06223106384277344, 'standard deviation': 3.0, 'reward_a1': tensor([-0.0622], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13298074901103973, 'denominator': 0.13298074901103973}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.24001312255859375, 'dpo_reward_mean_target': 0.238189697265625, 'standard deviation': 3.0, 'reward_a1': tensor([0.3476], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0186883211135864, 'numerator': 0.1328924149274826, 'denominator': 0.1304544359445572}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.0222930908203125, 'dpo_reward_mean_target': -0.23314285278320312, 'standard deviation': 3.0, 'reward_a1': tensor([0.5500], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9815709590911865, 'numerator': 0.12852652370929718, 'denominator': 0.13093961775302887}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.1963348388671875, 'dpo_reward_mean_target': -0.1963348388671875, 'standard deviation': 3.0, 'reward_a1': tensor([0.2458], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.1315440833568573, 'denominator': 0.1315440833568573}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.07359695434570312, 'dpo_reward_mean_target': 0.07359695434570312, 'standard deviation': 3.0, 'reward_a1': tensor([-0.0952], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13277052342891693, 'denominator': 0.13277052342891693}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.024394989013671875, 'dpo_reward_mean_target': 0.024394989013671875, 'standard deviation': 3.0, 'reward_a1': tensor([-0.0341], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13295546174049377, 'denominator': 0.13295546174049377}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.017358779907226562, 'dpo_reward_mean_target': 0.017358779907226562, 'standard deviation': 3.0, 'reward_a1': tensor([0.1500], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13285085558891296, 'denominator': 0.13285085558891296}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.10973930358886719, 'dpo_reward_mean_target': 0.10973930358886719, 'standard deviation': 3.0, 'reward_a1': tensor([-0.3528], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13140985369682312, 'denominator': 0.13140985369682312}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.15810012817382812, 'dpo_reward_mean_target': -0.09116363525390625, 'standard deviation': 3.0, 'reward_a1': tensor([0.5631], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9854379892349243, 'numerator': 0.12985552847385406, 'denominator': 0.1317744255065918}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.3271636962890625, 'dpo_reward_mean_target': 0.61175537109375, 'standard deviation': 3.0, 'reward_a1': tensor([-0.0567], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9794566631317139, 'numerator': 0.1297205537557602, 'denominator': 0.13244134187698364}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.1561145782470703, 'dpo_reward_mean_target': 0.057201385498046875, 'standard deviation': 3.0, 'reward_a1': tensor([0.1487], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0047088861465454, 'numerator': 0.13291887938976288, 'denominator': 0.1322959065437317}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.0012836456298828125, 'dpo_reward_mean_target': 0.0012836456298828125, 'standard deviation': 3.0, 'reward_a1': tensor([-0.0018], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13298067450523376, 'denominator': 0.13298067450523376}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.2900848388671875, 'dpo_reward_mean_target': -0.2900848388671875, 'standard deviation': 3.0, 'reward_a1': tensor([0.4485], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.12901091575622559, 'denominator': 0.12901091575622559}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.27129364013671875, 'dpo_reward_mean_target': -0.18248748779296875, 'standard deviation': 3.0, 'reward_a1': tensor([-0.6729], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9956088662147522, 'numerator': 0.13121598958969116, 'denominator': 0.13179472088813782}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.0831146240234375, 'dpo_reward_mean_target': 0.280487060546875, 'standard deviation': 3.0, 'reward_a1': tensor([-0.3310], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9888173341751099, 'numerator': 0.13024690747261047, 'denominator': 0.13171988725662231}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.06377410888671875, 'dpo_reward_mean_target': -0.25189208984375, 'standard deviation': 3.0, 'reward_a1': tensor([-0.1014], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0002590417861938, 'numerator': 0.13281364738941193, 'denominator': 0.13277925550937653}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.10013389587402344, 'dpo_reward_mean_target': 0.04540252685546875, 'standard deviation': 3.0, 'reward_a1': tensor([0.4608], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.007925033569336, 'numerator': 0.1317121386528015, 'denominator': 0.1306765228509903}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.3403587341308594, 'dpo_reward_mean_target': 0.3403587341308594, 'standard deviation': 3.0, 'reward_a1': tensor([-0.2636], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13031324744224548, 'denominator': 0.13031324744224548}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.08044052124023438, 'dpo_reward_mean_target': 0.05275535583496094, 'standard deviation': 3.0, 'reward_a1': tensor([0.0476], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0000581741333008, 'numerator': 0.1329805552959442, 'denominator': 0.1329728215932846}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.015958786010742188, 'dpo_reward_mean_target': 0.015958786010742188, 'standard deviation': 3.0, 'reward_a1': tensor([0.0386], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13297700881958008, 'denominator': 0.13297700881958008}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.03315925598144531, 'dpo_reward_mean_target': -0.03315925598144531, 'standard deviation': 3.0, 'reward_a1': tensor([0.2385], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13243669271469116, 'denominator': 0.13243669271469116}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.380767822265625, 'dpo_reward_mean_target': -0.0081787109375, 'standard deviation': 3.0, 'reward_a1': tensor([-0.2390], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9981576800346375, 'numerator': 0.13258767127990723, 'denominator': 0.1328323930501938}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.25846099853515625, 'dpo_reward_mean_target': -0.25846099853515625, 'standard deviation': 3.0, 'reward_a1': tensor([-0.3265], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13294661045074463, 'denominator': 0.13294661045074463}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.2979087829589844, 'dpo_reward_mean_target': -0.2979087829589844, 'standard deviation': 3.0, 'reward_a1': tensor([0.0191], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13224047422409058, 'denominator': 0.13224047422409058}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.09261703491210938, 'dpo_reward_mean_target': 0.26165008544921875, 'standard deviation': 3.0, 'reward_a1': tensor([0.4481], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0144153833389282, 'numerator': 0.1327241212129593, 'denominator': 0.1308380365371704}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.11372184753417969, 'dpo_reward_mean_target': -0.16488265991210938, 'standard deviation': 3.0, 'reward_a1': tensor([-0.1193], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9998863935470581, 'numerator': 0.13296541571617126, 'denominator': 0.13298052549362183}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.0039081573486328125, 'dpo_reward_mean_target': 0.0039081573486328125, 'standard deviation': 3.0, 'reward_a1': tensor([0.1436], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13283665478229523, 'denominator': 0.13283665478229523}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.15320968627929688, 'dpo_reward_mean_target': -0.15320968627929688, 'standard deviation': 3.0, 'reward_a1': tensor([0.3585], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13106010854244232, 'denominator': 0.13106010854244232}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.01848602294921875, 'dpo_reward_mean_target': -0.060394287109375, 'standard deviation': 3.0, 'reward_a1': tensor([0.3253], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9969695806503296, 'numerator': 0.13188600540161133, 'denominator': 0.1322868913412094}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.00791168212890625, 'dpo_reward_mean_target': 0.0077648162841796875, 'standard deviation': 3.0, 'reward_a1': tensor([-0.1517], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9997361302375793, 'numerator': 0.1327930986881256, 'denominator': 0.13282814621925354}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.12788772583007812, 'dpo_reward_mean_target': -0.12788772583007812, 'standard deviation': 3.0, 'reward_a1': tensor([0.1102], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.132562518119812, 'denominator': 0.132562518119812}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.0022430419921875, 'dpo_reward_mean_target': 0.832061767578125, 'standard deviation': 3.0, 'reward_a1': tensor([0.3947], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9981240034103394, 'numerator': 0.13157476484775543, 'denominator': 0.13182206451892853}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.07535362243652344, 'dpo_reward_mean_target': -0.07535362243652344, 'standard deviation': 3.0, 'reward_a1': tensor([-0.2681], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13270652294158936, 'denominator': 0.13270652294158936}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.02823638916015625, 'dpo_reward_mean_target': 0.02823638916015625, 'standard deviation': 3.0, 'reward_a1': tensor([-0.3209], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13208304345607758, 'denominator': 0.13208304345607758}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.34600830078125, 'dpo_reward_mean_target': 0.5015068054199219, 'standard deviation': 3.0, 'reward_a1': tensor([-0.2678], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.98812335729599, 'numerator': 0.12867994606494904, 'denominator': 0.13022659718990326}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.37697601318359375, 'dpo_reward_mean_target': 0.2955474853515625, 'standard deviation': 3.0, 'reward_a1': tensor([0.0563], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0072780847549438, 'numerator': 0.1325586587190628, 'denominator': 0.13160085678100586}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.2728729248046875, 'dpo_reward_mean_target': 0.2728729248046875, 'standard deviation': 3.0, 'reward_a1': tensor([-0.0622], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13215386867523193, 'denominator': 0.13215386867523193}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.135009765625, 'dpo_reward_mean_target': 0.095916748046875, 'standard deviation': 3.0, 'reward_a1': tensor([-0.1861], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9957355856895447, 'numerator': 0.132394477725029, 'denominator': 0.1329614818096161}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.09165763854980469, 'dpo_reward_mean_target': -0.09165763854980469, 'standard deviation': 3.0, 'reward_a1': tensor([0.0926], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13273020088672638, 'denominator': 0.13273020088672638}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.48052978515625, 'dpo_reward_mean_target': -0.5098190307617188, 'standard deviation': 3.0, 'reward_a1': tensor([1.2561], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.8695046901702881, 'numerator': 0.11182728409767151, 'denominator': 0.12861032783985138}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.8041534423828125, 'dpo_reward_mean_target': 1.3480224609375, 'standard deviation': 3.0, 'reward_a1': tensor([0.2346], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9504212737083435, 'numerator': 0.12413057684898376, 'denominator': 0.13060584664344788}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.8454742431640625, 'dpo_reward_mean_target': 0.06925201416015625, 'standard deviation': 3.0, 'reward_a1': tensor([0.6284], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.1088353395462036, 'numerator': 0.1306912899017334, 'denominator': 0.11786357313394547}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.3443164825439453, 'dpo_reward_mean_target': -0.3443164825439453, 'standard deviation': 3.0, 'reward_a1': tensor([-0.2315], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13288667798042297, 'denominator': 0.13288667798042297}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.07414627075195312, 'dpo_reward_mean_target': 0.42984771728515625, 'standard deviation': 3.0, 'reward_a1': tensor([-0.2698], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.975245475769043, 'numerator': 0.12941348552703857, 'denominator': 0.13269837200641632}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.021728515625, 'dpo_reward_mean_target': 0.021728515625, 'standard deviation': 3.0, 'reward_a1': tensor([0.0611], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13296933472156525, 'denominator': 0.13296933472156525}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.012834548950195312, 'dpo_reward_mean_target': -0.012834548950195312, 'standard deviation': 3.0, 'reward_a1': tensor([0.1048], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13287851214408875, 'denominator': 0.13287851214408875}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.11061859130859375, 'dpo_reward_mean_target': 0.1442108154296875, 'standard deviation': 3.0, 'reward_a1': tensor([-0.2415], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9986239075660706, 'numerator': 0.13188612461090088, 'denominator': 0.13206785917282104}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.09010696411132812, 'dpo_reward_mean_target': -0.09010696411132812, 'standard deviation': 3.0, 'reward_a1': tensor([-0.0178], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13294216990470886, 'denominator': 0.13294216990470886}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.057373046875, 'dpo_reward_mean_target': -0.057373046875, 'standard deviation': 3.0, 'reward_a1': tensor([0.0298], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.1329246461391449, 'denominator': 0.1329246461391449}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.020887374877929688, 'dpo_reward_mean_target': 0.020887374877929688, 'standard deviation': 3.0, 'reward_a1': tensor([0.0209], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13298074901103973, 'denominator': 0.13298074901103973}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.020183563232421875, 'dpo_reward_mean_target': 0.009502410888671875, 'standard deviation': 3.0, 'reward_a1': tensor([0.2465], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9997252225875854, 'numerator': 0.13256660103797913, 'denominator': 0.13260303437709808}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.06939697265625, 'dpo_reward_mean_target': 0.06939697265625, 'standard deviation': 3.0, 'reward_a1': tensor([-0.2964], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13199590146541595, 'denominator': 0.13199590146541595}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.08172035217285156, 'dpo_reward_mean_target': 0.08172035217285156, 'standard deviation': 3.0, 'reward_a1': tensor([-0.1539], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13257133960723877, 'denominator': 0.13257133960723877}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.07818031311035156, 'dpo_reward_mean_target': -0.07818031311035156, 'standard deviation': 3.0, 'reward_a1': tensor([0.0985], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13275033235549927, 'denominator': 0.13275033235549927}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.01766204833984375, 'dpo_reward_mean_target': 0.12305450439453125, 'standard deviation': 3.0, 'reward_a1': tensor([-0.3354], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.993950605392456, 'numerator': 0.13143718242645264, 'denominator': 0.13223713636398315}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.03225898742675781, 'dpo_reward_mean_target': -0.03225898742675781, 'standard deviation': 3.0, 'reward_a1': tensor([-0.0323], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13298074901103973, 'denominator': 0.13298074901103973}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.27269744873046875, 'dpo_reward_mean_target': -0.11871337890625, 'standard deviation': 3.0, 'reward_a1': tensor([0.2473], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0076087713241577, 'numerator': 0.13199454545974731, 'denominator': 0.13099780678749084}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.12540054321289062, 'dpo_reward_mean_target': 0.12540054321289062, 'standard deviation': 3.0, 'reward_a1': tensor([0.3518], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.1326027363538742, 'denominator': 0.1326027363538742}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.0069293975830078125, 'dpo_reward_mean_target': -0.1466846466064453, 'standard deviation': 3.0, 'reward_a1': tensor([0.0435], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9981334805488586, 'numerator': 0.13271380960941315, 'denominator': 0.13296198844909668}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.281982421875, 'dpo_reward_mean_target': 0.4312095642089844, 'standard deviation': 3.0, 'reward_a1': tensor([-1.4071], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9711798429489136, 'numerator': 0.11021798849105835, 'denominator': 0.11348874866962433}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.23235321044921875, 'dpo_reward_mean_target': -0.1897125244140625, 'standard deviation': 3.0, 'reward_a1': tensor([0.0717], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0013405084609985, 'numerator': 0.1324767917394638, 'denominator': 0.13229943811893463}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.009016036987304688, 'dpo_reward_mean_target': 0.009016036987304688, 'standard deviation': 3.0, 'reward_a1': tensor([0.1518], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13283029198646545, 'denominator': 0.13283029198646545}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.39359283447265625, 'dpo_reward_mean_target': -0.12030029296875, 'standard deviation': 3.0, 'reward_a1': tensor([-0.0294], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0095255374908447, 'numerator': 0.13291969895362854, 'denominator': 0.13166551291942596}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.1020965576171875, 'dpo_reward_mean_target': 0.09771156311035156, 'standard deviation': 3.0, 'reward_a1': tensor([-0.7421], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9837066531181335, 'numerator': 0.1278706043958664, 'denominator': 0.12998855113983154}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.2120037078857422, 'dpo_reward_mean_target': -0.2120037078857422, 'standard deviation': 3.0, 'reward_a1': tensor([0.5069], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.12921680510044098, 'denominator': 0.12921680510044098}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.07961273193359375, 'dpo_reward_mean_target': -0.31237030029296875, 'standard deviation': 3.0, 'reward_a1': tensor([0.1942], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.986562192440033, 'numerator': 0.13109803199768066, 'denominator': 0.1328836977481842}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.5447845458984375, 'dpo_reward_mean_target': 0.8287811279296875, 'standard deviation': 3.0, 'reward_a1': tensor([0.4257], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9917960166931152, 'numerator': 0.13178595900535583, 'denominator': 0.13287606835365295}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.170257568359375, 'dpo_reward_mean_target': 0.170257568359375, 'standard deviation': 3.0, 'reward_a1': tensor([0.1112], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13295504450798035, 'denominator': 0.13295504450798035}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.669708251953125, 'dpo_reward_mean_target': 0.4690704345703125, 'standard deviation': 3.0, 'reward_a1': tensor([0.9171], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.1373909711837769, 'numerator': 0.13150595128536224, 'denominator': 0.11562070995569229}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.3261566162109375, 'dpo_reward_mean_target': 0.34613037109375, 'standard deviation': 3.0, 'reward_a1': tensor([0.2736], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0198894739151, 'numerator': 0.13294194638729095, 'denominator': 0.13034936785697937}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.2929363250732422, 'dpo_reward_mean_target': 0.2929363250732422, 'standard deviation': 3.0, 'reward_a1': tensor([0.3912], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13290943205356598, 'denominator': 0.13290943205356598}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.21143722534179688, 'dpo_reward_mean_target': 0.21143722534179688, 'standard deviation': 3.0, 'reward_a1': tensor([0.3067], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.132913738489151, 'denominator': 0.132913738489151}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.029722213745117188, 'dpo_reward_mean_target': 0.029722213745117188, 'standard deviation': 3.0, 'reward_a1': tensor([0.0297], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13298074901103973, 'denominator': 0.13298074901103973}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.035686492919921875, 'dpo_reward_mean_target': 0.13167762756347656, 'standard deviation': 3.0, 'reward_a1': tensor([0.2665], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0019514560699463, 'numerator': 0.13284660875797272, 'denominator': 0.13258786499500275}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.021026611328125, 'dpo_reward_mean_target': 0.717041015625, 'standard deviation': 3.0, 'reward_a1': tensor([1.1212], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.059897541999817, 'numerator': 0.13177919387817383, 'denominator': 0.1243320107460022}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.19244384765625, 'dpo_reward_mean_target': 0.15256500244140625, 'standard deviation': 3.0, 'reward_a1': tensor([-0.0004], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0007481575012207, 'numerator': 0.13280794024467468, 'denominator': 0.13270865380764008}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.03631782531738281, 'dpo_reward_mean_target': 0.03631782531738281, 'standard deviation': 3.0, 'reward_a1': tensor([-0.5982], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13003931939601898, 'denominator': 0.13003931939601898}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.2130584716796875, 'dpo_reward_mean_target': 0.1252288818359375, 'standard deviation': 3.0, 'reward_a1': tensor([0.2331], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9993762969970703, 'numerator': 0.1328948587179184, 'denominator': 0.13297779858112335}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.16926193237304688, 'dpo_reward_mean_target': 0.5204353332519531, 'standard deviation': 3.0, 'reward_a1': tensor([0.2544], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0060564279556274, 'numerator': 0.13245882093906403, 'denominator': 0.13166143000125885}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.1927337646484375, 'dpo_reward_mean_target': 0.4271240234375, 'standard deviation': 3.0, 'reward_a1': tensor([0.4290], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0217061042785645, 'numerator': 0.13298074901103973, 'denominator': 0.13015557825565338}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.048580169677734375, 'dpo_reward_mean_target': 0.048580169677734375, 'standard deviation': 3.0, 'reward_a1': tensor([0.0962], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13296401500701904, 'denominator': 0.13296401500701904}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.3017120361328125, 'dpo_reward_mean_target': 0.0814056396484375, 'standard deviation': 3.0, 'reward_a1': tensor([0.0490], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0034958124160767, 'numerator': 0.13297301530838013, 'denominator': 0.13250978291034698}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.08979415893554688, 'dpo_reward_mean_target': 0.3500251770019531, 'standard deviation': 3.0, 'reward_a1': tensor([-0.4590], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9805614352226257, 'numerator': 0.1282321959733963, 'denominator': 0.13077425956726074}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.13235092163085938, 'dpo_reward_mean_target': 0.33045196533203125, 'standard deviation': 3.0, 'reward_a1': tensor([0.3541], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0027042627334595, 'numerator': 0.13297662138938904, 'denominator': 0.1326179951429367}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.43747711181640625, 'dpo_reward_mean_target': 0.1438751220703125, 'standard deviation': 3.0, 'reward_a1': tensor([0.6091], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0500410795211792, 'numerator': 0.13139115273952484, 'denominator': 0.12512953579425812}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.15821075439453125, 'dpo_reward_mean_target': -0.6362419128417969, 'standard deviation': 3.0, 'reward_a1': tensor([-1.0677], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0362564325332642, 'numerator': 0.13161225616931915, 'denominator': 0.12700742483139038}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.4778594970703125, 'dpo_reward_mean_target': 0.5825347900390625, 'standard deviation': 3.0, 'reward_a1': tensor([0.1797], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9959312677383423, 'numerator': 0.13178704679012299, 'denominator': 0.1323254406452179}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.1881256103515625, 'dpo_reward_mean_target': -0.1334991455078125, 'standard deviation': 3.0, 'reward_a1': tensor([-0.2525], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0100486278533936, 'numerator': 0.1328762322664261, 'denominator': 0.13155429065227509}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.03737926483154297, 'dpo_reward_mean_target': 0.03737926483154297, 'standard deviation': 3.0, 'reward_a1': tensor([0.0374], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13298074901103973, 'denominator': 0.13298074901103973}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.05948448181152344, 'dpo_reward_mean_target': -0.1859264373779297, 'standard deviation': 3.0, 'reward_a1': tensor([-0.6247], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0154294967651367, 'numerator': 0.13156577944755554, 'denominator': 0.12956662476062775}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.07721900939941406, 'dpo_reward_mean_target': -0.07721900939941406, 'standard deviation': 3.0, 'reward_a1': tensor([-0.0881], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.1329798549413681, 'denominator': 0.1329798549413681}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.5982475280761719, 'dpo_reward_mean_target': -0.436279296875, 'standard deviation': 3.0, 'reward_a1': tensor([-1.3220], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.985621452331543, 'numerator': 0.12730911374092102, 'denominator': 0.12916633486747742}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.11383056640625, 'dpo_reward_mean_target': 0.15673828125, 'standard deviation': 3.0, 'reward_a1': tensor([0.6099], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.017846941947937, 'numerator': 0.13147249817848206, 'denominator': 0.12916725873947144}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.05945587158203125, 'dpo_reward_mean_target': 0.19298553466796875, 'standard deviation': 3.0, 'reward_a1': tensor([0.8818], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0112736225128174, 'numerator': 0.12952090799808502, 'denominator': 0.12807701528072357}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.11017990112304688, 'dpo_reward_mean_target': 0.11017990112304688, 'standard deviation': 3.0, 'reward_a1': tensor([0.0826], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13297513127326965, 'denominator': 0.13297513127326965}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.20737075805664062, 'dpo_reward_mean_target': 0.20737075805664062, 'standard deviation': 3.0, 'reward_a1': tensor([0.3239], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.1328805685043335, 'denominator': 0.1328805685043335}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.14406585693359375, 'dpo_reward_mean_target': 0.02374267578125, 'standard deviation': 3.0, 'reward_a1': tensor([0.4440], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.009445309638977, 'numerator': 0.13168205320835114, 'denominator': 0.13044990599155426}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.04676055908203125, 'dpo_reward_mean_target': -0.04676055908203125, 'standard deviation': 3.0, 'reward_a1': tensor([0.2399], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13237519562244415, 'denominator': 0.13237519562244415}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.11025238037109375, 'dpo_reward_mean_target': -0.21355628967285156, 'standard deviation': 3.0, 'reward_a1': tensor([0.3454], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9858161807060242, 'numerator': 0.1306924968957901, 'denominator': 0.13257288932800293}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.03394317626953125, 'dpo_reward_mean_target': 0.0339813232421875, 'standard deviation': 3.0, 'reward_a1': tensor([0.1998], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0015087127685547, 'numerator': 0.13277789950370789, 'denominator': 0.13257788121700287}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.050262451171875, 'dpo_reward_mean_target': 0.167388916015625, 'standard deviation': 3.0, 'reward_a1': tensor([0.8719], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0099807977676392, 'numerator': 0.1293635219335556, 'denominator': 0.12808513641357422}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.5082015991210938, 'dpo_reward_mean_target': -0.5082015991210938, 'standard deviation': 3.0, 'reward_a1': tensor([0.1747], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.12958015501499176, 'denominator': 0.12958015501499176}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.357696533203125, 'dpo_reward_mean_target': -0.38327789306640625, 'standard deviation': 3.0, 'reward_a1': tensor([-0.2837], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0225582122802734, 'numerator': 0.13290759921073914, 'denominator': 0.1299755871295929}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.069427490234375, 'dpo_reward_mean_target': -0.069427490234375, 'standard deviation': 3.0, 'reward_a1': tensor([0.0209], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13292048871517181, 'denominator': 0.13292048871517181}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.11977005004882812, 'dpo_reward_mean_target': 0.06851577758789062, 'standard deviation': 3.0, 'reward_a1': tensor([0.6659], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0145719051361084, 'numerator': 0.13037042319774628, 'denominator': 0.12849795818328857}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.01770782470703125, 'dpo_reward_mean_target': -0.28185272216796875, 'standard deviation': 3.0, 'reward_a1': tensor([0.1909], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9893060326576233, 'numerator': 0.13133950531482697, 'denominator': 0.132759228348732}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.16337203979492188, 'dpo_reward_mean_target': 0.16337203979492188, 'standard deviation': 3.0, 'reward_a1': tensor([-0.0165], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.1327420324087143, 'denominator': 0.1327420324087143}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.14617919921875, 'dpo_reward_mean_target': -0.5727691650390625, 'standard deviation': 3.0, 'reward_a1': tensor([0.4153], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9510292410850525, 'numerator': 0.12596093118190765, 'denominator': 0.13244695961475372}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.4758453369140625, 'dpo_reward_mean_target': 0.0246734619140625, 'standard deviation': 3.0, 'reward_a1': tensor([-1.1037], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.952338457107544, 'numerator': 0.12389938533306122, 'denominator': 0.13010016083717346}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.1173248291015625, 'dpo_reward_mean_target': 0.15324783325195312, 'standard deviation': 3.0, 'reward_a1': tensor([-0.1561], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9988372921943665, 'numerator': 0.13227562606334686, 'denominator': 0.1324295997619629}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.16413116455078125, 'dpo_reward_mean_target': 0.2610664367675781, 'standard deviation': 3.0, 'reward_a1': tensor([0.6058], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0042442083358765, 'numerator': 0.13210558891296387, 'denominator': 0.13154727220535278}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.172454833984375, 'dpo_reward_mean_target': 0.184783935546875, 'standard deviation': 3.0, 'reward_a1': tensor([0.4118], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0162327289581299, 'numerator': 0.13260045647621155, 'denominator': 0.13048237562179565}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.05920600891113281, 'dpo_reward_mean_target': 0.092620849609375, 'standard deviation': 3.0, 'reward_a1': tensor([-0.2070], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9962334036827087, 'numerator': 0.13231925666332245, 'denominator': 0.1328195333480835}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.06459236145019531, 'dpo_reward_mean_target': 0.06459236145019531, 'standard deviation': 3.0, 'reward_a1': tensor([0.1990], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13284745812416077, 'denominator': 0.13284745812416077}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.01137542724609375, 'dpo_reward_mean_target': 0.01137542724609375, 'standard deviation': 3.0, 'reward_a1': tensor([-0.0817], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13291676342487335, 'denominator': 0.13291676342487335}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.028863906860351562, 'dpo_reward_mean_target': 0.028863906860351562, 'standard deviation': 3.0, 'reward_a1': tensor([0.1420], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13288630545139313, 'denominator': 0.13288630545139313}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.0709991455078125, 'dpo_reward_mean_target': 0.34990692138671875, 'standard deviation': 3.0, 'reward_a1': tensor([0.5510], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0194330215454102, 'numerator': 0.13268235325813293, 'denominator': 0.130153089761734}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.20555686950683594, 'dpo_reward_mean_target': 0.20555686950683594, 'standard deviation': 3.0, 'reward_a1': tensor([-0.1449], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13207636773586273, 'denominator': 0.13207636773586273}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.01824188232421875, 'dpo_reward_mean_target': 0.001972198486328125, 'standard deviation': 3.0, 'reward_a1': tensor([0.0443], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9999382495880127, 'numerator': 0.13296756148338318, 'denominator': 0.132975772023201}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.01700592041015625, 'dpo_reward_mean_target': -0.1046295166015625, 'standard deviation': 3.0, 'reward_a1': tensor([0.4879], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9928403496742249, 'numerator': 0.1304124891757965, 'denominator': 0.13135293126106262}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.7215576171875, 'dpo_reward_mean_target': -0.22161865234375, 'standard deviation': 3.0, 'reward_a1': tensor([0.3717], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0479600429534912, 'numerator': 0.130404993891716, 'denominator': 0.12443699687719345}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.17925071716308594, 'dpo_reward_mean_target': -0.17925071716308594, 'standard deviation': 3.0, 'reward_a1': tensor([0.0715], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.132517009973526, 'denominator': 0.132517009973526}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.13322830200195312, 'dpo_reward_mean_target': -0.15509605407714844, 'standard deviation': 3.0, 'reward_a1': tensor([0.2770], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.990818977355957, 'numerator': 0.13160870969295502, 'denominator': 0.13282820582389832}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.2348175048828125, 'dpo_reward_mean_target': 0.2348175048828125, 'standard deviation': 3.0, 'reward_a1': tensor([-0.0937], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.1321856528520584, 'denominator': 0.1321856528520584}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 1.22454833984375, 'dpo_reward_mean_target': 0.27850341796875, 'standard deviation': 3.0, 'reward_a1': tensor([1.6057], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.914124071598053, 'numerator': 0.1205836832523346, 'denominator': 0.1319117248058319}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.25745391845703125, 'dpo_reward_mean_target': 0.047210693359375, 'standard deviation': 3.0, 'reward_a1': tensor([-0.0922], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0057278871536255, 'numerator': 0.13283732533454895, 'denominator': 0.1320807784795761}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.16527938842773438, 'dpo_reward_mean_target': -0.16527938842773438, 'standard deviation': 3.0, 'reward_a1': tensor([0.6760], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.12785328924655914, 'denominator': 0.12785328924655914}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.0760650634765625, 'dpo_reward_mean_target': 0.17325592041015625, 'standard deviation': 3.0, 'reward_a1': tensor([-0.0270], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9983633756637573, 'numerator': 0.1326848268508911, 'denominator': 0.1329023391008377}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.3461761474609375, 'dpo_reward_mean_target': -0.41347503662109375, 'standard deviation': 3.0, 'reward_a1': tensor([1.1515], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9048103094100952, 'numerator': 0.11606471240520477, 'denominator': 0.12827518582344055}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.2210063934326172, 'dpo_reward_mean_target': 0.2210063934326172, 'standard deviation': 3.0, 'reward_a1': tensor([-0.1583], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13192220032215118, 'denominator': 0.13192220032215118}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.1978302001953125, 'dpo_reward_mean_target': 0.22259140014648438, 'standard deviation': 3.0, 'reward_a1': tensor([0.1905], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.008357048034668, 'numerator': 0.13297317922115326, 'denominator': 0.13187111914157867}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.0196380615234375, 'dpo_reward_mean_target': 0.0196380615234375, 'standard deviation': 3.0, 'reward_a1': tensor([-0.2733], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13234812021255493, 'denominator': 0.13234812021255493}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.00122833251953125, 'dpo_reward_mean_target': -0.00122833251953125, 'standard deviation': 3.0, 'reward_a1': tensor([-0.0012], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13298074901103973, 'denominator': 0.13298074901103973}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.028947830200195312, 'dpo_reward_mean_target': 0.049224853515625, 'standard deviation': 3.0, 'reward_a1': tensor([-0.3495], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9968806505203247, 'numerator': 0.1318112462759018, 'denominator': 0.1322236955165863}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.066009521484375, 'dpo_reward_mean_target': 0.19974517822265625, 'standard deviation': 3.0, 'reward_a1': tensor([1.7020], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.023589849472046, 'numerator': 0.11731167882680893, 'denominator': 0.11460808664560318}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.397003173828125, 'dpo_reward_mean_target': -0.0952301025390625, 'standard deviation': 3.0, 'reward_a1': tensor([-0.1202], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0042290687561035, 'numerator': 0.13297611474990845, 'denominator': 0.13241611421108246}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.04503631591796875, 'dpo_reward_mean_target': -0.04503631591796875, 'standard deviation': 3.0, 'reward_a1': tensor([-0.1258], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13293252885341644, 'denominator': 0.13293252885341644}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.14281082153320312, 'dpo_reward_mean_target': -0.20097732543945312, 'standard deviation': 3.0, 'reward_a1': tensor([-0.1380], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9997805953025818, 'numerator': 0.13295142352581024, 'denominator': 0.1329805999994278}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.03956413269042969, 'dpo_reward_mean_target': -0.03956413269042969, 'standard deviation': 3.0, 'reward_a1': tensor([0.0570], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13291187584400177, 'denominator': 0.13291187584400177}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.0680999755859375, 'dpo_reward_mean_target': -0.06231689453125, 'standard deviation': 3.0, 'reward_a1': tensor([-0.0729], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9999949336051941, 'numerator': 0.13297992944717407, 'denominator': 0.1329805999994278}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.37286376953125, 'dpo_reward_mean_target': 0.5801620483398438, 'standard deviation': 3.0, 'reward_a1': tensor([-0.2936], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9824187755584717, 'numerator': 0.12745875120162964, 'denominator': 0.12973973155021667}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.41105079650878906, 'dpo_reward_mean_target': -0.41105079650878906, 'standard deviation': 3.0, 'reward_a1': tensor([0.0522], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13140496611595154, 'denominator': 0.13140496611595154}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.2193145751953125, 'dpo_reward_mean_target': -0.29571533203125, 'standard deviation': 3.0, 'reward_a1': tensor([0.1245], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9907353520393372, 'numerator': 0.13168290257453918, 'denominator': 0.13291430473327637}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.10958099365234375, 'dpo_reward_mean_target': -0.2400684356689453, 'standard deviation': 3.0, 'reward_a1': tensor([0.1738], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9949575662612915, 'numerator': 0.13172107934951782, 'denominator': 0.132388636469841}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.04858207702636719, 'dpo_reward_mean_target': -0.04858207702636719, 'standard deviation': 3.0, 'reward_a1': tensor([-0.5419], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13119517266750336, 'denominator': 0.13119517266750336}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.18193435668945312, 'dpo_reward_mean_target': 0.22080612182617188, 'standard deviation': 3.0, 'reward_a1': tensor([1.4761], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0055208206176758, 'numerator': 0.12183474749326706, 'denominator': 0.12116581946611404}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.583526611328125, 'dpo_reward_mean_target': 0.44268798828125, 'standard deviation': 3.0, 'reward_a1': tensor([0.1833], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0051742792129517, 'numerator': 0.13248470425605774, 'denominator': 0.1318027228116989}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.0207061767578125, 'dpo_reward_mean_target': 0.0207061767578125, 'standard deviation': 3.0, 'reward_a1': tensor([0.0207], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13298074901103973, 'denominator': 0.13298074901103973}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.270050048828125, 'dpo_reward_mean_target': -0.70489501953125, 'standard deviation': 3.0, 'reward_a1': tensor([1.4930], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.8308634161949158, 'numerator': 0.10167888551950455, 'denominator': 0.12237738072872162}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.12187004089355469, 'dpo_reward_mean_target': 0.135040283203125, 'standard deviation': 3.0, 'reward_a1': tensor([-0.1361], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9996128678321838, 'numerator': 0.1324387490749359, 'denominator': 0.13249003887176514}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.03371429443359375, 'dpo_reward_mean_target': 0.1873760223388672, 'standard deviation': 3.0, 'reward_a1': tensor([0.1127], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0000371932983398, 'numerator': 0.13293960690498352, 'denominator': 0.13293465971946716}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.06537628173828125, 'dpo_reward_mean_target': -0.06537628173828125, 'standard deviation': 3.0, 'reward_a1': tensor([-0.0192], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13296499848365784, 'denominator': 0.13296499848365784}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.47292518615722656, 'dpo_reward_mean_target': 0.47292518615722656, 'standard deviation': 3.0, 'reward_a1': tensor([-0.2521], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.12915311753749847, 'denominator': 0.12915311753749847}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.010440826416015625, 'dpo_reward_mean_target': -0.010440826416015625, 'standard deviation': 3.0, 'reward_a1': tensor([-0.1059], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13291342556476593, 'denominator': 0.13291342556476593}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.02202606201171875, 'dpo_reward_mean_target': 0.02202606201171875, 'standard deviation': 3.0, 'reward_a1': tensor([0.0730], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13296154141426086, 'denominator': 0.13296154141426086}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.1334075927734375, 'dpo_reward_mean_target': -0.012451171875, 'standard deviation': 3.0, 'reward_a1': tensor([0.5575], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9919766187667847, 'numerator': 0.13060207664966583, 'denominator': 0.1316584199666977}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.24462127685546875, 'dpo_reward_mean_target': 0.24462127685546875, 'standard deviation': 3.0, 'reward_a1': tensor([0.3839], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13283757865428925, 'denominator': 0.13283757865428925}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.21125030517578125, 'dpo_reward_mean_target': -0.022533416748046875, 'standard deviation': 3.0, 'reward_a1': tensor([-0.4601], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9928297400474548, 'numerator': 0.13157391548156738, 'denominator': 0.13252414762973785}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.2619209289550781, 'dpo_reward_mean_target': 0.22049522399902344, 'standard deviation': 3.0, 'reward_a1': tensor([0.0359], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0030385255813599, 'numerator': 0.13272921741008759, 'denominator': 0.132327139377594}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.05049896240234375, 'dpo_reward_mean_target': 0.1467132568359375, 'standard deviation': 3.0, 'reward_a1': tensor([0.0877], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9998831152915955, 'numerator': 0.13295499980449677, 'denominator': 0.13297054171562195}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.0697021484375, 'dpo_reward_mean_target': 0.053863525390625, 'standard deviation': 3.0, 'reward_a1': tensor([0.4207], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0059022903442383, 'numerator': 0.1319902092218399, 'denominator': 0.13121573626995087}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.16159439086914062, 'dpo_reward_mean_target': 0.16159439086914062, 'standard deviation': 3.0, 'reward_a1': tensor([0.2274], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.1329488307237625, 'denominator': 0.1329488307237625}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.26650238037109375, 'dpo_reward_mean_target': -0.26650238037109375, 'standard deviation': 3.0, 'reward_a1': tensor([-0.2654], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13298074901103973, 'denominator': 0.13298074901103973}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.07199478149414062, 'dpo_reward_mean_target': 0.4110374450683594, 'standard deviation': 3.0, 'reward_a1': tensor([0.2238], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9993319511413574, 'numerator': 0.132721945643425, 'denominator': 0.13281066715717316}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.08870887756347656, 'dpo_reward_mean_target': 0.08870887756347656, 'standard deviation': 3.0, 'reward_a1': tensor([-0.0240], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.1328868716955185, 'denominator': 0.1328868716955185}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.09326171875, 'dpo_reward_mean_target': 1.029571533203125, 'standard deviation': 3.0, 'reward_a1': tensor([2.3589], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.2660349607467651, 'numerator': 0.12054568529129028, 'denominator': 0.09521513432264328}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.1055450439453125, 'dpo_reward_mean_target': 0.52435302734375, 'standard deviation': 3.0, 'reward_a1': tensor([1.2496], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0755201578140259, 'numerator': 0.12915074825286865, 'denominator': 0.12008213251829147}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.4022979736328125, 'dpo_reward_mean_target': -0.3455810546875, 'standard deviation': 3.0, 'reward_a1': tensor([0.9258], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9281341433525085, 'numerator': 0.12155875563621521, 'denominator': 0.13097110390663147}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.0771484375, 'dpo_reward_mean_target': -0.556427001953125, 'standard deviation': 3.0, 'reward_a1': tensor([4.3766], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.7225492000579834, 'numerator': 0.03440771624445915, 'denominator': 0.04761989414691925}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.5730819702148438, 'dpo_reward_mean_target': -0.6361846923828125, 'standard deviation': 3.0, 'reward_a1': tensor([0.0280], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9955740571022034, 'numerator': 0.1297612339258194, 'denominator': 0.13033810257911682}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.11893653869628906, 'dpo_reward_mean_target': -0.0619964599609375, 'standard deviation': 3.0, 'reward_a1': tensor([-0.1224], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9997979402542114, 'numerator': 0.13295380771160126, 'denominator': 0.13298067450523376}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.41527557373046875, 'dpo_reward_mean_target': -0.25346946716308594, 'standard deviation': 3.0, 'reward_a1': tensor([-0.0830], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0045297145843506, 'numerator': 0.1327662169933319, 'denominator': 0.1321675330400467}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.011304855346679688, 'dpo_reward_mean_target': 0.011304855346679688, 'standard deviation': 3.0, 'reward_a1': tensor([0.0625], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13296137750148773, 'denominator': 0.13296137750148773}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.07668113708496094, 'dpo_reward_mean_target': -0.07668113708496094, 'standard deviation': 3.0, 'reward_a1': tensor([0.0957], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.1327614039182663, 'denominator': 0.1327614039182663}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.13562774658203125, 'dpo_reward_mean_target': 0.25089263916015625, 'standard deviation': 3.0, 'reward_a1': tensor([1.0589], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0439379215240479, 'numerator': 0.12824423611164093, 'denominator': 0.12284661829471588}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.31870269775390625, 'dpo_reward_mean_target': 0.25048828125, 'standard deviation': 3.0, 'reward_a1': tensor([-0.3548], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.004858374595642, 'numerator': 0.13030104339122772, 'denominator': 0.12967105209827423}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.411895751953125, 'dpo_reward_mean_target': 0.5500869750976562, 'standard deviation': 3.0, 'reward_a1': tensor([-0.0785], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9914457201957703, 'numerator': 0.1300932765007019, 'denominator': 0.13121573626995087}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.05667877197265625, 'dpo_reward_mean_target': -0.053829193115234375, 'standard deviation': 3.0, 'reward_a1': tensor([0.3558], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0001299381256104, 'numerator': 0.13174693286418915, 'denominator': 0.13172981142997742}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.203033447265625, 'dpo_reward_mean_target': 0.377471923828125, 'standard deviation': 3.0, 'reward_a1': tensor([0.1143], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9965959787368774, 'numerator': 0.13247013092041016, 'denominator': 0.13292260468006134}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.15941619873046875, 'dpo_reward_mean_target': 0.15941619873046875, 'standard deviation': 3.0, 'reward_a1': tensor([-0.4267], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13046681880950928, 'denominator': 0.13046681880950928}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.11023521423339844, 'dpo_reward_mean_target': 0.11023521423339844, 'standard deviation': 3.0, 'reward_a1': tensor([-0.1063], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13263475894927979, 'denominator': 0.13263475894927979}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.037445068359375, 'dpo_reward_mean_target': 0.3785591125488281, 'standard deviation': 3.0, 'reward_a1': tensor([0.6493], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0168648958206177, 'numerator': 0.1324404776096344, 'denominator': 0.1302439272403717}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.060787200927734375, 'dpo_reward_mean_target': -0.060787200927734375, 'standard deviation': 3.0, 'reward_a1': tensor([0.1799], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13255341351032257, 'denominator': 0.13255341351032257}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.0081939697265625, 'dpo_reward_mean_target': -0.0081939697265625, 'standard deviation': 3.0, 'reward_a1': tensor([-0.6576], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.1299012452363968, 'denominator': 0.1299012452363968}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.3823089599609375, 'dpo_reward_mean_target': -0.063385009765625, 'standard deviation': 3.0, 'reward_a1': tensor([-0.6410], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0404373407363892, 'numerator': 0.13053841888904572, 'denominator': 0.12546494603157043}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.19631385803222656, 'dpo_reward_mean_target': 0.10456657409667969, 'standard deviation': 3.0, 'reward_a1': tensor([-0.0546], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0020925998687744, 'numerator': 0.13279369473457336, 'denominator': 0.13251638412475586}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.03023529052734375, 'dpo_reward_mean_target': -0.2961883544921875, 'standard deviation': 3.0, 'reward_a1': tensor([0.4690], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9814913868904114, 'numerator': 0.12872475385665894, 'denominator': 0.1311521977186203}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.21858882904052734, 'dpo_reward_mean_target': 0.21858882904052734, 'standard deviation': 3.0, 'reward_a1': tensor([-0.1153], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13215947151184082, 'denominator': 0.13215947151184082}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.004150390625, 'dpo_reward_mean_target': -0.4031524658203125, 'standard deviation': 3.0, 'reward_a1': tensor([0.1910], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9826570153236389, 'numerator': 0.13039834797382355, 'denominator': 0.13269975781440735}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.263763427734375, 'dpo_reward_mean_target': 0.263763427734375, 'standard deviation': 3.0, 'reward_a1': tensor([0.2638], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13298074901103973, 'denominator': 0.13298074901103973}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.0797281265258789, 'dpo_reward_mean_target': -0.0797281265258789, 'standard deviation': 3.0, 'reward_a1': tensor([0.0212], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13290546834468842, 'denominator': 0.13290546834468842}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.33979034423828125, 'dpo_reward_mean_target': 0.1358642578125, 'standard deviation': 3.0, 'reward_a1': tensor([1.4025], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0827597379684448, 'numerator': 0.1216403990983963, 'denominator': 0.11234293133020401}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.2822074890136719, 'dpo_reward_mean_target': 0.2822074890136719, 'standard deviation': 3.0, 'reward_a1': tensor([0.2876], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.1329805552959442, 'denominator': 0.1329805552959442}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.01068878173828125, 'dpo_reward_mean_target': 0.034152984619140625, 'standard deviation': 3.0, 'reward_a1': tensor([0.0031], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9999572038650513, 'numerator': 0.13297365605831146, 'denominator': 0.1329793483018875}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.3207969665527344, 'dpo_reward_mean_target': -0.3207969665527344, 'standard deviation': 3.0, 'reward_a1': tensor([-0.3919], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13294340670108795, 'denominator': 0.13294340670108795}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.15326690673828125, 'dpo_reward_mean_target': 0.026166915893554688, 'standard deviation': 3.0, 'reward_a1': tensor([0.3078], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0074301958084106, 'numerator': 0.13239622116088867, 'denominator': 0.13141974806785583}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.22759437561035156, 'dpo_reward_mean_target': 0.22759437561035156, 'standard deviation': 3.0, 'reward_a1': tensor([0.6217], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13183802366256714, 'denominator': 0.13183802366256714}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.057102203369140625, 'dpo_reward_mean_target': 0.38486480712890625, 'standard deviation': 3.0, 'reward_a1': tensor([-0.4021], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9775633215904236, 'numerator': 0.12848301231861115, 'denominator': 0.13143190741539001}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.032772064208984375, 'dpo_reward_mean_target': -0.032772064208984375, 'standard deviation': 3.0, 'reward_a1': tensor([-0.0328], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13298074901103973, 'denominator': 0.13298074901103973}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.24468612670898438, 'dpo_reward_mean_target': -0.24468612670898438, 'standard deviation': 3.0, 'reward_a1': tensor([0.1947], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13156193494796753, 'denominator': 0.13156193494796753}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.8694610595703125, 'dpo_reward_mean_target': 0.7450027465820312, 'standard deviation': 3.0, 'reward_a1': tensor([1.1263], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.2376468181610107, 'numerator': 0.1319107562303543, 'denominator': 0.1065819039940834}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.3290081024169922, 'dpo_reward_mean_target': -0.12850570678710938, 'standard deviation': 3.0, 'reward_a1': tensor([-0.0800], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0092030763626099, 'numerator': 0.1329633742570877, 'denominator': 0.13175086677074432}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.27176666259765625, 'dpo_reward_mean_target': 0.204925537109375, 'standard deviation': 3.0, 'reward_a1': tensor([0.1178], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.000895619392395, 'numerator': 0.13292473554611206, 'denominator': 0.13280579447746277}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.03936195373535156, 'dpo_reward_mean_target': 0.03936195373535156, 'standard deviation': 3.0, 'reward_a1': tensor([-0.3948], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13159552216529846, 'denominator': 0.13159552216529846}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.09222602844238281, 'dpo_reward_mean_target': 0.09222602844238281, 'standard deviation': 3.0, 'reward_a1': tensor([0.0211], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13294337689876556, 'denominator': 0.13294337689876556}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.10858917236328125, 'dpo_reward_mean_target': -0.11917877197265625, 'standard deviation': 3.0, 'reward_a1': tensor([1.0644], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9732926487922668, 'numerator': 0.12302456051111221, 'denominator': 0.12640038132667542}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.14981460571289062, 'dpo_reward_mean_target': -0.14981460571289062, 'standard deviation': 3.0, 'reward_a1': tensor([-0.0334], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13288073241710663, 'denominator': 0.13288073241710663}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.0158538818359375, 'dpo_reward_mean_target': 0.19448089599609375, 'standard deviation': 3.0, 'reward_a1': tensor([-0.1028], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9958814382553101, 'numerator': 0.13232959806919098, 'denominator': 0.13287685811519623}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.28775787353515625, 'dpo_reward_mean_target': -0.27355194091796875, 'standard deviation': 3.0, 'reward_a1': tensor([-0.6949], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9993464350700378, 'numerator': 0.13167589902877808, 'denominator': 0.13176201283931732}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.1270275115966797, 'dpo_reward_mean_target': 0.1270275115966797, 'standard deviation': 3.0, 'reward_a1': tensor([0.1060], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13297751545906067, 'denominator': 0.13297751545906067}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.1143798828125, 'dpo_reward_mean_target': 0.8507308959960938, 'standard deviation': 3.0, 'reward_a1': tensor([-0.2136], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9446329474449158, 'numerator': 0.12486941367387772, 'denominator': 0.13218829035758972}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.3116569519042969, 'dpo_reward_mean_target': 0.3116569519042969, 'standard deviation': 3.0, 'reward_a1': tensor([-0.3091], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13016383349895477, 'denominator': 0.13016383349895477}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.4311027526855469, 'dpo_reward_mean_target': -0.1252269744873047, 'standard deviation': 3.0, 'reward_a1': tensor([0.1847], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0158543586730957, 'numerator': 0.13227321207523346, 'denominator': 0.1302088350057602}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.2542304992675781, 'dpo_reward_mean_target': -0.202362060546875, 'standard deviation': 3.0, 'reward_a1': tensor([0.1142], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9955336451530457, 'numerator': 0.1322426199913025, 'denominator': 0.13283590972423553}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.0980682373046875, 'dpo_reward_mean_target': -0.0444183349609375, 'standard deviation': 3.0, 'reward_a1': tensor([0.5711], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9914193749427795, 'numerator': 0.13021069765090942, 'denominator': 0.13133765757083893}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.034099578857421875, 'dpo_reward_mean_target': 0.034099578857421875, 'standard deviation': 3.0, 'reward_a1': tensor([-0.1256], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.1327924281358719, 'denominator': 0.1327924281358719}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.041229248046875, 'dpo_reward_mean_target': 0.7715072631835938, 'standard deviation': 3.0, 'reward_a1': tensor([-0.3992], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9333077073097229, 'numerator': 0.12323169410228729, 'denominator': 0.13203758001327515}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.026235580444335938, 'dpo_reward_mean_target': -0.11138343811035156, 'standard deviation': 3.0, 'reward_a1': tensor([0.0537], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.998529314994812, 'numerator': 0.13277964293956757, 'denominator': 0.13297520577907562}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.02176666259765625, 'dpo_reward_mean_target': 0.02176666259765625, 'standard deviation': 3.0, 'reward_a1': tensor([0.0218], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13298074901103973, 'denominator': 0.13298074901103973}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.040798187255859375, 'dpo_reward_mean_target': -0.17246246337890625, 'standard deviation': 3.0, 'reward_a1': tensor([-0.3163], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0059536695480347, 'numerator': 0.13282789289951324, 'denominator': 0.13204176723957062}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.056224822998046875, 'dpo_reward_mean_target': 0.4981575012207031, 'standard deviation': 3.0, 'reward_a1': tensor([-0.0104], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9858500361442566, 'numerator': 0.13108380138874054, 'denominator': 0.13296525180339813}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.12622451782226562, 'dpo_reward_mean_target': 0.4580955505371094, 'standard deviation': 3.0, 'reward_a1': tensor([0.5688], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.010252594947815, 'numerator': 0.13289028406143188, 'denominator': 0.1315416395664215}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.11789703369140625, 'dpo_reward_mean_target': 0.04009246826171875, 'standard deviation': 3.0, 'reward_a1': tensor([0.3627], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9975503087043762, 'numerator': 0.13221408426761627, 'denominator': 0.13253876566886902}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.23065185546875, 'dpo_reward_mean_target': -0.3909645080566406, 'standard deviation': 3.0, 'reward_a1': tensor([-0.2318], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9985929131507874, 'numerator': 0.1327936351299286, 'denominator': 0.13298074901103973}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.0929718017578125, 'dpo_reward_mean_target': -0.246124267578125, 'standard deviation': 3.0, 'reward_a1': tensor([0.9130], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9817470908164978, 'numerator': 0.1234162300825119, 'denominator': 0.12571081519126892}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.2448883056640625, 'dpo_reward_mean_target': 0.38488006591796875, 'standard deviation': 3.0, 'reward_a1': tensor([0.6432], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0051192045211792, 'numerator': 0.1324888914823532, 'denominator': 0.131814107298851}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.060672760009765625, 'dpo_reward_mean_target': -0.026506423950195312, 'standard deviation': 3.0, 'reward_a1': tensor([-0.1088], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9997523427009583, 'numerator': 0.132930725812912, 'denominator': 0.1329636573791504}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.037545204162597656, 'dpo_reward_mean_target': -0.037545204162597656, 'standard deviation': 3.0, 'reward_a1': tensor([-0.2799], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13254772126674652, 'denominator': 0.13254772126674652}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.09033203125, 'dpo_reward_mean_target': -0.451690673828125, 'standard deviation': 3.0, 'reward_a1': tensor([-0.0085], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.989514946937561, 'numerator': 0.13153749704360962, 'denominator': 0.13293129205703735}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.29132080078125, 'dpo_reward_mean_target': 0.39263916015625, 'standard deviation': 3.0, 'reward_a1': tensor([0.1053], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9973394274711609, 'numerator': 0.13237226009368896, 'denominator': 0.13272538781166077}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.022762298583984375, 'dpo_reward_mean_target': 0.09813690185546875, 'standard deviation': 3.0, 'reward_a1': tensor([-0.0518], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.998798131942749, 'numerator': 0.13281472027301788, 'denominator': 0.1329745352268219}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.0063648223876953125, 'dpo_reward_mean_target': -0.0063648223876953125, 'standard deviation': 3.0, 'reward_a1': tensor([-0.1056], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13290803134441376, 'denominator': 0.13290803134441376}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.12388038635253906, 'dpo_reward_mean_target': -0.25732994079589844, 'standard deviation': 3.0, 'reward_a1': tensor([-0.5962], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0226809978485107, 'numerator': 0.13213501870632172, 'denominator': 0.12920452654361725}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.1212921142578125, 'dpo_reward_mean_target': -0.114501953125, 'standard deviation': 3.0, 'reward_a1': tensor([1.1817], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0009808540344238, 'numerator': 0.1211298331618309, 'denominator': 0.12101113796234131}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.21941757202148438, 'dpo_reward_mean_target': -0.22096633911132812, 'standard deviation': 3.0, 'reward_a1': tensor([0.1268], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9937762022018433, 'numerator': 0.13209016621112823, 'denominator': 0.13291741907596588}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.29892539978027344, 'dpo_reward_mean_target': -0.02826690673828125, 'standard deviation': 3.0, 'reward_a1': tensor([-0.6354], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9859113693237305, 'numerator': 0.13028515875339508, 'denominator': 0.1321469247341156}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.4597930908203125, 'dpo_reward_mean_target': 0.3775482177734375, 'standard deviation': 3.0, 'reward_a1': tensor([0.8105], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9964258670806885, 'numerator': 0.1316032111644745, 'denominator': 0.1320752650499344}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.3744354248046875, 'dpo_reward_mean_target': 0.19075393676757812, 'standard deviation': 3.0, 'reward_a1': tensor([1.2893], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0906050205230713, 'numerator': 0.12435777485370636, 'denominator': 0.11402641236782074}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.06662178039550781, 'dpo_reward_mean_target': 0.06662178039550781, 'standard deviation': 3.0, 'reward_a1': tensor([-0.2017], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13244979083538055, 'denominator': 0.13244979083538055}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.013151168823242188, 'dpo_reward_mean_target': -0.013151168823242188, 'standard deviation': 3.0, 'reward_a1': tensor([-0.0627], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13296261429786682, 'denominator': 0.13296261429786682}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.2012195587158203, 'dpo_reward_mean_target': 0.2012195587158203, 'standard deviation': 3.0, 'reward_a1': tensor([0.2425], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13296820223331451, 'denominator': 0.13296820223331451}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.05075263977050781, 'dpo_reward_mean_target': -0.05075263977050781, 'standard deviation': 3.0, 'reward_a1': tensor([-0.1409], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13292071223258972, 'denominator': 0.13292071223258972}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.03700828552246094, 'dpo_reward_mean_target': -0.03700828552246094, 'standard deviation': 3.0, 'reward_a1': tensor([0.1276], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13278083503246307, 'denominator': 0.13278083503246307}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.031951904296875, 'dpo_reward_mean_target': -0.4082183837890625, 'standard deviation': 3.0, 'reward_a1': tensor([0.4166], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9708568453788757, 'numerator': 0.12804840505123138, 'denominator': 0.1318921595811844}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.035373687744140625, 'dpo_reward_mean_target': 0.035373687744140625, 'standard deviation': 3.0, 'reward_a1': tensor([-0.0966], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13285225629806519, 'denominator': 0.13285225629806519}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.09542083740234375, 'dpo_reward_mean_target': 0.0402374267578125, 'standard deviation': 3.0, 'reward_a1': tensor([-0.1628], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9979643225669861, 'numerator': 0.1326766014099121, 'denominator': 0.13294723629951477}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.4401206970214844, 'dpo_reward_mean_target': 0.4401206970214844, 'standard deviation': 3.0, 'reward_a1': tensor([0.4317], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13298024237155914, 'denominator': 0.13298024237155914}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.08775520324707031, 'dpo_reward_mean_target': 0.030979156494140625, 'standard deviation': 3.0, 'reward_a1': tensor([-0.4314], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9946969151496887, 'numerator': 0.1314103603363037, 'denominator': 0.13211095333099365}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.27080345153808594, 'dpo_reward_mean_target': 0.27080345153808594, 'standard deviation': 3.0, 'reward_a1': tensor([0.5000], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13259336352348328, 'denominator': 0.13259336352348328}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.0053882598876953125, 'dpo_reward_mean_target': 0.0053882598876953125, 'standard deviation': 3.0, 'reward_a1': tensor([0.0055], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13298074901103973, 'denominator': 0.13298074901103973}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.8389701843261719, 'dpo_reward_mean_target': -0.4051017761230469, 'standard deviation': 3.0, 'reward_a1': tensor([-0.7613], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9933071732521057, 'numerator': 0.13204652070999146, 'denominator': 0.1329362392425537}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.3845672607421875, 'dpo_reward_mean_target': 0.7012481689453125, 'standard deviation': 3.0, 'reward_a1': tensor([0.1404], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9859362840652466, 'numerator': 0.13067695498466492, 'denominator': 0.1325409710407257}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.10067939758300781, 'dpo_reward_mean_target': 0.05809783935546875, 'standard deviation': 3.0, 'reward_a1': tensor([0.5137], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0094830989837646, 'numerator': 0.1314559131860733, 'denominator': 0.13022100925445557}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.017353057861328125, 'dpo_reward_mean_target': 0.017353057861328125, 'standard deviation': 3.0, 'reward_a1': tensor([0.1602], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13283003866672516, 'denominator': 0.13283003866672516}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.025726318359375, 'dpo_reward_mean_target': 0.025726318359375, 'standard deviation': 3.0, 'reward_a1': tensor([0.0257], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13298074901103973, 'denominator': 0.13298074901103973}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.10481834411621094, 'dpo_reward_mean_target': 0.10481834411621094, 'standard deviation': 3.0, 'reward_a1': tensor([-0.0149], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.132874995470047, 'denominator': 0.132874995470047}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.012273788452148438, 'dpo_reward_mean_target': -0.012273788452148438, 'standard deviation': 3.0, 'reward_a1': tensor([-0.0173], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.1329805552959442, 'denominator': 0.1329805552959442}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.02245330810546875, 'dpo_reward_mean_target': 0.02245330810546875, 'standard deviation': 3.0, 'reward_a1': tensor([-0.2243], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13253186643123627, 'denominator': 0.13253186643123627}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.08746147155761719, 'dpo_reward_mean_target': 0.3638172149658203, 'standard deviation': 3.0, 'reward_a1': tensor([-0.1268], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9892369508743286, 'numerator': 0.1312144249677658, 'denominator': 0.13264206051826477}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.18048095703125, 'dpo_reward_mean_target': 0.011749267578125, 'standard deviation': 3.0, 'reward_a1': tensor([-0.0117], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0020244121551514, 'numerator': 0.132976695895195, 'denominator': 0.13270804286003113}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.11211013793945312, 'dpo_reward_mean_target': 0.024930953979492188, 'standard deviation': 3.0, 'reward_a1': tensor([-0.0366], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0010188817977905, 'numerator': 0.13295277953147888, 'denominator': 0.13281744718551636}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.056842803955078125, 'dpo_reward_mean_target': 0.0676422119140625, 'standard deviation': 3.0, 'reward_a1': tensor([2.1849], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0025503635406494, 'numerator': 0.10366524010896683, 'denominator': 0.1034015342593193}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -1.3405094146728516, 'dpo_reward_mean_target': 0.5624580383300781, 'standard deviation': 3.0, 'reward_a1': tensor([1.3061], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.4310622215270996, 'numerator': 0.12895746529102325, 'denominator': 0.09011311084032059}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.458099365234375, 'dpo_reward_mean_target': 0.458099365234375, 'standard deviation': 3.0, 'reward_a1': tensor([0.0842], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13195210695266724, 'denominator': 0.13195210695266724}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.052509307861328125, 'dpo_reward_mean_target': 0.052509307861328125, 'standard deviation': 3.0, 'reward_a1': tensor([1.1333], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.1246248185634613, 'denominator': 0.1246248185634613}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.41378021240234375, 'dpo_reward_mean_target': 0.42691802978515625, 'standard deviation': 3.0, 'reward_a1': tensor([0.1193], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9995608925819397, 'numerator': 0.13228347897529602, 'denominator': 0.13234159350395203}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.24343490600585938, 'dpo_reward_mean_target': 0.0338134765625, 'standard deviation': 3.0, 'reward_a1': tensor([0.0920], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0010855197906494, 'numerator': 0.13295571506023407, 'denominator': 0.1328115463256836}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.6763381958007812, 'dpo_reward_mean_target': 0.38156890869140625, 'standard deviation': 3.0, 'reward_a1': tensor([1.0922], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9817225337028503, 'numerator': 0.12930215895175934, 'denominator': 0.13170947134494781}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.4084014892578125, 'dpo_reward_mean_target': 0.3812408447265625, 'standard deviation': 3.0, 'reward_a1': tensor([0.8361], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9986690878868103, 'numerator': 0.1314612478017807, 'denominator': 0.13163644075393677}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.04049110412597656, 'dpo_reward_mean_target': -0.04049110412597656, 'standard deviation': 3.0, 'reward_a1': tensor([1.1598], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.1227526068687439, 'denominator': 0.1227526068687439}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.15613937377929688, 'dpo_reward_mean_target': -0.40737152099609375, 'standard deviation': 3.0, 'reward_a1': tensor([-0.9065], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.017592430114746, 'numerator': 0.13115294277668, 'denominator': 0.12888553738594055}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.39801025390625, 'dpo_reward_mean_target': 1.0572662353515625, 'standard deviation': 3.0, 'reward_a1': tensor([0.4247], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9780505895614624, 'numerator': 0.13005678355693817, 'denominator': 0.1329755187034607}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.13401412963867188, 'dpo_reward_mean_target': 0.15141677856445312, 'standard deviation': 3.0, 'reward_a1': tensor([0.2103], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0064140558242798, 'numerator': 0.1329551339149475, 'denominator': 0.13210779428482056}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.4745979309082031, 'dpo_reward_mean_target': 0.6180648803710938, 'standard deviation': 3.0, 'reward_a1': tensor([0.8520], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0048847198486328, 'numerator': 0.13257700204849243, 'denominator': 0.13193254172801971}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.15938568115234375, 'dpo_reward_mean_target': 0.20343017578125, 'standard deviation': 3.0, 'reward_a1': tensor([-0.0565], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9968392848968506, 'numerator': 0.13248252868652344, 'denominator': 0.132902592420578}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.0337066650390625, 'dpo_reward_mean_target': 0.03624725341796875, 'standard deviation': 3.0, 'reward_a1': tensor([-0.1914], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9985038042068481, 'numerator': 0.13259848952293396, 'denominator': 0.13279718160629272}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 1.094635009765625, 'dpo_reward_mean_target': 0.4461517333984375, 'standard deviation': 3.0, 'reward_a1': tensor([0.8329], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9955026507377625, 'numerator': 0.1318800300359726, 'denominator': 0.1324758231639862}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.044902801513671875, 'dpo_reward_mean_target': 0.073974609375, 'standard deviation': 3.0, 'reward_a1': tensor([0.1617], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0019457340240479, 'numerator': 0.1329239457845688, 'denominator': 0.13266581296920776}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.09479522705078125, 'dpo_reward_mean_target': 0.3526458740234375, 'standard deviation': 3.0, 'reward_a1': tensor([1.7422], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.083510160446167, 'numerator': 0.1194540411233902, 'denominator': 0.11024727672338486}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.00740814208984375, 'dpo_reward_mean_target': 0.290924072265625, 'standard deviation': 3.0, 'reward_a1': tensor([0.0971], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9983624815940857, 'numerator': 0.13270361721515656, 'denominator': 0.1329212784767151}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.25635528564453125, 'dpo_reward_mean_target': 0.25635528564453125, 'standard deviation': 3.0, 'reward_a1': tensor([-0.2380], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13118764758110046, 'denominator': 0.13118764758110046}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.14479827880859375, 'dpo_reward_mean_target': 0.3982391357421875, 'standard deviation': 3.0, 'reward_a1': tensor([0.2377], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0067170858383179, 'numerator': 0.1327904313802719, 'denominator': 0.13190442323684692}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.04936408996582031, 'dpo_reward_mean_target': 0.04936408996582031, 'standard deviation': 3.0, 'reward_a1': tensor([0.8475], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.1283571720123291, 'denominator': 0.1283571720123291}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.05997467041015625, 'dpo_reward_mean_target': -0.2384033203125, 'standard deviation': 3.0, 'reward_a1': tensor([0.9669], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9655941128730774, 'numerator': 0.12267045676708221, 'denominator': 0.12704142928123474}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.03977775573730469, 'dpo_reward_mean_target': 0.03977775573730469, 'standard deviation': 3.0, 'reward_a1': tensor([0.0301], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.1329800933599472, 'denominator': 0.1329800933599472}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.2030925750732422, 'dpo_reward_mean_target': 0.18395233154296875, 'standard deviation': 3.0, 'reward_a1': tensor([0.3391], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0151054859161377, 'numerator': 0.13280317187309265, 'denominator': 0.13082696497440338}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.03129386901855469, 'dpo_reward_mean_target': 0.03129386901855469, 'standard deviation': 3.0, 'reward_a1': tensor([-0.2448], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.1324189156293869, 'denominator': 0.1324189156293869}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -3.814697265625e-05, 'dpo_reward_mean_target': -3.814697265625e-05, 'standard deviation': 3.0, 'reward_a1': tensor([0.4859], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.1312475949525833, 'denominator': 0.1312475949525833}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.04143333435058594, 'dpo_reward_mean_target': 0.04143333435058594, 'standard deviation': 3.0, 'reward_a1': tensor([0.0414], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13298074901103973, 'denominator': 0.13298074901103973}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.645233154296875, 'dpo_reward_mean_target': 0.215179443359375, 'standard deviation': 3.0, 'reward_a1': tensor([1.9106], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9317060708999634, 'numerator': 0.11335401982069016, 'denominator': 0.12166285514831543}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.2916374206542969, 'dpo_reward_mean_target': -0.04953575134277344, 'standard deviation': 3.0, 'reward_a1': tensor([0.2161], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.010455846786499, 'numerator': 0.1324605643749237, 'denominator': 0.13108991086483002}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.09153556823730469, 'dpo_reward_mean_target': 0.09153556823730469, 'standard deviation': 3.0, 'reward_a1': tensor([-0.1282], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13262441754341125, 'denominator': 0.13262441754341125}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.709869384765625, 'dpo_reward_mean_target': 0.32633209228515625, 'standard deviation': 3.0, 'reward_a1': tensor([0.7833], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9887608885765076, 'numerator': 0.13144676387310028, 'denominator': 0.13294090330600739}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.042499542236328125, 'dpo_reward_mean_target': -0.116790771484375, 'standard deviation': 3.0, 'reward_a1': tensor([0.8299], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.992519736289978, 'numerator': 0.12652111053466797, 'denominator': 0.12747465074062347}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.44297027587890625, 'dpo_reward_mean_target': 0.22147369384765625, 'standard deviation': 3.0, 'reward_a1': tensor([-0.0932], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0105243921279907, 'numerator': 0.13225138187408447, 'denominator': 0.13087400794029236}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.8954429626464844, 'dpo_reward_mean_target': 0.5651283264160156, 'standard deviation': 3.0, 'reward_a1': tensor([0.6504], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0029356479644775, 'numerator': 0.1329270452260971, 'denominator': 0.13253796100616455}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.035491943359375, 'dpo_reward_mean_target': -0.035491943359375, 'standard deviation': 3.0, 'reward_a1': tensor([1.0624], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.12436755746603012, 'denominator': 0.12436755746603012}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.368988037109375, 'dpo_reward_mean_target': 0.9411468505859375, 'standard deviation': 3.0, 'reward_a1': tensor([2.3537], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.1140340566635132, 'numerator': 0.11902748793363571, 'denominator': 0.10684367269277573}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.20081520080566406, 'dpo_reward_mean_target': 0.09036636352539062, 'standard deviation': 3.0, 'reward_a1': tensor([-0.3094], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9918090105056763, 'numerator': 0.13180509209632874, 'denominator': 0.1328936219215393}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.1954345703125, 'dpo_reward_mean_target': 0.04993438720703125, 'standard deviation': 3.0, 'reward_a1': tensor([-0.3654], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0079212188720703, 'numerator': 0.13171270489692688, 'denominator': 0.13067758083343506}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.37229156494140625, 'dpo_reward_mean_target': -0.37229156494140625, 'standard deviation': 3.0, 'reward_a1': tensor([-0.1020], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13244196772575378, 'denominator': 0.13244196772575378}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.034313201904296875, 'dpo_reward_mean_target': -0.000438690185546875, 'standard deviation': 3.0, 'reward_a1': tensor([0.0966], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0004292726516724, 'numerator': 0.13291114568710327, 'denominator': 0.13285411894321442}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.2874126434326172, 'dpo_reward_mean_target': 0.12203598022460938, 'standard deviation': 3.0, 'reward_a1': tensor([0.4294], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9958809614181519, 'numerator': 0.13228486478328705, 'denominator': 0.13283200562000275}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.17877578735351562, 'dpo_reward_mean_target': 0.17877578735351562, 'standard deviation': 3.0, 'reward_a1': tensor([0.3615], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13273420929908752, 'denominator': 0.13273420929908752}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.05469322204589844, 'dpo_reward_mean_target': 0.05469322204589844, 'standard deviation': 3.0, 'reward_a1': tensor([0.1837], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13285791873931885, 'denominator': 0.13285791873931885}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.008586883544921875, 'dpo_reward_mean_target': -0.008586883544921875, 'standard deviation': 3.0, 'reward_a1': tensor([-0.0086], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13298074901103973, 'denominator': 0.13298074901103973}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.039947509765625, 'dpo_reward_mean_target': -0.039947509765625, 'standard deviation': 3.0, 'reward_a1': tensor([-0.2643], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13260948657989502, 'denominator': 0.13260948657989502}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.10991668701171875, 'dpo_reward_mean_target': -0.10991668701171875, 'standard deviation': 3.0, 'reward_a1': tensor([0.5023], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13024075329303741, 'denominator': 0.13024075329303741}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.048969268798828125, 'dpo_reward_mean_target': -0.048969268798828125, 'standard deviation': 3.0, 'reward_a1': tensor([0.1814], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.1325892210006714, 'denominator': 0.1325892210006714}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.09307479858398438, 'dpo_reward_mean_target': -0.12848281860351562, 'standard deviation': 3.0, 'reward_a1': tensor([-0.0918], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.001826524734497, 'numerator': 0.13297085464000702, 'denominator': 0.1327284276485443}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.04100799560546875, 'dpo_reward_mean_target': 0.4540252685546875, 'standard deviation': 3.0, 'reward_a1': tensor([0.3563], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.008272647857666, 'numerator': 0.13291022181510925, 'denominator': 0.1318197250366211}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.12298965454101562, 'dpo_reward_mean_target': -0.12298965454101562, 'standard deviation': 3.0, 'reward_a1': tensor([0.1748], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13232733309268951, 'denominator': 0.13232733309268951}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.34521484375, 'dpo_reward_mean_target': 0.34521484375, 'standard deviation': 3.0, 'reward_a1': tensor([0.4087], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.132951021194458, 'denominator': 0.132951021194458}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.03619194030761719, 'dpo_reward_mean_target': 0.03619194030761719, 'standard deviation': 3.0, 'reward_a1': tensor([0.2484], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13264857232570648, 'denominator': 0.13264857232570648}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.053073883056640625, 'dpo_reward_mean_target': 0.053073883056640625, 'standard deviation': 3.0, 'reward_a1': tensor([-0.0598], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13288670778274536, 'denominator': 0.13288670778274536}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.21807861328125, 'dpo_reward_mean_target': -0.19768714904785156, 'standard deviation': 3.0, 'reward_a1': tensor([-0.0242], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0015922784805298, 'numerator': 0.1327587068080902, 'denominator': 0.13254766166210175}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 1.5808258056640625, 'dpo_reward_mean_target': 1.857666015625, 'standard deviation': 3.0, 'reward_a1': tensor([1.7604], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0012661218643188, 'numerator': 0.1329108625650406, 'denominator': 0.13274279236793518}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -1.6373443603515625, 'dpo_reward_mean_target': 0.303436279296875, 'standard deviation': 3.0, 'reward_a1': tensor([-0.1965], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.106766700744629, 'numerator': 0.13114681839942932, 'denominator': 0.11849544942378998}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.07413291931152344, 'dpo_reward_mean_target': -0.06141090393066406, 'standard deviation': 3.0, 'reward_a1': tensor([0.0829], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9988478422164917, 'numerator': 0.13282696902751923, 'denominator': 0.13298018276691437}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.97882080078125, 'dpo_reward_mean_target': 0.9576263427734375, 'standard deviation': 3.0, 'reward_a1': tensor([2.7853], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9957300424575806, 'numerator': 0.1104569137096405, 'denominator': 0.11093058437108994}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.00061798095703125, 'dpo_reward_mean_target': 0.00061798095703125, 'standard deviation': 3.0, 'reward_a1': tensor([0.0006], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13298074901103973, 'denominator': 0.13298074901103973}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.1504058837890625, 'dpo_reward_mean_target': 0.1504058837890625, 'standard deviation': 3.0, 'reward_a1': tensor([-0.0667], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13263295590877533, 'denominator': 0.13263295590877533}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.9315452575683594, 'dpo_reward_mean_target': -0.7052726745605469, 'standard deviation': 3.0, 'reward_a1': tensor([0.4782], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0331354141235352, 'numerator': 0.12302587926387787, 'denominator': 0.11908011138439178}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.38512229919433594, 'dpo_reward_mean_target': 0.38512229919433594, 'standard deviation': 3.0, 'reward_a1': tensor([0.2325], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13280878961086273, 'denominator': 0.13280878961086273}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.16910743713378906, 'dpo_reward_mean_target': 0.16910743713378906, 'standard deviation': 3.0, 'reward_a1': tensor([-0.6987], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.12753233313560486, 'denominator': 0.12753233313560486}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.007556915283203125, 'dpo_reward_mean_target': -0.007556915283203125, 'standard deviation': 3.0, 'reward_a1': tensor([0.0151], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.1329769790172577, 'denominator': 0.1329769790172577}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.82635498046875, 'dpo_reward_mean_target': 0.5545196533203125, 'standard deviation': 3.0, 'reward_a1': tensor([1.0901], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9880006909370422, 'numerator': 0.1308782398700714, 'denominator': 0.13246776163578033}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.2980499267578125, 'dpo_reward_mean_target': 0.44158172607421875, 'standard deviation': 3.0, 'reward_a1': tensor([-0.7578], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9821784496307373, 'numerator': 0.1227676197886467, 'denominator': 0.12499523162841797}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.013507843017578125, 'dpo_reward_mean_target': 0.030652999877929688, 'standard deviation': 3.0, 'reward_a1': tensor([0.5458], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0009982585906982, 'numerator': 0.13103453814983368, 'denominator': 0.13090386986732483}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.0210723876953125, 'dpo_reward_mean_target': 0.1042327880859375, 'standard deviation': 3.0, 'reward_a1': tensor([0.7682], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0101679563522339, 'numerator': 0.1297631859779358, 'denominator': 0.12845703959465027}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.3656044006347656, 'dpo_reward_mean_target': -0.18769073486328125, 'standard deviation': 3.0, 'reward_a1': tensor([0.3973], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0134119987487793, 'numerator': 0.13047634065151215, 'denominator': 0.1287495493888855}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.0517578125, 'dpo_reward_mean_target': -0.0517578125, 'standard deviation': 3.0, 'reward_a1': tensor([1.1268], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.12310454994440079, 'denominator': 0.12310454994440079}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.03916168212890625, 'dpo_reward_mean_target': 0.03916168212890625, 'standard deviation': 3.0, 'reward_a1': tensor([0.0464], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.1329803615808487, 'denominator': 0.1329803615808487}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.9605941772460938, 'dpo_reward_mean_target': 0.6398239135742188, 'standard deviation': 3.0, 'reward_a1': tensor([1.5586], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9733323454856873, 'numerator': 0.12688839435577393, 'denominator': 0.13036492466926575}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.023895263671875, 'dpo_reward_mean_target': 0.807891845703125, 'standard deviation': 3.0, 'reward_a1': tensor([0.2589], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9877753853797913, 'numerator': 0.13077276945114136, 'denominator': 0.13239119946956635}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.08436203002929688, 'dpo_reward_mean_target': 0.11597061157226562, 'standard deviation': 3.0, 'reward_a1': tensor([0.1333], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.002618670463562, 'numerator': 0.13297852873802185, 'denominator': 0.13263121247291565}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.13055801391601562, 'dpo_reward_mean_target': 0.28055763244628906, 'standard deviation': 3.0, 'reward_a1': tensor([0.0717], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.997771143913269, 'numerator': 0.13265882432460785, 'denominator': 0.1329551637172699}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.21174049377441406, 'dpo_reward_mean_target': 0.041812896728515625, 'standard deviation': 3.0, 'reward_a1': tensor([0.5732], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9916065335273743, 'numerator': 0.1309109479188919, 'denominator': 0.13201904296875}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.15052032470703125, 'dpo_reward_mean_target': 0.4795684814453125, 'standard deviation': 3.0, 'reward_a1': tensor([-0.4282], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9593534469604492, 'numerator': 0.12703023850917816, 'denominator': 0.13241234421730042}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.43987464904785156, 'dpo_reward_mean_target': 0.3791065216064453, 'standard deviation': 3.0, 'reward_a1': tensor([0.6423], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9984293580055237, 'numerator': 0.13246998190879822, 'denominator': 0.13267837464809418}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.16570663452148438, 'dpo_reward_mean_target': 0.06076622009277344, 'standard deviation': 3.0, 'reward_a1': tensor([0.5273], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0146969556808472, 'numerator': 0.1313822865486145, 'denominator': 0.1294793337583542}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.193878173828125, 'dpo_reward_mean_target': -0.193878173828125, 'standard deviation': 3.0, 'reward_a1': tensor([0.0191], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13264620304107666, 'denominator': 0.13264620304107666}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.00298309326171875, 'dpo_reward_mean_target': 0.00298309326171875, 'standard deviation': 3.0, 'reward_a1': tensor([0.0054], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13298071920871735, 'denominator': 0.13298071920871735}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 1.1130523681640625, 'dpo_reward_mean_target': 0.225555419921875, 'standard deviation': 3.0, 'reward_a1': tensor([0.4119], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0257052183151245, 'numerator': 0.13272440433502197, 'denominator': 0.1293981969356537}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.38043975830078125, 'dpo_reward_mean_target': 0.4369354248046875, 'standard deviation': 3.0, 'reward_a1': tensor([0.6193], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0013232231140137, 'numerator': 0.13273532688617706, 'denominator': 0.13255992531776428}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.1191864013671875, 'dpo_reward_mean_target': 0.2154083251953125, 'standard deviation': 3.0, 'reward_a1': tensor([1.5109], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0558866262435913, 'numerator': 0.12114288657903671, 'denominator': 0.11473095417022705}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.26616668701171875, 'dpo_reward_mean_target': -0.24585342407226562, 'standard deviation': 3.0, 'reward_a1': tensor([-0.2000], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0001263618469238, 'numerator': 0.13296520709991455, 'denominator': 0.13294841349124908}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.5404281616210938, 'dpo_reward_mean_target': 0.5404281616210938, 'standard deviation': 3.0, 'reward_a1': tensor([0.1647], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13194166123867035, 'denominator': 0.13194166123867035}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.07114601135253906, 'dpo_reward_mean_target': 0.07474517822265625, 'standard deviation': 3.0, 'reward_a1': tensor([0.0262], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.999981164932251, 'numerator': 0.13296334445476532, 'denominator': 0.1329658478498459}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.2332000732421875, 'dpo_reward_mean_target': 0.848114013671875, 'standard deviation': 3.0, 'reward_a1': tensor([2.9050], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.3662750720977783, 'numerator': 0.10512547940015793, 'denominator': 0.07694312930107117}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.576019287109375, 'dpo_reward_mean_target': 0.168212890625, 'standard deviation': 3.0, 'reward_a1': tensor([-0.3178], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0317566394805908, 'numerator': 0.13124677538871765, 'denominator': 0.12720710039138794}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.6038589477539062, 'dpo_reward_mean_target': 0.14715576171875, 'standard deviation': 3.0, 'reward_a1': tensor([3.1010], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.8708353638648987, 'numerator': 0.0818990170955658, 'denominator': 0.09404649585485458}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.049411773681640625, 'dpo_reward_mean_target': 0.049411773681640625, 'standard deviation': 3.0, 'reward_a1': tensor([0.5437], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13118797540664673, 'denominator': 0.13118797540664673}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.37667274475097656, 'dpo_reward_mean_target': 0.37667274475097656, 'standard deviation': 3.0, 'reward_a1': tensor([0.7029], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13219676911830902, 'denominator': 0.13219676911830902}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.1944427490234375, 'dpo_reward_mean_target': 0.3216094970703125, 'standard deviation': 3.0, 'reward_a1': tensor([-0.1122], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9947817325592041, 'numerator': 0.1315973997116089, 'denominator': 0.13228771090507507}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 1.05084228515625, 'dpo_reward_mean_target': -0.769134521484375, 'standard deviation': 3.0, 'reward_a1': tensor([0.1151], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.005213975906372, 'numerator': 0.12732762098312378, 'denominator': 0.12666718661785126}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.09210395812988281, 'dpo_reward_mean_target': -0.09210395812988281, 'standard deviation': 3.0, 'reward_a1': tensor([0.5799], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.12968610227108002, 'denominator': 0.12968610227108002}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.008810043334960938, 'dpo_reward_mean_target': 0.008810043334960938, 'standard deviation': 3.0, 'reward_a1': tensor([0.0088], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13298074901103973, 'denominator': 0.13298074901103973}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.32897186279296875, 'dpo_reward_mean_target': -0.32897186279296875, 'standard deviation': 3.0, 'reward_a1': tensor([-0.2431], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13292628526687622, 'denominator': 0.13292628526687622}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.06807708740234375, 'dpo_reward_mean_target': -0.08807563781738281, 'standard deviation': 3.0, 'reward_a1': tensor([0.2908], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9991804957389832, 'numerator': 0.13192421197891235, 'denominator': 0.13203240931034088}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.5678520202636719, 'dpo_reward_mean_target': 1.021169662475586, 'standard deviation': 3.0, 'reward_a1': tensor([0.2489], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9728901982307434, 'numerator': 0.1286463588476181, 'denominator': 0.13223111629486084}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.204925537109375, 'dpo_reward_mean_target': -0.204925537109375, 'standard deviation': 3.0, 'reward_a1': tensor([1.0298], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.12218225002288818, 'denominator': 0.12218225002288818}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.2564411163330078, 'dpo_reward_mean_target': 0.2564411163330078, 'standard deviation': 3.0, 'reward_a1': tensor([0.5709], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13225235044956207, 'denominator': 0.13225235044956207}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.026317596435546875, 'dpo_reward_mean_target': -0.026317596435546875, 'standard deviation': 3.0, 'reward_a1': tensor([0.8388], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.12756527960300446, 'denominator': 0.12756527960300446}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.23499298095703125, 'dpo_reward_mean_target': 0.0049381256103515625, 'standard deviation': 3.0, 'reward_a1': tensor([-0.0230], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.002455472946167, 'numerator': 0.13297498226165771, 'denominator': 0.1326492726802826}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.0022373199462890625, 'dpo_reward_mean_target': -0.0022373199462890625, 'standard deviation': 3.0, 'reward_a1': tensor([-0.0022], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13298074901103973, 'denominator': 0.13298074901103973}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.10424423217773438, 'dpo_reward_mean_target': -0.017742156982421875, 'standard deviation': 3.0, 'reward_a1': tensor([0.5779], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9927798509597778, 'numerator': 0.1303853541612625, 'denominator': 0.1313336044549942}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.35076904296875, 'dpo_reward_mean_target': 0.35076904296875, 'standard deviation': 3.0, 'reward_a1': tensor([0.6032], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.1325106918811798, 'denominator': 0.1325106918811798}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.64404296875, 'dpo_reward_mean_target': 0.7813682556152344, 'standard deviation': 3.0, 'reward_a1': tensor([-0.0683], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.988154411315918, 'numerator': 0.12775316834449768, 'denominator': 0.12928462028503418}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.1320648193359375, 'dpo_reward_mean_target': 0.121978759765625, 'standard deviation': 3.0, 'reward_a1': tensor([1.5635], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9983915090560913, 'numerator': 0.11848127096891403, 'denominator': 0.11867215484380722}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.03962135314941406, 'dpo_reward_mean_target': 0.03962135314941406, 'standard deviation': 3.0, 'reward_a1': tensor([0.4627], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13166525959968567, 'denominator': 0.13166525959968567}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.04034996032714844, 'dpo_reward_mean_target': 0.04034996032714844, 'standard deviation': 3.0, 'reward_a1': tensor([-0.0502], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13292014598846436, 'denominator': 0.13292014598846436}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 2.0731201171875, 'dpo_reward_mean_target': 1.370849609375, 'standard deviation': 3.0, 'reward_a1': tensor([3.3448], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.881061851978302, 'numerator': 0.10709710419178009, 'denominator': 0.12155458331108093}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 1.1991119384765625, 'dpo_reward_mean_target': 1.04217529296875, 'standard deviation': 3.0, 'reward_a1': tensor([1.8511], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.987343966960907, 'numerator': 0.12823359668254852, 'denominator': 0.12987732887268066}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.0002231597900390625, 'dpo_reward_mean_target': 0.0002231597900390625, 'standard deviation': 3.0, 'reward_a1': tensor([0.0002], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13298074901103973, 'denominator': 0.13298074901103973}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.20494461059570312, 'dpo_reward_mean_target': 0.352874755859375, 'standard deviation': 3.0, 'reward_a1': tensor([-0.1739], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9925851821899414, 'numerator': 0.13094660639762878, 'denominator': 0.1319248080253601}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.14404869079589844, 'dpo_reward_mean_target': 0.0451812744140625, 'standard deviation': 3.0, 'reward_a1': tensor([-0.0061], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.001107096672058, 'numerator': 0.13296134769916534, 'denominator': 0.13281430304050446}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.24705123901367188, 'dpo_reward_mean_target': -0.24705123901367188, 'standard deviation': 3.0, 'reward_a1': tensor([0.2669], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13104373216629028, 'denominator': 0.13104373216629028}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 1.746368408203125, 'dpo_reward_mean_target': 0.6273193359375, 'standard deviation': 3.0, 'reward_a1': tensor([1.1899], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9996224045753479, 'numerator': 0.13066312670707703, 'denominator': 0.13071247935295105}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.5096473693847656, 'dpo_reward_mean_target': 0.12952804565429688, 'standard deviation': 3.0, 'reward_a1': tensor([1.4142], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9548215866088867, 'numerator': 0.12133056670427322, 'denominator': 0.12707145512104034}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.3417205810546875, 'dpo_reward_mean_target': 0.04209136962890625, 'standard deviation': 3.0, 'reward_a1': tensor([0.7548], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.98143470287323, 'numerator': 0.12928064167499542, 'denominator': 0.13172617554664612}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.6183433532714844, 'dpo_reward_mean_target': -0.46254920959472656, 'standard deviation': 3.0, 'reward_a1': tensor([-0.2710], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0046751499176025, 'numerator': 0.13270995020866394, 'denominator': 0.1320924013853073}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.19971466064453125, 'dpo_reward_mean_target': -0.18035125732421875, 'standard deviation': 3.0, 'reward_a1': tensor([0.7315], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0019845962524414, 'numerator': 0.12697723507881165, 'denominator': 0.12672573328018188}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.05328941345214844, 'dpo_reward_mean_target': 0.23130416870117188, 'standard deviation': 3.0, 'reward_a1': tensor([-0.2399], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9896535873413086, 'numerator': 0.1313505321741104, 'denominator': 0.13272374868392944}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.004467010498046875, 'dpo_reward_mean_target': -0.004467010498046875, 'standard deviation': 3.0, 'reward_a1': tensor([-0.0045], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13298074901103973, 'denominator': 0.13298074901103973}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.4401092529296875, 'dpo_reward_mean_target': 0.2350311279296875, 'standard deviation': 3.0, 'reward_a1': tensor([3.4926], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9306322932243347, 'numerator': 0.07374986261129379, 'denominator': 0.07924704998731613}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.1495800018310547, 'dpo_reward_mean_target': -0.06560707092285156, 'standard deviation': 3.0, 'reward_a1': tensor([-0.1047], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0035128593444824, 'numerator': 0.132969468832016, 'denominator': 0.13250400125980377}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.032901763916015625, 'dpo_reward_mean_target': 0.032901763916015625, 'standard deviation': 3.0, 'reward_a1': tensor([-0.0147], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13296401500701904, 'denominator': 0.13296401500701904}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.07042312622070312, 'dpo_reward_mean_target': -0.07042312622070312, 'standard deviation': 3.0, 'reward_a1': tensor([0.0566], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13286158442497253, 'denominator': 0.13286158442497253}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.778717041015625, 'dpo_reward_mean_target': 0.2970123291015625, 'standard deviation': 3.0, 'reward_a1': tensor([0.8846], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9816137552261353, 'numerator': 0.13045449554920197, 'denominator': 0.1328979879617691}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.5578727722167969, 'dpo_reward_mean_target': 0.5578727722167969, 'standard deviation': 3.0, 'reward_a1': tensor([0.2612], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.1323319971561432, 'denominator': 0.1323319971561432}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.06435585021972656, 'dpo_reward_mean_target': 0.07285308837890625, 'standard deviation': 3.0, 'reward_a1': tensor([0.3324], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0002490282058716, 'numerator': 0.13248421251773834, 'denominator': 0.13245122134685516}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.13910865783691406, 'dpo_reward_mean_target': -0.13910865783691406, 'standard deviation': 3.0, 'reward_a1': tensor([0.2745], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13172295689582825, 'denominator': 0.13172295689582825}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.05722999572753906, 'dpo_reward_mean_target': -0.05722999572753906, 'standard deviation': 3.0, 'reward_a1': tensor([0.0485], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.1328982412815094, 'denominator': 0.1328982412815094}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.1185302734375, 'dpo_reward_mean_target': 0.1185302734375, 'standard deviation': 3.0, 'reward_a1': tensor([0.5964], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13130424916744232, 'denominator': 0.13130424916744232}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.6708450317382812, 'dpo_reward_mean_target': 0.612152099609375, 'standard deviation': 3.0, 'reward_a1': tensor([0.4886], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.000997543334961, 'numerator': 0.13286805152893066, 'denominator': 0.13273563981056213}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.4205303192138672, 'dpo_reward_mean_target': 0.4205303192138672, 'standard deviation': 3.0, 'reward_a1': tensor([0.5270], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13289707899093628, 'denominator': 0.13289707899093628}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.028356552124023438, 'dpo_reward_mean_target': -0.028356552124023438, 'standard deviation': 3.0, 'reward_a1': tensor([0.0522], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.1329328417778015, 'denominator': 0.1329328417778015}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.40224456787109375, 'dpo_reward_mean_target': 0.5357894897460938, 'standard deviation': 3.0, 'reward_a1': tensor([0.3157], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9977282881736755, 'numerator': 0.13262349367141724, 'denominator': 0.13292546570301056}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.247650146484375, 'dpo_reward_mean_target': 1.2235870361328125, 'standard deviation': 3.0, 'reward_a1': tensor([0.8339], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0107170343399048, 'numerator': 0.13186374306678772, 'denominator': 0.1304655373096466}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.43817138671875, 'dpo_reward_mean_target': 0.14011764526367188, 'standard deviation': 3.0, 'reward_a1': tensor([0.5285], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9921056032180786, 'numerator': 0.13187119364738464, 'denominator': 0.1329205185174942}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.5747222900390625, 'dpo_reward_mean_target': 0.84674072265625, 'standard deviation': 3.0, 'reward_a1': tensor([0.1154], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9821680188179016, 'numerator': 0.1290876269340515, 'denominator': 0.13143131136894226}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.06287193298339844, 'dpo_reward_mean_target': -0.06287193298339844, 'standard deviation': 3.0, 'reward_a1': tensor([-0.4884], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13164986670017242, 'denominator': 0.13164986670017242}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 1.1361083984375, 'dpo_reward_mean_target': 0.762054443359375, 'standard deviation': 3.0, 'reward_a1': tensor([0.4588], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0205860137939453, 'numerator': 0.13230310380458832, 'denominator': 0.12963443994522095}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.2088775634765625, 'dpo_reward_mean_target': 0.37839508056640625, 'standard deviation': 3.0, 'reward_a1': tensor([0.7180], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0080257654190063, 'numerator': 0.1321311742067337, 'denominator': 0.13107916712760925}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.42424774169921875, 'dpo_reward_mean_target': 0.44567108154296875, 'standard deviation': 3.0, 'reward_a1': tensor([-0.0499], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.998846173286438, 'numerator': 0.13117839395999908, 'denominator': 0.13132992386817932}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.24687957763671875, 'dpo_reward_mean_target': 0.15961456298828125, 'standard deviation': 3.0, 'reward_a1': tensor([0.5688], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9964621663093567, 'numerator': 0.13174979388713837, 'denominator': 0.13221755623817444}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.11718559265136719, 'dpo_reward_mean_target': -0.11718559265136719, 'standard deviation': 3.0, 'reward_a1': tensor([0.0552], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13276152312755585, 'denominator': 0.13276152312755585}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.38712310791015625, 'dpo_reward_mean_target': 0.31295013427734375, 'standard deviation': 3.0, 'reward_a1': tensor([1.0555], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9942025542259216, 'numerator': 0.1289687156677246, 'denominator': 0.1297207623720169}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.012678146362304688, 'dpo_reward_mean_target': 0.012678146362304688, 'standard deviation': 3.0, 'reward_a1': tensor([0.5870], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13056580722332, 'denominator': 0.13056580722332}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.0071163177490234375, 'dpo_reward_mean_target': 0.0636749267578125, 'standard deviation': 3.0, 'reward_a1': tensor([0.0637], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.000278353691101, 'numerator': 0.13298074901103973, 'denominator': 0.1329437494277954}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.14480018615722656, 'dpo_reward_mean_target': 0.1359882354736328, 'standard deviation': 3.0, 'reward_a1': tensor([0.0335], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0011839866638184, 'numerator': 0.13290321826934814, 'denominator': 0.13274605572223663}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.29810333251953125, 'dpo_reward_mean_target': 0.6901168823242188, 'standard deviation': 3.0, 'reward_a1': tensor([0.9894], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0218085050582886, 'numerator': 0.13232064247131348, 'denominator': 0.1294965147972107}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 1.1131134033203125, 'dpo_reward_mean_target': 1.3337554931640625, 'standard deviation': 3.0, 'reward_a1': tensor([1.5135], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0071377754211426, 'numerator': 0.13274215161800385, 'denominator': 0.13180138170719147}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.52276611328125, 'dpo_reward_mean_target': 0.660064697265625, 'standard deviation': 3.0, 'reward_a1': tensor([0.3572], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9964335560798645, 'numerator': 0.13230492174625397, 'denominator': 0.13277846574783325}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 1.0811004638671875, 'dpo_reward_mean_target': 0.6761245727539062, 'standard deviation': 3.0, 'reward_a1': tensor([1.1625], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9873056411743164, 'numerator': 0.13124428689479828, 'denominator': 0.13293176889419556}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.41039466857910156, 'dpo_reward_mean_target': 0.420623779296875, 'standard deviation': 3.0, 'reward_a1': tensor([0.4104], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9999942779541016, 'numerator': 0.13297998905181885, 'denominator': 0.13298074901103973}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.5344772338867188, 'dpo_reward_mean_target': 0.5344772338867188, 'standard deviation': 3.0, 'reward_a1': tensor([0.6817], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.1328207403421402, 'denominator': 0.1328207403421402}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.0056476593017578125, 'dpo_reward_mean_target': -0.0056476593017578125, 'standard deviation': 3.0, 'reward_a1': tensor([0.1369], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13283079862594604, 'denominator': 0.13283079862594604}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.2698860168457031, 'dpo_reward_mean_target': 0.2698860168457031, 'standard deviation': 3.0, 'reward_a1': tensor([1.1559], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.12730525434017181, 'denominator': 0.12730525434017181}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.08655929565429688, 'dpo_reward_mean_target': 0.11899185180664062, 'standard deviation': 3.0, 'reward_a1': tensor([0.2203], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0004234313964844, 'numerator': 0.1329049915075302, 'denominator': 0.13284873962402344}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 1.4907684326171875, 'dpo_reward_mean_target': 0.9950103759765625, 'standard deviation': 3.0, 'reward_a1': tensor([1.5814], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9815252423286438, 'numerator': 0.13046440482139587, 'denominator': 0.1329200714826584}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.4992637634277344, 'dpo_reward_mean_target': 0.5701446533203125, 'standard deviation': 3.0, 'reward_a1': tensor([1.6312], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.008672833442688, 'numerator': 0.1249181255698204, 'denominator': 0.12384404242038727}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.042148590087890625, 'dpo_reward_mean_target': 0.042148590087890625, 'standard deviation': 3.0, 'reward_a1': tensor([0.3479], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13229206204414368, 'denominator': 0.13229206204414368}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.17775344848632812, 'dpo_reward_mean_target': 0.17775344848632812, 'standard deviation': 3.0, 'reward_a1': tensor([-0.3702], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13078097999095917, 'denominator': 0.13078097999095917}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.0072765350341796875, 'dpo_reward_mean_target': -0.0072765350341796875, 'standard deviation': 3.0, 'reward_a1': tensor([-0.0073], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13298074901103973, 'denominator': 0.13298074901103973}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.00730133056640625, 'dpo_reward_mean_target': 0.00730133056640625, 'standard deviation': 3.0, 'reward_a1': tensor([0.0073], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13298074901103973, 'denominator': 0.13298074901103973}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.055755615234375, 'dpo_reward_mean_target': -0.15981292724609375, 'standard deviation': 3.0, 'reward_a1': tensor([0.3679], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9945157766342163, 'numerator': 0.1309395432472229, 'denominator': 0.13166160881519318}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.04270172119140625, 'dpo_reward_mean_target': 0.9714508056640625, 'standard deviation': 3.0, 'reward_a1': tensor([-0.3412], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9161815643310547, 'numerator': 0.12084086239337921, 'denominator': 0.13189619779586792}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 2.1453094482421875, 'dpo_reward_mean_target': 2.1732940673828125, 'standard deviation': 3.0, 'reward_a1': tensor([0.5982], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9951575994491577, 'numerator': 0.11585929244756699, 'denominator': 0.11642306298017502}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.733428955078125, 'dpo_reward_mean_target': 0.8765716552734375, 'standard deviation': 3.0, 'reward_a1': tensor([1.5047], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0111908912658691, 'numerator': 0.1300976574420929, 'denominator': 0.12865786254405975}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.1504535675048828, 'dpo_reward_mean_target': 0.25275421142578125, 'standard deviation': 3.0, 'reward_a1': tensor([1.4607], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.014414668083191, 'numerator': 0.1226264163851738, 'denominator': 0.12088391184806824}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.09633827209472656, 'dpo_reward_mean_target': 0.09633827209472656, 'standard deviation': 3.0, 'reward_a1': tensor([-0.1617], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13248984515666962, 'denominator': 0.13248984515666962}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 1.1125068664550781, 'dpo_reward_mean_target': 1.093353271484375, 'standard deviation': 3.0, 'reward_a1': tensor([1.4197], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9993259310722351, 'numerator': 0.1321960836648941, 'denominator': 0.1322852522134781}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.0706939697265625, 'dpo_reward_mean_target': 0.0706939697265625, 'standard deviation': 3.0, 'reward_a1': tensor([-0.1386], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13265767693519592, 'denominator': 0.13265767693519592}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.78265380859375, 'dpo_reward_mean_target': 0.818634033203125, 'standard deviation': 3.0, 'reward_a1': tensor([1.1840], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0015336275100708, 'numerator': 0.13199841976165771, 'denominator': 0.13179628551006317}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.6338119506835938, 'dpo_reward_mean_target': -0.10529327392578125, 'standard deviation': 3.0, 'reward_a1': tensor([2.4733], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.8340891003608704, 'numerator': 0.09190905094146729, 'denominator': 0.11019092798233032}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.8489303588867188, 'dpo_reward_mean_target': 0.9168472290039062, 'standard deviation': 3.0, 'reward_a1': tensor([1.0682], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0013995170593262, 'numerator': 0.1328115463256836, 'denominator': 0.13262593746185303}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.40050506591796875, 'dpo_reward_mean_target': 0.5246505737304688, 'standard deviation': 3.0, 'reward_a1': tensor([0.3563], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0307033061981201, 'numerator': 0.13277146220207214, 'denominator': 0.1288163661956787}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.009523391723632812, 'dpo_reward_mean_target': 0.009523391723632812, 'standard deviation': 3.0, 'reward_a1': tensor([0.8832], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.12745895981788635, 'denominator': 0.12745895981788635}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.11981582641601562, 'dpo_reward_mean_target': 0.37071990966796875, 'standard deviation': 3.0, 'reward_a1': tensor([0.1276], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0001155138015747, 'numerator': 0.13254469633102417, 'denominator': 0.1325293928384781}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.6430282592773438, 'dpo_reward_mean_target': 0.6014022827148438, 'standard deviation': 3.0, 'reward_a1': tensor([0.0223], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0027786493301392, 'numerator': 0.13052581250667572, 'denominator': 0.13016413152217865}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.4035148620605469, 'dpo_reward_mean_target': 0.748870849609375, 'standard deviation': 3.0, 'reward_a1': tensor([-0.0533], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9761322140693665, 'numerator': 0.12831030786037445, 'denominator': 0.1314476728439331}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.03888511657714844, 'dpo_reward_mean_target': -0.03888511657714844, 'standard deviation': 3.0, 'reward_a1': tensor([0.5944], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.1300504505634308, 'denominator': 0.1300504505634308}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.488922119140625, 'dpo_reward_mean_target': 0.488922119140625, 'standard deviation': 3.0, 'reward_a1': tensor([1.2568], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.12869547307491302, 'denominator': 0.12869547307491302}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.6057548522949219, 'dpo_reward_mean_target': 0.2843132019042969, 'standard deviation': 3.0, 'reward_a1': tensor([-0.0226], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0168436765670776, 'numerator': 0.13228647410869598, 'denominator': 0.1300951987504959}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.0700225830078125, 'dpo_reward_mean_target': 0.0700225830078125, 'standard deviation': 3.0, 'reward_a1': tensor([0.0700], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13298074901103973, 'denominator': 0.13298074901103973}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.2030029296875, 'dpo_reward_mean_target': -0.2030029296875, 'standard deviation': 3.0, 'reward_a1': tensor([0.0101], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13264566659927368, 'denominator': 0.13264566659927368}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.5940170288085938, 'dpo_reward_mean_target': 0.5940170288085938, 'standard deviation': 3.0, 'reward_a1': tensor([0.2742], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13222716748714447, 'denominator': 0.13222716748714447}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.13631439208984375, 'dpo_reward_mean_target': -0.13631439208984375, 'standard deviation': 3.0, 'reward_a1': tensor([-0.3108], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13275595009326935, 'denominator': 0.13275595009326935}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.2016277313232422, 'dpo_reward_mean_target': 0.2016277313232422, 'standard deviation': 3.0, 'reward_a1': tensor([-0.9214], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.12398146837949753, 'denominator': 0.12398146837949753}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.5997314453125, 'dpo_reward_mean_target': 0.764373779296875, 'standard deviation': 3.0, 'reward_a1': tensor([0.4727], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9961778521537781, 'numerator': 0.13235384225845337, 'denominator': 0.1328616589307785}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.32906341552734375, 'dpo_reward_mean_target': 0.5275344848632812, 'standard deviation': 3.0, 'reward_a1': tensor([-0.1695], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9869025349617004, 'numerator': 0.1294388622045517, 'denominator': 0.13115668296813965}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.041423797607421875, 'dpo_reward_mean_target': 0.041423797607421875, 'standard deviation': 3.0, 'reward_a1': tensor([-0.5313], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.1305796355009079, 'denominator': 0.1305796355009079}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.385009765625, 'dpo_reward_mean_target': 0.385009765625, 'standard deviation': 3.0, 'reward_a1': tensor([0.1927], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13270795345306396, 'denominator': 0.13270795345306396}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.7184562683105469, 'dpo_reward_mean_target': 0.6156349182128906, 'standard deviation': 3.0, 'reward_a1': tensor([-0.1753], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0096700191497803, 'numerator': 0.12843838334083557, 'denominator': 0.12720827758312225}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.52520751953125, 'dpo_reward_mean_target': 0.52520751953125, 'standard deviation': 3.0, 'reward_a1': tensor([0.1373], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.1318737417459488, 'denominator': 0.1318737417459488}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.3657417297363281, 'dpo_reward_mean_target': 0.22177886962890625, 'standard deviation': 3.0, 'reward_a1': tensor([0.9683], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9892679452896118, 'numerator': 0.12892642617225647, 'denominator': 0.1303250789642334}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 1.0669403076171875, 'dpo_reward_mean_target': 0.72930908203125, 'standard deviation': 3.0, 'reward_a1': tensor([0.6566], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.009103775024414, 'numerator': 0.13294166326522827, 'denominator': 0.13174231350421906}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.2849884033203125, 'dpo_reward_mean_target': 0.7026329040527344, 'standard deviation': 3.0, 'reward_a1': tensor([0.0708], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9805637001991272, 'numerator': 0.13006433844566345, 'denominator': 0.13264241814613342}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 1.3132476806640625, 'dpo_reward_mean_target': 1.1415252685546875, 'standard deviation': 3.0, 'reward_a1': tensor([0.8224], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0077581405639648, 'numerator': 0.13223031163215637, 'denominator': 0.13121235370635986}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.8379058837890625, 'dpo_reward_mean_target': 0.4964752197265625, 'standard deviation': 3.0, 'reward_a1': tensor([0.5730], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.00357985496521, 'numerator': 0.1329375058412552, 'denominator': 0.13246330618858337}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.09018707275390625, 'dpo_reward_mean_target': 0.09018707275390625, 'standard deviation': 3.0, 'reward_a1': tensor([0.4173], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13219262659549713, 'denominator': 0.13219262659549713}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.25125885009765625, 'dpo_reward_mean_target': 0.16982269287109375, 'standard deviation': 3.0, 'reward_a1': tensor([0.7685], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9949642419815063, 'numerator': 0.13035905361175537, 'denominator': 0.13101883232593536}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.08310508728027344, 'dpo_reward_mean_target': -0.08310508728027344, 'standard deviation': 3.0, 'reward_a1': tensor([-0.1128], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13297425210475922, 'denominator': 0.13297425210475922}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.5668258666992188, 'dpo_reward_mean_target': 0.2468414306640625, 'standard deviation': 3.0, 'reward_a1': tensor([0.9796], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9798423647880554, 'numerator': 0.12907272577285767, 'denominator': 0.13172805309295654}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.6264381408691406, 'dpo_reward_mean_target': 0.7984657287597656, 'standard deviation': 3.0, 'reward_a1': tensor([0.1741], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9897622466087341, 'numerator': 0.13013146817684174, 'denominator': 0.1314775049686432}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.09994888305664062, 'dpo_reward_mean_target': -0.09994888305664062, 'standard deviation': 3.0, 'reward_a1': tensor([-0.1637], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13295073807239532, 'denominator': 0.13295073807239532}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.19493484497070312, 'dpo_reward_mean_target': 0.2906379699707031, 'standard deviation': 3.0, 'reward_a1': tensor([0.2035], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0084341764450073, 'numerator': 0.13292470574378967, 'denominator': 0.13181297481060028}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.14034271240234375, 'dpo_reward_mean_target': -0.03708648681640625, 'standard deviation': 3.0, 'reward_a1': tensor([0.1139], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9987728595733643, 'numerator': 0.13281244039535522, 'denominator': 0.13297562301158905}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.07647705078125, 'dpo_reward_mean_target': 0.37020111083984375, 'standard deviation': 3.0, 'reward_a1': tensor([0.1957], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9990981221199036, 'numerator': 0.13275595009326935, 'denominator': 0.13287578523159027}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.36446380615234375, 'dpo_reward_mean_target': 0.5435867309570312, 'standard deviation': 3.0, 'reward_a1': tensor([1.1673], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0142977237701416, 'numerator': 0.13013732433319092, 'denominator': 0.12830288708209991}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.03440093994140625, 'dpo_reward_mean_target': -0.033355712890625, 'standard deviation': 3.0, 'reward_a1': tensor([3.0632], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9772059321403503, 'numerator': 0.07806095480918884, 'denominator': 0.07988178730010986}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.10197830200195312, 'dpo_reward_mean_target': 0.041534423828125, 'standard deviation': 3.0, 'reward_a1': tensor([0.8243], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0137193202972412, 'numerator': 0.12853050231933594, 'denominator': 0.12679101526737213}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.05039215087890625, 'dpo_reward_mean_target': 0.4206123352050781, 'standard deviation': 3.0, 'reward_a1': tensor([0.1228], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.995375394821167, 'numerator': 0.13232719898223877, 'denominator': 0.13294200599193573}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 1.61492919921875, 'dpo_reward_mean_target': 1.396484375, 'standard deviation': 3.0, 'reward_a1': tensor([0.3844], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0275905132293701, 'numerator': 0.1256241500377655, 'denominator': 0.12225118279457092}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.9577102661132812, 'dpo_reward_mean_target': 0.3653755187988281, 'standard deviation': 3.0, 'reward_a1': tensor([0.6310], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0020115375518799, 'numerator': 0.13246047496795654, 'denominator': 0.13219456374645233}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.361541748046875, 'dpo_reward_mean_target': 0.13672637939453125, 'standard deviation': 3.0, 'reward_a1': tensor([1.9353], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9587559103965759, 'numerator': 0.11110716313123703, 'denominator': 0.11588680744171143}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.6320533752441406, 'dpo_reward_mean_target': 0.6320533752441406, 'standard deviation': 3.0, 'reward_a1': tensor([1.2875], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.12984409928321838, 'denominator': 0.12984409928321838}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.1274261474609375, 'dpo_reward_mean_target': 0.1263599395751953, 'standard deviation': 3.0, 'reward_a1': tensor([0.0850], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.002415657043457, 'numerator': 0.13296812772750854, 'denominator': 0.13264769315719604}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.10822868347167969, 'dpo_reward_mean_target': 0.10822868347167969, 'standard deviation': 3.0, 'reward_a1': tensor([0.0458], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13295193016529083, 'denominator': 0.13295193016529083}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.15489959716796875, 'dpo_reward_mean_target': 0.2583198547363281, 'standard deviation': 3.0, 'reward_a1': tensor([0.3395], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0015281438827515, 'numerator': 0.13293205201625824, 'denominator': 0.13272921741008759}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.04386138916015625, 'dpo_reward_mean_target': 0.6853561401367188, 'standard deviation': 3.0, 'reward_a1': tensor([1.4770], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0982095003128052, 'numerator': 0.12843094766139984, 'denominator': 0.1169457659125328}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.14259910583496094, 'dpo_reward_mean_target': 0.17269134521484375, 'standard deviation': 3.0, 'reward_a1': tensor([0.4954], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0011298656463623, 'numerator': 0.13221348822116852, 'denominator': 0.13206426799297333}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.13665771484375, 'dpo_reward_mean_target': 0.375244140625, 'standard deviation': 3.0, 'reward_a1': tensor([0.9955], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.019799828529358, 'numerator': 0.13016830384731293, 'denominator': 0.12764103710651398}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.03189849853515625, 'dpo_reward_mean_target': 0.03243255615234375, 'standard deviation': 3.0, 'reward_a1': tensor([-0.0075], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9999447464942932, 'numerator': 0.13296902179718018, 'denominator': 0.13297636806964874}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.5486297607421875, 'dpo_reward_mean_target': 1.2307205200195312, 'standard deviation': 3.0, 'reward_a1': tensor([1.3112], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.032458782196045, 'numerator': 0.13293297588825226, 'denominator': 0.12875378131866455}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 1.07666015625, 'dpo_reward_mean_target': 1.141143798828125, 'standard deviation': 3.0, 'reward_a1': tensor([0.1340], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.993039608001709, 'numerator': 0.12569470703601837, 'denominator': 0.12657572329044342}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.0344085693359375, 'dpo_reward_mean_target': -0.10379791259765625, 'standard deviation': 3.0, 'reward_a1': tensor([0.6776], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9891219735145569, 'numerator': 0.12854589521884918, 'denominator': 0.1299595981836319}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.0171356201171875, 'dpo_reward_mean_target': 1.641693115234375, 'standard deviation': 3.0, 'reward_a1': tensor([2.0561], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.257659673690796, 'numerator': 0.13171812891960144, 'denominator': 0.10473272949457169}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.26589202880859375, 'dpo_reward_mean_target': 0.18979454040527344, 'standard deviation': 3.0, 'reward_a1': tensor([-0.1263], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0029988288879395, 'numerator': 0.13224466145038605, 'denominator': 0.1318492740392685}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.3945655822753906, 'dpo_reward_mean_target': 0.15850067138671875, 'standard deviation': 3.0, 'reward_a1': tensor([-0.0117], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0065525770187378, 'numerator': 0.13276678323745728, 'denominator': 0.13190248608589172}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.05725288391113281, 'dpo_reward_mean_target': -0.05725288391113281, 'standard deviation': 3.0, 'reward_a1': tensor([-0.0864], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.1329745054244995, 'denominator': 0.1329745054244995}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.95794677734375, 'dpo_reward_mean_target': 1.4681854248046875, 'standard deviation': 3.0, 'reward_a1': tensor([1.8788], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0384628772735596, 'numerator': 0.13174104690551758, 'denominator': 0.1268615871667862}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.6132888793945312, 'dpo_reward_mean_target': 1.0703887939453125, 'standard deviation': 3.0, 'reward_a1': tensor([1.7706], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0483026504516602, 'numerator': 0.12940713763237, 'denominator': 0.12344444543123245}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.033481597900390625, 'dpo_reward_mean_target': -0.033481597900390625, 'standard deviation': 3.0, 'reward_a1': tensor([0.0911], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13286621868610382, 'denominator': 0.13286621868610382}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.1008758544921875, 'dpo_reward_mean_target': -0.1008758544921875, 'standard deviation': 3.0, 'reward_a1': tensor([0.2242], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.1322023868560791, 'denominator': 0.1322023868560791}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.14064407348632812, 'dpo_reward_mean_target': 0.14064407348632812, 'standard deviation': 3.0, 'reward_a1': tensor([0.1807], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.132968932390213, 'denominator': 0.132968932390213}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.03972625732421875, 'dpo_reward_mean_target': -0.03972625732421875, 'standard deviation': 3.0, 'reward_a1': tensor([-0.2595], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13262435793876648, 'denominator': 0.13262435793876648}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.029691696166992188, 'dpo_reward_mean_target': 0.029691696166992188, 'standard deviation': 3.0, 'reward_a1': tensor([0.0873], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13295625150203705, 'denominator': 0.13295625150203705}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.41306495666503906, 'dpo_reward_mean_target': 0.056915283203125, 'standard deviation': 3.0, 'reward_a1': tensor([-0.1859], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9995887875556946, 'numerator': 0.13254570960998535, 'denominator': 0.13260023295879364}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.02449798583984375, 'dpo_reward_mean_target': -0.43450927734375, 'standard deviation': 3.0, 'reward_a1': tensor([0.4460], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9673411250114441, 'numerator': 0.1273740530014038, 'denominator': 0.1316743940114975}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.00782012939453125, 'dpo_reward_mean_target': -0.00782012939453125, 'standard deviation': 3.0, 'reward_a1': tensor([0.0355], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13296692073345184, 'denominator': 0.13296692073345184}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.1898345947265625, 'dpo_reward_mean_target': 0.30419921875, 'standard deviation': 3.0, 'reward_a1': tensor([0.6236], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.031579613685608, 'numerator': 0.13222931325435638, 'denominator': 0.12818139791488647}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.44365692138671875, 'dpo_reward_mean_target': 0.4284210205078125, 'standard deviation': 3.0, 'reward_a1': tensor([3.3542], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9950722455978394, 'numerator': 0.08265069127082825, 'denominator': 0.08305998891592026}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.012544631958007812, 'dpo_reward_mean_target': -0.012544631958007812, 'standard deviation': 3.0, 'reward_a1': tensor([-0.0125], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13298074901103973, 'denominator': 0.13298074901103973}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.8464508056640625, 'dpo_reward_mean_target': -0.1503753662109375, 'standard deviation': 3.0, 'reward_a1': tensor([3.5012], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.7052225470542908, 'numerator': 0.06339710205793381, 'denominator': 0.08989658951759338}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.0539398193359375, 'dpo_reward_mean_target': 1.00054931640625, 'standard deviation': 3.0, 'reward_a1': tensor([-0.6789], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.8737210631370544, 'numerator': 0.11369439959526062, 'denominator': 0.13012665510177612}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.0007343292236328125, 'dpo_reward_mean_target': 0.0007343292236328125, 'standard deviation': 3.0, 'reward_a1': tensor([-0.2048], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13266907632350922, 'denominator': 0.13266907632350922}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.3890380859375, 'dpo_reward_mean_target': 0.4210205078125, 'standard deviation': 3.0, 'reward_a1': tensor([1.0715], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.002371072769165, 'numerator': 0.12989120185375214, 'denominator': 0.1295839548110962}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.06241798400878906, 'dpo_reward_mean_target': -0.06241798400878906, 'standard deviation': 3.0, 'reward_a1': tensor([0.2547], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.1322399079799652, 'denominator': 0.1322399079799652}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.1893291473388672, 'dpo_reward_mean_target': -0.1893291473388672, 'standard deviation': 3.0, 'reward_a1': tensor([0.5295], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.12921784818172455, 'denominator': 0.12921784818172455}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.06181907653808594, 'dpo_reward_mean_target': -0.06181907653808594, 'standard deviation': 3.0, 'reward_a1': tensor([-0.2290], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13277457654476166, 'denominator': 0.13277457654476166}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.2268085479736328, 'dpo_reward_mean_target': -0.2268085479736328, 'standard deviation': 3.0, 'reward_a1': tensor([0.6906], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.12690605223178864, 'denominator': 0.12690605223178864}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.0593414306640625, 'dpo_reward_mean_target': 0.13774490356445312, 'standard deviation': 3.0, 'reward_a1': tensor([0.7654], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0058263540267944, 'numerator': 0.13010196387767792, 'denominator': 0.12934833765029907}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.404449462890625, 'dpo_reward_mean_target': 1.2961807250976562, 'standard deviation': 3.0, 'reward_a1': tensor([0.3792], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9543949961662292, 'numerator': 0.1269116848707199, 'denominator': 0.13297605514526367}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 1.8292083740234375, 'dpo_reward_mean_target': 1.1942901611328125, 'standard deviation': 3.0, 'reward_a1': tensor([0.1354], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.1019667387008667, 'numerator': 0.1249498799443245, 'denominator': 0.1133880689740181}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.49109649658203125, 'dpo_reward_mean_target': 0.6758575439453125, 'standard deviation': 3.0, 'reward_a1': tensor([-0.3893], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9802276492118835, 'numerator': 0.12485732138156891, 'denominator': 0.12737584114074707}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.056911468505859375, 'dpo_reward_mean_target': -0.056911468505859375, 'standard deviation': 3.0, 'reward_a1': tensor([-0.0361], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13297754526138306, 'denominator': 0.13297754526138306}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.5810928344726562, 'dpo_reward_mean_target': 0.7587203979492188, 'standard deviation': 3.0, 'reward_a1': tensor([0.0392], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9876288175582886, 'numerator': 0.12921032309532166, 'denominator': 0.1308288276195526}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.017587661743164062, 'dpo_reward_mean_target': 0.017587661743164062, 'standard deviation': 3.0, 'reward_a1': tensor([0.0666], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13296306133270264, 'denominator': 0.13296306133270264}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.029596328735351562, 'dpo_reward_mean_target': -0.029596328735351562, 'standard deviation': 3.0, 'reward_a1': tensor([-0.0717], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13296768069267273, 'denominator': 0.13296768069267273}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.956207275390625, 'dpo_reward_mean_target': 2.122833251953125, 'standard deviation': 3.0, 'reward_a1': tensor([4.0566], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.3858062028884888, 'numerator': 0.10803424566984177, 'denominator': 0.07795768976211548}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 2.029083251953125, 'dpo_reward_mean_target': 1.792449951171875, 'standard deviation': 3.0, 'reward_a1': tensor([2.5811], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.98252934217453, 'numerator': 0.12846408784389496, 'denominator': 0.13074834644794464}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.0006561279296875, 'dpo_reward_mean_target': 0.8460235595703125, 'standard deviation': 3.0, 'reward_a1': tensor([1.0315], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0587869882583618, 'numerator': 0.13272687792778015, 'denominator': 0.1253574937582016}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.036022186279296875, 'dpo_reward_mean_target': 0.036022186279296875, 'standard deviation': 3.0, 'reward_a1': tensor([0.3230], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13237352669239044, 'denominator': 0.13237352669239044}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.0010528564453125, 'dpo_reward_mean_target': 0.0010528564453125, 'standard deviation': 3.0, 'reward_a1': tensor([0.0011], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13298074901103973, 'denominator': 0.13298074901103973}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -9.5367431640625e-06, 'dpo_reward_mean_target': -9.5367431640625e-06, 'standard deviation': 3.0, 'reward_a1': tensor([-7.6294e-06], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13298074901103973, 'denominator': 0.13298074901103973}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.0156097412109375, 'dpo_reward_mean_target': -0.0156097412109375, 'standard deviation': 3.0, 'reward_a1': tensor([0.1224], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13284005224704742, 'denominator': 0.13284005224704742}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.3895912170410156, 'dpo_reward_mean_target': -0.063232421875, 'standard deviation': 3.0, 'reward_a1': tensor([1.0002], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9587628841400146, 'numerator': 0.12488370388746262, 'denominator': 0.13025504350662231}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.11398506164550781, 'dpo_reward_mean_target': 0.11398506164550781, 'standard deviation': 3.0, 'reward_a1': tensor([0.1183], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13298062980175018, 'denominator': 0.13298062980175018}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.0060787200927734375, 'dpo_reward_mean_target': 0.0060787200927734375, 'standard deviation': 3.0, 'reward_a1': tensor([0.4248], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13169169425964355, 'denominator': 0.13169169425964355}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.011249542236328125, 'dpo_reward_mean_target': -0.011249542236328125, 'standard deviation': 3.0, 'reward_a1': tensor([-0.0112], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13298074901103973, 'denominator': 0.13298074901103973}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.075469970703125, 'dpo_reward_mean_target': 0.65673828125, 'standard deviation': 3.0, 'reward_a1': tensor([0.0260], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9782739281654358, 'numerator': 0.1300739198923111, 'denominator': 0.1329626739025116}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.0894775390625, 'dpo_reward_mean_target': 0.43920135498046875, 'standard deviation': 3.0, 'reward_a1': tensor([0.2109], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9979240298271179, 'numerator': 0.13259612023830414, 'denominator': 0.13287195563316345}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.071990966796875, 'dpo_reward_mean_target': 0.11173439025878906, 'standard deviation': 3.0, 'reward_a1': tensor([-0.0396], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9987862706184387, 'numerator': 0.13281162083148956, 'denominator': 0.13297301530838013}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.12042808532714844, 'dpo_reward_mean_target': 0.34527015686035156, 'standard deviation': 3.0, 'reward_a1': tensor([0.1726], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9984947443008423, 'numerator': 0.13276058435440063, 'denominator': 0.1329607218503952}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.3232555389404297, 'dpo_reward_mean_target': 0.43140411376953125, 'standard deviation': 3.0, 'reward_a1': tensor([1.3440], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0116831064224243, 'numerator': 0.12696857750415802, 'denominator': 0.1255023181438446}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.5048732757568359, 'dpo_reward_mean_target': 0.5048732757568359, 'standard deviation': 3.0, 'reward_a1': tensor([-0.0408], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13079917430877686, 'denominator': 0.13079917430877686}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.8243789672851562, 'dpo_reward_mean_target': 0.8243789672851562, 'standard deviation': 3.0, 'reward_a1': tensor([1.3176], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13119544088840485, 'denominator': 0.13119544088840485}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.637939453125, 'dpo_reward_mean_target': 0.11409759521484375, 'standard deviation': 3.0, 'reward_a1': tensor([0.3538], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0012956857681274, 'numerator': 0.13255704939365387, 'denominator': 0.1323855221271515}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.2918243408203125, 'dpo_reward_mean_target': 0.03607940673828125, 'standard deviation': 3.0, 'reward_a1': tensor([0.1258], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0092846155166626, 'numerator': 0.13292130827903748, 'denominator': 0.13169853389263153}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.051422119140625, 'dpo_reward_mean_target': 0.0971832275390625, 'standard deviation': 3.0, 'reward_a1': tensor([-0.0160], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9995411038398743, 'numerator': 0.13288620114326477, 'denominator': 0.13294720649719238}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.00359344482421875, 'dpo_reward_mean_target': -0.00359344482421875, 'standard deviation': 3.0, 'reward_a1': tensor([-0.0036], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13298074901103973, 'denominator': 0.13298074901103973}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.13621997833251953, 'dpo_reward_mean_target': 0.13621997833251953, 'standard deviation': 3.0, 'reward_a1': tensor([-0.1153], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13251429796218872, 'denominator': 0.13251429796218872}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.022031784057617188, 'dpo_reward_mean_target': -0.022031784057617188, 'standard deviation': 3.0, 'reward_a1': tensor([-0.0323], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13297998905181885, 'denominator': 0.13297998905181885}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.351165771484375, 'dpo_reward_mean_target': 1.276458740234375, 'standard deviation': 3.0, 'reward_a1': tensor([0.1392], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9329952597618103, 'numerator': 0.12376121431589127, 'denominator': 0.13264934718608856}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.022844314575195312, 'dpo_reward_mean_target': -0.022844314575195312, 'standard deviation': 3.0, 'reward_a1': tensor([-0.0228], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13298074901103973, 'denominator': 0.13298074901103973}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.04720878601074219, 'dpo_reward_mean_target': 0.070098876953125, 'standard deviation': 3.0, 'reward_a1': tensor([0.0211], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9999040961265564, 'numerator': 0.13296298682689667, 'denominator': 0.1329757422208786}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.000850677490234375, 'dpo_reward_mean_target': -0.000850677490234375, 'standard deviation': 3.0, 'reward_a1': tensor([-0.0009], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13298074901103973, 'denominator': 0.13298074901103973}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.0214691162109375, 'dpo_reward_mean_target': 0.1539325714111328, 'standard deviation': 3.0, 'reward_a1': tensor([-0.0547], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9976457953453064, 'numerator': 0.13265955448150635, 'denominator': 0.1329725980758667}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.052978515625, 'dpo_reward_mean_target': -0.052978515625, 'standard deviation': 3.0, 'reward_a1': tensor([1.2554], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.12091608345508575, 'denominator': 0.12091608345508575}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.738555908203125, 'dpo_reward_mean_target': 1.1443328857421875, 'standard deviation': 3.0, 'reward_a1': tensor([0.7533], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9915528297424316, 'numerator': 0.13185587525367737, 'denominator': 0.13297916948795319}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.5555229187011719, 'dpo_reward_mean_target': 0.45877838134765625, 'standard deviation': 3.0, 'reward_a1': tensor([0.0429], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0103386640548706, 'numerator': 0.13170908391475677, 'denominator': 0.13036131858825684}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.053768157958984375, 'dpo_reward_mean_target': 0.015148162841796875, 'standard deviation': 3.0, 'reward_a1': tensor([2.9036], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.987763524055481, 'numerator': 0.08365549892187119, 'denominator': 0.08469182997941971}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.08343029022216797, 'dpo_reward_mean_target': 0.1656961441040039, 'standard deviation': 3.0, 'reward_a1': tensor([-0.2956], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9961670637130737, 'numerator': 0.13141804933547974, 'denominator': 0.13192370533943176}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.05292320251464844, 'dpo_reward_mean_target': 0.05292320251464844, 'standard deviation': 3.0, 'reward_a1': tensor([0.0529], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13298074901103973, 'denominator': 0.13298074901103973}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.3826866149902344, 'dpo_reward_mean_target': 0.4491558074951172, 'standard deviation': 3.0, 'reward_a1': tensor([1.0971], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.1033289432525635, 'numerator': 0.1299148052930832, 'denominator': 0.11774802953004837}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.2253742218017578, 'dpo_reward_mean_target': 0.3621807098388672, 'standard deviation': 3.0, 'reward_a1': tensor([-0.2395], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9919270277023315, 'numerator': 0.13033320009708405, 'denominator': 0.1313939392566681}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.6066741943359375, 'dpo_reward_mean_target': 0.7881011962890625, 'standard deviation': 3.0, 'reward_a1': tensor([3.1487], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0506556034088135, 'numerator': 0.0975758358836174, 'denominator': 0.09287138283252716}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.6897544860839844, 'dpo_reward_mean_target': 0.6897544860839844, 'standard deviation': 3.0, 'reward_a1': tensor([0.4215], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13245037198066711, 'denominator': 0.13245037198066711}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.0564422607421875, 'dpo_reward_mean_target': 0.4181365966796875, 'standard deviation': 3.0, 'reward_a1': tensor([0.2968], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0023926496505737, 'numerator': 0.13287195563316345, 'denominator': 0.1325547993183136}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.0950775146484375, 'dpo_reward_mean_target': -0.0950775146484375, 'standard deviation': 3.0, 'reward_a1': tensor([-0.2389], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13282814621925354, 'denominator': 0.13282814621925354}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.4691658020019531, 'dpo_reward_mean_target': 0.4691658020019531, 'standard deviation': 3.0, 'reward_a1': tensor([0.4615], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.1329803466796875, 'denominator': 0.1329803466796875}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.060184478759765625, 'dpo_reward_mean_target': 0.060184478759765625, 'standard deviation': 3.0, 'reward_a1': tensor([0.0816], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13297735154628754, 'denominator': 0.13297735154628754}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.25029754638671875, 'dpo_reward_mean_target': 0.25029754638671875, 'standard deviation': 3.0, 'reward_a1': tensor([0.2334], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.1329786628484726, 'denominator': 0.1329786628484726}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.589691162109375, 'dpo_reward_mean_target': 0.161529541015625, 'standard deviation': 3.0, 'reward_a1': tensor([1.6905], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9393629431724548, 'numerator': 0.11678458005189896, 'denominator': 0.12432317435741425}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 1.2830657958984375, 'dpo_reward_mean_target': 1.6569442749023438, 'standard deviation': 3.0, 'reward_a1': tensor([0.3296], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9537295699119568, 'numerator': 0.12058112770318985, 'denominator': 0.1264311522245407}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.06142997741699219, 'dpo_reward_mean_target': -0.06142997741699219, 'standard deviation': 3.0, 'reward_a1': tensor([-0.2789], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.1326318085193634, 'denominator': 0.1326318085193634}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.17518234252929688, 'dpo_reward_mean_target': 0.05042076110839844, 'standard deviation': 3.0, 'reward_a1': tensor([-0.0341], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0007085800170898, 'numerator': 0.13292796909809113, 'denominator': 0.1328338384628296}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.5486793518066406, 'dpo_reward_mean_target': 0.717620849609375, 'standard deviation': 3.0, 'reward_a1': tensor([0.9770], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0064752101898193, 'numerator': 0.13248470425605774, 'denominator': 0.13163235783576965}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.08253860473632812, 'dpo_reward_mean_target': 0.3104534149169922, 'standard deviation': 3.0, 'reward_a1': tensor([0.3465], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0038057565689087, 'numerator': 0.13297118246555328, 'denominator': 0.13246704638004303}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.02529144287109375, 'dpo_reward_mean_target': -0.02529144287109375, 'standard deviation': 3.0, 'reward_a1': tensor([-0.0129], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.1329796314239502, 'denominator': 0.1329796314239502}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.9122848510742188, 'dpo_reward_mean_target': 0.7091293334960938, 'standard deviation': 3.0, 'reward_a1': tensor([2.2706], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9675825238227844, 'numerator': 0.11613393574953079, 'denominator': 0.12002483755350113}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.08475875854492188, 'dpo_reward_mean_target': 0.15715789794921875, 'standard deviation': 3.0, 'reward_a1': tensor([0.1572], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0032563209533691, 'numerator': 0.13298074901103973, 'denominator': 0.13254912197589874}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.02164459228515625, 'dpo_reward_mean_target': -0.02164459228515625, 'standard deviation': 3.0, 'reward_a1': tensor([-0.0559], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.1329720914363861, 'denominator': 0.1329720914363861}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.02391815185546875, 'dpo_reward_mean_target': 0.02391815185546875, 'standard deviation': 3.0, 'reward_a1': tensor([0.2529], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13259388506412506, 'denominator': 0.13259388506412506}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 2.019287109375, 'dpo_reward_mean_target': 1.206695556640625, 'standard deviation': 3.0, 'reward_a1': tensor([1.5553], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.005225419998169, 'numerator': 0.1320861279964447, 'denominator': 0.1313995122909546}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.10569572448730469, 'dpo_reward_mean_target': 0.38385772705078125, 'standard deviation': 3.0, 'reward_a1': tensor([0.2043], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9987497925758362, 'numerator': 0.13274279236793518, 'denominator': 0.13290895521640778}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.06562042236328125, 'dpo_reward_mean_target': 0.06562042236328125, 'standard deviation': 3.0, 'reward_a1': tensor([0.0440], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13297732174396515, 'denominator': 0.13297732174396515}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.17034339904785156, 'dpo_reward_mean_target': -0.1670684814453125, 'standard deviation': 3.0, 'reward_a1': tensor([0.0245], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9991437792778015, 'numerator': 0.13270995020866394, 'denominator': 0.13282367587089539}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.19487571716308594, 'dpo_reward_mean_target': 0.19487571716308594, 'standard deviation': 3.0, 'reward_a1': tensor([0.1908], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13298062980175018, 'denominator': 0.13298062980175018}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 1.427154541015625, 'dpo_reward_mean_target': 1.1483230590820312, 'standard deviation': 3.0, 'reward_a1': tensor([3.2801], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9401408433914185, 'numerator': 0.10331019014120102, 'denominator': 0.10988799482584}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.6756973266601562, 'dpo_reward_mean_target': 0.2091217041015625, 'standard deviation': 3.0, 'reward_a1': tensor([1.5452], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9444330930709839, 'numerator': 0.12042564153671265, 'denominator': 0.12751103937625885}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.11563873291015625, 'dpo_reward_mean_target': 0.09515953063964844, 'standard deviation': 3.0, 'reward_a1': tensor([0.0009], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.000261664390564, 'numerator': 0.1329151690006256, 'denominator': 0.13288040459156036}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.504425048828125, 'dpo_reward_mean_target': 0.0081329345703125, 'standard deviation': 3.0, 'reward_a1': tensor([0.8796], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0663340091705322, 'numerator': 0.1274866759777069, 'denominator': 0.11955604702234268}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.7060928344726562, 'dpo_reward_mean_target': 0.6522674560546875, 'standard deviation': 3.0, 'reward_a1': tensor([-0.2243], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0054181814193726, 'numerator': 0.12742328643798828, 'denominator': 0.1267366111278534}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.035022735595703125, 'dpo_reward_mean_target': 0.035022735595703125, 'standard deviation': 3.0, 'reward_a1': tensor([1.0190], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.12601616978645325, 'denominator': 0.12601616978645325}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.6372489929199219, 'dpo_reward_mean_target': 0.4148101806640625, 'standard deviation': 3.0, 'reward_a1': tensor([0.4734], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0013006925582886, 'numerator': 0.1329553872346878, 'denominator': 0.1327826827764511}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.004241943359375, 'dpo_reward_mean_target': 0.14178466796875, 'standard deviation': 3.0, 'reward_a1': tensor([1.8016], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.026769757270813, 'numerator': 0.11410772055387497, 'denominator': 0.11113272607326508}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.013553619384765625, 'dpo_reward_mean_target': -0.013553619384765625, 'standard deviation': 3.0, 'reward_a1': tensor([0.2492], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.1324717253446579, 'denominator': 0.1324717253446579}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.15576934814453125, 'dpo_reward_mean_target': -0.01592254638671875, 'standard deviation': 3.0, 'reward_a1': tensor([0.7042], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0123512744903564, 'numerator': 0.129204660654068, 'denominator': 0.12762828171253204}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.08008575439453125, 'dpo_reward_mean_target': 0.31101226806640625, 'standard deviation': 3.0, 'reward_a1': tensor([0.6757], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0246437788009644, 'numerator': 0.13200189173221588, 'denominator': 0.12882710993289948}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.20376205444335938, 'dpo_reward_mean_target': -0.09978485107421875, 'standard deviation': 3.0, 'reward_a1': tensor([0.1336], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9972505569458008, 'numerator': 0.13257889449596405, 'denominator': 0.13294441998004913}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.10393905639648438, 'dpo_reward_mean_target': 0.10393905639648438, 'standard deviation': 3.0, 'reward_a1': tensor([-0.0325], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13284337520599365, 'denominator': 0.13284337520599365}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.09212303161621094, 'dpo_reward_mean_target': 0.09212303161621094, 'standard deviation': 3.0, 'reward_a1': tensor([-0.0483], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13283517956733704, 'denominator': 0.13283517956733704}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.23415756225585938, 'dpo_reward_mean_target': 0.3154449462890625, 'standard deviation': 3.0, 'reward_a1': tensor([0.2177], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9994838833808899, 'numerator': 0.1329101175069809, 'denominator': 0.13297875225543976}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.96771240234375, 'dpo_reward_mean_target': 2.3465728759765625, 'standard deviation': 3.0, 'reward_a1': tensor([3.0746], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.2425448894500732, 'numerator': 0.1291222721338272, 'denominator': 0.10391759127378464}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.1310253143310547, 'dpo_reward_mean_target': -0.1310253143310547, 'standard deviation': 3.0, 'reward_a1': tensor([0.1229], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.1325051337480545, 'denominator': 0.1325051337480545}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.40239715576171875, 'dpo_reward_mean_target': 0.8075103759765625, 'standard deviation': 3.0, 'reward_a1': tensor([1.6481], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.2144925594329834, 'numerator': 0.12786206603050232, 'denominator': 0.10528023540973663}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.2559700012207031, 'dpo_reward_mean_target': 0.2559700012207031, 'standard deviation': 3.0, 'reward_a1': tensor([-0.2365], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.1312008649110794, 'denominator': 0.1312008649110794}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.015550613403320312, 'dpo_reward_mean_target': -0.007928848266601562, 'standard deviation': 3.0, 'reward_a1': tensor([-0.0587], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.000162959098816, 'numerator': 0.13296173512935638, 'denominator': 0.13294006884098053}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.00518035888671875, 'dpo_reward_mean_target': -0.00518035888671875, 'standard deviation': 3.0, 'reward_a1': tensor([-0.0456], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.1329687088727951, 'denominator': 0.1329687088727951}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.09743309020996094, 'dpo_reward_mean_target': 0.27571868896484375, 'standard deviation': 3.0, 'reward_a1': tensor([0.6307], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.022708773612976, 'numerator': 0.13205288350582123, 'denominator': 0.12912070751190186}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.3463764190673828, 'dpo_reward_mean_target': 0.3463764190673828, 'standard deviation': 3.0, 'reward_a1': tensor([0.6766], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13217736780643463, 'denominator': 0.13217736780643463}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.12039566040039062, 'dpo_reward_mean_target': 0.12039566040039062, 'standard deviation': 3.0, 'reward_a1': tensor([0.4718], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.1320713311433792, 'denominator': 0.1320713311433792}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.06059074401855469, 'dpo_reward_mean_target': -0.06059074401855469, 'standard deviation': 3.0, 'reward_a1': tensor([-0.0293], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.1329735517501831, 'denominator': 0.1329735517501831}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.03823280334472656, 'dpo_reward_mean_target': -0.03823280334472656, 'standard deviation': 3.0, 'reward_a1': tensor([0.2884], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13219483196735382, 'denominator': 0.13219483196735382}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.5010757446289062, 'dpo_reward_mean_target': 0.656219482421875, 'standard deviation': 3.0, 'reward_a1': tensor([-0.3600], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9839493036270142, 'numerator': 0.12556585669517517, 'denominator': 0.12761415541172028}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.04935264587402344, 'dpo_reward_mean_target': 0.04935264587402344, 'standard deviation': 3.0, 'reward_a1': tensor([-0.1198], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.1327696293592453, 'denominator': 0.1327696293592453}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.996124267578125, 'dpo_reward_mean_target': 0.8846588134765625, 'standard deviation': 3.0, 'reward_a1': tensor([2.2196], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9842819571495056, 'numerator': 0.1204463392496109, 'denominator': 0.12236975133419037}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.51531982421875, 'dpo_reward_mean_target': 0.3102264404296875, 'standard deviation': 3.0, 'reward_a1': tensor([-0.0400], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0103707313537598, 'numerator': 0.13207775354385376, 'denominator': 0.1307220757007599}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.015535354614257812, 'dpo_reward_mean_target': 0.20148849487304688, 'standard deviation': 3.0, 'reward_a1': tensor([-0.0547], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9966331720352173, 'numerator': 0.1324966996908188, 'denominator': 0.13294430077075958}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.006305694580078125, 'dpo_reward_mean_target': 0.47564697265625, 'standard deviation': 3.0, 'reward_a1': tensor([0.4649], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0117437839508057, 'numerator': 0.13297992944717407, 'denominator': 0.13143636286258698}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.0964202880859375, 'dpo_reward_mean_target': -0.0964202880859375, 'standard deviation': 3.0, 'reward_a1': tensor([0.1320], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.1325957030057907, 'denominator': 0.1325957030057907}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.46503448486328125, 'dpo_reward_mean_target': 0.3913459777832031, 'standard deviation': 3.0, 'reward_a1': tensor([0.3489], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0006495714187622, 'numerator': 0.13296742737293243, 'denominator': 0.13288110494613647}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.87396240234375, 'dpo_reward_mean_target': 0.91937255859375, 'standard deviation': 3.0, 'reward_a1': tensor([1.5861], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0034843683242798, 'numerator': 0.12973728775978088, 'denominator': 0.12928681075572968}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.8897476196289062, 'dpo_reward_mean_target': 1.2687454223632812, 'standard deviation': 3.0, 'reward_a1': tensor([-0.0179], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.954849362373352, 'numerator': 0.12129612267017365, 'denominator': 0.12703168392181396}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.06075286865234375, 'dpo_reward_mean_target': 0.019073486328125, 'standard deviation': 3.0, 'reward_a1': tensor([-0.3543], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.001827359199524, 'numerator': 0.1319545954465866, 'denominator': 0.13171391189098358}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.2489490509033203, 'dpo_reward_mean_target': 0.2489490509033203, 'standard deviation': 3.0, 'reward_a1': tensor([0.8804], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13006778061389923, 'denominator': 0.13006778061389923}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.011804580688476562, 'dpo_reward_mean_target': -0.011804580688476562, 'standard deviation': 3.0, 'reward_a1': tensor([0.6964], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.12932632863521576, 'denominator': 0.12932632863521576}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 1.0950164794921875, 'dpo_reward_mean_target': 1.4742584228515625, 'standard deviation': 3.0, 'reward_a1': tensor([3.4083], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0936142206192017, 'numerator': 0.10802821815013885, 'denominator': 0.09878092259168625}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.37653350830078125, 'dpo_reward_mean_target': 1.290679931640625, 'standard deviation': 3.0, 'reward_a1': tensor([1.0334], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.020499348640442, 'numerator': 0.13249258697032928, 'denominator': 0.12983113527297974}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.03787994384765625, 'dpo_reward_mean_target': -0.03787994384765625, 'standard deviation': 3.0, 'reward_a1': tensor([-0.0379], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13298074901103973, 'denominator': 0.13298074901103973}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.4121818542480469, 'dpo_reward_mean_target': 0.5423507690429688, 'standard deviation': 3.0, 'reward_a1': tensor([-0.0879], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9918588399887085, 'numerator': 0.1300782561302185, 'denominator': 0.13114593923091888}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.10349082946777344, 'dpo_reward_mean_target': -0.10349082946777344, 'standard deviation': 3.0, 'reward_a1': tensor([1.0617], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.123319111764431, 'denominator': 0.123319111764431}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.2786865234375, 'dpo_reward_mean_target': 0.6631393432617188, 'standard deviation': 3.0, 'reward_a1': tensor([2.6226], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0962684154510498, 'numerator': 0.10743745416402817, 'denominator': 0.09800287336111069}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.7769775390625, 'dpo_reward_mean_target': 0.7769775390625, 'standard deviation': 3.0, 'reward_a1': tensor([1.0776], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13231457769870758, 'denominator': 0.13231457769870758}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.41284942626953125, 'dpo_reward_mean_target': 0.6068992614746094, 'standard deviation': 3.0, 'reward_a1': tensor([1.6521], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0249332189559937, 'numerator': 0.12514999508857727, 'denominator': 0.12210551649332047}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.01941680908203125, 'dpo_reward_mean_target': 0.01941680908203125, 'standard deviation': 3.0, 'reward_a1': tensor([-0.2844], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.1323005110025406, 'denominator': 0.1323005110025406}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.010471343994140625, 'dpo_reward_mean_target': 0.010471343994140625, 'standard deviation': 3.0, 'reward_a1': tensor([-0.0165], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13297538459300995, 'denominator': 0.13297538459300995}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.056461334228515625, 'dpo_reward_mean_target': 0.0564422607421875, 'standard deviation': 3.0, 'reward_a1': tensor([0.0885], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9999998807907104, 'numerator': 0.13297317922115326, 'denominator': 0.13297319412231445}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.050357818603515625, 'dpo_reward_mean_target': 0.09639930725097656, 'standard deviation': 3.0, 'reward_a1': tensor([0.1680], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0004843473434448, 'numerator': 0.13294290006160736, 'denominator': 0.13287854194641113}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.3701629638671875, 'dpo_reward_mean_target': 0.3701629638671875, 'standard deviation': 3.0, 'reward_a1': tensor([2.7297], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.09760318696498871, 'denominator': 0.09760318696498871}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.5219955444335938, 'dpo_reward_mean_target': 0.5219955444335938, 'standard deviation': 3.0, 'reward_a1': tensor([-0.0535], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13055597245693207, 'denominator': 0.13055597245693207}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.6969146728515625, 'dpo_reward_mean_target': 1.2212677001953125, 'standard deviation': 3.0, 'reward_a1': tensor([0.5519], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9765543341636658, 'numerator': 0.12971127033233643, 'denominator': 0.13282544910907745}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.4905261993408203, 'dpo_reward_mean_target': 0.4905261993408203, 'standard deviation': 3.0, 'reward_a1': tensor([-0.0085], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.1311536580324173, 'denominator': 0.1311536580324173}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.10218238830566406, 'dpo_reward_mean_target': 0.10218238830566406, 'standard deviation': 3.0, 'reward_a1': tensor([-0.0023], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.132900208234787, 'denominator': 0.132900208234787}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.28871917724609375, 'dpo_reward_mean_target': 1.1358871459960938, 'standard deviation': 3.0, 'reward_a1': tensor([4.5464], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.4346301555633545, 'numerator': 0.06968510150909424, 'denominator': 0.0485735647380352}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.004245758056640625, 'dpo_reward_mean_target': 0.060901641845703125, 'standard deviation': 3.0, 'reward_a1': tensor([0.7575], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0045742988586426, 'numerator': 0.12944349646568298, 'denominator': 0.12885408103466034}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.5610618591308594, 'dpo_reward_mean_target': 0.6716842651367188, 'standard deviation': 3.0, 'reward_a1': tensor([0.5216], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9988357424736023, 'numerator': 0.1328144669532776, 'denominator': 0.13296927511692047}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.0011138916015625, 'dpo_reward_mean_target': -0.0011138916015625, 'standard deviation': 3.0, 'reward_a1': tensor([-0.0011], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13298074901103973, 'denominator': 0.13298074901103973}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 1.8885498046875, 'dpo_reward_mean_target': 1.252838134765625, 'standard deviation': 3.0, 'reward_a1': tensor([5.3738], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.764423131942749, 'numerator': 0.05176547169685364, 'denominator': 0.06771834939718246}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.46756744384765625, 'dpo_reward_mean_target': 0.46756744384765625, 'standard deviation': 3.0, 'reward_a1': tensor([0.3565], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13288959860801697, 'denominator': 0.13288959860801697}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.11793899536132812, 'dpo_reward_mean_target': -0.11793899536132812, 'standard deviation': 3.0, 'reward_a1': tensor([0.0496], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.1327735036611557, 'denominator': 0.1327735036611557}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.6923789978027344, 'dpo_reward_mean_target': 0.6923789978027344, 'standard deviation': 3.0, 'reward_a1': tensor([0.8480], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13280199468135834, 'denominator': 0.13280199468135834}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.0825653076171875, 'dpo_reward_mean_target': -0.10103988647460938, 'standard deviation': 3.0, 'reward_a1': tensor([0.5014], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9896373152732849, 'numerator': 0.13032659888267517, 'denominator': 0.13169127702713013}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.2385101318359375, 'dpo_reward_mean_target': 0.28311920166015625, 'standard deviation': 3.0, 'reward_a1': tensor([0.1290], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9993467330932617, 'numerator': 0.13280531764030457, 'denominator': 0.13289213180541992}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.8528671264648438, 'dpo_reward_mean_target': 1.41595458984375, 'standard deviation': 3.0, 'reward_a1': tensor([2.0172], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0567885637283325, 'numerator': 0.13033635914325714, 'denominator': 0.12333247810602188}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.3258094787597656, 'dpo_reward_mean_target': 0.06166839599609375, 'standard deviation': 3.0, 'reward_a1': tensor([0.0214], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0066291093826294, 'numerator': 0.1329687535762787, 'denominator': 0.13209310173988342}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.93621826171875, 'dpo_reward_mean_target': 0.951263427734375, 'standard deviation': 3.0, 'reward_a1': tensor([4.1278], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0053372383117676, 'numerator': 0.07591582089662552, 'denominator': 0.07551278918981552}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.15907859802246094, 'dpo_reward_mean_target': 0.14776611328125, 'standard deviation': 3.0, 'reward_a1': tensor([0.7583], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0263885259628296, 'numerator': 0.13025525212287903, 'denominator': 0.1269063800573349}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.16743850708007812, 'dpo_reward_mean_target': -0.16743850708007812, 'standard deviation': 3.0, 'reward_a1': tensor([-0.0059], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13278809189796448, 'denominator': 0.13278809189796448}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.37390899658203125, 'dpo_reward_mean_target': 0.795684814453125, 'standard deviation': 3.0, 'reward_a1': tensor([0.7135], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0674912929534912, 'numerator': 0.13293083012104034, 'denominator': 0.12452638149261475}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.0124969482421875, 'dpo_reward_mean_target': 0.1142120361328125, 'standard deviation': 3.0, 'reward_a1': tensor([0.8987], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.012007474899292, 'numerator': 0.12851153314113617, 'denominator': 0.12698674201965332}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.6554527282714844, 'dpo_reward_mean_target': 0.14439773559570312, 'standard deviation': 3.0, 'reward_a1': tensor([0.3489], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.002902865409851, 'numerator': 0.1326722353696823, 'denominator': 0.13228821754455566}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.12257194519042969, 'dpo_reward_mean_target': 0.12257194519042969, 'standard deviation': 3.0, 'reward_a1': tensor([-0.0274], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.1328146904706955, 'denominator': 0.1328146904706955}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 1.6773681640625, 'dpo_reward_mean_target': 1.556884765625, 'standard deviation': 3.0, 'reward_a1': tensor([-0.7165], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.031733751296997, 'numerator': 0.09979043155908585, 'denominator': 0.0967211052775383}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.5760955810546875, 'dpo_reward_mean_target': 0.0742340087890625, 'standard deviation': 3.0, 'reward_a1': tensor([0.5269], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9888139367103577, 'numerator': 0.1314755380153656, 'denominator': 0.13296286761760712}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.5185432434082031, 'dpo_reward_mean_target': 0.45892333984375, 'standard deviation': 3.0, 'reward_a1': tensor([0.0954], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0026092529296875, 'numerator': 0.13200783729553223, 'denominator': 0.13166429102420807}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.009588241577148438, 'dpo_reward_mean_target': 0.0836639404296875, 'standard deviation': 3.0, 'reward_a1': tensor([0.2712], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.002428650856018, 'numerator': 0.13272127509117126, 'denominator': 0.13239972293376923}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 1.46990966796875, 'dpo_reward_mean_target': 1.9386749267578125, 'standard deviation': 3.0, 'reward_a1': tensor([3.9719], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.125367283821106, 'numerator': 0.1056923121213913, 'denominator': 0.09391805529594421}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.10936737060546875, 'dpo_reward_mean_target': 0.8218765258789062, 'standard deviation': 3.0, 'reward_a1': tensor([0.3445], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9904559254646301, 'numerator': 0.1313076913356781, 'denominator': 0.1325729787349701}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.7514266967773438, 'dpo_reward_mean_target': 0.533355712890625, 'standard deviation': 3.0, 'reward_a1': tensor([1.9402], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9690432548522949, 'numerator': 0.11913418024778366, 'denominator': 0.12294000387191772}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.84759521484375, 'dpo_reward_mean_target': 0.15505218505859375, 'standard deviation': 3.0, 'reward_a1': tensor([0.6152], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9912728071212769, 'numerator': 0.1314254254102707, 'denominator': 0.13258250057697296}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.013507843017578125, 'dpo_reward_mean_target': 0.7192306518554688, 'standard deviation': 3.0, 'reward_a1': tensor([1.7511], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.1146931648254395, 'numerator': 0.12534314393997192, 'denominator': 0.11244631558656693}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.20218849182128906, 'dpo_reward_mean_target': -0.20218849182128906, 'standard deviation': 3.0, 'reward_a1': tensor([-0.0091], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.1327054798603058, 'denominator': 0.1327054798603058}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 1.775634765625, 'dpo_reward_mean_target': 1.72430419921875, 'standard deviation': 3.0, 'reward_a1': tensor([0.8155], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0053437948226929, 'numerator': 0.12701690196990967, 'denominator': 0.12634176015853882}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.420074462890625, 'dpo_reward_mean_target': 0.5461502075195312, 'standard deviation': 3.0, 'reward_a1': tensor([1.9334], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0205243825912476, 'numerator': 0.1194966584444046, 'denominator': 0.11709338426589966}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.7534713745117188, 'dpo_reward_mean_target': 0.809967041015625, 'standard deviation': 3.0, 'reward_a1': tensor([4.0966], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0210262537002563, 'numerator': 0.07297459989786148, 'denominator': 0.07147181779146194}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 1.3360748291015625, 'dpo_reward_mean_target': 0.2206268310546875, 'standard deviation': 3.0, 'reward_a1': tensor([2.6631], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.7916790843009949, 'numerator': 0.09546554088592529, 'denominator': 0.12058616429567337}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.396392822265625, 'dpo_reward_mean_target': -0.2296295166015625, 'standard deviation': 3.0, 'reward_a1': tensor([0.8534], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9478486180305481, 'numerator': 0.12459160387516022, 'denominator': 0.1314467340707779}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.07345771789550781, 'dpo_reward_mean_target': 0.43738746643066406, 'standard deviation': 3.0, 'reward_a1': tensor([0.4522], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0079882144927979, 'numerator': 0.13297916948795319, 'denominator': 0.1319253146648407}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.8141860961914062, 'dpo_reward_mean_target': 0.36774444580078125, 'standard deviation': 3.0, 'reward_a1': tensor([0.2025], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.019455909729004, 'numerator': 0.13277919590473175, 'denominator': 0.1302451640367508}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.06020164489746094, 'dpo_reward_mean_target': 0.3204841613769531, 'standard deviation': 3.0, 'reward_a1': tensor([-0.0734], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9914295077323914, 'numerator': 0.131839781999588, 'denominator': 0.13297948241233826}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.7836074829101562, 'dpo_reward_mean_target': 0.7961273193359375, 'standard deviation': 3.0, 'reward_a1': tensor([2.1587], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0019057989120483, 'numerator': 0.11994776129722595, 'denominator': 0.11971960216760635}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 1.0577774047851562, 'dpo_reward_mean_target': 0.52593994140625, 'standard deviation': 3.0, 'reward_a1': tensor([0.4507], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0203657150268555, 'numerator': 0.1329389363527298, 'denominator': 0.1302855759859085}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.6318817138671875, 'dpo_reward_mean_target': 0.7385406494140625, 'standard deviation': 3.0, 'reward_a1': tensor([0.9301], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0029058456420898, 'numerator': 0.13271008431911469, 'denominator': 0.13232555985450745}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.46190643310546875, 'dpo_reward_mean_target': -0.46190643310546875, 'standard deviation': 3.0, 'reward_a1': tensor([-0.2721], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13271494209766388, 'denominator': 0.13271494209766388}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.026956558227539062, 'dpo_reward_mean_target': -0.026956558227539062, 'standard deviation': 3.0, 'reward_a1': tensor([-0.0270], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13298074901103973, 'denominator': 0.13298074901103973}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.04319000244140625, 'dpo_reward_mean_target': 0.04319000244140625, 'standard deviation': 3.0, 'reward_a1': tensor([0.1645], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13287201523780823, 'denominator': 0.13287201523780823}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.3402366638183594, 'dpo_reward_mean_target': 0.8648605346679688, 'standard deviation': 3.0, 'reward_a1': tensor([0.3211], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9837296009063721, 'numerator': 0.13081447780132294, 'denominator': 0.13297808170318604}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.08018302917480469, 'dpo_reward_mean_target': 0.08018302917480469, 'standard deviation': 3.0, 'reward_a1': tensor([-0.1328], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13264617323875427, 'denominator': 0.13264617323875427}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.4307060241699219, 'dpo_reward_mean_target': 0.6505126953125, 'standard deviation': 3.0, 'reward_a1': tensor([1.3326], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0195302963256836, 'numerator': 0.12958800792694092, 'denominator': 0.12710559368133545}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.7286529541015625, 'dpo_reward_mean_target': 1.4308242797851562, 'standard deviation': 3.0, 'reward_a1': tensor([1.0281], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9959761500358582, 'numerator': 0.13178770244121552, 'denominator': 0.13232013583183289}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.176055908203125, 'dpo_reward_mean_target': 0.5707626342773438, 'standard deviation': 3.0, 'reward_a1': tensor([1.7150], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0606017112731934, 'numerator': 0.12365173548460007, 'denominator': 0.11658639460802078}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.63006591796875, 'dpo_reward_mean_target': 0.73822021484375, 'standard deviation': 3.0, 'reward_a1': tensor([1.4337], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.009048342704773, 'numerator': 0.12945474684238434, 'denominator': 0.12829390168190002}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.005169868469238281, 'dpo_reward_mean_target': 0.20030975341796875, 'standard deviation': 3.0, 'reward_a1': tensor([0.1206], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0003880262374878, 'numerator': 0.1329338699579239, 'denominator': 0.13288231194019318}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.1412181854248047, 'dpo_reward_mean_target': -0.08785820007324219, 'standard deviation': 3.0, 'reward_a1': tensor([0.5217], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9874790906906128, 'numerator': 0.1302637904882431, 'denominator': 0.13191549479961395}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 1.653900146484375, 'dpo_reward_mean_target': 1.1153411865234375, 'standard deviation': 3.0, 'reward_a1': tensor([0.7043], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.041549563407898, 'numerator': 0.13173848390579224, 'denominator': 0.12648315727710724}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 1.3496246337890625, 'dpo_reward_mean_target': 2.1319046020507812, 'standard deviation': 3.0, 'reward_a1': tensor([-0.0138], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.8585501313209534, 'numerator': 0.10296761989593506, 'denominator': 0.11993198096752167}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.2410125732421875, 'dpo_reward_mean_target': 0.31375885009765625, 'standard deviation': 3.0, 'reward_a1': tensor([0.3562], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0006376504898071, 'numerator': 0.13296742737293243, 'denominator': 0.13288269937038422}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.0077686309814453125, 'dpo_reward_mean_target': 0.0077686309814453125, 'standard deviation': 3.0, 'reward_a1': tensor([0.0078], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13298074901103973, 'denominator': 0.13298074901103973}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.6588535308837891, 'dpo_reward_mean_target': 0.6588535308837891, 'standard deviation': 3.0, 'reward_a1': tensor([0.8308], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13276241719722748, 'denominator': 0.13276241719722748}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.0120849609375, 'dpo_reward_mean_target': 0.6887893676757812, 'standard deviation': 3.0, 'reward_a1': tensor([4.6223], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.3787806034088135, 'numerator': 0.0562954843044281, 'denominator': 0.04082990810275078}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.07242774963378906, 'dpo_reward_mean_target': -0.07242774963378906, 'standard deviation': 3.0, 'reward_a1': tensor([0.4232], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13117849826812744, 'denominator': 0.13117849826812744}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.03569602966308594, 'dpo_reward_mean_target': -0.03569602966308594, 'standard deviation': 3.0, 'reward_a1': tensor([-0.0357], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13298074901103973, 'denominator': 0.13298074901103973}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.735565185546875, 'dpo_reward_mean_target': 0.56207275390625, 'standard deviation': 3.0, 'reward_a1': tensor([1.7585], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9788362383842468, 'numerator': 0.12281537055969238, 'denominator': 0.12547080218791962}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.8447189331054688, 'dpo_reward_mean_target': 1.118804931640625, 'standard deviation': 3.0, 'reward_a1': tensor([-0.1323], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9666420221328735, 'numerator': 0.12190576642751694, 'denominator': 0.12611262500286102}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.073150634765625, 'dpo_reward_mean_target': 0.4456939697265625, 'standard deviation': 3.0, 'reward_a1': tensor([2.4759], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0960899591445923, 'numerator': 0.10576443374156952, 'denominator': 0.0964924767613411}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.2389678955078125, 'dpo_reward_mean_target': 0.2389678955078125, 'standard deviation': 3.0, 'reward_a1': tensor([0.6068], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13198477029800415, 'denominator': 0.13198477029800415}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.18406963348388672, 'dpo_reward_mean_target': -0.18406963348388672, 'standard deviation': 3.0, 'reward_a1': tensor([-0.0211], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13278482854366302, 'denominator': 0.13278482854366302}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.22849273681640625, 'dpo_reward_mean_target': 0.1460113525390625, 'standard deviation': 3.0, 'reward_a1': tensor([0.1456], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0078036785125732, 'numerator': 0.13298074901103973, 'denominator': 0.13195104897022247}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.471771240234375, 'dpo_reward_mean_target': 0.471771240234375, 'standard deviation': 3.0, 'reward_a1': tensor([-0.0303], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.1311315894126892, 'denominator': 0.1311315894126892}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.000446319580078125, 'dpo_reward_mean_target': -0.000446319580078125, 'standard deviation': 3.0, 'reward_a1': tensor([-0.0004], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13298074901103973, 'denominator': 0.13298074901103973}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 1.5379180908203125, 'dpo_reward_mean_target': 2.1869659423828125, 'standard deviation': 3.0, 'reward_a1': tensor([0.8915], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9323763251304626, 'numerator': 0.12114317715167999, 'denominator': 0.12992948293685913}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.3451042175292969, 'dpo_reward_mean_target': 0.3451042175292969, 'standard deviation': 3.0, 'reward_a1': tensor([-1.0014], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.12023907154798508, 'denominator': 0.12023907154798508}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.10065269470214844, 'dpo_reward_mean_target': -0.10065269470214844, 'standard deviation': 3.0, 'reward_a1': tensor([-0.1861], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13292686641216278, 'denominator': 0.13292686641216278}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.01110076904296875, 'dpo_reward_mean_target': 0.26796722412109375, 'standard deviation': 3.0, 'reward_a1': tensor([0.8073], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.021273136138916, 'numerator': 0.13084910809993744, 'denominator': 0.12812352180480957}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.05072784423828125, 'dpo_reward_mean_target': -0.05072784423828125, 'standard deviation': 3.0, 'reward_a1': tensor([0.4149], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.1313885748386383, 'denominator': 0.1313885748386383}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.67779541015625, 'dpo_reward_mean_target': 0.99761962890625, 'standard deviation': 3.0, 'reward_a1': tensor([1.4403], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.021645188331604, 'numerator': 0.13154082000255585, 'denominator': 0.1287539154291153}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.0010395050048828125, 'dpo_reward_mean_target': 0.0010395050048828125, 'standard deviation': 3.0, 'reward_a1': tensor([0.0010], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13298074901103973, 'denominator': 0.13298074901103973}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 1.2577667236328125, 'dpo_reward_mean_target': 1.5811843872070312, 'standard deviation': 3.0, 'reward_a1': tensor([3.3065], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.070163607597351, 'numerator': 0.112711101770401, 'denominator': 0.10532137751579285}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.08528900146484375, 'dpo_reward_mean_target': 0.08528900146484375, 'standard deviation': 3.0, 'reward_a1': tensor([0.2628], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13274817168712616, 'denominator': 0.13274817168712616}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.4186553955078125, 'dpo_reward_mean_target': 0.68017578125, 'standard deviation': 3.0, 'reward_a1': tensor([0.7473], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0057657957077026, 'numerator': 0.13294751942157745, 'denominator': 0.13218536972999573}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.6523971557617188, 'dpo_reward_mean_target': 0.15032196044921875, 'standard deviation': 3.0, 'reward_a1': tensor([-0.5705], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.05571448802948, 'numerator': 0.12919676303863525, 'denominator': 0.12237851321697235}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.04076385498046875, 'dpo_reward_mean_target': 0.11340713500976562, 'standard deviation': 3.0, 'reward_a1': tensor([0.4348], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.006850004196167, 'numerator': 0.132219597697258, 'denominator': 0.131320059299469}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.758270263671875, 'dpo_reward_mean_target': 1.0928421020507812, 'standard deviation': 3.0, 'reward_a1': tensor([5.3665], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.1795004606246948, 'numerator': 0.048207834362983704, 'denominator': 0.04087140038609505}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.9911651611328125, 'dpo_reward_mean_target': 1.2632598876953125, 'standard deviation': 3.0, 'reward_a1': tensor([3.8955], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.087294340133667, 'numerator': 0.09049403667449951, 'denominator': 0.08322864770889282}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.5091476440429688, 'dpo_reward_mean_target': 0.4139595031738281, 'standard deviation': 3.0, 'reward_a1': tensor([0.0849], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0039914846420288, 'numerator': 0.1321832686662674, 'denominator': 0.13165776431560516}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.0030002593994140625, 'dpo_reward_mean_target': 0.0030002593994140625, 'standard deviation': 3.0, 'reward_a1': tensor([0.8541], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.12773573398590088, 'denominator': 0.12773573398590088}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.6003875732421875, 'dpo_reward_mean_target': 0.6003875732421875, 'standard deviation': 3.0, 'reward_a1': tensor([4.8135], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.04960476979613304, 'denominator': 0.04960476979613304}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.36040496826171875, 'dpo_reward_mean_target': 0.27909088134765625, 'standard deviation': 3.0, 'reward_a1': tensor([2.3727], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9816226363182068, 'numerator': 0.10423911362886429, 'denominator': 0.10619061440229416}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.7942657470703125, 'dpo_reward_mean_target': 0.228912353515625, 'standard deviation': 3.0, 'reward_a1': tensor([3.5873], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.8243123292922974, 'numerator': 0.07106632739305496, 'denominator': 0.08621286600828171}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.21929550170898438, 'dpo_reward_mean_target': 0.07326126098632812, 'standard deviation': 3.0, 'reward_a1': tensor([1.5429], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9775936007499695, 'numerator': 0.1179448738694191, 'denominator': 0.12064816802740097}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 1.0857162475585938, 'dpo_reward_mean_target': 1.067626953125, 'standard deviation': 3.0, 'reward_a1': tensor([-0.3558], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0028830766677856, 'numerator': 0.11882461607456207, 'denominator': 0.11848302185535431}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.11670684814453125, 'dpo_reward_mean_target': 0.13080215454101562, 'standard deviation': 3.0, 'reward_a1': tensor([0.4086], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.00044584274292, 'numerator': 0.13241203129291534, 'denominator': 0.1323530226945877}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.00365447998046875, 'dpo_reward_mean_target': -0.00365447998046875, 'standard deviation': 3.0, 'reward_a1': tensor([-0.0037], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13298074901103973, 'denominator': 0.13298074901103973}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.5011138916015625, 'dpo_reward_mean_target': 0.67657470703125, 'standard deviation': 3.0, 'reward_a1': tensor([0.8715], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.005524754524231, 'numerator': 0.13270044326782227, 'denominator': 0.1319713294506073}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.13490676879882812, 'dpo_reward_mean_target': 0.13490676879882812, 'standard deviation': 3.0, 'reward_a1': tensor([0.2412], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.1328973025083542, 'denominator': 0.1328973025083542}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.50970458984375, 'dpo_reward_mean_target': 1.1093063354492188, 'standard deviation': 3.0, 'reward_a1': tensor([1.3748], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0383764505386353, 'numerator': 0.13246123492717743, 'denominator': 0.12756571173667908}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.03218650817871094, 'dpo_reward_mean_target': -0.11197662353515625, 'standard deviation': 3.0, 'reward_a1': tensor([-0.9140], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0074920654296875, 'numerator': 0.12831231951713562, 'denominator': 0.12735813856124878}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.17884445190429688, 'dpo_reward_mean_target': 0.414398193359375, 'standard deviation': 3.0, 'reward_a1': tensor([-0.2008], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9870649576187134, 'numerator': 0.13021370768547058, 'denominator': 0.13192009925842285}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.08404922485351562, 'dpo_reward_mean_target': 0.9224777221679688, 'standard deviation': 3.0, 'reward_a1': tensor([4.2642], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.537274718284607, 'numerator': 0.07150743901729584, 'denominator': 0.04651571810245514}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.08504104614257812, 'dpo_reward_mean_target': -0.08504104614257812, 'standard deviation': 3.0, 'reward_a1': tensor([-0.0014], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13292904198169708, 'denominator': 0.13292904198169708}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.9848709106445312, 'dpo_reward_mean_target': 1.130218505859375, 'standard deviation': 3.0, 'reward_a1': tensor([-0.1719], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9803415536880493, 'numerator': 0.12102735787630081, 'denominator': 0.12345428019762039}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.0026683807373046875, 'dpo_reward_mean_target': -0.0026683807373046875, 'standard deviation': 3.0, 'reward_a1': tensor([0.2030], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13266853988170624, 'denominator': 0.13266853988170624}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 1.0467071533203125, 'dpo_reward_mean_target': 1.3530654907226562, 'standard deviation': 3.0, 'reward_a1': tensor([1.0954], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9964485168457031, 'numerator': 0.13249105215072632, 'denominator': 0.13296326994895935}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.15476226806640625, 'dpo_reward_mean_target': 0.49733734130859375, 'standard deviation': 3.0, 'reward_a1': tensor([1.5009], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.1011346578598022, 'numerator': 0.12574389576911926, 'denominator': 0.1141948401927948}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 1.856842041015625, 'dpo_reward_mean_target': 2.089324951171875, 'standard deviation': 3.0, 'reward_a1': tensor([3.3439], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0360437631607056, 'numerator': 0.121847003698349, 'denominator': 0.1176079660654068}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 1.2126998901367188, 'dpo_reward_mean_target': 1.2126998901367188, 'standard deviation': 3.0, 'reward_a1': tensor([4.0435], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.08520057797431946, 'denominator': 0.08520057797431946}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.3892364501953125, 'dpo_reward_mean_target': 0.3665924072265625, 'standard deviation': 3.0, 'reward_a1': tensor([2.4584], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.2304903268814087, 'numerator': 0.10428256541490555, 'denominator': 0.08474878966808319}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.900909423828125, 'dpo_reward_mean_target': 0.7666549682617188, 'standard deviation': 3.0, 'reward_a1': tensor([-0.1758], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0151736736297607, 'numerator': 0.1265784651041031, 'denominator': 0.12468651682138443}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.7307777404785156, 'dpo_reward_mean_target': 0.840484619140625, 'standard deviation': 3.0, 'reward_a1': tensor([2.3489], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0192389488220215, 'numerator': 0.11718940734863281, 'denominator': 0.11497736722230911}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.28766632080078125, 'dpo_reward_mean_target': 0.39778900146484375, 'standard deviation': 3.0, 'reward_a1': tensor([0.6655], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0039576292037964, 'numerator': 0.13245221972465515, 'denominator': 0.13193008303642273}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 1.4402923583984375, 'dpo_reward_mean_target': 1.990020751953125, 'standard deviation': 3.0, 'reward_a1': tensor([2.9035], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0752848386764526, 'numerator': 0.12695671617984772, 'denominator': 0.11806798726320267}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.25609397888183594, 'dpo_reward_mean_target': -0.25609397888183594, 'standard deviation': 3.0, 'reward_a1': tensor([1.2645], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.1169506162405014, 'denominator': 0.1169506162405014}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.0001354217529296875, 'dpo_reward_mean_target': 0.0001354217529296875, 'standard deviation': 3.0, 'reward_a1': tensor([0.0001], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13298074901103973, 'denominator': 0.13298074901103973}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.4049224853515625, 'dpo_reward_mean_target': 0.7815055847167969, 'standard deviation': 3.0, 'reward_a1': tensor([1.4597], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.182461142539978, 'numerator': 0.12962622940540314, 'denominator': 0.10962409526109695}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.9248275756835938, 'dpo_reward_mean_target': 1.0994110107421875, 'standard deviation': 3.0, 'reward_a1': tensor([3.9658], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0589697360992432, 'numerator': 0.0842461958527565, 'denominator': 0.07955486327409744}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.26807403564453125, 'dpo_reward_mean_target': 0.4833564758300781, 'standard deviation': 3.0, 'reward_a1': tensor([1.0448], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0161335468292236, 'numerator': 0.13067224621772766, 'denominator': 0.12859751284122467}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.047504425048828125, 'dpo_reward_mean_target': 0.18976593017578125, 'standard deviation': 3.0, 'reward_a1': tensor([0.9474], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0233697891235352, 'numerator': 0.12880711257457733, 'denominator': 0.1258656531572342}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.23844528198242188, 'dpo_reward_mean_target': -0.23844528198242188, 'standard deviation': 3.0, 'reward_a1': tensor([-0.4517], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13264510035514832, 'denominator': 0.13264510035514832}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.09050941467285156, 'dpo_reward_mean_target': -0.09050941467285156, 'standard deviation': 3.0, 'reward_a1': tensor([0.0509], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13283315300941467, 'denominator': 0.13283315300941467}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 1.1677932739257812, 'dpo_reward_mean_target': 1.3863067626953125, 'standard deviation': 3.0, 'reward_a1': tensor([4.0233], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0689492225646973, 'numerator': 0.0903688371181488, 'denominator': 0.08453987538814545}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 2.071136474609375, 'dpo_reward_mean_target': 0.7667694091796875, 'standard deviation': 3.0, 'reward_a1': tensor([1.8011], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9461272954940796, 'numerator': 0.12530791759490967, 'denominator': 0.13244298100471497}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.13616943359375, 'dpo_reward_mean_target': -0.13616943359375, 'standard deviation': 3.0, 'reward_a1': tensor([0.2522], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13187134265899658, 'denominator': 0.13187134265899658}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.007732391357421875, 'dpo_reward_mean_target': 0.007732391357421875, 'standard deviation': 3.0, 'reward_a1': tensor([-0.0176], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13297602534294128, 'denominator': 0.13297602534294128}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.2338695526123047, 'dpo_reward_mean_target': 0.4418468475341797, 'standard deviation': 3.0, 'reward_a1': tensor([-0.2225], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9757867455482483, 'numerator': 0.12975992262363434, 'denominator': 0.13297979533672333}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.3678436279296875, 'dpo_reward_mean_target': 0.3678436279296875, 'standard deviation': 3.0, 'reward_a1': tensor([0.3664], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13298074901103973, 'denominator': 0.13298074901103973}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 2.191802978515625, 'dpo_reward_mean_target': 1.012451171875, 'standard deviation': 3.0, 'reward_a1': tensor([2.3203], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9101860523223877, 'numerator': 0.12092629075050354, 'denominator': 0.13285887241363525}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.560150146484375, 'dpo_reward_mean_target': 0.5643997192382812, 'standard deviation': 3.0, 'reward_a1': tensor([5.4113], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0022919178009033, 'numerator': 0.03605552390217781, 'denominator': 0.03597307577729225}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 2.586212158203125, 'dpo_reward_mean_target': 1.0258026123046875, 'standard deviation': 3.0, 'reward_a1': tensor([6.3528], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.45460546016693115, 'numerator': 0.027486195787787437, 'denominator': 0.06046164780855179}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 1.31671142578125, 'dpo_reward_mean_target': 0.5190200805664062, 'standard deviation': 3.0, 'reward_a1': tensor([1.4940], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9502187967300415, 'numerator': 0.12614040076732635, 'denominator': 0.1327487975358963}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.9861640930175781, 'dpo_reward_mean_target': 1.5911102294921875, 'standard deviation': 3.0, 'reward_a1': tensor([4.8937], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.2741984128952026, 'numerator': 0.07254906743764877, 'denominator': 0.05693702772259712}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 1.2779083251953125, 'dpo_reward_mean_target': 0.8460845947265625, 'standard deviation': 3.0, 'reward_a1': tensor([8.1967], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.7101182341575623, 'numerator': 0.006609369069337845, 'denominator': 0.009307420812547207}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.249664306640625, 'dpo_reward_mean_target': 0.249664306640625, 'standard deviation': 3.0, 'reward_a1': tensor([2.1273], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.10932575911283493, 'denominator': 0.10932575911283493}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.1755523681640625, 'dpo_reward_mean_target': -0.1755523681640625, 'standard deviation': 3.0, 'reward_a1': tensor([0.3012], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13131235539913177, 'denominator': 0.13131235539913177}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.8303451538085938, 'dpo_reward_mean_target': 0.8303451538085938, 'standard deviation': 3.0, 'reward_a1': tensor([-0.4977], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.12056850641965866, 'denominator': 0.12056850641965866}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.467529296875, 'dpo_reward_mean_target': 0.03312492370605469, 'standard deviation': 3.0, 'reward_a1': tensor([0.1757], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.003610372543335, 'numerator': 0.13283073902130127, 'denominator': 0.13235288858413696}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 1.233367919921875, 'dpo_reward_mean_target': 1.228790283203125, 'standard deviation': 3.0, 'reward_a1': tensor([1.9119], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9996538758277893, 'numerator': 0.1295773833990097, 'denominator': 0.12962225079536438}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.44995880126953125, 'dpo_reward_mean_target': 0.773834228515625, 'standard deviation': 3.0, 'reward_a1': tensor([-0.0939], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9749215841293335, 'numerator': 0.12753301858901978, 'denominator': 0.1308136135339737}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.5093994140625, 'dpo_reward_mean_target': 0.36724090576171875, 'standard deviation': 3.0, 'reward_a1': tensor([5.5105], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.923008143901825, 'numerator': 0.030587393790483475, 'denominator': 0.033138811588287354}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.04007148742675781, 'dpo_reward_mean_target': 0.04007148742675781, 'standard deviation': 3.0, 'reward_a1': tensor([-0.1204], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.1327907294034958, 'denominator': 0.1327907294034958}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.5856094360351562, 'dpo_reward_mean_target': 0.6301078796386719, 'standard deviation': 3.0, 'reward_a1': tensor([-0.6994], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.993557333946228, 'numerator': 0.1205422431230545, 'denominator': 0.12132389098405838}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.02503204345703125, 'dpo_reward_mean_target': 0.07492256164550781, 'standard deviation': 3.0, 'reward_a1': tensor([-0.1134], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9990948438644409, 'numerator': 0.13271912932395935, 'denominator': 0.1328393667936325}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 2.325653076171875, 'dpo_reward_mean_target': 1.338958740234375, 'standard deviation': 3.0, 'reward_a1': tensor([2.9904], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.8807686567306519, 'numerator': 0.11428529769182205, 'denominator': 0.12975631654262543}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.4230232238769531, 'dpo_reward_mean_target': -0.11163711547851562, 'standard deviation': 3.0, 'reward_a1': tensor([0.3616], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0219987630844116, 'numerator': 0.13133646547794342, 'denominator': 0.12850941717624664}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.05050849914550781, 'dpo_reward_mean_target': 0.2999000549316406, 'standard deviation': 3.0, 'reward_a1': tensor([0.4890], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0087323188781738, 'numerator': 0.13271693885326385, 'denominator': 0.131568044424057}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 1.0325775146484375, 'dpo_reward_mean_target': 0.4724578857421875, 'standard deviation': 3.0, 'reward_a1': tensor([1.4508], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9574707746505737, 'numerator': 0.12609371542930603, 'denominator': 0.13169458508491516}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.106292724609375, 'dpo_reward_mean_target': 1.1522216796875, 'standard deviation': 3.0, 'reward_a1': tensor([3.7126], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.5620962381362915, 'numerator': 0.09238777309656143, 'denominator': 0.05914345756173134}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.6318511962890625, 'dpo_reward_mean_target': 0.46961212158203125, 'standard deviation': 3.0, 'reward_a1': tensor([0.8684], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.994289755821228, 'numerator': 0.1318109929561615, 'denominator': 0.13256798684597015}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.5791015625, 'dpo_reward_mean_target': 0.42406463623046875, 'standard deviation': 3.0, 'reward_a1': tensor([1.6043], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9811838865280151, 'numerator': 0.12307853996753693, 'denominator': 0.12543880939483643}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.570220947265625, 'dpo_reward_mean_target': 0.4332275390625, 'standard deviation': 3.0, 'reward_a1': tensor([1.0221], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9921104311943054, 'numerator': 0.13044343888759613, 'denominator': 0.13148076832294464}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.10052108764648438, 'dpo_reward_mean_target': -0.10052108764648438, 'standard deviation': 3.0, 'reward_a1': tensor([0.9845], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.12456130981445312, 'denominator': 0.12456130981445312}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 1.445159912109375, 'dpo_reward_mean_target': 1.165771484375, 'standard deviation': 3.0, 'reward_a1': tensor([12.8571], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.6986559629440308, 'numerator': 6.697278877254575e-05, 'denominator': 9.585946827428415e-05}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.626861572265625, 'dpo_reward_mean_target': 0.31536865234375, 'standard deviation': 3.0, 'reward_a1': tensor([1.1490], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9768096208572388, 'numerator': 0.12794393301010132, 'denominator': 0.1309814453125}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.08405494689941406, 'dpo_reward_mean_target': -0.08405494689941406, 'standard deviation': 3.0, 'reward_a1': tensor([0.3882], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13134343922138214, 'denominator': 0.13134343922138214}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 1.4257965087890625, 'dpo_reward_mean_target': 0.5797958374023438, 'standard deviation': 3.0, 'reward_a1': tensor([1.2926], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9731270670890808, 'numerator': 0.12927968800067902, 'denominator': 0.13284975290298462}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 1.5836639404296875, 'dpo_reward_mean_target': 0.8468170166015625, 'standard deviation': 3.0, 'reward_a1': tensor([4.4568], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.7669057250022888, 'numerator': 0.06447070837020874, 'denominator': 0.08406601846218109}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.08193778991699219, 'dpo_reward_mean_target': 0.18005943298339844, 'standard deviation': 3.0, 'reward_a1': tensor([0.1838], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0005759000778198, 'numerator': 0.13298064470291138, 'denominator': 0.13290411233901978}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.14098358154296875, 'dpo_reward_mean_target': 0.14098358154296875, 'standard deviation': 3.0, 'reward_a1': tensor([1.0557], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.1269412487745285, 'denominator': 0.1269412487745285}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.12264060974121094, 'dpo_reward_mean_target': 0.15901756286621094, 'standard deviation': 3.0, 'reward_a1': tensor([1.1884], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0042431354522705, 'numerator': 0.1253790408372879, 'denominator': 0.12484928220510483}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.005367279052734375, 'dpo_reward_mean_target': 0.005367279052734375, 'standard deviation': 3.0, 'reward_a1': tensor([0.0406], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13297158479690552, 'denominator': 0.13297158479690552}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.05097389221191406, 'dpo_reward_mean_target': -0.05097389221191406, 'standard deviation': 3.0, 'reward_a1': tensor([-0.0616], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13297992944717407, 'denominator': 0.13297992944717407}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.14446449279785156, 'dpo_reward_mean_target': 0.14446449279785156, 'standard deviation': 3.0, 'reward_a1': tensor([0.1903], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13296528160572052, 'denominator': 0.13296528160572052}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 1.386993408203125, 'dpo_reward_mean_target': 0.9276504516601562, 'standard deviation': 3.0, 'reward_a1': tensor([4.4302], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.8461629748344421, 'numerator': 0.06726519018411636, 'denominator': 0.07949436455965042}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.009973526000976562, 'dpo_reward_mean_target': -0.009973526000976562, 'standard deviation': 3.0, 'reward_a1': tensor([0.6691], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.12961673736572266, 'denominator': 0.12961673736572266}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.43405914306640625, 'dpo_reward_mean_target': 0.5020370483398438, 'standard deviation': 3.0, 'reward_a1': tensor([1.3186], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0064451694488525, 'numerator': 0.12814484536647797, 'denominator': 0.12732422351837158}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.5916290283203125, 'dpo_reward_mean_target': 0.5789070129394531, 'standard deviation': 3.0, 'reward_a1': tensor([0.1121], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0006691217422485, 'numerator': 0.1313803344964981, 'denominator': 0.13129247725009918}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.04294776916503906, 'dpo_reward_mean_target': -0.04294776916503906, 'standard deviation': 3.0, 'reward_a1': tensor([0.1715], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13264130055904388, 'denominator': 0.13264130055904388}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.5800819396972656, 'dpo_reward_mean_target': 0.5800819396972656, 'standard deviation': 3.0, 'reward_a1': tensor([-0.0167], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13037517666816711, 'denominator': 0.13037517666816711}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.17750930786132812, 'dpo_reward_mean_target': 0.7527351379394531, 'standard deviation': 3.0, 'reward_a1': tensor([0.5326], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0043203830718994, 'numerator': 0.13262315094470978, 'denominator': 0.13205263018608093}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.03939628601074219, 'dpo_reward_mean_target': 0.03939628601074219, 'standard deviation': 3.0, 'reward_a1': tensor([-0.2353], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13242460787296295, 'denominator': 0.13242460787296295}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.12435436248779297, 'dpo_reward_mean_target': 0.17508792877197266, 'standard deviation': 3.0, 'reward_a1': tensor([0.1375], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9999308586120605, 'numerator': 0.13297028839588165, 'denominator': 0.13297948241233826}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.027435302734375, 'dpo_reward_mean_target': 0.5827102661132812, 'standard deviation': 3.0, 'reward_a1': tensor([0.1897], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9929050803184509, 'numerator': 0.13184437155723572, 'denominator': 0.13278648257255554}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 2.63427734375, 'dpo_reward_mean_target': 1.80377197265625, 'standard deviation': 3.0, 'reward_a1': tensor([1.9206], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0279215574264526, 'numerator': 0.13287997245788574, 'denominator': 0.129270538687706}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.020610809326171875, 'dpo_reward_mean_target': -0.020610809326171875, 'standard deviation': 3.0, 'reward_a1': tensor([-0.0206], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13298074901103973, 'denominator': 0.13298074901103973}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.11253929138183594, 'dpo_reward_mean_target': -0.11253929138183594, 'standard deviation': 3.0, 'reward_a1': tensor([-0.0402], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.1329421103000641, 'denominator': 0.1329421103000641}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.05776023864746094, 'dpo_reward_mean_target': -0.05776023864746094, 'standard deviation': 3.0, 'reward_a1': tensor([-0.2221], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.1327814757823944, 'denominator': 0.1327814757823944}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.31777381896972656, 'dpo_reward_mean_target': 0.31777381896972656, 'standard deviation': 3.0, 'reward_a1': tensor([1.3499], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.1253385692834854, 'denominator': 0.1253385692834854}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.24072265625, 'dpo_reward_mean_target': 1.45843505859375, 'standard deviation': 3.0, 'reward_a1': tensor([-0.1250], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.8764567375183105, 'numerator': 0.11568881571292877, 'denominator': 0.1319960355758667}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.6119117736816406, 'dpo_reward_mean_target': 0.42247772216796875, 'standard deviation': 3.0, 'reward_a1': tensor([-0.1953], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0151093006134033, 'numerator': 0.13019105792045593, 'denominator': 0.12825323641300201}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.05938529968261719, 'dpo_reward_mean_target': -0.1674213409423828, 'standard deviation': 3.0, 'reward_a1': tensor([-0.4974], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0112367868423462, 'numerator': 0.13217861950397491, 'denominator': 0.13070985674858093}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.523895263671875, 'dpo_reward_mean_target': 0.5536041259765625, 'standard deviation': 3.0, 'reward_a1': tensor([0.9188], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0012550354003906, 'numerator': 0.13199912011623383, 'denominator': 0.13183365762233734}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.6262283325195312, 'dpo_reward_mean_target': -0.5173416137695312, 'standard deviation': 3.0, 'reward_a1': tensor([-2.1469], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9811244010925293, 'numerator': 0.11474219709634781, 'denominator': 0.11694969236850739}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.4379005432128906, 'dpo_reward_mean_target': 0.4379005432128906, 'standard deviation': 3.0, 'reward_a1': tensor([0.7532], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13224853575229645, 'denominator': 0.13224853575229645}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.3072090148925781, 'dpo_reward_mean_target': -0.3072090148925781, 'standard deviation': 3.0, 'reward_a1': tensor([0.8257], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.12382856756448746, 'denominator': 0.12382856756448746}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.6586990356445312, 'dpo_reward_mean_target': 0.7681503295898438, 'standard deviation': 3.0, 'reward_a1': tensor([-0.9366], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9801328778266907, 'numerator': 0.11315307766199112, 'denominator': 0.11544667184352875}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.023649215698242188, 'dpo_reward_mean_target': 0.023649215698242188, 'standard deviation': 3.0, 'reward_a1': tensor([0.0236], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13298074901103973, 'denominator': 0.13298074901103973}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.4006614685058594, 'dpo_reward_mean_target': 1.0231704711914062, 'standard deviation': 3.0, 'reward_a1': tensor([0.0815], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9643076658248901, 'numerator': 0.12658865749835968, 'denominator': 0.13127413392066956}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.009882926940917969, 'dpo_reward_mean_target': 0.07934951782226562, 'standard deviation': 3.0, 'reward_a1': tensor([-0.0850], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9988133907318115, 'numerator': 0.13278132677078247, 'denominator': 0.13293907046318054}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.40545082092285156, 'dpo_reward_mean_target': 0.5371341705322266, 'standard deviation': 3.0, 'reward_a1': tensor([0.3374], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9980427622795105, 'numerator': 0.13268634676933289, 'denominator': 0.13294655084609985}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.005428314208984375, 'dpo_reward_mean_target': -0.005428314208984375, 'standard deviation': 3.0, 'reward_a1': tensor([-0.0054], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13298074901103973, 'denominator': 0.13298074901103973}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.7445220947265625, 'dpo_reward_mean_target': 0.2265472412109375, 'standard deviation': 3.0, 'reward_a1': tensor([0.3452], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0081112384796143, 'numerator': 0.13287685811519623, 'denominator': 0.13180772960186005}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.8078079223632812, 'dpo_reward_mean_target': 0.9789657592773438, 'standard deviation': 3.0, 'reward_a1': tensor([0.7480], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9972380995750427, 'numerator': 0.13258710503578186, 'denominator': 0.13295431435108185}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.08278656005859375, 'dpo_reward_mean_target': -0.19305419921875, 'standard deviation': 3.0, 'reward_a1': tensor([1.5717], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9792718291282654, 'numerator': 0.1118532046675682, 'denominator': 0.11422079056501389}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.04083061218261719, 'dpo_reward_mean_target': 0.04083061218261719, 'standard deviation': 3.0, 'reward_a1': tensor([0.5620], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.1309891939163208, 'denominator': 0.1309891939163208}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.052295684814453125, 'dpo_reward_mean_target': 0.5316619873046875, 'standard deviation': 3.0, 'reward_a1': tensor([1.3907], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.060270071029663, 'numerator': 0.12763862311840057, 'denominator': 0.1203831285238266}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.80322265625, 'dpo_reward_mean_target': 1.7742843627929688, 'standard deviation': 3.0, 'reward_a1': tensor([5.5440], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.582687258720398, 'numerator': 0.06038348376750946, 'denominator': 0.038152504712343216}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 1.550750732421875, 'dpo_reward_mean_target': 0.84844970703125, 'standard deviation': 3.0, 'reward_a1': tensor([0.8955], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0240106582641602, 'numerator': 0.1329643875360489, 'denominator': 0.12984667718410492}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.052234649658203125, 'dpo_reward_mean_target': -0.052234649658203125, 'standard deviation': 3.0, 'reward_a1': tensor([-0.0495], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13298067450523376, 'denominator': 0.13298067450523376}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.26160430908203125, 'dpo_reward_mean_target': -0.26160430908203125, 'standard deviation': 3.0, 'reward_a1': tensor([-0.0026], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.1324859857559204, 'denominator': 0.1324859857559204}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.04930877685546875, 'dpo_reward_mean_target': 0.480438232421875, 'standard deviation': 3.0, 'reward_a1': tensor([0.1037], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.993435800075531, 'numerator': 0.1319361925125122, 'denominator': 0.13280797004699707}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.0004444122314453125, 'dpo_reward_mean_target': 0.0004444122314453125, 'standard deviation': 3.0, 'reward_a1': tensor([0.0004], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13298074901103973, 'denominator': 0.13298074901103973}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.05755805969238281, 'dpo_reward_mean_target': 0.05755805969238281, 'standard deviation': 3.0, 'reward_a1': tensor([0.8653], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.12824741005897522, 'denominator': 0.12824741005897522}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.21987533569335938, 'dpo_reward_mean_target': 0.4981536865234375, 'standard deviation': 3.0, 'reward_a1': tensor([2.0990], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0552730560302734, 'numerator': 0.1153339371085167, 'denominator': 0.10929297655820847}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 1.4600372314453125, 'dpo_reward_mean_target': 0.056362152099609375, 'standard deviation': 3.0, 'reward_a1': tensor([1.9677], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.8280802369117737, 'numerator': 0.10855305194854736, 'denominator': 0.13109001517295837}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.5841331481933594, 'dpo_reward_mean_target': 0.15563583374023438, 'standard deviation': 3.0, 'reward_a1': tensor([0.8191], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9788395166397095, 'numerator': 0.12976819276809692, 'denominator': 0.13257351517677307}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 1.0252342224121094, 'dpo_reward_mean_target': 1.0875282287597656, 'standard deviation': 3.0, 'reward_a1': tensor([1.1613], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0007264614105225, 'numerator': 0.13294054567813873, 'denominator': 0.13284404575824738}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.3420562744140625, 'dpo_reward_mean_target': 0.12335586547851562, 'standard deviation': 3.0, 'reward_a1': tensor([-0.2016], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0106085538864136, 'numerator': 0.1322030872106552, 'denominator': 0.130815327167511}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.10487747192382812, 'dpo_reward_mean_target': -0.10487747192382812, 'standard deviation': 3.0, 'reward_a1': tensor([0.9018], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.12570130825042725, 'denominator': 0.12570130825042725}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.187286376953125, 'dpo_reward_mean_target': 0.03968620300292969, 'standard deviation': 3.0, 'reward_a1': tensor([1.2528], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9814881086349487, 'numerator': 0.12254045903682709, 'denominator': 0.12485170364379883}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.8208541870117188, 'dpo_reward_mean_target': 0.573944091796875, 'standard deviation': 3.0, 'reward_a1': tensor([1.3454], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9823805689811707, 'numerator': 0.12865620851516724, 'denominator': 0.13096371293067932}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.86663818359375, 'dpo_reward_mean_target': 1.2882537841796875, 'standard deviation': 3.0, 'reward_a1': tensor([2.5463], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0712311267852783, 'numerator': 0.12178810685873032, 'denominator': 0.1136898547410965}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.6222991943359375, 'dpo_reward_mean_target': 0.7461128234863281, 'standard deviation': 3.0, 'reward_a1': tensor([1.4370], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0104103088378906, 'numerator': 0.1295005977153778, 'denominator': 0.1281663477420807}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.17116546630859375, 'dpo_reward_mean_target': 0.1291332244873047, 'standard deviation': 3.0, 'reward_a1': tensor([0.7668], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.026634693145752, 'numerator': 0.13001056015491486, 'denominator': 0.12663760781288147}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.18773841857910156, 'dpo_reward_mean_target': 0.18773841857910156, 'standard deviation': 3.0, 'reward_a1': tensor([-0.0026], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.1327134072780609, 'denominator': 0.1327134072780609}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 2.183135986328125, 'dpo_reward_mean_target': 2.2870941162109375, 'standard deviation': 3.0, 'reward_a1': tensor([6.5279], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0508357286453247, 'numerator': 0.04896337911486626, 'denominator': 0.04659470170736313}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.1999187469482422, 'dpo_reward_mean_target': -0.030750274658203125, 'standard deviation': 3.0, 'reward_a1': tensor([0.2756], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.007375717163086, 'numerator': 0.1322891265153885, 'denominator': 0.1313205510377884}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.0301666259765625, 'dpo_reward_mean_target': 0.3626251220703125, 'standard deviation': 3.0, 'reward_a1': tensor([1.7595], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0720088481903076, 'numerator': 0.11931981146335602, 'denominator': 0.111304871737957}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.054229736328125, 'dpo_reward_mean_target': -0.054229736328125, 'standard deviation': 3.0, 'reward_a1': tensor([0.5904], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.12994599342346191, 'denominator': 0.12994599342346191}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 1.0574531555175781, 'dpo_reward_mean_target': 0.6567726135253906, 'standard deviation': 3.0, 'reward_a1': tensor([0.7388], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0052818059921265, 'numerator': 0.13293108344078064, 'denominator': 0.1322326511144638}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 1.9030075073242188, 'dpo_reward_mean_target': 1.2839813232421875, 'standard deviation': 3.0, 'reward_a1': tensor([3.1685], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.8973320722579956, 'numerator': 0.10916999727487564, 'denominator': 0.12166064232587814}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.032581329345703125, 'dpo_reward_mean_target': 0.09792709350585938, 'standard deviation': 3.0, 'reward_a1': tensor([-0.1191], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9978021383285522, 'numerator': 0.13263334333896637, 'denominator': 0.13292549550533295}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 1.2193984985351562, 'dpo_reward_mean_target': 0.7358245849609375, 'standard deviation': 3.0, 'reward_a1': tensor([2.7098], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9111266136169434, 'numerator': 0.10709531605243683, 'denominator': 0.11754164099693298}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.0554046630859375, 'dpo_reward_mean_target': 0.18228912353515625, 'standard deviation': 3.0, 'reward_a1': tensor([-1.3871], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9789917469024658, 'numerator': 0.11597377061843872, 'denominator': 0.1184624582529068}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.012897491455078125, 'dpo_reward_mean_target': -0.012897491455078125, 'standard deviation': 3.0, 'reward_a1': tensor([-0.0129], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13298074901103973, 'denominator': 0.13298074901103973}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.18271446228027344, 'dpo_reward_mean_target': -0.18271446228027344, 'standard deviation': 3.0, 'reward_a1': tensor([0.5180], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.12940195202827454, 'denominator': 0.12940195202827454}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.10581779479980469, 'dpo_reward_mean_target': 0.10581779479980469, 'standard deviation': 3.0, 'reward_a1': tensor([0.0793], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13297554850578308, 'denominator': 0.13297554850578308}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.0320892333984375, 'dpo_reward_mean_target': -0.065948486328125, 'standard deviation': 3.0, 'reward_a1': tensor([0.2035], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9976014494895935, 'numerator': 0.13244543969631195, 'denominator': 0.13276387751102448}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.6043014526367188, 'dpo_reward_mean_target': 0.8653030395507812, 'standard deviation': 3.0, 'reward_a1': tensor([0.9869], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0073384046554565, 'numerator': 0.13287153840065002, 'denominator': 0.13190357387065887}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.08095359802246094, 'dpo_reward_mean_target': 0.08095359802246094, 'standard deviation': 3.0, 'reward_a1': tensor([0.6334], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13074520230293274, 'denominator': 0.13074520230293274}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.21801376342773438, 'dpo_reward_mean_target': -0.11028289794921875, 'standard deviation': 3.0, 'reward_a1': tensor([0.2080], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9943926334381104, 'numerator': 0.1322343498468399, 'denominator': 0.13298001885414124}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.004657745361328125, 'dpo_reward_mean_target': 0.2621421813964844, 'standard deviation': 3.0, 'reward_a1': tensor([1.5153], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0403269529342651, 'numerator': 0.12187077105045319, 'denominator': 0.11714661121368408}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.09670829772949219, 'dpo_reward_mean_target': -0.09670829772949219, 'standard deviation': 3.0, 'reward_a1': tensor([0.8484], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.12654243409633636, 'denominator': 0.12654243409633636}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.7068252563476562, 'dpo_reward_mean_target': 0.5485038757324219, 'standard deviation': 3.0, 'reward_a1': tensor([-0.0007], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0111154317855835, 'numerator': 0.13077083230018616, 'denominator': 0.1293332427740097}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 1.3882369995117188, 'dpo_reward_mean_target': 0.9931983947753906, 'standard deviation': 3.0, 'reward_a1': tensor([0.5414], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0289117097854614, 'numerator': 0.1314810812473297, 'denominator': 0.1277865469455719}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.00042724609375, 'dpo_reward_mean_target': -0.00042724609375, 'standard deviation': 3.0, 'reward_a1': tensor([0.3007], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.1323125660419464, 'denominator': 0.1323125660419464}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.0030765533447265625, 'dpo_reward_mean_target': 0.47083282470703125, 'standard deviation': 3.0, 'reward_a1': tensor([0.2564], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0011887550354004, 'numerator': 0.13264162838459015, 'denominator': 0.13248413801193237}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.0682373046875, 'dpo_reward_mean_target': 0.8000679016113281, 'standard deviation': 3.0, 'reward_a1': tensor([-0.2558], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9454405903816223, 'numerator': 0.12499409914016724, 'denominator': 0.1322072446346283}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.6356582641601562, 'dpo_reward_mean_target': 0.6356582641601562, 'standard deviation': 3.0, 'reward_a1': tensor([1.1805], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13080544769763947, 'denominator': 0.13080544769763947}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.07523727416992188, 'dpo_reward_mean_target': 0.07523727416992188, 'standard deviation': 3.0, 'reward_a1': tensor([0.3803], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.1322949081659317, 'denominator': 0.1322949081659317}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.22328758239746094, 'dpo_reward_mean_target': 0.22328758239746094, 'standard deviation': 3.0, 'reward_a1': tensor([2.9740], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.08734265714883804, 'denominator': 0.08734265714883804}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.3131065368652344, 'dpo_reward_mean_target': -0.3131065368652344, 'standard deviation': 3.0, 'reward_a1': tensor([-0.0682], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.132538303732872, 'denominator': 0.132538303732872}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.5608367919921875, 'dpo_reward_mean_target': 0.5849685668945312, 'standard deviation': 3.0, 'reward_a1': tensor([1.9949], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0038200616836548, 'numerator': 0.11907574534416199, 'denominator': 0.11862259358167648}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.06122398376464844, 'dpo_reward_mean_target': 0.06122398376464844, 'standard deviation': 3.0, 'reward_a1': tensor([-0.1280], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13271640241146088, 'denominator': 0.13271640241146088}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.054744720458984375, 'dpo_reward_mean_target': 0.03148841857910156, 'standard deviation': 3.0, 'reward_a1': tensor([-0.0547], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9995871782302856, 'numerator': 0.1329258531332016, 'denominator': 0.13298074901103973}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.5332050323486328, 'dpo_reward_mean_target': 0.8595848083496094, 'standard deviation': 3.0, 'reward_a1': tensor([0.9491], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0092064142227173, 'numerator': 0.13292156159877777, 'denominator': 0.1317089945077896}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.4931297302246094, 'dpo_reward_mean_target': 0.4255542755126953, 'standard deviation': 3.0, 'reward_a1': tensor([1.4391], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9926702976226807, 'numerator': 0.12560369074344635, 'denominator': 0.12653112411499023}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.00872802734375, 'dpo_reward_mean_target': -0.00872802734375, 'standard deviation': 3.0, 'reward_a1': tensor([7.2208], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.007290427573025227, 'denominator': 0.007290427573025227}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.00135040283203125, 'dpo_reward_mean_target': -0.00135040283203125, 'standard deviation': 3.0, 'reward_a1': tensor([-0.0014], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13298074901103973, 'denominator': 0.13298074901103973}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.09828758239746094, 'dpo_reward_mean_target': -0.09828758239746094, 'standard deviation': 3.0, 'reward_a1': tensor([0.6155], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.12926898896694183, 'denominator': 0.12926898896694183}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.00885772705078125, 'dpo_reward_mean_target': -0.00885772705078125, 'standard deviation': 3.0, 'reward_a1': tensor([-0.0089], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13298074901103973, 'denominator': 0.13298074901103973}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.4354515075683594, 'dpo_reward_mean_target': 0.4354515075683594, 'standard deviation': 3.0, 'reward_a1': tensor([0.1597], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13242018222808838, 'denominator': 0.13242018222808838}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.37129974365234375, 'dpo_reward_mean_target': 0.057392120361328125, 'standard deviation': 3.0, 'reward_a1': tensor([0.2638], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9982768297195435, 'numerator': 0.1326664239168167, 'denominator': 0.13289542496204376}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 1.881378173828125, 'dpo_reward_mean_target': 1.956329345703125, 'standard deviation': 3.0, 'reward_a1': tensor([4.3120], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.020129919052124, 'numerator': 0.09770035743713379, 'denominator': 0.09577246010303497}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.377197265625, 'dpo_reward_mean_target': 0.865936279296875, 'standard deviation': 3.0, 'reward_a1': tensor([3.7718], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.1865746974945068, 'numerator': 0.08318726718425751, 'denominator': 0.07010706514120102}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.05972099304199219, 'dpo_reward_mean_target': 0.05972099304199219, 'standard deviation': 3.0, 'reward_a1': tensor([0.1961], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.132843479514122, 'denominator': 0.132843479514122}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.8461532592773438, 'dpo_reward_mean_target': 1.193450927734375, 'standard deviation': 3.0, 'reward_a1': tensor([0.8511], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9935110211372375, 'numerator': 0.13211768865585327, 'denominator': 0.1329805999994278}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.0613250732421875, 'dpo_reward_mean_target': -0.0613250732421875, 'standard deviation': 3.0, 'reward_a1': tensor([0.6340], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.12945687770843506, 'denominator': 0.12945687770843506}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.025386810302734375, 'dpo_reward_mean_target': -0.025386810302734375, 'standard deviation': 3.0, 'reward_a1': tensor([-0.0254], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13298074901103973, 'denominator': 0.13298074901103973}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.08718490600585938, 'dpo_reward_mean_target': -0.08718490600585938, 'standard deviation': 3.0, 'reward_a1': tensor([0.2727], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13202731311321259, 'denominator': 0.13202731311321259}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.051433563232421875, 'dpo_reward_mean_target': -0.2755393981933594, 'standard deviation': 3.0, 'reward_a1': tensor([-0.1971], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0008370876312256, 'numerator': 0.1329352855682373, 'denominator': 0.1328240931034088}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.0081329345703125, 'dpo_reward_mean_target': -0.09058380126953125, 'standard deviation': 3.0, 'reward_a1': tensor([0.1871], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9978359937667847, 'numerator': 0.13241228461265564, 'denominator': 0.13269944489002228}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.02277374267578125, 'dpo_reward_mean_target': 0.4734077453613281, 'standard deviation': 3.0, 'reward_a1': tensor([0.6394], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0230886936187744, 'numerator': 0.13277748227119446, 'denominator': 0.12978100776672363}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.4708213806152344, 'dpo_reward_mean_target': 0.5476875305175781, 'standard deviation': 3.0, 'reward_a1': tensor([0.6963], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0015987157821655, 'numerator': 0.13281770050525665, 'denominator': 0.13260570168495178}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.12750625610351562, 'dpo_reward_mean_target': 0.16660690307617188, 'standard deviation': 3.0, 'reward_a1': tensor([0.8956], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0032572746276855, 'numerator': 0.12911178171634674, 'denominator': 0.1286925971508026}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.001033782958984375, 'dpo_reward_mean_target': 0.001033782958984375, 'standard deviation': 3.0, 'reward_a1': tensor([0.0010], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13298074901103973, 'denominator': 0.13298074901103973}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.308380126953125, 'dpo_reward_mean_target': 1.2509841918945312, 'standard deviation': 3.0, 'reward_a1': tensor([-0.6077], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.8647589087486267, 'numerator': 0.10975807160139084, 'denominator': 0.1269233226776123}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.15775680541992188, 'dpo_reward_mean_target': 0.09745407104492188, 'standard deviation': 3.0, 'reward_a1': tensor([0.4646], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.014129638671875, 'numerator': 0.13198839128017426, 'denominator': 0.13014942407608032}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.08212661743164062, 'dpo_reward_mean_target': 0.08212661743164062, 'standard deviation': 3.0, 'reward_a1': tensor([0.1303], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13296359777450562, 'denominator': 0.13296359777450562}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.348663330078125, 'dpo_reward_mean_target': 0.6289901733398438, 'standard deviation': 3.0, 'reward_a1': tensor([0.5007], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.000370740890503, 'numerator': 0.1328592747449875, 'denominator': 0.13281004130840302}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.02178192138671875, 'dpo_reward_mean_target': -0.02178192138671875, 'standard deviation': 3.0, 'reward_a1': tensor([0.0180], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13296908140182495, 'denominator': 0.13296908140182495}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.027675628662109375, 'dpo_reward_mean_target': -0.13689613342285156, 'standard deviation': 3.0, 'reward_a1': tensor([-0.2417], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0034273862838745, 'numerator': 0.13289961218833923, 'denominator': 0.13244566321372986}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.22542190551757812, 'dpo_reward_mean_target': -0.12341117858886719, 'standard deviation': 3.0, 'reward_a1': tensor([2.8206], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0345298051834106, 'numerator': 0.08216238021850586, 'denominator': 0.07942002266645432}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.3402557373046875, 'dpo_reward_mean_target': -0.3402557373046875, 'standard deviation': 3.0, 'reward_a1': tensor([0.4555], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.12838385999202728, 'denominator': 0.12838385999202728}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.06334304809570312, 'dpo_reward_mean_target': -0.06334304809570312, 'standard deviation': 3.0, 'reward_a1': tensor([0.0763], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13283687829971313, 'denominator': 0.13283687829971313}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.12264633178710938, 'dpo_reward_mean_target': 0.0065975189208984375, 'standard deviation': 3.0, 'reward_a1': tensor([1.6745], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0251920223236084, 'numerator': 0.11393804103136063, 'denominator': 0.1111382395029068}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.5817794799804688, 'dpo_reward_mean_target': 0.7287139892578125, 'standard deviation': 3.0, 'reward_a1': tensor([6.0601], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0922493934631348, 'numerator': 0.02741624414920807, 'denominator': 0.025100719183683395}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.06216239929199219, 'dpo_reward_mean_target': -0.06216239929199219, 'standard deviation': 3.0, 'reward_a1': tensor([-0.1408], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13293510675430298, 'denominator': 0.13293510675430298}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 1.1633682250976562, 'dpo_reward_mean_target': 1.3249435424804688, 'standard deviation': 3.0, 'reward_a1': tensor([5.3601], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0766916275024414, 'numerator': 0.05381856858730316, 'denominator': 0.0499851256608963}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.4613456726074219, 'dpo_reward_mean_target': 0.8536834716796875, 'standard deviation': 3.0, 'reward_a1': tensor([-0.4404], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9111867547035217, 'numerator': 0.12116734683513641, 'denominator': 0.13297751545906067}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.49631500244140625, 'dpo_reward_mean_target': 0.8742637634277344, 'standard deviation': 3.0, 'reward_a1': tensor([2.3545], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0726137161254883, 'numerator': 0.11773931980133057, 'denominator': 0.10976861417293549}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.03156089782714844, 'dpo_reward_mean_target': -0.03156089782714844, 'standard deviation': 3.0, 'reward_a1': tensor([-0.0316], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13298074901103973, 'denominator': 0.13298074901103973}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 1.1966094970703125, 'dpo_reward_mean_target': 1.64404296875, 'standard deviation': 3.0, 'reward_a1': tensor([11.1129], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.6190918684005737, 'numerator': 0.0009131263941526413, 'denominator': 0.0005639744340442121}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.9939041137695312, 'dpo_reward_mean_target': 1.2100143432617188, 'standard deviation': 3.0, 'reward_a1': tensor([0.5530], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9869043231010437, 'numerator': 0.12982940673828125, 'denominator': 0.13155217468738556}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.8196315765380859, 'dpo_reward_mean_target': 0.8196315765380859, 'standard deviation': 3.0, 'reward_a1': tensor([0.4671], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13206560909748077, 'denominator': 0.13206560909748077}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.009073257446289062, 'dpo_reward_mean_target': 0.009073257446289062, 'standard deviation': 3.0, 'reward_a1': tensor([0.0723], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13295122981071472, 'denominator': 0.13295122981071472}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.5867691040039062, 'dpo_reward_mean_target': 0.6923065185546875, 'standard deviation': 3.0, 'reward_a1': tensor([0.9031], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0030959844589233, 'numerator': 0.13265278935432434, 'denominator': 0.13224336504936218}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.4319305419921875, 'dpo_reward_mean_target': 0.2046051025390625, 'standard deviation': 3.0, 'reward_a1': tensor([2.6326], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9432192444801331, 'numerator': 0.09584150463342667, 'denominator': 0.10161105543375015}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.19581985473632812, 'dpo_reward_mean_target': 0.19581985473632812, 'standard deviation': 3.0, 'reward_a1': tensor([0.9546], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.12879449129104614, 'denominator': 0.12879449129104614}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.01441192626953125, 'dpo_reward_mean_target': 0.33533477783203125, 'standard deviation': 3.0, 'reward_a1': tensor([0.7269], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0198774337768555, 'numerator': 0.1318531185388565, 'denominator': 0.12928329408168793}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.0737762451171875, 'dpo_reward_mean_target': 0.6076087951660156, 'standard deviation': 3.0, 'reward_a1': tensor([2.7180], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.2039090394973755, 'numerator': 0.10383079200983047, 'denominator': 0.08624471724033356}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.28318023681640625, 'dpo_reward_mean_target': 0.31354522705078125, 'standard deviation': 3.0, 'reward_a1': tensor([2.4178], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0071762800216675, 'numerator': 0.1039823517203331, 'denominator': 0.103241465985775}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.4956951141357422, 'dpo_reward_mean_target': 0.3373832702636719, 'standard deviation': 3.0, 'reward_a1': tensor([3.2325], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9516738057136536, 'numerator': 0.08347606658935547, 'denominator': 0.08771499991416931}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.195465087890625, 'dpo_reward_mean_target': 0.8310470581054688, 'standard deviation': 3.0, 'reward_a1': tensor([1.3010], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0572092533111572, 'numerator': 0.13135892152786255, 'denominator': 0.1242506355047226}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 1.1971435546875, 'dpo_reward_mean_target': 0.9364013671875, 'standard deviation': 3.0, 'reward_a1': tensor([4.8069], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.8973064422607422, 'numerator': 0.05785387381911278, 'denominator': 0.06447504460811615}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.2598228454589844, 'dpo_reward_mean_target': 0.2598228454589844, 'standard deviation': 3.0, 'reward_a1': tensor([0.3206], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.1329534947872162, 'denominator': 0.1329534947872162}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.1298370361328125, 'dpo_reward_mean_target': 0.1298370361328125, 'standard deviation': 3.0, 'reward_a1': tensor([1.5050], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.11971917003393173, 'denominator': 0.11971917003393173}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.09047889709472656, 'dpo_reward_mean_target': -0.09047889709472656, 'standard deviation': 3.0, 'reward_a1': tensor([-0.0799], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13297992944717407, 'denominator': 0.13297992944717407}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.22193145751953125, 'dpo_reward_mean_target': 0.3986358642578125, 'standard deviation': 3.0, 'reward_a1': tensor([1.7108], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.027880072593689, 'numerator': 0.12084921449422836, 'denominator': 0.11757132411003113}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.2509613037109375, 'dpo_reward_mean_target': 0.779083251953125, 'standard deviation': 3.0, 'reward_a1': tensor([5.6570], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.853761076927185, 'numerator': 0.03545701503753662, 'denominator': 0.019127069041132927}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.05026817321777344, 'dpo_reward_mean_target': 0.05026817321777344, 'standard deviation': 3.0, 'reward_a1': tensor([-0.2884], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13213589787483215, 'denominator': 0.13213589787483215}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.11275482177734375, 'dpo_reward_mean_target': 0.1568603515625, 'standard deviation': 3.0, 'reward_a1': tensor([1.0244], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0043689012527466, 'numerator': 0.1275351494550705, 'denominator': 0.12698037922382355}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.3153057098388672, 'dpo_reward_mean_target': 0.6796665191650391, 'standard deviation': 3.0, 'reward_a1': tensor([1.1061], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.024947166442871, 'numerator': 0.1316438466310501, 'denominator': 0.12843963503837585}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.0008563995361328125, 'dpo_reward_mean_target': -0.0008563995361328125, 'standard deviation': 3.0, 'reward_a1': tensor([0.1560], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13279910385608673, 'denominator': 0.13279910385608673}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.342559814453125, 'dpo_reward_mean_target': 1.0188217163085938, 'standard deviation': 3.0, 'reward_a1': tensor([5.0415], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.38773775100708, 'numerator': 0.054119374603033066, 'denominator': 0.03899827226996422}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.04392242431640625, 'dpo_reward_mean_target': -0.04392242431640625, 'standard deviation': 3.0, 'reward_a1': tensor([-0.0134], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13297390937805176, 'denominator': 0.13297390937805176}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.04499626159667969, 'dpo_reward_mean_target': 0.04499626159667969, 'standard deviation': 3.0, 'reward_a1': tensor([0.7570], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.12928809225559235, 'denominator': 0.12928809225559235}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.0042285919189453125, 'dpo_reward_mean_target': 0.0042285919189453125, 'standard deviation': 3.0, 'reward_a1': tensor([0.0042], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13298074901103973, 'denominator': 0.13298074901103973}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.600555419921875, 'dpo_reward_mean_target': 0.8118972778320312, 'standard deviation': 3.0, 'reward_a1': tensor([4.7731], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.1002088785171509, 'numerator': 0.05561619997024536, 'denominator': 0.05055058375000954}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.0892181396484375, 'dpo_reward_mean_target': -0.0892181396484375, 'standard deviation': 3.0, 'reward_a1': tensor([-0.1817], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13291755318641663, 'denominator': 0.13291755318641663}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.11974143981933594, 'dpo_reward_mean_target': 0.16627883911132812, 'standard deviation': 3.0, 'reward_a1': tensor([0.0415], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9994755387306213, 'numerator': 0.13286587595939636, 'denominator': 0.13293559849262238}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.06221771240234375, 'dpo_reward_mean_target': 0.5696220397949219, 'standard deviation': 3.0, 'reward_a1': tensor([2.5941], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.1370539665222168, 'numerator': 0.10590111464262009, 'denominator': 0.09313639998435974}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.044147491455078125, 'dpo_reward_mean_target': -0.044147491455078125, 'standard deviation': 3.0, 'reward_a1': tensor([0.0934], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13284112513065338, 'denominator': 0.13284112513065338}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.01883697509765625, 'dpo_reward_mean_target': 0.01883697509765625, 'standard deviation': 3.0, 'reward_a1': tensor([0.0188], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13298074901103973, 'denominator': 0.13298074901103973}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.22417068481445312, 'dpo_reward_mean_target': 0.371429443359375, 'standard deviation': 3.0, 'reward_a1': tensor([0.0174], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.99542236328125, 'numerator': 0.13205792009830475, 'denominator': 0.13266521692276}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.3455219268798828, 'dpo_reward_mean_target': 0.8749217987060547, 'standard deviation': 3.0, 'reward_a1': tensor([0.8749], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0862711668014526, 'numerator': 0.13298074901103973, 'denominator': 0.12241947650909424}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.022691726684570312, 'dpo_reward_mean_target': -0.022691726684570312, 'standard deviation': 3.0, 'reward_a1': tensor([-0.0227], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13298074901103973, 'denominator': 0.13298074901103973}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.08054161071777344, 'dpo_reward_mean_target': 0.16848373413085938, 'standard deviation': 3.0, 'reward_a1': tensor([0.7211], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0058470964431763, 'numerator': 0.13074329495429993, 'denominator': 0.12998327612876892}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 1.3358917236328125, 'dpo_reward_mean_target': 0.7450103759765625, 'standard deviation': 3.0, 'reward_a1': tensor([-0.2652], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0895031690597534, 'numerator': 0.1256508082151413, 'denominator': 0.11532854288816452}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.8332481384277344, 'dpo_reward_mean_target': 0.6372814178466797, 'standard deviation': 3.0, 'reward_a1': tensor([0.3882], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0075867176055908, 'numerator': 0.1325230449438095, 'denominator': 0.1315252035856247}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.4690704345703125, 'dpo_reward_mean_target': 0.42659759521484375, 'standard deviation': 3.0, 'reward_a1': tensor([1.4731], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9951729774475098, 'numerator': 0.12513042986392975, 'denominator': 0.12573736906051636}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.03247642517089844, 'dpo_reward_mean_target': -0.03247642517089844, 'standard deviation': 3.0, 'reward_a1': tensor([0.2190], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13251449167728424, 'denominator': 0.13251449167728424}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.7979621887207031, 'dpo_reward_mean_target': 0.345916748046875, 'standard deviation': 3.0, 'reward_a1': tensor([2.3729], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9135124087333679, 'numerator': 0.10584121942520142, 'denominator': 0.11586183309555054}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 1.3253707885742188, 'dpo_reward_mean_target': 1.86053466796875, 'standard deviation': 3.0, 'reward_a1': tensor([1.6902], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0057984590530396, 'numerator': 0.13276652991771698, 'denominator': 0.132001131772995}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.6520233154296875, 'dpo_reward_mean_target': 0.859344482421875, 'standard deviation': 3.0, 'reward_a1': tensor([6.4886], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.1411783695220947, 'numerator': 0.022868024185299873, 'denominator': 0.02003895677626133}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.35349464416503906, 'dpo_reward_mean_target': -0.07090950012207031, 'standard deviation': 3.0, 'reward_a1': tensor([-0.0023], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0066124200820923, 'numerator': 0.13294601440429688, 'denominator': 0.13207268714904785}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.22833824157714844, 'dpo_reward_mean_target': 0.22833824157714844, 'standard deviation': 3.0, 'reward_a1': tensor([0.2257], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13298071920871735, 'denominator': 0.13298071920871735}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.015094757080078125, 'dpo_reward_mean_target': -0.015094757080078125, 'standard deviation': 3.0, 'reward_a1': tensor([-0.1617], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13282209634780884, 'denominator': 0.13282209634780884}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.9483642578125, 'dpo_reward_mean_target': 0.8344039916992188, 'standard deviation': 3.0, 'reward_a1': tensor([-0.2766], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0148985385894775, 'numerator': 0.12416789680719376, 'denominator': 0.12234513461589813}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.7293434143066406, 'dpo_reward_mean_target': 0.6259536743164062, 'standard deviation': 3.0, 'reward_a1': tensor([2.3214], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9812943935394287, 'numerator': 0.11335353553295135, 'denominator': 0.11551430076360703}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.016897201538085938, 'dpo_reward_mean_target': 0.016897201538085938, 'standard deviation': 3.0, 'reward_a1': tensor([0.6564], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.12999317049980164, 'denominator': 0.12999317049980164}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 1.4586639404296875, 'dpo_reward_mean_target': 1.1985244750976562, 'standard deviation': 3.0, 'reward_a1': tensor([1.6422], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.990975022315979, 'numerator': 0.1315341293811798, 'denominator': 0.13273203372955322}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.151092529296875, 'dpo_reward_mean_target': 0.30420494079589844, 'standard deviation': 3.0, 'reward_a1': tensor([1.2341], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.017270565032959, 'numerator': 0.126742884516716, 'denominator': 0.12459111958742142}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.11635589599609375, 'dpo_reward_mean_target': 0.38021087646484375, 'standard deviation': 3.0, 'reward_a1': tensor([1.5182], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.037930965423584, 'numerator': 0.12375012040138245, 'denominator': 0.11922769993543625}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.5127792358398438, 'dpo_reward_mean_target': 0.5127792358398438, 'standard deviation': 3.0, 'reward_a1': tensor([0.7900], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13241423666477203, 'denominator': 0.13241423666477203}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.16508865356445312, 'dpo_reward_mean_target': 1.6332321166992188, 'standard deviation': 3.0, 'reward_a1': tensor([5.9677], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 2.8455023765563965, 'numerator': 0.04682755097746849, 'denominator': 0.016456689685583115}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.06518173217773438, 'dpo_reward_mean_target': 0.06518173217773438, 'standard deviation': 3.0, 'reward_a1': tensor([0.2503], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13272778689861298, 'denominator': 0.13272778689861298}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.5112762451171875, 'dpo_reward_mean_target': 0.491851806640625, 'standard deviation': 3.0, 'reward_a1': tensor([0.3718], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0002800226211548, 'numerator': 0.1328742653131485, 'denominator': 0.13283707201480865}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.5520954132080078, 'dpo_reward_mean_target': 0.5520954132080078, 'standard deviation': 3.0, 'reward_a1': tensor([0.8583], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13228969275951385, 'denominator': 0.13228969275951385}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.20478057861328125, 'dpo_reward_mean_target': 0.5075225830078125, 'standard deviation': 3.0, 'reward_a1': tensor([1.7127], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.046689510345459, 'numerator': 0.1226714476943016, 'denominator': 0.11719946563243866}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.0308837890625, 'dpo_reward_mean_target': 1.12274169921875, 'standard deviation': 3.0, 'reward_a1': tensor([1.6444], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.1382758617401123, 'numerator': 0.13098563253879547, 'denominator': 0.11507371813058853}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.23188018798828125, 'dpo_reward_mean_target': 0.5422401428222656, 'standard deviation': 3.0, 'reward_a1': tensor([4.0680], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.135341763496399, 'numerator': 0.06666025519371033, 'denominator': 0.058713823556900024}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.2111835479736328, 'dpo_reward_mean_target': 0.2111835479736328, 'standard deviation': 3.0, 'reward_a1': tensor([0.0904], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13287299871444702, 'denominator': 0.13287299871444702}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.16710853576660156, 'dpo_reward_mean_target': 0.16710853576660156, 'standard deviation': 3.0, 'reward_a1': tensor([1.5358], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.11983691900968552, 'denominator': 0.11983691900968552}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.1599559783935547, 'dpo_reward_mean_target': -0.1599559783935547, 'standard deviation': 3.0, 'reward_a1': tensor([0.5698], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.1291041225194931, 'denominator': 0.1291041225194931}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.502471923828125, 'dpo_reward_mean_target': 0.6777801513671875, 'standard deviation': 3.0, 'reward_a1': tensor([0.3318], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9949817061424255, 'numerator': 0.13209961354732513, 'denominator': 0.13276587426662445}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.060649871826171875, 'dpo_reward_mean_target': -0.060649871826171875, 'standard deviation': 3.0, 'reward_a1': tensor([-0.0606], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13298074901103973, 'denominator': 0.13298074901103973}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.5999813079833984, 'dpo_reward_mean_target': 0.5999813079833984, 'standard deviation': 3.0, 'reward_a1': tensor([0.8695], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.1324450522661209, 'denominator': 0.1324450522661209}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 1.391204833984375, 'dpo_reward_mean_target': 1.4626693725585938, 'standard deviation': 3.0, 'reward_a1': tensor([1.5454], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.000941276550293, 'numerator': 0.1329302191734314, 'denominator': 0.1328052133321762}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.06258583068847656, 'dpo_reward_mean_target': -0.06258583068847656, 'standard deviation': 3.0, 'reward_a1': tensor([1.3480], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.11906489729881287, 'denominator': 0.11906489729881287}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.3572578430175781, 'dpo_reward_mean_target': 0.3071327209472656, 'standard deviation': 3.0, 'reward_a1': tensor([1.5972], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9929792881011963, 'numerator': 0.12123696506023407, 'denominator': 0.12209415435791016}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 1.36956787109375, 'dpo_reward_mean_target': 1.4229049682617188, 'standard deviation': 3.0, 'reward_a1': tensor([0.4975], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9946883916854858, 'numerator': 0.1268027424812317, 'denominator': 0.12747986614704132}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 1.3062973022460938, 'dpo_reward_mean_target': 0.8403472900390625, 'standard deviation': 3.0, 'reward_a1': tensor([-0.2122], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0688197612762451, 'numerator': 0.12504267692565918, 'denominator': 0.11699136346578598}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.25760650634765625, 'dpo_reward_mean_target': 0.6314849853515625, 'standard deviation': 3.0, 'reward_a1': tensor([0.4797], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0014617443084717, 'numerator': 0.13281066715717316, 'denominator': 0.1326168179512024}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.031558990478515625, 'dpo_reward_mean_target': 0.031558990478515625, 'standard deviation': 3.0, 'reward_a1': tensor([0.8317], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.12833380699157715, 'denominator': 0.12833380699157715}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.2175579071044922, 'dpo_reward_mean_target': -0.020540237426757812, 'standard deviation': 3.0, 'reward_a1': tensor([-0.0311], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0019268989562988, 'numerator': 0.13297992944717407, 'denominator': 0.13272418081760406}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.27256011962890625, 'dpo_reward_mean_target': 0.27256011962890625, 'standard deviation': 3.0, 'reward_a1': tensor([-0.0489], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13221965730190277, 'denominator': 0.13221965730190277}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 1.5850372314453125, 'dpo_reward_mean_target': 1.5850372314453125, 'standard deviation': 3.0, 'reward_a1': tensor([0.9667], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13018567860126495, 'denominator': 0.13018567860126495}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.11339569091796875, 'dpo_reward_mean_target': -0.3044891357421875, 'standard deviation': 3.0, 'reward_a1': tensor([0.0806], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9938707947731018, 'numerator': 0.13188961148262024, 'denominator': 0.13270297646522522}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 1.651092529296875, 'dpo_reward_mean_target': 0.794586181640625, 'standard deviation': 3.0, 'reward_a1': tensor([4.5315], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.7298792600631714, 'numerator': 0.06121546030044556, 'denominator': 0.08387066423892975}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.03721046447753906, 'dpo_reward_mean_target': -0.03721046447753906, 'standard deviation': 3.0, 'reward_a1': tensor([-0.0372], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13298074901103973, 'denominator': 0.13298074901103973}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.012416839599609375, 'dpo_reward_mean_target': 0.012416839599609375, 'standard deviation': 3.0, 'reward_a1': tensor([0.0585], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.132965087890625, 'denominator': 0.132965087890625}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.11620140075683594, 'dpo_reward_mean_target': -0.11620140075683594, 'standard deviation': 3.0, 'reward_a1': tensor([1.7662], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.1092173308134079, 'denominator': 0.1092173308134079}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.85809326171875, 'dpo_reward_mean_target': 2.17108154296875, 'standard deviation': 3.0, 'reward_a1': tensor([8.5863], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 2.805762767791748, 'numerator': 0.013515149243175983, 'denominator': 0.004816925153136253}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.1706085205078125, 'dpo_reward_mean_target': 0.586639404296875, 'standard deviation': 3.0, 'reward_a1': tensor([1.5961], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0578945875167847, 'numerator': 0.1256609708070755, 'denominator': 0.11878402531147003}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.14478492736816406, 'dpo_reward_mean_target': -0.2517051696777344, 'standard deviation': 3.0, 'reward_a1': tensor([-0.2264], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0076475143432617, 'numerator': 0.13297602534294128, 'denominator': 0.13196679949760437}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.2523536682128906, 'dpo_reward_mean_target': 0.39192771911621094, 'standard deviation': 3.0, 'reward_a1': tensor([1.0020], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0105993747711182, 'numerator': 0.1302592009305954, 'denominator': 0.12889301776885986}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 1.15576171875, 'dpo_reward_mean_target': 1.510772705078125, 'standard deviation': 3.0, 'reward_a1': tensor([7.8614], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.2936995029449463, 'numerator': 0.014148809015750885, 'denominator': 0.010936704464256763}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.9010543823242188, 'dpo_reward_mean_target': 1.5142898559570312, 'standard deviation': 3.0, 'reward_a1': tensor([4.2977], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.234349250793457, 'numerator': 0.08647061884403229, 'denominator': 0.07005360722541809}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.34197998046875, 'dpo_reward_mean_target': 0.6268272399902344, 'standard deviation': 3.0, 'reward_a1': tensor([1.6971], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0391278266906738, 'numerator': 0.12478158622980118, 'denominator': 0.12008299678564072}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.19805335998535156, 'dpo_reward_mean_target': 0.19805335998535156, 'standard deviation': 3.0, 'reward_a1': tensor([0.5645], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13199257850646973, 'denominator': 0.13199257850646973}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 1.8882522583007812, 'dpo_reward_mean_target': 1.1757659912109375, 'standard deviation': 3.0, 'reward_a1': tensor([4.9842], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.7608701586723328, 'numerator': 0.05940750613808632, 'denominator': 0.07807837426662445}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.6696357727050781, 'dpo_reward_mean_target': 1.3997802734375, 'standard deviation': 3.0, 'reward_a1': tensor([0.2521], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9743664264678955, 'numerator': 0.12359782308340073, 'denominator': 0.12684942781925201}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.3343658447265625, 'dpo_reward_mean_target': 1.920654296875, 'standard deviation': 3.0, 'reward_a1': tensor([0.2002], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.8492193222045898, 'numerator': 0.11281700432300568, 'denominator': 0.13284790515899658}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.3928050994873047, 'dpo_reward_mean_target': -0.3928050994873047, 'standard deviation': 3.0, 'reward_a1': tensor([1.2784], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.11386829614639282, 'denominator': 0.11386829614639282}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.1277751922607422, 'dpo_reward_mean_target': -0.1277751922607422, 'standard deviation': 3.0, 'reward_a1': tensor([0.5865], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.12926414608955383, 'denominator': 0.12926414608955383}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.07905960083007812, 'dpo_reward_mean_target': 0.14325332641601562, 'standard deviation': 3.0, 'reward_a1': tensor([-0.1569], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9980898499488831, 'numerator': 0.1323169469833374, 'denominator': 0.13257017731666565}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.8396148681640625, 'dpo_reward_mean_target': 0.94537353515625, 'standard deviation': 3.0, 'reward_a1': tensor([4.2537], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0402873754501343, 'numerator': 0.07239677011966705, 'denominator': 0.06959304958581924}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.027738571166992188, 'dpo_reward_mean_target': -0.027738571166992188, 'standard deviation': 3.0, 'reward_a1': tensor([-0.0277], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13298074901103973, 'denominator': 0.13298074901103973}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.19715118408203125, 'dpo_reward_mean_target': 0.3232536315917969, 'standard deviation': 3.0, 'reward_a1': tensor([0.9427], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0096088647842407, 'numerator': 0.13017581403255463, 'denominator': 0.12893687188625336}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.2627716064453125, 'dpo_reward_mean_target': -0.2627716064453125, 'standard deviation': 3.0, 'reward_a1': tensor([0.1572], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13168418407440186, 'denominator': 0.13168418407440186}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 1.0610580444335938, 'dpo_reward_mean_target': 1.5934982299804688, 'standard deviation': 3.0, 'reward_a1': tensor([4.9141], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.236387848854065, 'numerator': 0.07207020372152328, 'denominator': 0.058290932327508926}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.42009735107421875, 'dpo_reward_mean_target': 0.44129180908203125, 'standard deviation': 3.0, 'reward_a1': tensor([-0.2300], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9984452128410339, 'numerator': 0.12969277799129486, 'denominator': 0.12989473342895508}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.745452880859375, 'dpo_reward_mean_target': 1.03082275390625, 'standard deviation': 3.0, 'reward_a1': tensor([0.3944], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9844658970832825, 'numerator': 0.13002164661884308, 'denominator': 0.1320732831954956}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.8319473266601562, 'dpo_reward_mean_target': 0.6269149780273438, 'standard deviation': 3.0, 'reward_a1': tensor([0.0140], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.016432762145996, 'numerator': 0.1302337944507599, 'denominator': 0.1281282901763916}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.6501617431640625, 'dpo_reward_mean_target': 0.55950927734375, 'standard deviation': 3.0, 'reward_a1': tensor([5.3749], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9530894160270691, 'numerator': 0.03667011111974716, 'denominator': 0.03847499564290047}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.04412078857421875, 'dpo_reward_mean_target': 0.174713134765625, 'standard deviation': 3.0, 'reward_a1': tensor([5.3885], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0796113014221191, 'numerator': 0.029370617121458054, 'denominator': 0.027204807847738266}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.21883010864257812, 'dpo_reward_mean_target': 0.0395050048828125, 'standard deviation': 3.0, 'reward_a1': tensor([4.6823], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9132720232009888, 'numerator': 0.04015182703733444, 'denominator': 0.04396480694413185}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 1.4064407348632812, 'dpo_reward_mean_target': 2.6560516357421875, 'standard deviation': 3.0, 'reward_a1': tensor([5.4330], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.603704571723938, 'numerator': 0.08664299547672272, 'denominator': 0.054026782512664795}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.15717124938964844, 'dpo_reward_mean_target': -0.15717124938964844, 'standard deviation': 3.0, 'reward_a1': tensor([0.0849], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13254867494106293, 'denominator': 0.13254867494106293}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.06232452392578125, 'dpo_reward_mean_target': 0.529571533203125, 'standard deviation': 3.0, 'reward_a1': tensor([-0.5825], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9477434754371643, 'numerator': 0.12415143847465515, 'denominator': 0.13099688291549683}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.29436302185058594, 'dpo_reward_mean_target': -0.29436302185058594, 'standard deviation': 3.0, 'reward_a1': tensor([1.3607], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.11420738697052002, 'denominator': 0.11420738697052002}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.01265716552734375, 'dpo_reward_mean_target': 0.5994834899902344, 'standard deviation': 3.0, 'reward_a1': tensor([1.4289], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0802947282791138, 'numerator': 0.1279938966035843, 'denominator': 0.11848054081201553}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.04182624816894531, 'dpo_reward_mean_target': 0.04182624816894531, 'standard deviation': 3.0, 'reward_a1': tensor([-0.1055], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13282054662704468, 'denominator': 0.13282054662704468}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.0023441314697265625, 'dpo_reward_mean_target': -0.0023441314697265625, 'standard deviation': 3.0, 'reward_a1': tensor([-0.0723], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13294467329978943, 'denominator': 0.13294467329978943}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.27678680419921875, 'dpo_reward_mean_target': 0.3956489562988281, 'standard deviation': 3.0, 'reward_a1': tensor([2.1370], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.024067997932434, 'numerator': 0.11236342042684555, 'denominator': 0.10972261428833008}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.0047817230224609375, 'dpo_reward_mean_target': -0.0047817230224609375, 'standard deviation': 3.0, 'reward_a1': tensor([-0.1529], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13281874358654022, 'denominator': 0.13281874358654022}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.427215576171875, 'dpo_reward_mean_target': 0.47051239013671875, 'standard deviation': 3.0, 'reward_a1': tensor([0.8618], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0019885301589966, 'numerator': 0.1318543404340744, 'denominator': 0.13159266114234924}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.018857955932617188, 'dpo_reward_mean_target': 0.018857955932617188, 'standard deviation': 3.0, 'reward_a1': tensor([-0.0371], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13295763731002808, 'denominator': 0.13295763731002808}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.0006008148193359375, 'dpo_reward_mean_target': -0.0006008148193359375, 'standard deviation': 3.0, 'reward_a1': tensor([1.3791], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.11963508278131485, 'denominator': 0.11963508278131485}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 1.4643707275390625, 'dpo_reward_mean_target': 1.5789794921875, 'standard deviation': 3.0, 'reward_a1': tensor([6.7808], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.06926429271698, 'numerator': 0.02957567758858204, 'denominator': 0.027659837156534195}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.12531280517578125, 'dpo_reward_mean_target': 0.2512702941894531, 'standard deviation': 3.0, 'reward_a1': tensor([0.2661], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0010895729064941, 'numerator': 0.1329791396856308, 'denominator': 0.13283440470695496}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.3252410888671875, 'dpo_reward_mean_target': -0.3252410888671875, 'standard deviation': 3.0, 'reward_a1': tensor([0.8513], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.12313807010650635, 'denominator': 0.12313807010650635}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.0305328369140625, 'dpo_reward_mean_target': 0.20061492919921875, 'standard deviation': 3.0, 'reward_a1': tensor([1.2542], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0304815769195557, 'numerator': 0.12502843141555786, 'denominator': 0.12133010476827621}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.688690185546875, 'dpo_reward_mean_target': 2.1370086669921875, 'standard deviation': 3.0, 'reward_a1': tensor([4.6419], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.6814029216766357, 'numerator': 0.09384352713823318, 'denominator': 0.05581263452768326}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.07476425170898438, 'dpo_reward_mean_target': -0.2614612579345703, 'standard deviation': 3.0, 'reward_a1': tensor([1.6814], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9623612761497498, 'numerator': 0.10782494395971298, 'denominator': 0.11204206198453903}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 1.6948928833007812, 'dpo_reward_mean_target': 1.2750091552734375, 'standard deviation': 3.0, 'reward_a1': tensor([1.5894], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9951380491256714, 'numerator': 0.13225245475769043, 'denominator': 0.13289859890937805}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.26540374755859375, 'dpo_reward_mean_target': 0.5425262451171875, 'standard deviation': 3.0, 'reward_a1': tensor([2.4689], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0656479597091675, 'numerator': 0.10820583254098892, 'denominator': 0.10153994709253311}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.7933502197265625, 'dpo_reward_mean_target': -0.044158935546875, 'standard deviation': 3.0, 'reward_a1': tensor([4.4855], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.6821237802505493, 'numerator': 0.04253551363945007, 'denominator': 0.06235747039318085}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.10595130920410156, 'dpo_reward_mean_target': 0.04780006408691406, 'standard deviation': 3.0, 'reward_a1': tensor([0.8092], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0144232511520386, 'numerator': 0.12876634299755096, 'denominator': 0.12693552672863007}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.21284103393554688, 'dpo_reward_mean_target': -0.21284103393554688, 'standard deviation': 3.0, 'reward_a1': tensor([3.1030], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.07219671458005905, 'denominator': 0.07219671458005905}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.7408065795898438, 'dpo_reward_mean_target': 0.75732421875, 'standard deviation': 3.0, 'reward_a1': tensor([-0.0653], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9878320097923279, 'numerator': 0.1280742883682251, 'denominator': 0.12965188920497894}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 1.1152496337890625, 'dpo_reward_mean_target': 0.85009765625, 'standard deviation': 3.0, 'reward_a1': tensor([5.0583], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.8868536949157715, 'numerator': 0.04971713200211525, 'denominator': 0.056060127913951874}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.038593292236328125, 'dpo_reward_mean_target': 0.038593292236328125, 'standard deviation': 3.0, 'reward_a1': tensor([0.1815], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.1328299194574356, 'denominator': 0.1328299194574356}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.5688552856445312, 'dpo_reward_mean_target': 0.4793052673339844, 'standard deviation': 3.0, 'reward_a1': tensor([0.0519], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.004709005355835, 'numerator': 0.1316380649805069, 'denominator': 0.13102108240127563}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.023540496826171875, 'dpo_reward_mean_target': -0.023540496826171875, 'standard deviation': 3.0, 'reward_a1': tensor([2.0155], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.10555312782526016, 'denominator': 0.10555312782526016}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.11782073974609375, 'dpo_reward_mean_target': 0.11782073974609375, 'standard deviation': 3.0, 'reward_a1': tensor([-0.0764], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13270226120948792, 'denominator': 0.13270226120948792}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 1.033447265625, 'dpo_reward_mean_target': 0.6981582641601562, 'standard deviation': 3.0, 'reward_a1': tensor([4.9634], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.8584257364273071, 'numerator': 0.048400141298770905, 'denominator': 0.0563824437558651}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.12413978576660156, 'dpo_reward_mean_target': 0.12413978576660156, 'standard deviation': 3.0, 'reward_a1': tensor([-0.3893], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13104748725891113, 'denominator': 0.13104748725891113}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.389862060546875, 'dpo_reward_mean_target': 0.03009033203125, 'standard deviation': 3.0, 'reward_a1': tensor([0.0863], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0049573183059692, 'numerator': 0.13295744359493256, 'denominator': 0.13230158388614655}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 1.10546875, 'dpo_reward_mean_target': 1.3719100952148438, 'standard deviation': 3.0, 'reward_a1': tensor([0.4478], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9768592119216919, 'numerator': 0.12681958079338074, 'denominator': 0.12982380390167236}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.7365570068359375, 'dpo_reward_mean_target': -0.030975341796875, 'standard deviation': 3.0, 'reward_a1': tensor([-0.4935], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9914316534996033, 'numerator': 0.1314094364643097, 'denominator': 0.1325451284646988}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.558868408203125, 'dpo_reward_mean_target': 0.6674423217773438, 'standard deviation': 3.0, 'reward_a1': tensor([2.9651], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0287801027297974, 'numerator': 0.09917652606964111, 'denominator': 0.09640206396579742}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.11433219909667969, 'dpo_reward_mean_target': 0.2825431823730469, 'standard deviation': 3.0, 'reward_a1': tensor([1.1504], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0481419563293457, 'numerator': 0.1275317370891571, 'denominator': 0.12167410552501678}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.06714630126953125, 'dpo_reward_mean_target': 0.12045478820800781, 'standard deviation': 3.0, 'reward_a1': tensor([1.1411], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0062224864959717, 'numerator': 0.12550365924835205, 'denominator': 0.12472753971815109}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.5631103515625, 'dpo_reward_mean_target': 0.6045913696289062, 'standard deviation': 3.0, 'reward_a1': tensor([0.8170], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0010751485824585, 'numerator': 0.13264794647693634, 'denominator': 0.13250547647476196}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.024261474609375, 'dpo_reward_mean_target': -0.333404541015625, 'standard deviation': 3.0, 'reward_a1': tensor([1.5792], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9334177374839783, 'numerator': 0.10852428525686264, 'denominator': 0.11626550555229187}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 1.794708251953125, 'dpo_reward_mean_target': 2.74688720703125, 'standard deviation': 3.0, 'reward_a1': tensor([2.2577], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9986177086830139, 'numerator': 0.13122469186782837, 'denominator': 0.13140633702278137}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.1131591796875, 'dpo_reward_mean_target': 0.1131591796875, 'standard deviation': 3.0, 'reward_a1': tensor([0.1568], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13296666741371155, 'denominator': 0.13296666741371155}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.35404014587402344, 'dpo_reward_mean_target': 0.6373958587646484, 'standard deviation': 3.0, 'reward_a1': tensor([1.3829], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0283260345458984, 'numerator': 0.1289374977350235, 'denominator': 0.1253858208656311}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.05664253234863281, 'dpo_reward_mean_target': 0.11423110961914062, 'standard deviation': 3.0, 'reward_a1': tensor([0.3133], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.001458764076233, 'numerator': 0.13268843293190002, 'denominator': 0.13249514997005463}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.48919677734375, 'dpo_reward_mean_target': 1.941375732421875, 'standard deviation': 3.0, 'reward_a1': tensor([3.4149], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.426063060760498, 'numerator': 0.11786919832229614, 'denominator': 0.08265356719493866}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.1764850616455078, 'dpo_reward_mean_target': 0.1764850616455078, 'standard deviation': 3.0, 'reward_a1': tensor([1.8295], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.11425205320119858, 'denominator': 0.11425205320119858}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.5300712585449219, 'dpo_reward_mean_target': 0.15109634399414062, 'standard deviation': 3.0, 'reward_a1': tensor([0.0480], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0181372165679932, 'numerator': 0.13290226459503174, 'denominator': 0.13053472340106964}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.1741790771484375, 'dpo_reward_mean_target': 0.7618331909179688, 'standard deviation': 3.0, 'reward_a1': tensor([17.1445], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 2.970968723297119, 'numerator': 4.448337875828656e-08, 'denominator': 1.4972684425629268e-08}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.002948760986328125, 'dpo_reward_mean_target': -0.002948760986328125, 'standard deviation': 3.0, 'reward_a1': tensor([0.2089], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13264977931976318, 'denominator': 0.13264977931976318}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.2252216339111328, 'dpo_reward_mean_target': 0.413330078125, 'standard deviation': 3.0, 'reward_a1': tensor([0.2299], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.998134195804596, 'numerator': 0.13273251056671143, 'denominator': 0.13298062980175018}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.6471099853515625, 'dpo_reward_mean_target': 0.1394805908203125, 'standard deviation': 3.0, 'reward_a1': tensor([1.2720], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9516445398330688, 'numerator': 0.12383431941270828, 'denominator': 0.13012665510177612}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 1.9737625122070312, 'dpo_reward_mean_target': 2.178009033203125, 'standard deviation': 3.0, 'reward_a1': tensor([3.1001], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0235165357589722, 'numerator': 0.12684494256973267, 'denominator': 0.12393052130937576}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.5873699188232422, 'dpo_reward_mean_target': 0.21080970764160156, 'standard deviation': 3.0, 'reward_a1': tensor([0.6831], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.080340027809143, 'numerator': 0.1313433200120926, 'denominator': 0.12157591432332993}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.04135894775390625, 'dpo_reward_mean_target': 0.04135894775390625, 'standard deviation': 3.0, 'reward_a1': tensor([-0.0350], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13293777406215668, 'denominator': 0.13293777406215668}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.3289375305175781, 'dpo_reward_mean_target': 0.4058361053466797, 'standard deviation': 3.0, 'reward_a1': tensor([0.4892], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0374857187271118, 'numerator': 0.13292942941188812, 'denominator': 0.12812651693820953}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.00380706787109375, 'dpo_reward_mean_target': -0.00380706787109375, 'standard deviation': 3.0, 'reward_a1': tensor([-0.0038], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13298074901103973, 'denominator': 0.13298074901103973}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.14569473266601562, 'dpo_reward_mean_target': 0.2473163604736328, 'standard deviation': 3.0, 'reward_a1': tensor([-0.0690], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9947793483734131, 'numerator': 0.13224336504936218, 'denominator': 0.13293738663196564}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.5136222839355469, 'dpo_reward_mean_target': 0.617431640625, 'standard deviation': 3.0, 'reward_a1': tensor([2.9186], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.027513027191162, 'numerator': 0.09908860176801682, 'denominator': 0.09643537551164627}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.14857864379882812, 'dpo_reward_mean_target': -0.14857864379882812, 'standard deviation': 3.0, 'reward_a1': tensor([0.0277], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13275133073329926, 'denominator': 0.13275133073329926}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.45870208740234375, 'dpo_reward_mean_target': 1.4523239135742188, 'standard deviation': 3.0, 'reward_a1': tensor([3.4803], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.3214740753173828, 'numerator': 0.10581701993942261, 'denominator': 0.0800749883055687}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.21729278564453125, 'dpo_reward_mean_target': -0.21729278564453125, 'standard deviation': 3.0, 'reward_a1': tensor([0.9401], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.12344430387020111, 'denominator': 0.12344430387020111}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.780548095703125, 'dpo_reward_mean_target': 0.11637496948242188, 'standard deviation': 3.0, 'reward_a1': tensor([0.9532], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9634401798248291, 'numerator': 0.12790712714195251, 'denominator': 0.13276083767414093}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.03306770324707031, 'dpo_reward_mean_target': 0.31165504455566406, 'standard deviation': 3.0, 'reward_a1': tensor([0.0577], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9964579343795776, 'numerator': 0.13250526785850525, 'denominator': 0.13297627866268158}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.21636199951171875, 'dpo_reward_mean_target': 0.21636199951171875, 'standard deviation': 3.0, 'reward_a1': tensor([0.0369], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.1327429860830307, 'denominator': 0.1327429860830307}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 1.363311767578125, 'dpo_reward_mean_target': 2.3433990478515625, 'standard deviation': 3.0, 'reward_a1': tensor([1.0230], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9135412573814392, 'numerator': 0.12070420384407043, 'denominator': 0.1321278065443039}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.09557533264160156, 'dpo_reward_mean_target': -0.09557533264160156, 'standard deviation': 3.0, 'reward_a1': tensor([-0.0249], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13294385373592377, 'denominator': 0.13294385373592377}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.08465003967285156, 'dpo_reward_mean_target': -0.038478851318359375, 'standard deviation': 3.0, 'reward_a1': tensor([-0.0522], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0000476837158203, 'numerator': 0.1329793483018875, 'denominator': 0.13297301530838013}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.13768959045410156, 'dpo_reward_mean_target': -0.3758068084716797, 'standard deviation': 3.0, 'reward_a1': tensor([0.8383], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9468441605567932, 'numerator': 0.12252506613731384, 'denominator': 0.12940362095832825}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.9044723510742188, 'dpo_reward_mean_target': 1.249969482421875, 'standard deviation': 3.0, 'reward_a1': tensor([2.3839], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0514402389526367, 'numerator': 0.12381315976381302, 'denominator': 0.11775577068328857}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.48383522033691406, 'dpo_reward_mean_target': 1.2824249267578125, 'standard deviation': 3.0, 'reward_a1': tensor([4.2439], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 2.1265766620635986, 'numerator': 0.08169188350439072, 'denominator': 0.038414739072322845}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.12925148010253906, 'dpo_reward_mean_target': 0.12925148010253906, 'standard deviation': 3.0, 'reward_a1': tensor([2.7780], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.09005662053823471, 'denominator': 0.09005662053823471}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.6445178985595703, 'dpo_reward_mean_target': -0.04044151306152344, 'standard deviation': 3.0, 'reward_a1': tensor([0.7685], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.077422022819519, 'numerator': 0.12823262810707092, 'denominator': 0.11901801079511642}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.023097991943359375, 'dpo_reward_mean_target': -0.023097991943359375, 'standard deviation': 3.0, 'reward_a1': tensor([-0.0231], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13298074901103973, 'denominator': 0.13298074901103973}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.1518230438232422, 'dpo_reward_mean_target': -0.03673553466796875, 'standard deviation': 3.0, 'reward_a1': tensor([1.7040], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9660933017730713, 'numerator': 0.1123771145939827, 'denominator': 0.11632118374109268}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.634765625, 'dpo_reward_mean_target': 0.7582931518554688, 'standard deviation': 3.0, 'reward_a1': tensor([1.5911], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0123543739318848, 'numerator': 0.12795397639274597, 'denominator': 0.12639248371124268}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.2158660888671875, 'dpo_reward_mean_target': 1.6600418090820312, 'standard deviation': 3.0, 'reward_a1': tensor([7.8336], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 3.0237443447113037, 'numerator': 0.016004256904125214, 'denominator': 0.005292860325425863}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.0197296142578125, 'dpo_reward_mean_target': 0.0197296142578125, 'standard deviation': 3.0, 'reward_a1': tensor([1.1501], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.12386789917945862, 'denominator': 0.12386789917945862}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.8281707763671875, 'dpo_reward_mean_target': 0.8281707763671875, 'standard deviation': 3.0, 'reward_a1': tensor([0.7094], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13287657499313354, 'denominator': 0.13287657499313354}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.0672760009765625, 'dpo_reward_mean_target': 0.0672760009765625, 'standard deviation': 3.0, 'reward_a1': tensor([-0.1431], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13265405595302582, 'denominator': 0.13265405595302582}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.8765106201171875, 'dpo_reward_mean_target': 1.81488037109375, 'standard deviation': 3.0, 'reward_a1': tensor([5.0684], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.4742321968078613, 'numerator': 0.07385782897472382, 'denominator': 0.05009918287396431}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.04358482360839844, 'dpo_reward_mean_target': -0.04358482360839844, 'standard deviation': 3.0, 'reward_a1': tensor([-0.0556], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13297970592975616, 'denominator': 0.13297970592975616}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.25350379943847656, 'dpo_reward_mean_target': 0.25350379943847656, 'standard deviation': 3.0, 'reward_a1': tensor([0.9906], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.1290273517370224, 'denominator': 0.1290273517370224}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.38006591796875, 'dpo_reward_mean_target': 0.6962928771972656, 'standard deviation': 3.0, 'reward_a1': tensor([-0.2361], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.973160982131958, 'numerator': 0.12671056389808655, 'denominator': 0.13020513951778412}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.26409912109375, 'dpo_reward_mean_target': 0.5313816070556641, 'standard deviation': 3.0, 'reward_a1': tensor([0.3423], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0186138153076172, 'numerator': 0.13271690905094147, 'denominator': 0.13029168546199799}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 1.1468429565429688, 'dpo_reward_mean_target': 1.1468429565429688, 'standard deviation': 3.0, 'reward_a1': tensor([2.9298], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.111452117562294, 'denominator': 0.111452117562294}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 1.083587646484375, 'dpo_reward_mean_target': 1.0261917114257812, 'standard deviation': 3.0, 'reward_a1': tensor([9.1931], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9494234323501587, 'numerator': 0.003269784850999713, 'denominator': 0.0034439689479768276}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.9933013916015625, 'dpo_reward_mean_target': 1.5421600341796875, 'standard deviation': 3.0, 'reward_a1': tensor([0.5626], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9579089283943176, 'numerator': 0.12607738375663757, 'denominator': 0.13161729276180267}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.0001697540283203125, 'dpo_reward_mean_target': -0.0001697540283203125, 'standard deviation': 3.0, 'reward_a1': tensor([-0.0002], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13298074901103973, 'denominator': 0.13298074901103973}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.40749359130859375, 'dpo_reward_mean_target': 0.5586624145507812, 'standard deviation': 3.0, 'reward_a1': tensor([1.4594], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0165339708328247, 'numerator': 0.12712015211582184, 'denominator': 0.1250525414943695}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.8205718994140625, 'dpo_reward_mean_target': 0.760345458984375, 'standard deviation': 3.0, 'reward_a1': tensor([1.0439], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9983053207397461, 'numerator': 0.13238801062107086, 'denominator': 0.13261274993419647}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.1057891845703125, 'dpo_reward_mean_target': 0.1057891845703125, 'standard deviation': 3.0, 'reward_a1': tensor([0.0347], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13294340670108795, 'denominator': 0.13294340670108795}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.2860679626464844, 'dpo_reward_mean_target': 0.4930877685546875, 'standard deviation': 3.0, 'reward_a1': tensor([2.4993], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0497249364852905, 'numerator': 0.1063356027007103, 'denominator': 0.10129854083061218}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 1.4917526245117188, 'dpo_reward_mean_target': 1.3319931030273438, 'standard deviation': 3.0, 'reward_a1': tensor([8.3803], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.8836477398872375, 'numerator': 0.008417163044214249, 'denominator': 0.009525473229587078}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.2730064392089844, 'dpo_reward_mean_target': 0.3602485656738281, 'standard deviation': 3.0, 'reward_a1': tensor([0.0934], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9978387355804443, 'numerator': 0.13245579600334167, 'denominator': 0.13274268805980682}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.014322280883789062, 'dpo_reward_mean_target': -0.010303497314453125, 'standard deviation': 3.0, 'reward_a1': tensor([-0.0103], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0000009536743164, 'numerator': 0.13298074901103973, 'denominator': 0.13298062980175018}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.032867431640625, 'dpo_reward_mean_target': 0.032867431640625, 'standard deviation': 3.0, 'reward_a1': tensor([0.2891], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13249658048152924, 'denominator': 0.13249658048152924}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.40952491760253906, 'dpo_reward_mean_target': 0.21019744873046875, 'standard deviation': 3.0, 'reward_a1': tensor([0.0588], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0055760145187378, 'numerator': 0.13281148672103882, 'denominator': 0.13207504153251648}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.1567840576171875, 'dpo_reward_mean_target': 1.2834625244140625, 'standard deviation': 3.0, 'reward_a1': tensor([7.1162], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 2.8537654876708984, 'numerator': 0.02008858136832714, 'denominator': 0.007039324380457401}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.4767951965332031, 'dpo_reward_mean_target': 1.3693695068359375, 'standard deviation': 3.0, 'reward_a1': tensor([0.6972], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9778444170951843, 'numerator': 0.12968412041664124, 'denominator': 0.13262245059013367}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.9084930419921875, 'dpo_reward_mean_target': 0.9222183227539062, 'standard deviation': 3.0, 'reward_a1': tensor([1.2442], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0005017518997192, 'numerator': 0.13221688568592072, 'denominator': 0.1321505755186081}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.10089492797851562, 'dpo_reward_mean_target': -0.10089492797851562, 'standard deviation': 3.0, 'reward_a1': tensor([-0.1150], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13297928869724274, 'denominator': 0.13297928869724274}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.8946170806884766, 'dpo_reward_mean_target': 0.7585391998291016, 'standard deviation': 3.0, 'reward_a1': tensor([0.0483], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.011837124824524, 'numerator': 0.1293056458234787, 'denominator': 0.12779293954372406}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.14952850341796875, 'dpo_reward_mean_target': -0.14952850341796875, 'standard deviation': 3.0, 'reward_a1': tensor([0.8244], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.12615442276000977, 'denominator': 0.12615442276000977}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 1.0015869140625, 'dpo_reward_mean_target': 1.41754150390625, 'standard deviation': 3.0, 'reward_a1': tensor([6.3906], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.2705535888671875, 'numerator': 0.03365783020853996, 'denominator': 0.026490680873394012}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.1602001190185547, 'dpo_reward_mean_target': 0.14934349060058594, 'standard deviation': 3.0, 'reward_a1': tensor([0.1353], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.000023365020752, 'numerator': 0.13297928869724274, 'denominator': 0.13297618925571442}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 1.4251861572265625, 'dpo_reward_mean_target': 1.167938232421875, 'standard deviation': 3.0, 'reward_a1': tensor([4.5575], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9110045433044434, 'numerator': 0.07024025171995163, 'denominator': 0.07710197567939758}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.10882186889648438, 'dpo_reward_mean_target': 0.10882186889648438, 'standard deviation': 3.0, 'reward_a1': tensor([0.0250], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13292889297008514, 'denominator': 0.13292889297008514}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 2.0267333984375, 'dpo_reward_mean_target': 1.1683349609375, 'standard deviation': 3.0, 'reward_a1': tensor([5.6612], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.6786977052688599, 'numerator': 0.043327488005161285, 'denominator': 0.0638391524553299}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.4670448303222656, 'dpo_reward_mean_target': 0.7498397827148438, 'standard deviation': 3.0, 'reward_a1': tensor([2.7507], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0696314573287964, 'numerator': 0.1064615249633789, 'denominator': 0.09953103214502335}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.7483787536621094, 'dpo_reward_mean_target': 1.2795181274414062, 'standard deviation': 3.0, 'reward_a1': tensor([1.3984], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0229462385177612, 'numerator': 0.1328764110803604, 'denominator': 0.12989579141139984}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 1.3742523193359375, 'dpo_reward_mean_target': 0.892974853515625, 'standard deviation': 3.0, 'reward_a1': tensor([0.7910], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0184910297393799, 'numerator': 0.13290391862392426, 'denominator': 0.1304910033941269}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.4323883056640625, 'dpo_reward_mean_target': 0.4323883056640625, 'standard deviation': 3.0, 'reward_a1': tensor([1.9960], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.11609194427728653, 'denominator': 0.11609194427728653}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.05690765380859375, 'dpo_reward_mean_target': 0.227325439453125, 'standard deviation': 3.0, 'reward_a1': tensor([0.2032], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0037318468093872, 'numerator': 0.1329764425754547, 'denominator': 0.13248203694820404}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.012887954711914062, 'dpo_reward_mean_target': -0.04471015930175781, 'standard deviation': 3.0, 'reward_a1': tensor([-0.2365], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0014127492904663, 'numerator': 0.13270924985408783, 'denominator': 0.13252203166484833}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.11956024169921875, 'dpo_reward_mean_target': 0.031421661376953125, 'standard deviation': 3.0, 'reward_a1': tensor([0.5575], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9952905178070068, 'numerator': 0.13095159828662872, 'denominator': 0.1315712332725525}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.1472625732421875, 'dpo_reward_mean_target': 0.1472625732421875, 'standard deviation': 3.0, 'reward_a1': tensor([-0.6429], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.12844738364219666, 'denominator': 0.12844738364219666}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.9364337921142578, 'dpo_reward_mean_target': 1.2352275848388672, 'standard deviation': 3.0, 'reward_a1': tensor([1.5639], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0159976482391357, 'numerator': 0.13218511641025543, 'denominator': 0.13010376691818237}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 2.47503662109375, 'dpo_reward_mean_target': 2.944000244140625, 'standard deviation': 3.0, 'reward_a1': tensor([8.9629], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.3852003812789917, 'numerator': 0.017771411687135696, 'denominator': 0.012829488143324852}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.012315750122070312, 'dpo_reward_mean_target': -0.012315750122070312, 'standard deviation': 3.0, 'reward_a1': tensor([0.2109], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13261303305625916, 'denominator': 0.13261303305625916}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.2509899139404297, 'dpo_reward_mean_target': -0.2509899139404297, 'standard deviation': 3.0, 'reward_a1': tensor([0.8622], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.1241341233253479, 'denominator': 0.1241341233253479}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.05025482177734375, 'dpo_reward_mean_target': 0.05025482177734375, 'standard deviation': 3.0, 'reward_a1': tensor([-0.1895], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13255682587623596, 'denominator': 0.13255682587623596}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.10211563110351562, 'dpo_reward_mean_target': -0.10211563110351562, 'standard deviation': 3.0, 'reward_a1': tensor([0.7596], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.1276063621044159, 'denominator': 0.1276063621044159}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.4098930358886719, 'dpo_reward_mean_target': 0.43132781982421875, 'standard deviation': 3.0, 'reward_a1': tensor([-0.5286], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9508387446403503, 'numerator': 0.12634427845478058, 'denominator': 0.1328766644001007}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 1.2265853881835938, 'dpo_reward_mean_target': 0.8334197998046875, 'standard deviation': 3.0, 'reward_a1': tensor([0.6911], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0149171352386475, 'numerator': 0.13283111155033112, 'denominator': 0.1308787763118744}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 2.0469589233398438, 'dpo_reward_mean_target': 0.8250732421875, 'standard deviation': 3.0, 'reward_a1': tensor([0.6067], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.1191813945770264, 'numerator': 0.13262878358364105, 'denominator': 0.11850517243146896}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.15671539306640625, 'dpo_reward_mean_target': 0.52288818359375, 'standard deviation': 3.0, 'reward_a1': tensor([1.0011], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0272717475891113, 'numerator': 0.13130180537700653, 'denominator': 0.12781603634357452}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.0057048797607421875, 'dpo_reward_mean_target': -0.0057048797607421875, 'standard deviation': 3.0, 'reward_a1': tensor([-0.0057], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13298074901103973, 'denominator': 0.13298074901103973}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.13641357421875, 'dpo_reward_mean_target': 0.5355720520019531, 'standard deviation': 3.0, 'reward_a1': tensor([2.2370], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0879666805267334, 'numerator': 0.113225556910038, 'denominator': 0.10407079756259918}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.08451652526855469, 'dpo_reward_mean_target': 0.08451652526855469, 'standard deviation': 3.0, 'reward_a1': tensor([0.2721], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.1327209174633026, 'denominator': 0.1327209174633026}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.3651924133300781, 'dpo_reward_mean_target': 0.673248291015625, 'standard deviation': 3.0, 'reward_a1': tensor([2.2841], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0622704029083252, 'numerator': 0.11512845754623413, 'denominator': 0.10837961733341217}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.9232864379882812, 'dpo_reward_mean_target': 0.7252883911132812, 'standard deviation': 3.0, 'reward_a1': tensor([2.6332], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9609864950180054, 'numerator': 0.10863383859395981, 'denominator': 0.11304408311843872}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.030599594116210938, 'dpo_reward_mean_target': -0.030599594116210938, 'standard deviation': 3.0, 'reward_a1': tensor([-0.0306], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13298074901103973, 'denominator': 0.13298074901103973}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.5717678070068359, 'dpo_reward_mean_target': 0.4673347473144531, 'standard deviation': 3.0, 'reward_a1': tensor([1.3395], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.990530788898468, 'numerator': 0.12747822701931, 'denominator': 0.12869688868522644}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.4625110626220703, 'dpo_reward_mean_target': 0.4625110626220703, 'standard deviation': 3.0, 'reward_a1': tensor([1.6299], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.12328365445137024, 'denominator': 0.12328365445137024}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.16658782958984375, 'dpo_reward_mean_target': 0.16658782958984375, 'standard deviation': 3.0, 'reward_a1': tensor([0.6244], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13144156336784363, 'denominator': 0.13144156336784363}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 1.4013595581054688, 'dpo_reward_mean_target': 2.081573486328125, 'standard deviation': 3.0, 'reward_a1': tensor([1.7969], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0041977167129517, 'numerator': 0.13238336145877838, 'denominator': 0.13182997703552246}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 1.283294677734375, 'dpo_reward_mean_target': 2.571868896484375, 'standard deviation': 3.0, 'reward_a1': tensor([1.3600], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9219567775726318, 'numerator': 0.12256240099668503, 'denominator': 0.1329372525215149}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 1.0979957580566406, 'dpo_reward_mean_target': 1.0979957580566406, 'standard deviation': 3.0, 'reward_a1': tensor([1.0745], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.132976695895195, 'denominator': 0.132976695895195}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.7095470428466797, 'dpo_reward_mean_target': 0.8445777893066406, 'standard deviation': 3.0, 'reward_a1': tensor([0.3572], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.993720293045044, 'numerator': 0.13123740255832672, 'denominator': 0.1320667415857315}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.37241363525390625, 'dpo_reward_mean_target': -0.37241363525390625, 'standard deviation': 3.0, 'reward_a1': tensor([0.1327], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13110879063606262, 'denominator': 0.13110879063606262}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.13307571411132812, 'dpo_reward_mean_target': 0.13307571411132812, 'standard deviation': 3.0, 'reward_a1': tensor([0.6985], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13063938915729523, 'denominator': 0.13063938915729523}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.07266616821289062, 'dpo_reward_mean_target': -0.07266616821289062, 'standard deviation': 3.0, 'reward_a1': tensor([-0.6860], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.1302306354045868, 'denominator': 0.1302306354045868}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.9968490600585938, 'dpo_reward_mean_target': 1.2664108276367188, 'standard deviation': 3.0, 'reward_a1': tensor([1.1123], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9994195699691772, 'numerator': 0.13280531764030457, 'denominator': 0.13288244605064392}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.8265533447265625, 'dpo_reward_mean_target': 0.37120819091796875, 'standard deviation': 3.0, 'reward_a1': tensor([0.6161], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9991273283958435, 'numerator': 0.132538303732872, 'denominator': 0.132654070854187}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.26934051513671875, 'dpo_reward_mean_target': -0.26934051513671875, 'standard deviation': 3.0, 'reward_a1': tensor([1.9517], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.10110404342412949, 'denominator': 0.10110404342412949}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.015985488891601562, 'dpo_reward_mean_target': 0.015985488891601562, 'standard deviation': 3.0, 'reward_a1': tensor([2.9383], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.08274400979280472, 'denominator': 0.08274400979280472}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.46900177001953125, 'dpo_reward_mean_target': 0.664764404296875, 'standard deviation': 3.0, 'reward_a1': tensor([4.4308], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0876792669296265, 'numerator': 0.06047632172703743, 'denominator': 0.05560124292969704}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.9066314697265625, 'dpo_reward_mean_target': 0.13427734375, 'standard deviation': 3.0, 'reward_a1': tensor([2.0749], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.8751162886619568, 'numerator': 0.1078757494688034, 'denominator': 0.1232701912522316}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.2512931823730469, 'dpo_reward_mean_target': 0.28001976013183594, 'standard deviation': 3.0, 'reward_a1': tensor([1.4478], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0037803649902344, 'numerator': 0.12327759712934494, 'denominator': 0.12281332165002823}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.04041290283203125, 'dpo_reward_mean_target': 0.04041290283203125, 'standard deviation': 3.0, 'reward_a1': tensor([1.9822], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.10784967243671417, 'denominator': 0.10784967243671417}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.5355567932128906, 'dpo_reward_mean_target': 0.7207565307617188, 'standard deviation': 3.0, 'reward_a1': tensor([2.4152], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0374577045440674, 'numerator': 0.113374724984169, 'denominator': 0.10928130149841309}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 1.5955581665039062, 'dpo_reward_mean_target': 1.9259033203125, 'standard deviation': 3.0, 'reward_a1': tensor([2.6268], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0322989225387573, 'numerator': 0.12940065562725067, 'denominator': 0.1253519207239151}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.03828620910644531, 'dpo_reward_mean_target': 0.459564208984375, 'standard deviation': 3.0, 'reward_a1': tensor([3.1349], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.175581932067871, 'numerator': 0.08934949338436127, 'denominator': 0.07600448280572891}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.11026763916015625, 'dpo_reward_mean_target': 0.19736671447753906, 'standard deviation': 3.0, 'reward_a1': tensor([0.3713], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0112649202346802, 'numerator': 0.1327575445175171, 'denominator': 0.13127869367599487}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.14782142639160156, 'dpo_reward_mean_target': -0.11103630065917969, 'standard deviation': 3.0, 'reward_a1': tensor([0.1843], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.995239794254303, 'numerator': 0.13233798742294312, 'denominator': 0.13297095894813538}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.9664154052734375, 'dpo_reward_mean_target': 1.5662918090820312, 'standard deviation': 3.0, 'reward_a1': tensor([4.0996], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.2078533172607422, 'numerator': 0.0930991992354393, 'denominator': 0.07707823067903519}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.41828155517578125, 'dpo_reward_mean_target': 0.5145721435546875, 'standard deviation': 3.0, 'reward_a1': tensor([2.7575], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0248147249221802, 'numerator': 0.10055705904960632, 'denominator': 0.09812217950820923}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.05298805236816406, 'dpo_reward_mean_target': 0.05298805236816406, 'standard deviation': 3.0, 'reward_a1': tensor([0.0224], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.1329738348722458, 'denominator': 0.1329738348722458}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 1.6907501220703125, 'dpo_reward_mean_target': 2.74041748046875, 'standard deviation': 3.0, 'reward_a1': tensor([4.4968], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.3048142194747925, 'numerator': 0.11203565448522568, 'denominator': 0.08586329966783524}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 1.960906982421875, 'dpo_reward_mean_target': 2.262237548828125, 'standard deviation': 3.0, 'reward_a1': tensor([5.0312], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.1026920080184937, 'numerator': 0.08685465902090073, 'denominator': 0.07876601815223694}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 1.175750732421875, 'dpo_reward_mean_target': 0.854736328125, 'standard deviation': 3.0, 'reward_a1': tensor([3.3614], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9197219014167786, 'numerator': 0.09379610419273376, 'denominator': 0.10198311507701874}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.7113189697265625, 'dpo_reward_mean_target': 0.589447021484375, 'standard deviation': 3.0, 'reward_a1': tensor([5.5432], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9358921051025391, 'numerator': 0.034018322825431824, 'denominator': 0.036348551511764526}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.3178844451904297, 'dpo_reward_mean_target': 0.4208488464355469, 'standard deviation': 3.0, 'reward_a1': tensor([0.7964], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0048967599868774, 'numerator': 0.13194312155246735, 'denominator': 0.1313001811504364}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.3065605163574219, 'dpo_reward_mean_target': 0.4078216552734375, 'standard deviation': 3.0, 'reward_a1': tensor([1.3850], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0116311311721802, 'numerator': 0.12611021101474762, 'denominator': 0.12466026842594147}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 1.1607742309570312, 'dpo_reward_mean_target': 1.6241378784179688, 'standard deviation': 3.0, 'reward_a1': tensor([1.4627], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0036227703094482, 'numerator': 0.13278834521770477, 'denominator': 0.13230901956558228}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.3955841064453125, 'dpo_reward_mean_target': 2.663238525390625, 'standard deviation': 3.0, 'reward_a1': tensor([10.1093], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 8.686675071716309, 'numerator': 0.0061106327921152115, 'denominator': 0.0007034489535726607}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.6157283782958984, 'dpo_reward_mean_target': 0.32777976989746094, 'standard deviation': 3.0, 'reward_a1': tensor([2.0440], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9509426951408386, 'numerator': 0.11290854215621948, 'denominator': 0.11873327940702438}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.5917739868164062, 'dpo_reward_mean_target': 0.5917739868164062, 'standard deviation': 3.0, 'reward_a1': tensor([3.8692], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.07321963459253311, 'denominator': 0.07321963459253311}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.748748779296875, 'dpo_reward_mean_target': 0.5912742614746094, 'standard deviation': 3.0, 'reward_a1': tensor([7.5160], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.8871108293533325, 'numerator': 0.009264787659049034, 'denominator': 0.010443776845932007}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.021833419799804688, 'dpo_reward_mean_target': -0.021833419799804688, 'standard deviation': 3.0, 'reward_a1': tensor([-0.0218], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13298074901103973, 'denominator': 0.13298074901103973}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 1.7059173583984375, 'dpo_reward_mean_target': 0.8866653442382812, 'standard deviation': 3.0, 'reward_a1': tensor([6.1657], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.641944408416748, 'numerator': 0.028274621814489365, 'denominator': 0.044045280665159225}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.28456878662109375, 'dpo_reward_mean_target': 1.1500396728515625, 'standard deviation': 3.0, 'reward_a1': tensor([1.4439], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.174890398979187, 'numerator': 0.13234449923038483, 'denominator': 0.11264412850141525}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.0030002593994140625, 'dpo_reward_mean_target': 0.2051258087158203, 'standard deviation': 3.0, 'reward_a1': tensor([-0.4806], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.986638605594635, 'numerator': 0.12955161929130554, 'denominator': 0.13130605220794678}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.7606124877929688, 'dpo_reward_mean_target': 0.5847854614257812, 'standard deviation': 3.0, 'reward_a1': tensor([1.2066], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9896243214607239, 'numerator': 0.13015495240688324, 'denominator': 0.13151955604553223}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.20020294189453125, 'dpo_reward_mean_target': 0.6818695068359375, 'standard deviation': 3.0, 'reward_a1': tensor([3.0034], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.146983027458191, 'numerator': 0.09857159107923508, 'denominator': 0.08593988418579102}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.09990692138671875, 'dpo_reward_mean_target': 0.2240924835205078, 'standard deviation': 3.0, 'reward_a1': tensor([-0.0776], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9966990351676941, 'numerator': 0.1323099136352539, 'denominator': 0.13274811208248138}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.44760894775390625, 'dpo_reward_mean_target': 0.7704505920410156, 'standard deviation': 3.0, 'reward_a1': tensor([0.1663], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.984243631362915, 'numerator': 0.1303112953901291, 'denominator': 0.13239739835262299}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.6671829223632812, 'dpo_reward_mean_target': 0.9222412109375, 'standard deviation': 3.0, 'reward_a1': tensor([1.0261], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0065778493881226, 'numerator': 0.1329011619091034, 'denominator': 0.13203267753124237}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 2.1014251708984375, 'dpo_reward_mean_target': 1.452880859375, 'standard deviation': 3.0, 'reward_a1': tensor([5.1209], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.785878598690033, 'numerator': 0.06297484785318375, 'denominator': 0.08013305068016052}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.07406234741210938, 'dpo_reward_mean_target': -0.054218292236328125, 'standard deviation': 3.0, 'reward_a1': tensor([-0.3566], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0052374601364136, 'numerator': 0.1323070228099823, 'denominator': 0.1316176801919937}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.5514297485351562, 'dpo_reward_mean_target': 0.5514297485351562, 'standard deviation': 3.0, 'reward_a1': tensor([1.8559], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.12098535150289536, 'denominator': 0.12098535150289536}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.42852783203125, 'dpo_reward_mean_target': 0.49977874755859375, 'standard deviation': 3.0, 'reward_a1': tensor([0.1052], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9971621036529541, 'numerator': 0.13183526694774628, 'denominator': 0.13221046328544617}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.4822502136230469, 'dpo_reward_mean_target': 0.23511505126953125, 'standard deviation': 3.0, 'reward_a1': tensor([4.7755], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.477701187133789, 'numerator': 0.04230671748518944, 'denominator': 0.0286300890147686}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 2.617218017578125, 'dpo_reward_mean_target': 1.3422622680664062, 'standard deviation': 3.0, 'reward_a1': tensor([3.5235], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.8035662174224854, 'numerator': 0.10209209471940994, 'denominator': 0.12704876065254211}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 3.708099365234375, 'dpo_reward_mean_target': 4.35540771484375, 'standard deviation': 3.0, 'reward_a1': tensor([11.8063], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.7492238283157349, 'numerator': 0.0060858349315822124, 'denominator': 0.0034791631624102592}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.1346893310546875, 'dpo_reward_mean_target': -0.1346893310546875, 'standard deviation': 3.0, 'reward_a1': tensor([-0.1292], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.1329805552959442, 'denominator': 0.1329805552959442}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 1.2598495483398438, 'dpo_reward_mean_target': 3.047119140625, 'standard deviation': 3.0, 'reward_a1': tensor([8.7480], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 3.7045676708221436, 'numerator': 0.021860355511307716, 'denominator': 0.005900919437408447}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.48773193359375, 'dpo_reward_mean_target': 0.5807342529296875, 'standard deviation': 3.0, 'reward_a1': tensor([0.9006], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0037932395935059, 'numerator': 0.13222701847553253, 'denominator': 0.13172735273838043}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 1.8165512084960938, 'dpo_reward_mean_target': 2.876068115234375, 'standard deviation': 3.0, 'reward_a1': tensor([2.8689], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.063456416130066, 'numerator': 0.1329803615808487, 'denominator': 0.12504543364048004}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.3126716613769531, 'dpo_reward_mean_target': 0.9780349731445312, 'standard deviation': 3.0, 'reward_a1': tensor([1.8116], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0900484323501587, 'numerator': 0.1279449462890625, 'denominator': 0.1173754632472992}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.20584678649902344, 'dpo_reward_mean_target': 0.20584678649902344, 'standard deviation': 3.0, 'reward_a1': tensor([0.4524], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.1325322687625885, 'denominator': 0.1325322687625885}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.20247268676757812, 'dpo_reward_mean_target': 0.21467208862304688, 'standard deviation': 3.0, 'reward_a1': tensor([-0.0692], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9965174794197083, 'numerator': 0.13238690793514252, 'denominator': 0.1328495591878891}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 2.0645904541015625, 'dpo_reward_mean_target': 1.8087615966796875, 'standard deviation': 3.0, 'reward_a1': tensor([6.1051], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.8882613182067871, 'numerator': 0.04769038408994675, 'denominator': 0.053689587861299515}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.26161956787109375, 'dpo_reward_mean_target': 0.14313507080078125, 'standard deviation': 3.0, 'reward_a1': tensor([1.4724], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9834187030792236, 'numerator': 0.1205463707447052, 'denominator': 0.12257888913154602}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.0077800750732421875, 'dpo_reward_mean_target': -0.0077800750732421875, 'standard deviation': 3.0, 'reward_a1': tensor([-0.0078], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13298074901103973, 'denominator': 0.13298074901103973}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.5979995727539062, 'dpo_reward_mean_target': 0.7831344604492188, 'standard deviation': 3.0, 'reward_a1': tensor([2.3535], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.034799337387085, 'numerator': 0.11595495790243149, 'denominator': 0.11205549538135529}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.14726829528808594, 'dpo_reward_mean_target': -0.14726829528808594, 'standard deviation': 3.0, 'reward_a1': tensor([0.6265], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.12863044440746307, 'denominator': 0.12863044440746307}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.3303718566894531, 'dpo_reward_mean_target': -0.3303718566894531, 'standard deviation': 3.0, 'reward_a1': tensor([1.3313], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.11406876891851425, 'denominator': 0.11406876891851425}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.38491058349609375, 'dpo_reward_mean_target': 1.72564697265625, 'standard deviation': 3.0, 'reward_a1': tensor([11.1635], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 4.507894992828369, 'numerator': 0.0009433542145416141, 'denominator': 0.00020926712022628635}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.26451873779296875, 'dpo_reward_mean_target': 0.26451873779296875, 'standard deviation': 3.0, 'reward_a1': tensor([2.1484], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.10918449610471725, 'denominator': 0.10918449610471725}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 1.8172531127929688, 'dpo_reward_mean_target': 2.5005569458007812, 'standard deviation': 3.0, 'reward_a1': tensor([2.8656], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.055117130279541, 'numerator': 0.13200005888938904, 'denominator': 0.1251046508550644}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 1.4906272888183594, 'dpo_reward_mean_target': 1.2230873107910156, 'standard deviation': 3.0, 'reward_a1': tensor([0.9388], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0125051736831665, 'numerator': 0.1323850154876709, 'denominator': 0.13074997067451477}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.10559272766113281, 'dpo_reward_mean_target': -0.10559272766113281, 'standard deviation': 3.0, 'reward_a1': tensor([-0.0137], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13291843235492706, 'denominator': 0.13291843235492706}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.5659103393554688, 'dpo_reward_mean_target': 0.9713363647460938, 'standard deviation': 3.0, 'reward_a1': tensor([4.0351], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.1585228443145752, 'numerator': 0.07894296944141388, 'denominator': 0.06814105808734894}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.4533348083496094, 'dpo_reward_mean_target': 0.4533348083496094, 'standard deviation': 3.0, 'reward_a1': tensor([0.9707], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13101798295974731, 'denominator': 0.13101798295974731}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 1.339874267578125, 'dpo_reward_mean_target': 1.6132888793945312, 'standard deviation': 3.0, 'reward_a1': tensor([0.7240], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9773963689804077, 'numerator': 0.12726467847824097, 'denominator': 0.1302078515291214}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.23080062866210938, 'dpo_reward_mean_target': 0.23080062866210938, 'standard deviation': 3.0, 'reward_a1': tensor([-0.4040], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13003702461719513, 'denominator': 0.13003702461719513}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.13663291931152344, 'dpo_reward_mean_target': 0.13663291931152344, 'standard deviation': 3.0, 'reward_a1': tensor([0.2792], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13283061981201172, 'denominator': 0.13283061981201172}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.13985061645507812, 'dpo_reward_mean_target': 0.7101669311523438, 'standard deviation': 3.0, 'reward_a1': tensor([2.7538], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.1590144634246826, 'numerator': 0.1054447665810585, 'denominator': 0.09097795188426971}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.25754547119140625, 'dpo_reward_mean_target': 0.25754547119140625, 'standard deviation': 3.0, 'reward_a1': tensor([0.3904], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13285040855407715, 'denominator': 0.13285040855407715}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 2.2343902587890625, 'dpo_reward_mean_target': 2.878143310546875, 'standard deviation': 3.0, 'reward_a1': tensor([2.5880], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0022754669189453, 'numerator': 0.13236045837402344, 'denominator': 0.1320599615573883}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.023912429809570312, 'dpo_reward_mean_target': 0.023912429809570312, 'standard deviation': 3.0, 'reward_a1': tensor([0.7449], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.1291956603527069, 'denominator': 0.1291956603527069}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.9900283813476562, 'dpo_reward_mean_target': 2.4976043701171875, 'standard deviation': 3.0, 'reward_a1': tensor([3.8134], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.414342999458313, 'numerator': 0.12078677117824554, 'denominator': 0.08540132641792297}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.04105186462402344, 'dpo_reward_mean_target': -0.04105186462402344, 'standard deviation': 3.0, 'reward_a1': tensor([0.2500], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13235647976398468, 'denominator': 0.13235647976398468}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 2.0321502685546875, 'dpo_reward_mean_target': 3.372344970703125, 'standard deviation': 3.0, 'reward_a1': tensor([4.3366], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.2755392789840698, 'numerator': 0.1262863427400589, 'denominator': 0.09900623559951782}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.0523681640625, 'dpo_reward_mean_target': 0.0523681640625, 'standard deviation': 3.0, 'reward_a1': tensor([-0.0108], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.1329512745141983, 'denominator': 0.1329512745141983}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 1.9078826904296875, 'dpo_reward_mean_target': 2.2458953857421875, 'standard deviation': 3.0, 'reward_a1': tensor([5.7755], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.1490169763565063, 'numerator': 0.06656016409397125, 'denominator': 0.057927921414375305}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.2125415802001953, 'dpo_reward_mean_target': 0.2125415802001953, 'standard deviation': 3.0, 'reward_a1': tensor([-0.0257], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13256217539310455, 'denominator': 0.13256217539310455}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 2.280548095703125, 'dpo_reward_mean_target': 3.683502197265625, 'standard deviation': 3.0, 'reward_a1': tensor([-0.1545], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.6132842898368835, 'numerator': 0.058666858822107315, 'denominator': 0.09566013514995575}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.6692104339599609, 'dpo_reward_mean_target': 0.8774700164794922, 'standard deviation': 3.0, 'reward_a1': tensor([1.0246], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.00583016872406, 'numerator': 0.1328209936618805, 'denominator': 0.13205111026763916}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.0150299072265625, 'dpo_reward_mean_target': -0.0150299072265625, 'standard deviation': 3.0, 'reward_a1': tensor([-0.0150], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13298074901103973, 'denominator': 0.13298074901103973}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 1.36297607421875, 'dpo_reward_mean_target': 2.191986083984375, 'standard deviation': 3.0, 'reward_a1': tensor([8.4535], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.8495432138442993, 'numerator': 0.015060712583363056, 'denominator': 0.008142936043441296}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.020320892333984375, 'dpo_reward_mean_target': 0.020320892333984375, 'standard deviation': 3.0, 'reward_a1': tensor([0.0539], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13297240436077118, 'denominator': 0.13297240436077118}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.7697525024414062, 'dpo_reward_mean_target': 1.1104354858398438, 'standard deviation': 3.0, 'reward_a1': tensor([2.4462], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0586665868759155, 'numerator': 0.12043175846338272, 'denominator': 0.11375796794891357}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.028900146484375, 'dpo_reward_mean_target': 0.909912109375, 'standard deviation': 3.0, 'reward_a1': tensor([0.9339], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0465203523635864, 'numerator': 0.1329765021800995, 'denominator': 0.12706537544727325}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 1.345489501953125, 'dpo_reward_mean_target': 1.3521766662597656, 'standard deviation': 3.0, 'reward_a1': tensor([0.3557], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9992623925209045, 'numerator': 0.12584419548511505, 'denominator': 0.1259370893239975}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.4224052429199219, 'dpo_reward_mean_target': 0.2549552917480469, 'standard deviation': 3.0, 'reward_a1': tensor([0.9440], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0804153680801392, 'numerator': 0.12951946258544922, 'denominator': 0.11987932026386261}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.45703887939453125, 'dpo_reward_mean_target': 0.23786544799804688, 'standard deviation': 3.0, 'reward_a1': tensor([0.7032], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9913727641105652, 'numerator': 0.1313903033733368, 'denominator': 0.1325336992740631}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.63824462890625, 'dpo_reward_mean_target': 0.3488197326660156, 'standard deviation': 3.0, 'reward_a1': tensor([2.2582], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9448326826095581, 'numerator': 0.10860001295804977, 'denominator': 0.11494100093841553}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.012163162231445312, 'dpo_reward_mean_target': -0.012163162231445312, 'standard deviation': 3.0, 'reward_a1': tensor([0.2214], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13257819414138794, 'denominator': 0.13257819414138794}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.21167755126953125, 'dpo_reward_mean_target': -0.01102447509765625, 'standard deviation': 3.0, 'reward_a1': tensor([0.1354], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0055171251296997, 'numerator': 0.1328224539756775, 'denominator': 0.13209368288516998}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 3.5969390869140625, 'dpo_reward_mean_target': 2.923919677734375, 'standard deviation': 3.0, 'reward_a1': tensor([4.4980], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9116047024726868, 'numerator': 0.11587879061698914, 'denominator': 0.1271151751279831}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.1812458038330078, 'dpo_reward_mean_target': -0.1812458038330078, 'standard deviation': 3.0, 'reward_a1': tensor([0.8956], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.12468339502811432, 'denominator': 0.12468339502811432}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.11587715148925781, 'dpo_reward_mean_target': -0.11587715148925781, 'standard deviation': 3.0, 'reward_a1': tensor([-0.1159], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13298074901103973, 'denominator': 0.13298074901103973}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.009674072265625, 'dpo_reward_mean_target': 0.009674072265625, 'standard deviation': 3.0, 'reward_a1': tensor([0.7687], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.12879155576229095, 'denominator': 0.12879155576229095}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 1.8136367797851562, 'dpo_reward_mean_target': 0.68475341796875, 'standard deviation': 3.0, 'reward_a1': tensor([6.7000], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.5047476291656494, 'numerator': 0.01781533472239971, 'denominator': 0.035295531153678894}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 1.3827972412109375, 'dpo_reward_mean_target': 0.966705322265625, 'standard deviation': 3.0, 'reward_a1': tensor([0.2241], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.044931411743164, 'numerator': 0.12896832823753357, 'denominator': 0.1234227642416954}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.09967803955078125, 'dpo_reward_mean_target': 0.1651592254638672, 'standard deviation': 3.0, 'reward_a1': tensor([0.2376], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0060473680496216, 'numerator': 0.13294194638729095, 'denominator': 0.1321428269147873}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.04671478271484375, 'dpo_reward_mean_target': -0.04671478271484375, 'standard deviation': 3.0, 'reward_a1': tensor([1.7061], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.11211317032575607, 'denominator': 0.11211317032575607}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.7980422973632812, 'dpo_reward_mean_target': -0.06352996826171875, 'standard deviation': 3.0, 'reward_a1': tensor([0.5856], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9793115854263306, 'numerator': 0.12990358471870422, 'denominator': 0.13264785706996918}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.41902923583984375, 'dpo_reward_mean_target': 0.24733734130859375, 'standard deviation': 3.0, 'reward_a1': tensor([1.6556], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9750875234603882, 'numerator': 0.11910731345415115, 'denominator': 0.12215038388967514}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.3156585693359375, 'dpo_reward_mean_target': -0.3452301025390625, 'standard deviation': 3.0, 'reward_a1': tensor([4.8166], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.7013292908668518, 'numerator': 0.03026406653225422, 'denominator': 0.04315243661403656}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 1.0364875793457031, 'dpo_reward_mean_target': 1.0364875793457031, 'standard deviation': 3.0, 'reward_a1': tensor([2.8613], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.11052128672599792, 'denominator': 0.11052128672599792}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.9418869018554688, 'dpo_reward_mean_target': 2.0211257934570312, 'standard deviation': 3.0, 'reward_a1': tensor([-0.4258], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.7955588698387146, 'numerator': 0.09535244107246399, 'denominator': 0.11985591799020767}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.6549835205078125, 'dpo_reward_mean_target': 2.47052001953125, 'standard deviation': 3.0, 'reward_a1': tensor([2.7337], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.2664345502853394, 'numerator': 0.13247013092041016, 'denominator': 0.10460084676742554}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.5685958862304688, 'dpo_reward_mean_target': 0.7501983642578125, 'standard deviation': 3.0, 'reward_a1': tensor([5.7295], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.1077207326889038, 'numerator': 0.033541884273290634, 'denominator': 0.030280090868473053}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.016727447509765625, 'dpo_reward_mean_target': 0.23647308349609375, 'standard deviation': 3.0, 'reward_a1': tensor([0.7698], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.018738031387329, 'numerator': 0.13089615106582642, 'denominator': 0.12848852574825287}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 1.0028800964355469, 'dpo_reward_mean_target': 1.1337966918945312, 'standard deviation': 3.0, 'reward_a1': tensor([2.2346], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0171093940734863, 'numerator': 0.1243232935667038, 'denominator': 0.12223198264837265}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.6672401428222656, 'dpo_reward_mean_target': 0.6672401428222656, 'standard deviation': 3.0, 'reward_a1': tensor([0.3731], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13234303891658783, 'denominator': 0.13234303891658783}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.17748260498046875, 'dpo_reward_mean_target': 1.1707611083984375, 'standard deviation': 3.0, 'reward_a1': tensor([3.1811], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.4950264692306519, 'numerator': 0.10623788833618164, 'denominator': 0.07106087356805801}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.22829246520996094, 'dpo_reward_mean_target': 0.1527118682861328, 'standard deviation': 3.0, 'reward_a1': tensor([-0.3632], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0046610832214355, 'numerator': 0.13102860748767853, 'denominator': 0.13042069971561432}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.04382514953613281, 'dpo_reward_mean_target': 0.04382514953613281, 'standard deviation': 3.0, 'reward_a1': tensor([0.1194], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13293859362602234, 'denominator': 0.13293859362602234}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 1.303802490234375, 'dpo_reward_mean_target': 0.3997039794921875, 'standard deviation': 3.0, 'reward_a1': tensor([6.0626], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.5924674272537231, 'numerator': 0.022389978170394897, 'denominator': 0.037791069597005844}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.5289306640625, 'dpo_reward_mean_target': 1.02044677734375, 'standard deviation': 3.0, 'reward_a1': tensor([7.1394], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 3.276456117630005, 'numerator': 0.016611801460385323, 'denominator': 0.005070051643997431}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.8091354370117188, 'dpo_reward_mean_target': -0.07942962646484375, 'standard deviation': 3.0, 'reward_a1': tensor([1.5258], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.8917050957679749, 'numerator': 0.11524388194084167, 'denominator': 0.12923990190029144}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 2.2886962890625, 'dpo_reward_mean_target': 2.3935623168945312, 'standard deviation': 3.0, 'reward_a1': tensor([4.8492], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0296553373336792, 'numerator': 0.0951242595911026, 'denominator': 0.09238456189632416}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.11507415771484375, 'dpo_reward_mean_target': 0.11507415771484375, 'standard deviation': 3.0, 'reward_a1': tensor([0.3645], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.1325218826532364, 'denominator': 0.1325218826532364}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 1.248077392578125, 'dpo_reward_mean_target': 0.8698806762695312, 'standard deviation': 3.0, 'reward_a1': tensor([1.3554], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9876208901405334, 'numerator': 0.1312505453824997, 'denominator': 0.13289567828178406}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 1.45172119140625, 'dpo_reward_mean_target': 1.2124252319335938, 'standard deviation': 3.0, 'reward_a1': tensor([12.9609], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.7340394258499146, 'numerator': 6.216813926585019e-05, 'denominator': 8.46931870910339e-05}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 1.1394805908203125, 'dpo_reward_mean_target': 0.8315582275390625, 'standard deviation': 3.0, 'reward_a1': tensor([1.2003], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9926779270172119, 'numerator': 0.13197992742061615, 'denominator': 0.13295342028141022}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.20638465881347656, 'dpo_reward_mean_target': -0.20638465881347656, 'standard deviation': 3.0, 'reward_a1': tensor([6.8881], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.008117163553833961, 'denominator': 0.008117163553833961}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.041187286376953125, 'dpo_reward_mean_target': 0.041187286376953125, 'standard deviation': 3.0, 'reward_a1': tensor([0.3113], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13244284689426422, 'denominator': 0.13244284689426422}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.07084465026855469, 'dpo_reward_mean_target': -0.015155792236328125, 'standard deviation': 3.0, 'reward_a1': tensor([0.4220], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.002881407737732, 'numerator': 0.13157613575458527, 'denominator': 0.13119810819625854}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.11305999755859375, 'dpo_reward_mean_target': -0.11305999755859375, 'standard deviation': 3.0, 'reward_a1': tensor([0.3325], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.1315222829580307, 'denominator': 0.1315222829580307}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.4814109802246094, 'dpo_reward_mean_target': -0.1871185302734375, 'standard deviation': 3.0, 'reward_a1': tensor([1.0820], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9329174757003784, 'numerator': 0.12159910053014755, 'denominator': 0.13034282624721527}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.06866836547851562, 'dpo_reward_mean_target': 0.6356925964355469, 'standard deviation': 3.0, 'reward_a1': tensor([1.7550], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0924029350280762, 'numerator': 0.12404009699821472, 'denominator': 0.11354793608188629}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.34479331970214844, 'dpo_reward_mean_target': -0.34479331970214844, 'standard deviation': 3.0, 'reward_a1': tensor([-0.0235], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13222050666809082, 'denominator': 0.13222050666809082}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.0175933837890625, 'dpo_reward_mean_target': -0.0175933837890625, 'standard deviation': 3.0, 'reward_a1': tensor([-0.0176], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13298074901103973, 'denominator': 0.13298074901103973}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.21328163146972656, 'dpo_reward_mean_target': -0.21328163146972656, 'standard deviation': 3.0, 'reward_a1': tensor([1.1220], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.12044049054384232, 'denominator': 0.12044049054384232}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 1.9500350952148438, 'dpo_reward_mean_target': 2.2984085083007812, 'standard deviation': 3.0, 'reward_a1': tensor([4.4367], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.093640685081482, 'numerator': 0.10315026342868805, 'denominator': 0.09431824088096619}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.5323143005371094, 'dpo_reward_mean_target': -0.5323143005371094, 'standard deviation': 3.0, 'reward_a1': tensor([1.6215], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.10276945680379868, 'denominator': 0.10276945680379868}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.017004013061523438, 'dpo_reward_mean_target': -0.017004013061523438, 'standard deviation': 3.0, 'reward_a1': tensor([0.0813], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13290929794311523, 'denominator': 0.13290929794311523}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.20409393310546875, 'dpo_reward_mean_target': 0.47672080993652344, 'standard deviation': 3.0, 'reward_a1': tensor([-0.3360], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9797201752662659, 'numerator': 0.12818998098373413, 'denominator': 0.13084346055984497}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.3211402893066406, 'dpo_reward_mean_target': 0.5517349243164062, 'standard deviation': 3.0, 'reward_a1': tensor([0.1194], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9919099807739258, 'numerator': 0.1316070705652237, 'denominator': 0.13268046081066132}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.12102317810058594, 'dpo_reward_mean_target': 0.12102317810058594, 'standard deviation': 3.0, 'reward_a1': tensor([0.4249], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13230013847351074, 'denominator': 0.13230013847351074}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 1.6583175659179688, 'dpo_reward_mean_target': 1.4730758666992188, 'standard deviation': 3.0, 'reward_a1': tensor([5.3053], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9259175658226013, 'numerator': 0.058810461312532425, 'denominator': 0.06351587176322937}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 1.4493331909179688, 'dpo_reward_mean_target': 1.8992385864257812, 'standard deviation': 3.0, 'reward_a1': tensor([5.2285], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.1944361925125122, 'numerator': 0.0718386098742485, 'denominator': 0.060144368559122086}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.8618316650390625, 'dpo_reward_mean_target': 0.5872573852539062, 'standard deviation': 3.0, 'reward_a1': tensor([2.4070], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9499666094779968, 'numerator': 0.11063513159751892, 'denominator': 0.11646212637424469}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 1.1732177734375, 'dpo_reward_mean_target': 4.78277587890625, 'standard deviation': 3.0, 'reward_a1': tensor([3.5922], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.2793384790420532, 'numerator': 0.12291084229946136, 'denominator': 0.09607374668121338}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 2.3503265380859375, 'dpo_reward_mean_target': 3.6412353515625, 'standard deviation': 3.0, 'reward_a1': tensor([6.2487], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.5945218801498413, 'numerator': 0.09114955365657806, 'denominator': 0.05716419219970703}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.04350471496582031, 'dpo_reward_mean_target': 0.563812255859375, 'standard deviation': 3.0, 'reward_a1': tensor([0.1598], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9932494759559631, 'numerator': 0.131780207157135, 'denominator': 0.13267584145069122}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 2.75091552734375, 'dpo_reward_mean_target': 3.04833984375, 'standard deviation': 3.0, 'reward_a1': tensor([7.5173], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.1648592948913574, 'numerator': 0.04384603723883629, 'denominator': 0.03764063119888306}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.6159591674804688, 'dpo_reward_mean_target': 0.7174072265625, 'standard deviation': 3.0, 'reward_a1': tensor([7.3564], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0783230066299438, 'numerator': 0.011490171775221825, 'denominator': 0.010655594058334827}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 1.58184814453125, 'dpo_reward_mean_target': 2.3481979370117188, 'standard deviation': 3.0, 'reward_a1': tensor([2.1426], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.015238881111145, 'numerator': 0.13266895711421967, 'denominator': 0.13067758083343506}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.47408294677734375, 'dpo_reward_mean_target': 0.6564483642578125, 'standard deviation': 3.0, 'reward_a1': tensor([2.0143], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0297960042953491, 'numerator': 0.12003467231988907, 'denominator': 0.11656159907579422}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 2.655242919921875, 'dpo_reward_mean_target': 2.911834716796875, 'standard deviation': 3.0, 'reward_a1': tensor([4.9037], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0623098611831665, 'numerator': 0.10667525231838226, 'denominator': 0.10041820257902145}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 2.2212295532226562, 'dpo_reward_mean_target': 2.2352371215820312, 'standard deviation': 3.0, 'reward_a1': tensor([1.5844], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9989986419677734, 'numerator': 0.12988772988319397, 'denominator': 0.13001792132854462}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.09994888305664062, 'dpo_reward_mean_target': 0.09749412536621094, 'standard deviation': 3.0, 'reward_a1': tensor([-0.0999], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9978365302085876, 'numerator': 0.13269305229187012, 'denominator': 0.13298074901103973}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 1.086029052734375, 'dpo_reward_mean_target': 1.93402099609375, 'standard deviation': 3.0, 'reward_a1': tensor([2.9810], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.1486632823944092, 'numerator': 0.12512385845184326, 'denominator': 0.10892996937036514}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.037693023681640625, 'dpo_reward_mean_target': 1.0804634094238281, 'standard deviation': 3.0, 'reward_a1': tensor([1.1982], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0768582820892334, 'numerator': 0.13287843763828278, 'denominator': 0.12339454889297485}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.9461479187011719, 'dpo_reward_mean_target': 1.7163314819335938, 'standard deviation': 3.0, 'reward_a1': tensor([1.0121], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9730600714683533, 'numerator': 0.12936697900295258, 'denominator': 0.1329486072063446}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 1.0676422119140625, 'dpo_reward_mean_target': 0.44864654541015625, 'standard deviation': 3.0, 'reward_a1': tensor([2.3039], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.8991425633430481, 'numerator': 0.10983531922101974, 'denominator': 0.12215562164783478}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.19886398315429688, 'dpo_reward_mean_target': 0.19886398315429688, 'standard deviation': 3.0, 'reward_a1': tensor([-0.1002], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.1323217749595642, 'denominator': 0.1323217749595642}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.004856109619140625, 'dpo_reward_mean_target': 0.004856109619140625, 'standard deviation': 3.0, 'reward_a1': tensor([-0.0340], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13296958804130554, 'denominator': 0.13296958804130554}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 1.49493408203125, 'dpo_reward_mean_target': 1.3552017211914062, 'standard deviation': 3.0, 'reward_a1': tensor([3.9929], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9609168767929077, 'numerator': 0.0903492346405983, 'denominator': 0.09402398765087128}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.2668876647949219, 'dpo_reward_mean_target': 0.5641212463378906, 'standard deviation': 3.0, 'reward_a1': tensor([-0.0830], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9836723208427429, 'numerator': 0.12992295622825623, 'denominator': 0.13207951188087463}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.3781929016113281, 'dpo_reward_mean_target': 0.38307762145996094, 'standard deviation': 3.0, 'reward_a1': tensor([-0.2025], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9828140139579773, 'numerator': 0.13047142326831818, 'denominator': 0.1327529102563858}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.20940208435058594, 'dpo_reward_mean_target': -0.3767585754394531, 'standard deviation': 3.0, 'reward_a1': tensor([2.3858], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9514056444168091, 'numerator': 0.08702657371759415, 'denominator': 0.09147157520055771}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 3.04559326171875, 'dpo_reward_mean_target': 1.6085281372070312, 'standard deviation': 3.0, 'reward_a1': tensor([2.5378], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9669038653373718, 'numerator': 0.126751109957695, 'denominator': 0.13108967244625092}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.052455902099609375, 'dpo_reward_mean_target': 0.3325920104980469, 'standard deviation': 3.0, 'reward_a1': tensor([0.8895], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0219327211380005, 'numerator': 0.1307087391614914, 'denominator': 0.12790346145629883}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.01334381103515625, 'dpo_reward_mean_target': 0.01334381103515625, 'standard deviation': 3.0, 'reward_a1': tensor([0.3667], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.1320614516735077, 'denominator': 0.1320614516735077}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.13775634765625, 'dpo_reward_mean_target': 1.0385513305664062, 'standard deviation': 3.0, 'reward_a1': tensor([-0.1592], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9279283285140991, 'numerator': 0.12279358506202698, 'denominator': 0.13233089447021484}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.815765380859375, 'dpo_reward_mean_target': 0.933837890625, 'standard deviation': 3.0, 'reward_a1': tensor([1.7073], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0109812021255493, 'numerator': 0.12863394618034363, 'denominator': 0.1272367388010025}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.08351707458496094, 'dpo_reward_mean_target': 0.6544990539550781, 'standard deviation': 3.0, 'reward_a1': tensor([0.8642], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0319136381149292, 'numerator': 0.13265635073184967, 'denominator': 0.12855373322963715}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.011737823486328125, 'dpo_reward_mean_target': -0.011737823486328125, 'standard deviation': 3.0, 'reward_a1': tensor([-0.0117], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13298074901103973, 'denominator': 0.13298074901103973}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 1.98046875, 'dpo_reward_mean_target': 2.34136962890625, 'standard deviation': 3.0, 'reward_a1': tensor([6.7225], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.2007167339324951, 'numerator': 0.045780912041664124, 'denominator': 0.03812798857688904}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.7506494522094727, 'dpo_reward_mean_target': 0.457855224609375, 'standard deviation': 3.0, 'reward_a1': tensor([1.1055], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.1830652952194214, 'numerator': 0.12991739809513092, 'denominator': 0.10981421917676926}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.7098846435546875, 'dpo_reward_mean_target': 1.4587860107421875, 'standard deviation': 3.0, 'reward_a1': tensor([2.7103], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.144875168800354, 'numerator': 0.12189904600381851, 'denominator': 0.1064736545085907}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.3231964111328125, 'dpo_reward_mean_target': -0.16215133666992188, 'standard deviation': 3.0, 'reward_a1': tensor([0.1746], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0074938535690308, 'numerator': 0.13214579224586487, 'denominator': 0.1311628818511963}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.21840476989746094, 'dpo_reward_mean_target': 0.21840476989746094, 'standard deviation': 3.0, 'reward_a1': tensor([1.6198], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.11923503130674362, 'denominator': 0.11923503130674362}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.16100692749023438, 'dpo_reward_mean_target': 0.39646148681640625, 'standard deviation': 3.0, 'reward_a1': tensor([1.3195], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0772762298583984, 'numerator': 0.12683363258838654, 'denominator': 0.11773547530174255}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 1.8864059448242188, 'dpo_reward_mean_target': 1.64263916015625, 'standard deviation': 3.0, 'reward_a1': tensor([1.6285], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0036917924880981, 'numerator': 0.13297928869724274, 'denominator': 0.1324901580810547}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.12670326232910156, 'dpo_reward_mean_target': 0.12670326232910156, 'standard deviation': 3.0, 'reward_a1': tensor([0.0977], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.1329745650291443, 'denominator': 0.1329745650291443}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.0362701416015625, 'dpo_reward_mean_target': 0.1589508056640625, 'standard deviation': 3.0, 'reward_a1': tensor([3.4285], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.075770616531372, 'numerator': 0.07342983782291412, 'denominator': 0.0682578980922699}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.2824249267578125, 'dpo_reward_mean_target': 1.6481285095214844, 'standard deviation': 3.0, 'reward_a1': tensor([2.1667], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.1999781131744385, 'numerator': 0.13100916147232056, 'denominator': 0.10917629301548004}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.5461883544921875, 'dpo_reward_mean_target': -0.13956451416015625, 'standard deviation': 3.0, 'reward_a1': tensor([0.6168], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.044311285018921, 'numerator': 0.12882100045681, 'denominator': 0.12335498631000519}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 1.0824127197265625, 'dpo_reward_mean_target': 1.0114364624023438, 'standard deviation': 3.0, 'reward_a1': tensor([5.4380], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9659639000892639, 'numerator': 0.04477406293153763, 'denominator': 0.04635169357061386}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 1.4771270751953125, 'dpo_reward_mean_target': 1.702239990234375, 'standard deviation': 3.0, 'reward_a1': tensor([1.5680], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9994568228721619, 'numerator': 0.1328476220369339, 'denominator': 0.1329198181629181}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 1.753570556640625, 'dpo_reward_mean_target': 2.763946533203125, 'standard deviation': 3.0, 'reward_a1': tensor([3.2529], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.1180686950683594, 'numerator': 0.13122642040252686, 'denominator': 0.11736883223056793}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 1.4515228271484375, 'dpo_reward_mean_target': 1.8340988159179688, 'standard deviation': 3.0, 'reward_a1': tensor([4.0834], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.1093182563781738, 'numerator': 0.10039655864238739, 'denominator': 0.09050293266773224}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 1.8192138671875, 'dpo_reward_mean_target': 3.1168899536132812, 'standard deviation': 3.0, 'reward_a1': tensor([4.6797], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.375590205192566, 'numerator': 0.11610841751098633, 'denominator': 0.08440625667572021}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.08257579803466797, 'dpo_reward_mean_target': 0.19652557373046875, 'standard deviation': 3.0, 'reward_a1': tensor([0.9735], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0288301706314087, 'numerator': 0.12859481573104858, 'denominator': 0.12499129772186279}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 1.2626228332519531, 'dpo_reward_mean_target': 1.6672821044921875, 'standard deviation': 3.0, 'reward_a1': tensor([1.1822], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9873669743537903, 'numerator': 0.1312536597251892, 'denominator': 0.13293300569057465}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.5678787231445312, 'dpo_reward_mean_target': 0.6896743774414062, 'standard deviation': 3.0, 'reward_a1': tensor([2.3738], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0238969326019287, 'numerator': 0.1135934591293335, 'denominator': 0.11094227433204651}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.8249626159667969, 'dpo_reward_mean_target': 0.8365936279296875, 'standard deviation': 3.0, 'reward_a1': tensor([2.4507], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0020954608917236, 'numerator': 0.11506142467260361, 'denominator': 0.11482081562280655}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 1.01611328125, 'dpo_reward_mean_target': 1.7981719970703125, 'standard deviation': 3.0, 'reward_a1': tensor([15.4249], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 3.3806726932525635, 'numerator': 4.401858404889936e-06, 'denominator': 1.3020658116147388e-06}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.2059326171875, 'dpo_reward_mean_target': 0.2059326171875, 'standard deviation': 3.0, 'reward_a1': tensor([0.6133], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13176052272319794, 'denominator': 0.13176052272319794}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.47830963134765625, 'dpo_reward_mean_target': 0.7976531982421875, 'standard deviation': 3.0, 'reward_a1': tensor([0.4927], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9948576092720032, 'numerator': 0.1322953999042511, 'denominator': 0.13297922909259796}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.19599151611328125, 'dpo_reward_mean_target': -0.19599151611328125, 'standard deviation': 3.0, 'reward_a1': tensor([0.7298], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.12679722905158997, 'denominator': 0.12679722905158997}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.98876953125, 'dpo_reward_mean_target': 0.98876953125, 'standard deviation': 3.0, 'reward_a1': tensor([1.2738], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13238176703453064, 'denominator': 0.13238176703453064}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.1279277801513672, 'dpo_reward_mean_target': 0.1279277801513672, 'standard deviation': 3.0, 'reward_a1': tensor([0.2368], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13289324939250946, 'denominator': 0.13289324939250946}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 1.1932182312011719, 'dpo_reward_mean_target': 0.88934326171875, 'standard deviation': 3.0, 'reward_a1': tensor([0.4085], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0215946435928345, 'numerator': 0.13128359615802765, 'denominator': 0.12850850820541382}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.39654541015625, 'dpo_reward_mean_target': -0.499359130859375, 'standard deviation': 3.0, 'reward_a1': tensor([0.7322], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9866088032722473, 'numerator': 0.12223468720912933, 'denominator': 0.12389377504587173}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.022724151611328125, 'dpo_reward_mean_target': 0.022724151611328125, 'standard deviation': 3.0, 'reward_a1': tensor([0.7072], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.12956371903419495, 'denominator': 0.12956371903419495}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.9534759521484375, 'dpo_reward_mean_target': 1.368011474609375, 'standard deviation': 3.0, 'reward_a1': tensor([3.0153], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0891753435134888, 'numerator': 0.11437099426984787, 'denominator': 0.10500696301460266}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.14349746704101562, 'dpo_reward_mean_target': -0.14349746704101562, 'standard deviation': 3.0, 'reward_a1': tensor([-0.1435], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13298074901103973, 'denominator': 0.13298074901103973}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.24639129638671875, 'dpo_reward_mean_target': -0.08053398132324219, 'standard deviation': 3.0, 'reward_a1': tensor([0.4223], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0108534097671509, 'numerator': 0.13112592697143555, 'denominator': 0.12971805036067963}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.1026763916015625, 'dpo_reward_mean_target': 0.40625762939453125, 'standard deviation': 3.0, 'reward_a1': tensor([3.3159], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.1087851524353027, 'numerator': 0.08308623731136322, 'denominator': 0.0749344751238823}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.2877616882324219, 'dpo_reward_mean_target': -0.2877616882324219, 'standard deviation': 3.0, 'reward_a1': tensor([3.7782], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.053078509867191315, 'denominator': 0.053078509867191315}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.10881423950195312, 'dpo_reward_mean_target': 0.2041473388671875, 'standard deviation': 3.0, 'reward_a1': tensor([5.4346], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.2060173749923706, 'numerator': 0.02908775582909584, 'denominator': 0.024118853732943535}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.12220191955566406, 'dpo_reward_mean_target': 0.06779861450195312, 'standard deviation': 3.0, 'reward_a1': tensor([1.1532], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0252336263656616, 'numerator': 0.12455548346042633, 'denominator': 0.12148986011743546}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.61572265625, 'dpo_reward_mean_target': 0.6001205444335938, 'standard deviation': 3.0, 'reward_a1': tensor([1.0197], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9992864727973938, 'numerator': 0.13168667256832123, 'denominator': 0.1317806988954544}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.2526969909667969, 'dpo_reward_mean_target': 0.3373584747314453, 'standard deviation': 3.0, 'reward_a1': tensor([0.0167], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9973853230476379, 'numerator': 0.13222329318523407, 'denominator': 0.13256992399692535}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 2.264862060546875, 'dpo_reward_mean_target': 1.92218017578125, 'standard deviation': 3.0, 'reward_a1': tensor([1.9346], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0060702562332153, 'numerator': 0.1329796016216278, 'denominator': 0.13217724859714508}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.3684043884277344, 'dpo_reward_mean_target': 0.3684043884277344, 'standard deviation': 3.0, 'reward_a1': tensor([1.1253], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.1288154423236847, 'denominator': 0.1288154423236847}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.30356788635253906, 'dpo_reward_mean_target': -0.30356788635253906, 'standard deviation': 3.0, 'reward_a1': tensor([1.2065], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.11715683341026306, 'denominator': 0.11715683341026306}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.0124359130859375, 'dpo_reward_mean_target': -0.0124359130859375, 'standard deviation': 3.0, 'reward_a1': tensor([-0.0124], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13298074901103973, 'denominator': 0.13298074901103973}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.001018524169921875, 'dpo_reward_mean_target': -0.001018524169921875, 'standard deviation': 3.0, 'reward_a1': tensor([-0.0010], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13298074901103973, 'denominator': 0.13298074901103973}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 1.4343719482421875, 'dpo_reward_mean_target': 2.1192474365234375, 'standard deviation': 3.0, 'reward_a1': tensor([2.4101], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.049371361732483, 'numerator': 0.13235726952552795, 'denominator': 0.12613005936145782}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.2583026885986328, 'dpo_reward_mean_target': -0.2583026885986328, 'standard deviation': 3.0, 'reward_a1': tensor([1.8804], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.10314029455184937, 'denominator': 0.10314029455184937}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.5870800018310547, 'dpo_reward_mean_target': 1.062479019165039, 'standard deviation': 3.0, 'reward_a1': tensor([1.9959], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.3802218437194824, 'numerator': 0.12669770419597626, 'denominator': 0.0917951762676239}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.6448974609375, 'dpo_reward_mean_target': 2.36224365234375, 'standard deviation': 3.0, 'reward_a1': tensor([8.4684], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 3.7772626876831055, 'numerator': 0.016757041215896606, 'denominator': 0.004436292219907045}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.8984413146972656, 'dpo_reward_mean_target': 1.300872802734375, 'standard deviation': 3.0, 'reward_a1': tensor([2.4893], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0641062259674072, 'numerator': 0.12294603884220123, 'denominator': 0.11553925275802612}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 4.5757598876953125, 'dpo_reward_mean_target': 3.68341064453125, 'standard deviation': 3.0, 'reward_a1': tensor([5.8216], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.8455545902252197, 'numerator': 0.10315306484699249, 'denominator': 0.12199456244707108}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.4856300354003906, 'dpo_reward_mean_target': 0.7338027954101562, 'standard deviation': 3.0, 'reward_a1': tensor([4.2566], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.1057912111282349, 'numerator': 0.06673768162727356, 'denominator': 0.060352880507707596}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 2.75762939453125, 'dpo_reward_mean_target': 2.4583892822265625, 'standard deviation': 3.0, 'reward_a1': tensor([9.6154], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.7921627759933472, 'numerator': 0.007725134491920471, 'denominator': 0.009751953184604645}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.7464103698730469, 'dpo_reward_mean_target': 0.7464103698730469, 'standard deviation': 3.0, 'reward_a1': tensor([-0.1884], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.12667842209339142, 'denominator': 0.12667842209339142}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.07705116271972656, 'dpo_reward_mean_target': 0.8639602661132812, 'standard deviation': 3.0, 'reward_a1': tensor([-0.1470], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9474408030509949, 'numerator': 0.12564054131507874, 'denominator': 0.13261044025421143}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 1.5164794921875, 'dpo_reward_mean_target': 1.5164794921875, 'standard deviation': 3.0, 'reward_a1': tensor([2.4591], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.12657614052295685, 'denominator': 0.12657614052295685}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 2.755584716796875, 'dpo_reward_mean_target': 2.669403076171875, 'standard deviation': 3.0, 'reward_a1': tensor([1.1522], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0150529146194458, 'numerator': 0.11701761186122894, 'denominator': 0.11528227478265762}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 2.3603515625, 'dpo_reward_mean_target': 4.0421295166015625, 'standard deviation': 3.0, 'reward_a1': tensor([4.5934], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.2971211671829224, 'numerator': 0.13075433671474457, 'denominator': 0.10080348700284958}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.45589256286621094, 'dpo_reward_mean_target': 0.2243671417236328, 'standard deviation': 3.0, 'reward_a1': tensor([1.5235], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9700164794921875, 'numerator': 0.1210787296295166, 'denominator': 0.12482131272554398}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 4.493621826171875, 'dpo_reward_mean_target': 4.629547119140625, 'standard deviation': 3.0, 'reward_a1': tensor([4.6155], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0008147954940796, 'numerator': 0.13297928869724274, 'denominator': 0.13287103176116943}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 4.00885009765625, 'dpo_reward_mean_target': 3.99896240234375, 'standard deviation': 3.0, 'reward_a1': tensor([1.7245], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0025074481964111, 'numerator': 0.09976504743099213, 'denominator': 0.09951551258563995}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.013153076171875, 'dpo_reward_mean_target': 0.25083160400390625, 'standard deviation': 3.0, 'reward_a1': tensor([2.9343], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0768128633499146, 'numerator': 0.08913442492485046, 'denominator': 0.08277615159749985}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 1.657684326171875, 'dpo_reward_mean_target': 2.1142730712890625, 'standard deviation': 3.0, 'reward_a1': tensor([12.1609], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.6841609477996826, 'numerator': 0.00048808433348312974, 'denominator': 0.0002898086095228791}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.551300048828125, 'dpo_reward_mean_target': 0.3351726531982422, 'standard deviation': 3.0, 'reward_a1': tensor([4.8034], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9005894064903259, 'numerator': 0.043861281126737595, 'denominator': 0.04870286211371422}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.45013999938964844, 'dpo_reward_mean_target': 0.15105247497558594, 'standard deviation': 3.0, 'reward_a1': tensor([0.9954], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9771751165390015, 'numerator': 0.1278168261051178, 'denominator': 0.13080237805843353}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.31386566162109375, 'dpo_reward_mean_target': 0.12743568420410156, 'standard deviation': 3.0, 'reward_a1': tensor([1.8555], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9667011499404907, 'numerator': 0.11265110969543457, 'denominator': 0.11653147637844086}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.16423606872558594, 'dpo_reward_mean_target': -0.16423606872558594, 'standard deviation': 3.0, 'reward_a1': tensor([4.9917], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.030366310849785805, 'denominator': 0.030366310849785805}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.3604011535644531, 'dpo_reward_mean_target': 0.4158782958984375, 'standard deviation': 3.0, 'reward_a1': tensor([6.6598], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0394160747528076, 'numerator': 0.015245604328811169, 'denominator': 0.014667470939457417}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.22125244140625, 'dpo_reward_mean_target': 0.3006019592285156, 'standard deviation': 3.0, 'reward_a1': tensor([-0.2335], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9956506490707397, 'numerator': 0.13089019060134888, 'denominator': 0.131461963057518}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.50201416015625, 'dpo_reward_mean_target': 1.263092041015625, 'standard deviation': 3.0, 'reward_a1': tensor([1.4606], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0500993728637695, 'numerator': 0.13269279897212982, 'denominator': 0.1263621300458908}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.32584571838378906, 'dpo_reward_mean_target': 0.04113578796386719, 'standard deviation': 3.0, 'reward_a1': tensor([0.2596], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0165261030197144, 'numerator': 0.13262858986854553, 'denominator': 0.13047239184379578}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.5995883941650391, 'dpo_reward_mean_target': 0.5995883941650391, 'standard deviation': 3.0, 'reward_a1': tensor([0.3604], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13255876302719116, 'denominator': 0.13255876302719116}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.1579132080078125, 'dpo_reward_mean_target': -0.1679859161376953, 'standard deviation': 3.0, 'reward_a1': tensor([0.7351], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9735559821128845, 'numerator': 0.12709036469459534, 'denominator': 0.13054242730140686}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 2.0530548095703125, 'dpo_reward_mean_target': 3.05963134765625, 'standard deviation': 3.0, 'reward_a1': tensor([1.1538], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.8548206686973572, 'numerator': 0.10868072509765625, 'denominator': 0.1271386295557022}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 2.258758544921875, 'dpo_reward_mean_target': 2.399688720703125, 'standard deviation': 3.0, 'reward_a1': tensor([2.6935], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0057207345962524, 'numerator': 0.13234449923038483, 'denominator': 0.13159169256687164}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.0041599273681640625, 'dpo_reward_mean_target': -0.0041599273681640625, 'standard deviation': 3.0, 'reward_a1': tensor([-0.0042], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13298074901103973, 'denominator': 0.13298074901103973}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 1.683563232421875, 'dpo_reward_mean_target': 1.4155120849609375, 'standard deviation': 3.0, 'reward_a1': tensor([4.9762], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9029768109321594, 'numerator': 0.06574823707342148, 'denominator': 0.0728127658367157}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.3751678466796875, 'dpo_reward_mean_target': 0.306365966796875, 'standard deviation': 3.0, 'reward_a1': tensor([0.6732], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0550462007522583, 'numerator': 0.1319902390241623, 'denominator': 0.12510375678539276}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 1.6354217529296875, 'dpo_reward_mean_target': 0.873138427734375, 'standard deviation': 3.0, 'reward_a1': tensor([8.0970], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.560142457485199, 'numerator': 0.0073235551826655865, 'denominator': 0.013074451126158237}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.20848846435546875, 'dpo_reward_mean_target': 0.20848846435546875, 'standard deviation': 3.0, 'reward_a1': tensor([1.2436], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.12529604136943817, 'denominator': 0.12529604136943817}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.6455841064453125, 'dpo_reward_mean_target': 0.5933914184570312, 'standard deviation': 3.0, 'reward_a1': tensor([4.1465], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9797537922859192, 'numerator': 0.06594579666852951, 'denominator': 0.06730853766202927}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -1.9429187774658203, 'dpo_reward_mean_target': -1.5944957733154297, 'standard deviation': 3.0, 'reward_a1': tensor([-0.9547], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.032013177871704, 'numerator': 0.12999117374420166, 'denominator': 0.12595883011817932}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 2.4294281005859375, 'dpo_reward_mean_target': 2.261505126953125, 'standard deviation': 3.0, 'reward_a1': tensor([3.8961], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9714835286140442, 'numerator': 0.1146373301744461, 'denominator': 0.11800234019756317}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.10094833374023438, 'dpo_reward_mean_target': 0.10094833374023438, 'standard deviation': 3.0, 'reward_a1': tensor([-0.1595], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13248050212860107, 'denominator': 0.13248050212860107}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.1771240234375, 'dpo_reward_mean_target': -0.35323333740234375, 'standard deviation': 3.0, 'reward_a1': tensor([2.4398], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.8616030216217041, 'numerator': 0.08621348440647125, 'denominator': 0.10006172209978104}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.5783843994140625, 'dpo_reward_mean_target': 0.6811904907226562, 'standard deviation': 3.0, 'reward_a1': tensor([1.6335], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0115307569503784, 'numerator': 0.1264474093914032, 'denominator': 0.12500599026679993}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.21028709411621094, 'dpo_reward_mean_target': 0.21028709411621094, 'standard deviation': 3.0, 'reward_a1': tensor([3.4337], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.07466159015893936, 'denominator': 0.07466159015893936}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.4816131591796875, 'dpo_reward_mean_target': 0.24703598022460938, 'standard deviation': 3.0, 'reward_a1': tensor([-0.0505], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0108706951141357, 'numerator': 0.13232840597629547, 'denominator': 0.1309053748846054}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.08363914489746094, 'dpo_reward_mean_target': 0.08363914489746094, 'standard deviation': 3.0, 'reward_a1': tensor([0.9202], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.12790974974632263, 'denominator': 0.12790974974632263}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.011693954467773438, 'dpo_reward_mean_target': -0.011693954467773438, 'standard deviation': 3.0, 'reward_a1': tensor([-0.0117], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13298074901103973, 'denominator': 0.13298074901103973}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.04659271240234375, 'dpo_reward_mean_target': 0.3266754150390625, 'standard deviation': 3.0, 'reward_a1': tensor([0.7779], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0185712575912476, 'numerator': 0.1314849704504013, 'denominator': 0.1290876567363739}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 2.7373809814453125, 'dpo_reward_mean_target': 1.495086669921875, 'standard deviation': 3.0, 'reward_a1': tensor([8.1873], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.4325714111328125, 'numerator': 0.011046307161450386, 'denominator': 0.02553637884557247}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.02936553955078125, 'dpo_reward_mean_target': -0.23335647583007812, 'standard deviation': 3.0, 'reward_a1': tensor([0.4976], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9858461618423462, 'numerator': 0.12909184396266937, 'denominator': 0.13094522058963776}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 1.461395263671875, 'dpo_reward_mean_target': 1.0304641723632812, 'standard deviation': 3.0, 'reward_a1': tensor([1.1957], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0024076700210571, 'numerator': 0.13277919590473175, 'denominator': 0.13246028125286102}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.23938941955566406, 'dpo_reward_mean_target': 0.23938941955566406, 'standard deviation': 3.0, 'reward_a1': tensor([0.5319], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.1323501020669937, 'denominator': 0.1323501020669937}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.0025148391723632812, 'dpo_reward_mean_target': 0.27172088623046875, 'standard deviation': 3.0, 'reward_a1': tensor([0.0564], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9976198077201843, 'numerator': 0.13263867795467377, 'denominator': 0.1329551339149475}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.4818458557128906, 'dpo_reward_mean_target': 0.4818458557128906, 'standard deviation': 3.0, 'reward_a1': tensor([0.4928], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.1329798549413681, 'denominator': 0.1329798549413681}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 1.1060409545898438, 'dpo_reward_mean_target': 1.5007171630859375, 'standard deviation': 3.0, 'reward_a1': tensor([2.7735], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0665943622589111, 'numerator': 0.12153464555740356, 'denominator': 0.11394645273685455}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.4085674285888672, 'dpo_reward_mean_target': -0.1263256072998047, 'standard deviation': 3.0, 'reward_a1': tensor([0.9491], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0388882160186768, 'numerator': 0.12470515817403793, 'denominator': 0.12003713101148605}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.16631507873535156, 'dpo_reward_mean_target': -0.16631507873535156, 'standard deviation': 3.0, 'reward_a1': tensor([1.7944], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.10740633308887482, 'denominator': 0.10740633308887482}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.018566131591796875, 'dpo_reward_mean_target': 0.018566131591796875, 'standard deviation': 3.0, 'reward_a1': tensor([-0.1383], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.1327991783618927, 'denominator': 0.1327991783618927}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.00296783447265625, 'dpo_reward_mean_target': -0.00296783447265625, 'standard deviation': 3.0, 'reward_a1': tensor([-0.0030], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13298074901103973, 'denominator': 0.13298074901103973}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.0296478271484375, 'dpo_reward_mean_target': -0.0296478271484375, 'standard deviation': 3.0, 'reward_a1': tensor([-0.0296], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13298074901103973, 'denominator': 0.13298074901103973}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.764862060546875, 'dpo_reward_mean_target': 1.2369823455810547, 'standard deviation': 3.0, 'reward_a1': tensor([0.4752], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9727970361709595, 'numerator': 0.12876158952713013, 'denominator': 0.1323622316122055}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.27161598205566406, 'dpo_reward_mean_target': 0.28411102294921875, 'standard deviation': 3.0, 'reward_a1': tensor([-0.0845], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9944109320640564, 'numerator': 0.13198061287403107, 'denominator': 0.132722407579422}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.015203475952148438, 'dpo_reward_mean_target': -0.015203475952148438, 'standard deviation': 3.0, 'reward_a1': tensor([-0.0152], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13298074901103973, 'denominator': 0.13298074901103973}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.19866180419921875, 'dpo_reward_mean_target': 0.19866180419921875, 'standard deviation': 3.0, 'reward_a1': tensor([0.1012], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.1329105794429779, 'denominator': 0.1329105794429779}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 2.1021194458007812, 'dpo_reward_mean_target': 3.0278778076171875, 'standard deviation': 3.0, 'reward_a1': tensor([-0.0005], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.7680590152740479, 'numerator': 0.07989490032196045, 'denominator': 0.1040218248963356}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.8433265686035156, 'dpo_reward_mean_target': 0.6310577392578125, 'standard deviation': 3.0, 'reward_a1': tensor([0.3963], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.008073329925537, 'numerator': 0.1325741559267044, 'denominator': 0.13151241838932037}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.07879257202148438, 'dpo_reward_mean_target': -0.07879257202148438, 'standard deviation': 3.0, 'reward_a1': tensor([-0.0949], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13297884166240692, 'denominator': 0.13297884166240692}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.46778106689453125, 'dpo_reward_mean_target': -0.5172748565673828, 'standard deviation': 3.0, 'reward_a1': tensor([0.2065], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9750102162361145, 'numerator': 0.12916670739650726, 'denominator': 0.1324772834777832}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 2.448577880859375, 'dpo_reward_mean_target': 4.3597412109375, 'standard deviation': 3.0, 'reward_a1': tensor([8.8659], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 3.1893272399902344, 'numerator': 0.043039578944444656, 'denominator': 0.013494877144694328}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 1.2924957275390625, 'dpo_reward_mean_target': 2.1087799072265625, 'standard deviation': 3.0, 'reward_a1': tensor([3.7773], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.207260251045227, 'numerator': 0.1139240488409996, 'denominator': 0.09436577558517456}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 1.0722503662109375, 'dpo_reward_mean_target': 2.10943603515625, 'standard deviation': 3.0, 'reward_a1': tensor([1.0089], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9351391196250916, 'numerator': 0.12432780861854553, 'denominator': 0.13295114040374756}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.72589111328125, 'dpo_reward_mean_target': 1.0910491943359375, 'standard deviation': 3.0, 'reward_a1': tensor([1.8581], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0392818450927734, 'numerator': 0.12870419025421143, 'denominator': 0.12383954972028732}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 1.60968017578125, 'dpo_reward_mean_target': 1.7183914184570312, 'standard deviation': 3.0, 'reward_a1': tensor([1.6057], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9992952942848206, 'numerator': 0.13288693130016327, 'denominator': 0.13298064470291138}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.3179779052734375, 'dpo_reward_mean_target': 0.7985000610351562, 'standard deviation': 3.0, 'reward_a1': tensor([6.9478], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.4065591096878052, 'numerator': 0.016271959990262985, 'denominator': 0.011568628251552582}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.3507518768310547, 'dpo_reward_mean_target': 0.3507518768310547, 'standard deviation': 3.0, 'reward_a1': tensor([-0.1441], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13118408620357513, 'denominator': 0.13118408620357513}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 1.05816650390625, 'dpo_reward_mean_target': 1.3466033935546875, 'standard deviation': 3.0, 'reward_a1': tensor([2.8145], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0530222654342651, 'numerator': 0.11797884851694107, 'denominator': 0.11203832179307938}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 1.4450454711914062, 'dpo_reward_mean_target': 2.2840652465820312, 'standard deviation': 3.0, 'reward_a1': tensor([3.1893], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.1314455270767212, 'numerator': 0.12706294655799866, 'denominator': 0.11230142414569855}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.5075836181640625, 'dpo_reward_mean_target': 0.5075836181640625, 'standard deviation': 3.0, 'reward_a1': tensor([2.3534], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.1100483164191246, 'denominator': 0.1100483164191246}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.42501258850097656, 'dpo_reward_mean_target': 0.2098541259765625, 'standard deviation': 3.0, 'reward_a1': tensor([3.2106], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.2637251615524292, 'numerator': 0.08063803613185883, 'denominator': 0.06380978971719742}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.46761322021484375, 'dpo_reward_mean_target': 1.7199783325195312, 'standard deviation': 3.0, 'reward_a1': tensor([0.5337], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.925018310546875, 'numerator': 0.12297984212636948, 'denominator': 0.13294854760169983}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.12614822387695312, 'dpo_reward_mean_target': -0.12614822387695312, 'standard deviation': 3.0, 'reward_a1': tensor([0.8653], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.12591391801834106, 'denominator': 0.12591391801834106}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.7634201049804688, 'dpo_reward_mean_target': 0.7634201049804688, 'standard deviation': 3.0, 'reward_a1': tensor([1.4424], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.1296186000108719, 'denominator': 0.1296186000108719}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 2.602142333984375, 'dpo_reward_mean_target': 3.001434326171875, 'standard deviation': 3.0, 'reward_a1': tensor([10.7042], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.4199122190475464, 'numerator': 0.00492330826818943, 'denominator': 0.0034673328045755625}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.7711944580078125, 'dpo_reward_mean_target': -0.1463642120361328, 'standard deviation': 3.0, 'reward_a1': tensor([1.3418], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9003767967224121, 'numerator': 0.1175868809223175, 'denominator': 0.13059741258621216}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.030099868774414062, 'dpo_reward_mean_target': 0.030099868774414062, 'standard deviation': 3.0, 'reward_a1': tensor([0.0301], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13298074901103973, 'denominator': 0.13298074901103973}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.17663192749023438, 'dpo_reward_mean_target': -0.17663192749023438, 'standard deviation': 3.0, 'reward_a1': tensor([1.0639], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.12208428978919983, 'denominator': 0.12208428978919983}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.12061691284179688, 'dpo_reward_mean_target': 0.34820556640625, 'standard deviation': 3.0, 'reward_a1': tensor([3.0968], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0750682353973389, 'numerator': 0.08739869296550751, 'denominator': 0.08129595220088959}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.9253997802734375, 'dpo_reward_mean_target': -0.10379791259765625, 'standard deviation': 3.0, 'reward_a1': tensor([1.2888], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9044716954231262, 'numerator': 0.11939811706542969, 'denominator': 0.13200868666172028}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.06280708312988281, 'dpo_reward_mean_target': -0.06280708312988281, 'standard deviation': 3.0, 'reward_a1': tensor([-0.4444], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13190951943397522, 'denominator': 0.13190951943397522}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 1.9745407104492188, 'dpo_reward_mean_target': 2.1986007690429688, 'standard deviation': 3.0, 'reward_a1': tensor([10.6633], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.2380324602127075, 'numerator': 0.002483224030584097, 'denominator': 0.00200578267686069}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.1536407470703125, 'dpo_reward_mean_target': -0.4608917236328125, 'standard deviation': 3.0, 'reward_a1': tensor([1.2368], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9486513733863831, 'numerator': 0.11330462247133255, 'denominator': 0.119437575340271}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.20885848999023438, 'dpo_reward_mean_target': -0.04351806640625, 'standard deviation': 3.0, 'reward_a1': tensor([0.3166], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0081677436828613, 'numerator': 0.1320260912179947, 'denominator': 0.1309564709663391}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 1.2374954223632812, 'dpo_reward_mean_target': 1.2374954223632812, 'standard deviation': 3.0, 'reward_a1': tensor([1.5356], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13232578337192535, 'denominator': 0.13232578337192535}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 2.2187271118164062, 'dpo_reward_mean_target': 3.278411865234375, 'standard deviation': 3.0, 'reward_a1': tensor([5.4839], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.3799678087234497, 'numerator': 0.10149232298135757, 'denominator': 0.0735468789935112}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.44373321533203125, 'dpo_reward_mean_target': 3.0129241943359375, 'standard deviation': 3.0, 'reward_a1': tensor([5.9818], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 3.367677688598633, 'numerator': 0.08149337023496628, 'denominator': 0.024198684841394424}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 1.9290695190429688, 'dpo_reward_mean_target': 2.0274734497070312, 'standard deviation': 3.0, 'reward_a1': tensor([7.1776], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.058495044708252, 'numerator': 0.030467258766293526, 'denominator': 0.028783563524484634}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.2716560363769531, 'dpo_reward_mean_target': -0.2716560363769531, 'standard deviation': 3.0, 'reward_a1': tensor([-0.2717], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13298074901103973, 'denominator': 0.13298074901103973}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 1.3308258056640625, 'dpo_reward_mean_target': 0.8203964233398438, 'standard deviation': 3.0, 'reward_a1': tensor([0.8503], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.01285982131958, 'numerator': 0.13297414779663086, 'denominator': 0.13128583133220673}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 2.22650146484375, 'dpo_reward_mean_target': 1.9033660888671875, 'standard deviation': 3.0, 'reward_a1': tensor([2.1364], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9974379539489746, 'numerator': 0.1325802206993103, 'denominator': 0.1329207718372345}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.07056045532226562, 'dpo_reward_mean_target': -0.07056045532226562, 'standard deviation': 3.0, 'reward_a1': tensor([0.2749], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13210207223892212, 'denominator': 0.13210207223892212}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.1521587371826172, 'dpo_reward_mean_target': -0.11933326721191406, 'standard deviation': 3.0, 'reward_a1': tensor([0.0079], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0005241632461548, 'numerator': 0.1328611522912979, 'denominator': 0.13279154896736145}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.4638671875, 'dpo_reward_mean_target': 2.254852294921875, 'standard deviation': 3.0, 'reward_a1': tensor([2.9068], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.3606163263320923, 'numerator': 0.12987744808197021, 'denominator': 0.0954548642039299}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.12652015686035156, 'dpo_reward_mean_target': 0.0673370361328125, 'standard deviation': 3.0, 'reward_a1': tensor([1.1654], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0260732173919678, 'numerator': 0.12436512112617493, 'denominator': 0.1212049126625061}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.001758575439453125, 'dpo_reward_mean_target': 0.001758575439453125, 'standard deviation': 3.0, 'reward_a1': tensor([0.0018], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13298074901103973, 'denominator': 0.13298074901103973}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 1.1077156066894531, 'dpo_reward_mean_target': 1.2920074462890625, 'standard deviation': 3.0, 'reward_a1': tensor([1.7413], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.011149525642395, 'numerator': 0.13149741291999817, 'denominator': 0.13004744052886963}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.2913532257080078, 'dpo_reward_mean_target': 0.2913532257080078, 'standard deviation': 3.0, 'reward_a1': tensor([0.2372], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13295909762382507, 'denominator': 0.13295909762382507}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.03986549377441406, 'dpo_reward_mean_target': -0.03986549377441406, 'standard deviation': 3.0, 'reward_a1': tensor([0.8853], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.1268048882484436, 'denominator': 0.1268048882484436}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 2.3409042358398438, 'dpo_reward_mean_target': 1.7332000732421875, 'standard deviation': 3.0, 'reward_a1': tensor([6.4554], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.7420523166656494, 'numerator': 0.03852803632616997, 'denominator': 0.05192091688513756}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 2.431396484375, 'dpo_reward_mean_target': 4.4584503173828125, 'standard deviation': 3.0, 'reward_a1': tensor([6.1511], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.839534044265747, 'numerator': 0.11341230571269989, 'denominator': 0.061652734875679016}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.025279998779296875, 'dpo_reward_mean_target': -0.025279998779296875, 'standard deviation': 3.0, 'reward_a1': tensor([-0.0253], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13298074901103973, 'denominator': 0.13298074901103973}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 2.1475830078125, 'dpo_reward_mean_target': 0.31179046630859375, 'standard deviation': 3.0, 'reward_a1': tensor([2.7418], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.7346009612083435, 'numerator': 0.09579043090343475, 'denominator': 0.13039791584014893}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.28766822814941406, 'dpo_reward_mean_target': -0.09671401977539062, 'standard deviation': 3.0, 'reward_a1': tensor([0.1055], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.006335735321045, 'numerator': 0.1326790750026703, 'denominator': 0.13184374570846558}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 2.0395660400390625, 'dpo_reward_mean_target': 3.841339111328125, 'standard deviation': 3.0, 'reward_a1': tensor([15.1019], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 11.412296295166016, 'numerator': 0.00011598564015002921, 'denominator': 1.016321675706422e-05}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.6949386596679688, 'dpo_reward_mean_target': 1.2879562377929688, 'standard deviation': 3.0, 'reward_a1': tensor([8.2183], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.6099143028259277, 'numerator': 0.009225020185112953, 'denominator': 0.005730131175369024}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.0126190185546875, 'dpo_reward_mean_target': -0.0126190185546875, 'standard deviation': 3.0, 'reward_a1': tensor([-0.0148], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13298074901103973, 'denominator': 0.13298074901103973}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.333648681640625, 'dpo_reward_mean_target': -0.1126861572265625, 'standard deviation': 3.0, 'reward_a1': tensor([0.0600], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0069775581359863, 'numerator': 0.13276052474975586, 'denominator': 0.13184060156345367}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.09227752685546875, 'dpo_reward_mean_target': -0.09227752685546875, 'standard deviation': 3.0, 'reward_a1': tensor([0.1041], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.1326962113380432, 'denominator': 0.1326962113380432}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.48065757751464844, 'dpo_reward_mean_target': -0.48065757751464844, 'standard deviation': 3.0, 'reward_a1': tensor([2.5583], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.07961070537567139, 'denominator': 0.07961070537567139}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 2.0019378662109375, 'dpo_reward_mean_target': 2.3082809448242188, 'standard deviation': 3.0, 'reward_a1': tensor([0.8139], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9553744792938232, 'numerator': 0.11746493726968765, 'denominator': 0.12295172363519669}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 1.7848358154296875, 'dpo_reward_mean_target': 2.2335968017578125, 'standard deviation': 3.0, 'reward_a1': tensor([5.8468], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.210882544517517, 'numerator': 0.06438785046339035, 'denominator': 0.05317431688308716}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.2937583923339844, 'dpo_reward_mean_target': 1.141326904296875, 'standard deviation': 3.0, 'reward_a1': tensor([0.3756], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9923527836799622, 'numerator': 0.12871922552585602, 'denominator': 0.12971115112304688}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.13079071044921875, 'dpo_reward_mean_target': 0.34606170654296875, 'standard deviation': 3.0, 'reward_a1': tensor([2.5865], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.1403534412384033, 'numerator': 0.10061918944120407, 'denominator': 0.08823508769273758}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.9796676635742188, 'dpo_reward_mean_target': 1.0739994049072266, 'standard deviation': 3.0, 'reward_a1': tensor([3.0512], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0214442014694214, 'numerator': 0.10702093690633774, 'denominator': 0.10477413982152939}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.09685134887695312, 'dpo_reward_mean_target': 0.09685134887695312, 'standard deviation': 3.0, 'reward_a1': tensor([0.0563], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13296863436698914, 'denominator': 0.13296863436698914}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.1875324249267578, 'dpo_reward_mean_target': -0.1875324249267578, 'standard deviation': 3.0, 'reward_a1': tensor([-0.2578], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13294430077075958, 'denominator': 0.13294430077075958}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 1.9846649169921875, 'dpo_reward_mean_target': 1.5629501342773438, 'standard deviation': 3.0, 'reward_a1': tensor([8.1266], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.742541491985321, 'numerator': 0.012142998166382313, 'denominator': 0.016353292390704155}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.061916351318359375, 'dpo_reward_mean_target': 0.21242713928222656, 'standard deviation': 3.0, 'reward_a1': tensor([-0.0804], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9952676296234131, 'numerator': 0.13234896957874298, 'denominator': 0.13297827541828156}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 1.2627067565917969, 'dpo_reward_mean_target': 1.2627067565917969, 'standard deviation': 3.0, 'reward_a1': tensor([0.4938], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.12868443131446838, 'denominator': 0.12868443131446838}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.53070068359375, 'dpo_reward_mean_target': 1.190399169921875, 'standard deviation': 3.0, 'reward_a1': tensor([0.3142], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9607468247413635, 'numerator': 0.12742869555950165, 'denominator': 0.13263504207134247}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.5646514892578125, 'dpo_reward_mean_target': 0.47953224182128906, 'standard deviation': 3.0, 'reward_a1': tensor([-0.2050], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0068998336791992, 'numerator': 0.1295638084411621, 'denominator': 0.12867596745491028}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 1.7676239013671875, 'dpo_reward_mean_target': 2.4883880615234375, 'standard deviation': 3.0, 'reward_a1': tensor([6.2856], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.3950903415679932, 'numerator': 0.059690091758966446, 'denominator': 0.042785827070474625}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.789031982421875, 'dpo_reward_mean_target': 1.753143310546875, 'standard deviation': 3.0, 'reward_a1': tensor([3.6379], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.2885817289352417, 'numerator': 0.10916408896446228, 'denominator': 0.08471646159887314}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.4142494201660156, 'dpo_reward_mean_target': 3.8602142333984375, 'standard deviation': 3.0, 'reward_a1': tensor([4.0883], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 2.1107213497161865, 'numerator': 0.13259702920913696, 'denominator': 0.06282071769237518}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 1.203033447265625, 'dpo_reward_mean_target': 1.9671783447265625, 'standard deviation': 3.0, 'reward_a1': tensor([3.1286], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.1400238275527954, 'numerator': 0.12337980419397354, 'denominator': 0.10822563618421555}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 2.7409515380859375, 'dpo_reward_mean_target': 1.9123764038085938, 'standard deviation': 3.0, 'reward_a1': tensor([0.9535], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.134759783744812, 'numerator': 0.12635833024978638, 'denominator': 0.11135249584913254}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.09711265563964844, 'dpo_reward_mean_target': -0.09711265563964844, 'standard deviation': 3.0, 'reward_a1': tensor([0.3344], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.1316124051809311, 'denominator': 0.1316124051809311}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.3038368225097656, 'dpo_reward_mean_target': 0.5303230285644531, 'standard deviation': 3.0, 'reward_a1': tensor([0.3556], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9984537363052368, 'numerator': 0.13275538384914398, 'denominator': 0.1329609751701355}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.42182350158691406, 'dpo_reward_mean_target': 0.42182350158691406, 'standard deviation': 3.0, 'reward_a1': tensor([1.9795], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.11621168255805969, 'denominator': 0.11621168255805969}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.0005435943603515625, 'dpo_reward_mean_target': -0.0005435943603515625, 'standard deviation': 3.0, 'reward_a1': tensor([-0.0005], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13298074901103973, 'denominator': 0.13298074901103973}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.07311439514160156, 'dpo_reward_mean_target': 0.07311439514160156, 'standard deviation': 3.0, 'reward_a1': tensor([0.0731], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13298074901103973, 'denominator': 0.13298074901103973}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.0500335693359375, 'dpo_reward_mean_target': 0.0500335693359375, 'standard deviation': 3.0, 'reward_a1': tensor([0.0859], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13297122716903687, 'denominator': 0.13297122716903687}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 1.6500282287597656, 'dpo_reward_mean_target': 1.6500282287597656, 'standard deviation': 3.0, 'reward_a1': tensor([1.2884], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13201825320720673, 'denominator': 0.13201825320720673}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 2.3225555419921875, 'dpo_reward_mean_target': 2.8303070068359375, 'standard deviation': 3.0, 'reward_a1': tensor([6.0390], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.215734839439392, 'numerator': 0.07505408674478531, 'denominator': 0.061735574156045914}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.6816329956054688, 'dpo_reward_mean_target': 0.892059326171875, 'standard deviation': 3.0, 'reward_a1': tensor([6.9209], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.154212474822998, 'numerator': 0.017653463408350945, 'denominator': 0.015294812619686127}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 1.2688846588134766, 'dpo_reward_mean_target': 1.2688846588134766, 'standard deviation': 3.0, 'reward_a1': tensor([0.6074], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.12978743016719818, 'denominator': 0.12978743016719818}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.03542518615722656, 'dpo_reward_mean_target': 0.03542518615722656, 'standard deviation': 3.0, 'reward_a1': tensor([-0.2995], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13215447962284088, 'denominator': 0.13215447962284088}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.5732669830322266, 'dpo_reward_mean_target': -0.5732669830322266, 'standard deviation': 3.0, 'reward_a1': tensor([2.5106], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.0784037783741951, 'denominator': 0.0784037783741951}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.13817214965820312, 'dpo_reward_mean_target': -0.19332313537597656, 'standard deviation': 3.0, 'reward_a1': tensor([-0.1382], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9998312592506409, 'numerator': 0.1329583078622818, 'denominator': 0.13298074901103973}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.0057048797607421875, 'dpo_reward_mean_target': 0.024278640747070312, 'standard deviation': 3.0, 'reward_a1': tensor([1.8586], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0061800479888916, 'numerator': 0.11030719429254532, 'denominator': 0.10962968319654465}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 1.6136856079101562, 'dpo_reward_mean_target': 1.6136856079101562, 'standard deviation': 3.0, 'reward_a1': tensor([1.4296], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13273051381111145, 'denominator': 0.13273051381111145}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.5459556579589844, 'dpo_reward_mean_target': 0.5459556579589844, 'standard deviation': 3.0, 'reward_a1': tensor([0.3795], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13277629017829895, 'denominator': 0.13277629017829895}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.09544563293457031, 'dpo_reward_mean_target': 0.27922630310058594, 'standard deviation': 3.0, 'reward_a1': tensor([2.7072], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.1150221824645996, 'numerator': 0.09584283828735352, 'denominator': 0.08595599234104156}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.08881378173828125, 'dpo_reward_mean_target': 0.27574920654296875, 'standard deviation': 3.0, 'reward_a1': tensor([3.2597], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0660064220428467, 'numerator': 0.0810890942811966, 'denominator': 0.07606811076402664}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.7150650024414062, 'dpo_reward_mean_target': 0.7543220520019531, 'standard deviation': 3.0, 'reward_a1': tensor([-0.3631], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9952230453491211, 'numerator': 0.12406888604164124, 'denominator': 0.12466440349817276}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 1.7045516967773438, 'dpo_reward_mean_target': 1.439910888671875, 'standard deviation': 3.0, 'reward_a1': tensor([3.1523], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9546013474464417, 'numerator': 0.11298982053995132, 'denominator': 0.11836335808038712}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 2.2092437744140625, 'dpo_reward_mean_target': 3.8908843994140625, 'standard deviation': 3.0, 'reward_a1': tensor([17.0063], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 13.568048477172852, 'numerator': 9.407905054104049e-06, 'denominator': 6.933867666703009e-07}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.6189594268798828, 'dpo_reward_mean_target': -0.16343307495117188, 'standard deviation': 3.0, 'reward_a1': tensor([1.5277], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.101993441581726, 'numerator': 0.11344605684280396, 'denominator': 0.1029462218284607}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.78704833984375, 'dpo_reward_mean_target': 1.9294281005859375, 'standard deviation': 3.0, 'reward_a1': tensor([4.9105], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.5697165727615356, 'numerator': 0.08116518706083298, 'denominator': 0.05170690640807152}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.0429534912109375, 'dpo_reward_mean_target': -0.0429534912109375, 'standard deviation': 3.0, 'reward_a1': tensor([-0.1102], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13294734060764313, 'denominator': 0.13294734060764313}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.441925048828125, 'dpo_reward_mean_target': 0.8626861572265625, 'standard deviation': 3.0, 'reward_a1': tensor([3.4799], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.1413270235061646, 'numerator': 0.09089196473360062, 'denominator': 0.0796370878815651}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 2.3222808837890625, 'dpo_reward_mean_target': 1.9476776123046875, 'standard deviation': 3.0, 'reward_a1': tensor([8.4187], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.7698614001274109, 'numerator': 0.012985887005925179, 'denominator': 0.016867823898792267}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 2.9314651489257812, 'dpo_reward_mean_target': 2.9681320190429688, 'standard deviation': 3.0, 'reward_a1': tensor([0.0494], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9882530570030212, 'numerator': 0.08284110575914383, 'denominator': 0.08382580429315567}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.8061294555664062, 'dpo_reward_mean_target': 0.8061294555664062, 'standard deviation': 3.0, 'reward_a1': tensor([6.7006], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.01929650828242302, 'denominator': 0.01929650828242302}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.040302276611328125, 'dpo_reward_mean_target': 0.040302276611328125, 'standard deviation': 3.0, 'reward_a1': tensor([0.0403], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13298074901103973, 'denominator': 0.13298074901103973}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.2598609924316406, 'dpo_reward_mean_target': 0.2598609924316406, 'standard deviation': 3.0, 'reward_a1': tensor([0.9632], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.12937603890895844, 'denominator': 0.12937603890895844}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.16816329956054688, 'dpo_reward_mean_target': 0.316131591796875, 'standard deviation': 3.0, 'reward_a1': tensor([1.3242], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0695878267288208, 'numerator': 0.12568160891532898, 'denominator': 0.11750470846891403}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 1.3587303161621094, 'dpo_reward_mean_target': 0.3702392578125, 'standard deviation': 3.0, 'reward_a1': tensor([1.7114], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9111739993095398, 'numerator': 0.12033411115407944, 'denominator': 0.13206490874290466}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.0927886962890625, 'dpo_reward_mean_target': -0.0927886962890625, 'standard deviation': 3.0, 'reward_a1': tensor([-0.6887], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13038279116153717, 'denominator': 0.13038279116153717}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.0828094482421875, 'dpo_reward_mean_target': 0.05689430236816406, 'standard deviation': 3.0, 'reward_a1': tensor([1.1661], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0184705257415771, 'numerator': 0.12419507652521133, 'denominator': 0.12194273620843887}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.5501136779785156, 'dpo_reward_mean_target': 0.6034698486328125, 'standard deviation': 3.0, 'reward_a1': tensor([1.1543], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0034292936325073, 'numerator': 0.13075822591781616, 'denominator': 0.13031135499477386}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.03971672058105469, 'dpo_reward_mean_target': -0.03971672058105469, 'standard deviation': 3.0, 'reward_a1': tensor([0.4064], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13151851296424866, 'denominator': 0.13151851296424866}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.15599822998046875, 'dpo_reward_mean_target': 1.3157386779785156, 'standard deviation': 3.0, 'reward_a1': tensor([2.5328], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.260554313659668, 'numerator': 0.12247594445943832, 'denominator': 0.09716038405895233}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 4.25421142578125, 'dpo_reward_mean_target': 1.7075233459472656, 'standard deviation': 3.0, 'reward_a1': tensor([2.9786], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0006310939788818, 'numerator': 0.12156409025192261, 'denominator': 0.12148742377758026}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.0843658447265625, 'dpo_reward_mean_target': 0.052032470703125, 'standard deviation': 3.0, 'reward_a1': tensor([0.3715], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9989109039306641, 'numerator': 0.13222898542881012, 'denominator': 0.1323731541633606}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.052433013916015625, 'dpo_reward_mean_target': 0.12534332275390625, 'standard deviation': 3.0, 'reward_a1': tensor([0.9688], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0185869932174683, 'numerator': 0.1278274953365326, 'denominator': 0.12549492716789246}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.43137550354003906, 'dpo_reward_mean_target': 0.7727394104003906, 'standard deviation': 3.0, 'reward_a1': tensor([0.7488], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.080418586730957, 'numerator': 0.13297653198242188, 'denominator': 0.12307871133089066}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 1.8807601928710938, 'dpo_reward_mean_target': 3.0144195556640625, 'standard deviation': 3.0, 'reward_a1': tensor([3.6972], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.1704685688018799, 'numerator': 0.1295807957649231, 'denominator': 0.11070848256349564}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.012346267700195312, 'dpo_reward_mean_target': 0.012346267700195312, 'standard deviation': 3.0, 'reward_a1': tensor([-0.2300], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13254772126674652, 'denominator': 0.13254772126674652}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.4181060791015625, 'dpo_reward_mean_target': 0.6329116821289062, 'standard deviation': 3.0, 'reward_a1': tensor([1.2015], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0162655115127563, 'numerator': 0.13061347603797913, 'denominator': 0.12852297723293304}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 1.642608642578125, 'dpo_reward_mean_target': 1.962127685546875, 'standard deviation': 3.0, 'reward_a1': tensor([9.6483], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.3212097883224487, 'numerator': 0.004993426147848368, 'denominator': 0.003779434598982334}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 4.34197998046875, 'dpo_reward_mean_target': 2.70269775390625, 'standard deviation': 3.0, 'reward_a1': tensor([12.2689], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.2032894641160965, 'numerator': 0.0008237974834628403, 'denominator': 0.004052337259054184}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.4192962646484375, 'dpo_reward_mean_target': -0.3585357666015625, 'standard deviation': 3.0, 'reward_a1': tensor([6.0975], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.591936469078064, 'numerator': 0.013126379810273647, 'denominator': 0.022175317630171776}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.050922393798828125, 'dpo_reward_mean_target': 0.050922393798828125, 'standard deviation': 3.0, 'reward_a1': tensor([0.0249], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.132975772023201, 'denominator': 0.132975772023201}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.3304557800292969, 'dpo_reward_mean_target': 0.3304557800292969, 'standard deviation': 3.0, 'reward_a1': tensor([0.1278], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.1326778382062912, 'denominator': 0.1326778382062912}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.03131866455078125, 'dpo_reward_mean_target': 0.03131866455078125, 'standard deviation': 3.0, 'reward_a1': tensor([-0.1134], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13282614946365356, 'denominator': 0.13282614946365356}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.3827991485595703, 'dpo_reward_mean_target': 0.8254680633544922, 'standard deviation': 3.0, 'reward_a1': tensor([-0.1687], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9627009034156799, 'numerator': 0.12587560713291168, 'denominator': 0.1307525634765625}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.2705211639404297, 'dpo_reward_mean_target': -0.2705211639404297, 'standard deviation': 3.0, 'reward_a1': tensor([0.5049], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.12861210107803345, 'denominator': 0.12861210107803345}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.8331146240234375, 'dpo_reward_mean_target': 1.340728759765625, 'standard deviation': 3.0, 'reward_a1': tensor([3.8069], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 2.3589367866516113, 'numerator': 0.09485138207674026, 'denominator': 0.04020937904715538}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.051548004150390625, 'dpo_reward_mean_target': 0.5149135589599609, 'standard deviation': 3.0, 'reward_a1': tensor([0.9196], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0333046913146973, 'numerator': 0.13177649676799774, 'denominator': 0.12752917408943176}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 3.450225830078125, 'dpo_reward_mean_target': 4.75848388671875, 'standard deviation': 3.0, 'reward_a1': tensor([10.1956], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 2.4240241050720215, 'numerator': 0.025735829025506973, 'denominator': 0.010616986081004143}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.0605621337890625, 'dpo_reward_mean_target': -0.0605621337890625, 'standard deviation': 3.0, 'reward_a1': tensor([-0.1051], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13296610116958618, 'denominator': 0.13296610116958618}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.38776397705078125, 'dpo_reward_mean_target': 0.38776397705078125, 'standard deviation': 3.0, 'reward_a1': tensor([-0.0786], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13138388097286224, 'denominator': 0.13138388097286224}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 1.1856155395507812, 'dpo_reward_mean_target': 0.964691162109375, 'standard deviation': 3.0, 'reward_a1': tensor([2.4150], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.967646062374115, 'numerator': 0.11831523478031158, 'denominator': 0.12227118760347366}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.16947174072265625, 'dpo_reward_mean_target': -0.16947174072265625, 'standard deviation': 3.0, 'reward_a1': tensor([0.6913], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.12761805951595306, 'denominator': 0.12761805951595306}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.2413787841796875, 'dpo_reward_mean_target': 1.050048828125, 'standard deviation': 3.0, 'reward_a1': tensor([0.8268], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0164036750793457, 'numerator': 0.13261303305625916, 'denominator': 0.130472794175148}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.4832611083984375, 'dpo_reward_mean_target': 0.9935722351074219, 'standard deviation': 3.0, 'reward_a1': tensor([2.8954], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.1300981044769287, 'numerator': 0.10877259820699692, 'denominator': 0.09625057876110077}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.33091163635253906, 'dpo_reward_mean_target': -0.33091163635253906, 'standard deviation': 3.0, 'reward_a1': tensor([0.1467], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13130609691143036, 'denominator': 0.13130609691143036}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 1.4310302734375, 'dpo_reward_mean_target': 2.01397705078125, 'standard deviation': 3.0, 'reward_a1': tensor([11.0061], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.824519157409668, 'numerator': 0.0014889549929648638, 'denominator': 0.0008160807774402201}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.252960205078125, 'dpo_reward_mean_target': 1.9388427734375, 'standard deviation': 3.0, 'reward_a1': tensor([1.5382], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0863759517669678, 'numerator': 0.13180005550384521, 'denominator': 0.12132085114717484}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.047252655029296875, 'dpo_reward_mean_target': -0.083465576171875, 'standard deviation': 3.0, 'reward_a1': tensor([2.3228], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9665708541870117, 'numerator': 0.09640125930309296, 'denominator': 0.099735327064991}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.016338348388671875, 'dpo_reward_mean_target': -0.016338348388671875, 'standard deviation': 3.0, 'reward_a1': tensor([-0.3033], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.1323738396167755, 'denominator': 0.1323738396167755}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 1.7210693359375, 'dpo_reward_mean_target': 1.2360649108886719, 'standard deviation': 3.0, 'reward_a1': tensor([0.9899], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0266813039779663, 'numerator': 0.13253401219844818, 'denominator': 0.12908972799777985}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.0358123779296875, 'dpo_reward_mean_target': -0.0358123779296875, 'standard deviation': 3.0, 'reward_a1': tensor([0.3242], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13202662765979767, 'denominator': 0.13202662765979767}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.23794937133789062, 'dpo_reward_mean_target': -0.13071060180664062, 'standard deviation': 3.0, 'reward_a1': tensor([-0.1635], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0002481937408447, 'numerator': 0.1329728215932846, 'denominator': 0.13293983042240143}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.11478233337402344, 'dpo_reward_mean_target': -0.11478233337402344, 'standard deviation': 3.0, 'reward_a1': tensor([-0.1148], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13298074901103973, 'denominator': 0.13298074901103973}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.4261627197265625, 'dpo_reward_mean_target': 0.6249465942382812, 'standard deviation': 3.0, 'reward_a1': tensor([1.1987], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.014978289604187, 'numerator': 0.13057111203670502, 'denominator': 0.12864424288272858}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.8461799621582031, 'dpo_reward_mean_target': 0.7644157409667969, 'standard deviation': 3.0, 'reward_a1': tensor([1.2477], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9959887266159058, 'numerator': 0.13126631081104279, 'denominator': 0.1317949742078781}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.06731414794921875, 'dpo_reward_mean_target': 0.8522872924804688, 'standard deviation': 3.0, 'reward_a1': tensor([-0.4136], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9209389686584473, 'numerator': 0.12165411561727524, 'denominator': 0.13209791481494904}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.9180526733398438, 'dpo_reward_mean_target': 2.515228271484375, 'standard deviation': 3.0, 'reward_a1': tensor([6.5802], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 2.370513677597046, 'numerator': 0.053101468831300735, 'denominator': 0.022400828078389168}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.053607940673828125, 'dpo_reward_mean_target': 1.2547454833984375, 'standard deviation': 3.0, 'reward_a1': tensor([4.3306], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.719854474067688, 'numerator': 0.0786188542842865, 'denominator': 0.04571250453591347}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.3997211456298828, 'dpo_reward_mean_target': -0.3997211456298828, 'standard deviation': 3.0, 'reward_a1': tensor([0.1558], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13072027266025543, 'denominator': 0.13072027266025543}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.6354408264160156, 'dpo_reward_mean_target': 1.5753860473632812, 'standard deviation': 3.0, 'reward_a1': tensor([0.4893], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9376839399337769, 'numerator': 0.1245461255311966, 'denominator': 0.1328231394290924}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.5120716094970703, 'dpo_reward_mean_target': -0.5258674621582031, 'standard deviation': 3.0, 'reward_a1': tensor([1.6016], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9967547059059143, 'numerator': 0.10341618210077286, 'denominator': 0.10375288873910904}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.5826225280761719, 'dpo_reward_mean_target': 0.5826225280761719, 'standard deviation': 3.0, 'reward_a1': tensor([0.4270], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.1328018605709076, 'denominator': 0.1328018605709076}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.2643871307373047, 'dpo_reward_mean_target': 0.2643871307373047, 'standard deviation': 3.0, 'reward_a1': tensor([-0.1986], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13140647113323212, 'denominator': 0.13140647113323212}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.12769508361816406, 'dpo_reward_mean_target': -0.12769508361816406, 'standard deviation': 3.0, 'reward_a1': tensor([0.3565], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.1312597393989563, 'denominator': 0.1312597393989563}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.3586139678955078, 'dpo_reward_mean_target': -0.3586139678955078, 'standard deviation': 3.0, 'reward_a1': tensor([0.2036], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13066630065441132, 'denominator': 0.13066630065441132}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.1758441925048828, 'dpo_reward_mean_target': -0.1758441925048828, 'standard deviation': 3.0, 'reward_a1': tensor([-0.0488], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13286149501800537, 'denominator': 0.13286149501800537}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.04941368103027344, 'dpo_reward_mean_target': -0.04941368103027344, 'standard deviation': 3.0, 'reward_a1': tensor([-0.0494], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13298074901103973, 'denominator': 0.13298074901103973}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.0352020263671875, 'dpo_reward_mean_target': -0.0352020263671875, 'standard deviation': 3.0, 'reward_a1': tensor([-0.0352], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13298074901103973, 'denominator': 0.13298074901103973}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.42128753662109375, 'dpo_reward_mean_target': 0.6774711608886719, 'standard deviation': 3.0, 'reward_a1': tensor([4.5570], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.1208374500274658, 'numerator': 0.057630326598882675, 'denominator': 0.05141720175743103}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.7001018524169922, 'dpo_reward_mean_target': 0.4390602111816406, 'standard deviation': 3.0, 'reward_a1': tensor([1.6653], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9687198400497437, 'numerator': 0.12232407927513123, 'denominator': 0.12627394497394562}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 1.0459136962890625, 'dpo_reward_mean_target': 0.9926834106445312, 'standard deviation': 3.0, 'reward_a1': tensor([4.5094], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9795693159103394, 'numerator': 0.06689475476741791, 'denominator': 0.06828996539115906}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.047733306884765625, 'dpo_reward_mean_target': 0.047733306884765625, 'standard deviation': 3.0, 'reward_a1': tensor([0.0477], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13298074901103973, 'denominator': 0.13298074901103973}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.17522048950195312, 'dpo_reward_mean_target': 0.17522048950195312, 'standard deviation': 3.0, 'reward_a1': tensor([1.1607], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.12599577009677887, 'denominator': 0.12599577009677887}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.5054550170898438, 'dpo_reward_mean_target': 0.5054550170898438, 'standard deviation': 3.0, 'reward_a1': tensor([2.9893], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.09439261257648468, 'denominator': 0.09439261257648468}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 1.73162841796875, 'dpo_reward_mean_target': 2.2806854248046875, 'standard deviation': 3.0, 'reward_a1': tensor([4.4896], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.163586974143982, 'numerator': 0.10140538960695267, 'denominator': 0.08714895695447922}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.36682701110839844, 'dpo_reward_mean_target': 0.022336959838867188, 'standard deviation': 3.0, 'reward_a1': tensor([0.7257], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0395915508270264, 'numerator': 0.12937554717063904, 'denominator': 0.12444844096899033}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 1.3766250610351562, 'dpo_reward_mean_target': 0.8712959289550781, 'standard deviation': 3.0, 'reward_a1': tensor([1.6190], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9725865125656128, 'numerator': 0.12891381978988647, 'denominator': 0.13254740834236145}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.050560951232910156, 'dpo_reward_mean_target': 0.050560951232910156, 'standard deviation': 3.0, 'reward_a1': tensor([-0.3137], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13200390338897705, 'denominator': 0.13200390338897705}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.4866294860839844, 'dpo_reward_mean_target': -0.3798408508300781, 'standard deviation': 3.0, 'reward_a1': tensor([2.3936], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0341105461120605, 'numerator': 0.08673525601625443, 'denominator': 0.08387426286935806}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.18749427795410156, 'dpo_reward_mean_target': 0.18749427795410156, 'standard deviation': 3.0, 'reward_a1': tensor([0.4662], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13240812718868256, 'denominator': 0.13240812718868256}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.8031234741210938, 'dpo_reward_mean_target': -0.09661102294921875, 'standard deviation': 3.0, 'reward_a1': tensor([2.6111], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.7979395985603333, 'numerator': 0.08848864585161209, 'denominator': 0.11089642345905304}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.0019779205322265625, 'dpo_reward_mean_target': 0.49461936950683594, 'standard deviation': 3.0, 'reward_a1': tensor([0.1810], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9963240027427673, 'numerator': 0.1322561800479889, 'denominator': 0.13274414837360382}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.021821975708007812, 'dpo_reward_mean_target': 0.021821975708007812, 'standard deviation': 3.0, 'reward_a1': tensor([0.5263], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13111379742622375, 'denominator': 0.13111379742622375}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.08167266845703125, 'dpo_reward_mean_target': 0.08167266845703125, 'standard deviation': 3.0, 'reward_a1': tensor([0.3945], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13225989043712616, 'denominator': 0.13225989043712616}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.02439117431640625, 'dpo_reward_mean_target': -0.02439117431640625, 'standard deviation': 3.0, 'reward_a1': tensor([0.2787], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.1323036402463913, 'denominator': 0.1323036402463913}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 2.24761962890625, 'dpo_reward_mean_target': 4.6348724365234375, 'standard deviation': 3.0, 'reward_a1': tensor([14.7135], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 19.885080337524414, 'numerator': 0.00047092357999645174, 'denominator': 2.368225614191033e-05}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.11046791076660156, 'dpo_reward_mean_target': -0.11046791076660156, 'standard deviation': 3.0, 'reward_a1': tensor([-0.1888], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13293547928333282, 'denominator': 0.13293547928333282}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 1.3079833984375, 'dpo_reward_mean_target': 1.875946044921875, 'standard deviation': 3.0, 'reward_a1': tensor([7.7944], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.479077935218811, 'numerator': 0.018995024263858795, 'denominator': 0.012842477299273014}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.12702369689941406, 'dpo_reward_mean_target': -0.12702369689941406, 'standard deviation': 3.0, 'reward_a1': tensor([-0.1270], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13298074901103973, 'denominator': 0.13298074901103973}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.01229095458984375, 'dpo_reward_mean_target': 0.01229095458984375, 'standard deviation': 3.0, 'reward_a1': tensor([0.0439], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13297340273857117, 'denominator': 0.13297340273857117}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 2.4189071655273438, 'dpo_reward_mean_target': 2.4974288940429688, 'standard deviation': 3.0, 'reward_a1': tensor([1.1791], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9889028668403625, 'numerator': 0.12074173986911774, 'denominator': 0.12209666520357132}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.9213066101074219, 'dpo_reward_mean_target': 1.1278076171875, 'standard deviation': 3.0, 'reward_a1': tensor([0.1131], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9793041944503784, 'numerator': 0.12558747828006744, 'denominator': 0.12824153900146484}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.29808807373046875, 'dpo_reward_mean_target': 0.68658447265625, 'standard deviation': 3.0, 'reward_a1': tensor([2.4231], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.2761574983596802, 'numerator': 0.11246905475854874, 'denominator': 0.08813101798295975}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.009349822998046875, 'dpo_reward_mean_target': 0.009349822998046875, 'standard deviation': 3.0, 'reward_a1': tensor([0.0093], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13298074901103973, 'denominator': 0.13298074901103973}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.30818939208984375, 'dpo_reward_mean_target': 0.30818939208984375, 'standard deviation': 3.0, 'reward_a1': tensor([1.0784], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.12866921722888947, 'denominator': 0.12866921722888947}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.013019561767578125, 'dpo_reward_mean_target': -0.013019561767578125, 'standard deviation': 3.0, 'reward_a1': tensor([-0.0130], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13298074901103973, 'denominator': 0.13298074901103973}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 1.550018310546875, 'dpo_reward_mean_target': 1.5943679809570312, 'standard deviation': 3.0, 'reward_a1': tensor([2.9699], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.006911277770996, 'numerator': 0.11971235275268555, 'denominator': 0.1188906729221344}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 1.6631546020507812, 'dpo_reward_mean_target': 3.2893218994140625, 'standard deviation': 3.0, 'reward_a1': tensor([5.6357], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.7697970867156982, 'numerator': 0.0979393869638443, 'denominator': 0.05533933266997337}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.010972976684570312, 'dpo_reward_mean_target': -0.010972976684570312, 'standard deviation': 3.0, 'reward_a1': tensor([-0.0110], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13298074901103973, 'denominator': 0.13298074901103973}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.06512928009033203, 'dpo_reward_mean_target': 0.06512928009033203, 'standard deviation': 3.0, 'reward_a1': tensor([-0.0163], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13293179869651794, 'denominator': 0.13293179869651794}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.43753814697265625, 'dpo_reward_mean_target': 0.43753814697265625, 'standard deviation': 3.0, 'reward_a1': tensor([0.1443], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.1323470175266266, 'denominator': 0.1323470175266266}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 1.8291015625, 'dpo_reward_mean_target': 1.8821258544921875, 'standard deviation': 3.0, 'reward_a1': tensor([1.4462], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9975907802581787, 'numerator': 0.1315842568874359, 'denominator': 0.1319020390510559}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 1.6343326568603516, 'dpo_reward_mean_target': 1.480072021484375, 'standard deviation': 3.0, 'reward_a1': tensor([0.1725], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0240182876586914, 'numerator': 0.12093029916286469, 'denominator': 0.11809389293193817}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.5928001403808594, 'dpo_reward_mean_target': 0.7313041687011719, 'standard deviation': 3.0, 'reward_a1': tensor([2.7487], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0326331853866577, 'numerator': 0.10607033967971802, 'denominator': 0.1027183085680008}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.3668479919433594, 'dpo_reward_mean_target': 0.3668479919433594, 'standard deviation': 3.0, 'reward_a1': tensor([0.1911], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13275276124477386, 'denominator': 0.13275276124477386}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.452545166015625, 'dpo_reward_mean_target': 2.124755859375, 'standard deviation': 3.0, 'reward_a1': tensor([0.9377], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9368759989738464, 'numerator': 0.12296784669160843, 'denominator': 0.13125306367874146}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -1.037057876586914, 'dpo_reward_mean_target': 1.1603584289550781, 'standard deviation': 3.0, 'reward_a1': tensor([1.3809], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.3800156116485596, 'numerator': 0.13262204825878143, 'denominator': 0.09610185027122498}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.23241424560546875, 'dpo_reward_mean_target': -0.1842498779296875, 'standard deviation': 3.0, 'reward_a1': tensor([5.6720], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.7699158191680908, 'numerator': 0.019784757867455482, 'denominator': 0.02569730021059513}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.6168422698974609, 'dpo_reward_mean_target': -0.2333526611328125, 'standard deviation': 3.0, 'reward_a1': tensor([-0.2915], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.005710244178772, 'numerator': 0.13295583426952362, 'denominator': 0.1322009414434433}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.6437530517578125, 'dpo_reward_mean_target': 1.4690780639648438, 'standard deviation': 3.0, 'reward_a1': tensor([1.9939], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0897719860076904, 'numerator': 0.13096152245998383, 'denominator': 0.12017332762479782}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.47614288330078125, 'dpo_reward_mean_target': 0.6070823669433594, 'standard deviation': 3.0, 'reward_a1': tensor([0.3788], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9976336359977722, 'numerator': 0.1325962394475937, 'denominator': 0.13291075825691223}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.014947891235351562, 'dpo_reward_mean_target': 0.014947891235351562, 'standard deviation': 3.0, 'reward_a1': tensor([0.3001], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13238123059272766, 'denominator': 0.13238123059272766}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.13674545288085938, 'dpo_reward_mean_target': -0.13674545288085938, 'standard deviation': 3.0, 'reward_a1': tensor([1.5350], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.11385705322027206, 'denominator': 0.11385705322027206}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.16177940368652344, 'dpo_reward_mean_target': 0.16177940368652344, 'standard deviation': 3.0, 'reward_a1': tensor([0.9975], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.1279193013906479, 'denominator': 0.1279193013906479}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.8441848754882812, 'dpo_reward_mean_target': 1.4339370727539062, 'standard deviation': 3.0, 'reward_a1': tensor([2.7106], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.1084667444229126, 'numerator': 0.12146912515163422, 'denominator': 0.10958301275968552}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.6234855651855469, 'dpo_reward_mean_target': 0.4862060546875, 'standard deviation': 3.0, 'reward_a1': tensor([2.7814], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9666079878807068, 'numerator': 0.09924005717039108, 'denominator': 0.10266835987567902}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.585693359375, 'dpo_reward_mean_target': 0.9939193725585938, 'standard deviation': 3.0, 'reward_a1': tensor([2.7099], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0909959077835083, 'numerator': 0.11291349679231644, 'denominator': 0.10349579900503159}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.0370941162109375, 'dpo_reward_mean_target': 1.01324462890625, 'standard deviation': 3.0, 'reward_a1': tensor([3.6869], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.4525424242019653, 'numerator': 0.0893956869840622, 'denominator': 0.06154428794980049}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 2.1270599365234375, 'dpo_reward_mean_target': 3.965179443359375, 'standard deviation': 3.0, 'reward_a1': tensor([9.4652], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 3.7099063396453857, 'numerator': 0.02476993016898632, 'denominator': 0.006676699500530958}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.17874717712402344, 'dpo_reward_mean_target': 0.7647743225097656, 'standard deviation': 3.0, 'reward_a1': tensor([0.8759], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0630096197128296, 'numerator': 0.13288959860801697, 'denominator': 0.12501260638237}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.06771659851074219, 'dpo_reward_mean_target': 0.06771659851074219, 'standard deviation': 3.0, 'reward_a1': tensor([0.0007], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13294759392738342, 'denominator': 0.13294759392738342}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.2163105010986328, 'dpo_reward_mean_target': -0.6358909606933594, 'standard deviation': 3.0, 'reward_a1': tensor([1.2041], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9268156290054321, 'numerator': 0.1101798489689827, 'denominator': 0.1188800111413002}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.7060108184814453, 'dpo_reward_mean_target': -0.33205223083496094, 'standard deviation': 3.0, 'reward_a1': tensor([1.4351], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.084581971168518, 'numerator': 0.11180102080106735, 'denominator': 0.10308212786912918}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.824859619140625, 'dpo_reward_mean_target': 1.129486083984375, 'standard deviation': 3.0, 'reward_a1': tensor([2.0520], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0370489358901978, 'numerator': 0.12684017419815063, 'denominator': 0.12230876833200455}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 1.71905517578125, 'dpo_reward_mean_target': 1.1458053588867188, 'standard deviation': 3.0, 'reward_a1': tensor([5.8457], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.7549504637718201, 'numerator': 0.03897886350750923, 'denominator': 0.05163102224469185}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.5481433868408203, 'dpo_reward_mean_target': 0.5481433868408203, 'standard deviation': 3.0, 'reward_a1': tensor([-0.0025], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13075947761535645, 'denominator': 0.13075947761535645}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.09982490539550781, 'dpo_reward_mean_target': -0.09982490539550781, 'standard deviation': 3.0, 'reward_a1': tensor([0.1213], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.1326199322938919, 'denominator': 0.1326199322938919}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 1.6933135986328125, 'dpo_reward_mean_target': 1.375885009765625, 'standard deviation': 3.0, 'reward_a1': tensor([9.0350], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.7675591707229614, 'numerator': 0.005109862890094519, 'denominator': 0.006657288409769535}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.38042640686035156, 'dpo_reward_mean_target': -0.11091423034667969, 'standard deviation': 3.0, 'reward_a1': tensor([2.0362], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0707224607467651, 'numerator': 0.10293301194906235, 'denominator': 0.09613417088985443}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.4067573547363281, 'dpo_reward_mean_target': 0.4973602294921875, 'standard deviation': 3.0, 'reward_a1': tensor([1.9383], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0150742530822754, 'numerator': 0.11849313974380493, 'denominator': 0.11673346906900406}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.22716140747070312, 'dpo_reward_mean_target': 0.2807655334472656, 'standard deviation': 3.0, 'reward_a1': tensor([0.2917], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.015064001083374, 'numerator': 0.1329798549413681, 'denominator': 0.1310063749551773}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 1.71600341796875, 'dpo_reward_mean_target': 1.6690521240234375, 'standard deviation': 3.0, 'reward_a1': tensor([5.3474], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9811133146286011, 'numerator': 0.06270911544561386, 'denominator': 0.06391628086566925}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.6645774841308594, 'dpo_reward_mean_target': 0.6645774841308594, 'standard deviation': 3.0, 'reward_a1': tensor([1.1361], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.1313486248254776, 'denominator': 0.1313486248254776}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 1.0989761352539062, 'dpo_reward_mean_target': 1.341766357421875, 'standard deviation': 3.0, 'reward_a1': tensor([1.2614], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.001107096672058, 'numerator': 0.13293305039405823, 'denominator': 0.13278603553771973}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.1236419677734375, 'dpo_reward_mean_target': 0.1236419677734375, 'standard deviation': 3.0, 'reward_a1': tensor([2.8250], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.08865862339735031, 'denominator': 0.08865862339735031}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.26108551025390625, 'dpo_reward_mean_target': -0.15709304809570312, 'standard deviation': 3.0, 'reward_a1': tensor([1.5772], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.93158358335495, 'numerator': 0.11251631379127502, 'denominator': 0.1207796260714531}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.1186981201171875, 'dpo_reward_mean_target': 0.1186981201171875, 'standard deviation': 3.0, 'reward_a1': tensor([-0.0366], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13280262053012848, 'denominator': 0.13280262053012848}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.3626060485839844, 'dpo_reward_mean_target': 0.3117961883544922, 'standard deviation': 3.0, 'reward_a1': tensor([0.4503], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0362930297851562, 'numerator': 0.13283903896808624, 'denominator': 0.12818674743175507}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 2.8505859375, 'dpo_reward_mean_target': 1.117431640625, 'standard deviation': 3.0, 'reward_a1': tensor([1.6337], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0697816610336304, 'numerator': 0.13102595508098602, 'denominator': 0.1224791556596756}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.7041454315185547, 'dpo_reward_mean_target': 0.7041454315185547, 'standard deviation': 3.0, 'reward_a1': tensor([1.6301], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.12679524719715118, 'denominator': 0.12679524719715118}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.3582134246826172, 'dpo_reward_mean_target': -0.2519702911376953, 'standard deviation': 3.0, 'reward_a1': tensor([1.9906], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0274711847305298, 'numerator': 0.10056506842374802, 'denominator': 0.09787628799676895}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.9293785095214844, 'dpo_reward_mean_target': 0.9305191040039062, 'standard deviation': 3.0, 'reward_a1': tensor([1.4356], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0000643730163574, 'numerator': 0.1311091035604477, 'denominator': 0.13110066950321198}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.43428802490234375, 'dpo_reward_mean_target': 0.43428802490234375, 'standard deviation': 3.0, 'reward_a1': tensor([0.0778], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.1320454478263855, 'denominator': 0.1320454478263855}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 1.5662841796875, 'dpo_reward_mean_target': 1.5662841796875, 'standard deviation': 3.0, 'reward_a1': tensor([0.4237], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.12367774546146393, 'denominator': 0.12367774546146393}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.04795646667480469, 'dpo_reward_mean_target': -0.04795646667480469, 'standard deviation': 3.0, 'reward_a1': tensor([-0.0480], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13298074901103973, 'denominator': 0.13298074901103973}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.6454734802246094, 'dpo_reward_mean_target': 0.8690338134765625, 'standard deviation': 3.0, 'reward_a1': tensor([2.9545], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0560952425003052, 'numerator': 0.10443811863660812, 'denominator': 0.09889081865549088}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.1928386688232422, 'dpo_reward_mean_target': 0.07945442199707031, 'standard deviation': 3.0, 'reward_a1': tensor([0.0156], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0015196800231934, 'numerator': 0.13295063376426697, 'denominator': 0.13274890184402466}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.4120044708251953, 'dpo_reward_mean_target': 0.9708709716796875, 'standard deviation': 3.0, 'reward_a1': tensor([1.4676], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.2002867460250854, 'numerator': 0.1311705857515335, 'denominator': 0.1092827096581459}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.0663909912109375, 'dpo_reward_mean_target': -0.0663909912109375, 'standard deviation': 3.0, 'reward_a1': tensor([-0.0664], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13298074901103973, 'denominator': 0.13298074901103973}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.14169692993164062, 'dpo_reward_mean_target': -0.14169692993164062, 'standard deviation': 3.0, 'reward_a1': tensor([0.1945], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.1321483701467514, 'denominator': 0.1321483701467514}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.289093017578125, 'dpo_reward_mean_target': -0.26958465576171875, 'standard deviation': 3.0, 'reward_a1': tensor([1.7525], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0044140815734863, 'numerator': 0.10595770925283432, 'denominator': 0.10549206286668777}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.6238269805908203, 'dpo_reward_mean_target': 0.8015708923339844, 'standard deviation': 3.0, 'reward_a1': tensor([0.7074], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.1029181480407715, 'numerator': 0.13291525840759277, 'denominator': 0.12051235139369965}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.5461578369140625, 'dpo_reward_mean_target': 0.5461578369140625, 'standard deviation': 3.0, 'reward_a1': tensor([0.4832], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.1329515278339386, 'denominator': 0.1329515278339386}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.16922760009765625, 'dpo_reward_mean_target': 0.8063125610351562, 'standard deviation': 3.0, 'reward_a1': tensor([2.8484], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.3155019283294678, 'numerator': 0.10548127442598343, 'denominator': 0.08018328994512558}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.8104629516601562, 'dpo_reward_mean_target': 0.3038673400878906, 'standard deviation': 3.0, 'reward_a1': tensor([-0.3755], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0538978576660156, 'numerator': 0.12961483001708984, 'denominator': 0.12298613786697388}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.558868408203125, 'dpo_reward_mean_target': -0.20650482177734375, 'standard deviation': 3.0, 'reward_a1': tensor([5.7568], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.271721601486206, 'numerator': 0.018440866842865944, 'denominator': 0.014500710181891918}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.29085540771484375, 'dpo_reward_mean_target': -0.07724380493164062, 'standard deviation': 3.0, 'reward_a1': tensor([2.4407], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9089576005935669, 'numerator': 0.09350121766328812, 'denominator': 0.10286642611026764}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.3692207336425781, 'dpo_reward_mean_target': -0.3692207336425781, 'standard deviation': 3.0, 'reward_a1': tensor([1.9333], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.09905525296926498, 'denominator': 0.09905525296926498}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.21907615661621094, 'dpo_reward_mean_target': 0.07233238220214844, 'standard deviation': 3.0, 'reward_a1': tensor([0.7518], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0270792245864868, 'numerator': 0.12961304187774658, 'denominator': 0.1261957585811615}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.5278148651123047, 'dpo_reward_mean_target': 0.3406181335449219, 'standard deviation': 3.0, 'reward_a1': tensor([0.5548], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.064563512802124, 'numerator': 0.13264231383800507, 'denominator': 0.12459784001111984}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -1.307647705078125, 'dpo_reward_mean_target': 2.1969146728515625, 'standard deviation': 3.0, 'reward_a1': tensor([1.7178], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.6417770385742188, 'numerator': 0.13129588961601257, 'denominator': 0.0799718126654625}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.6576175689697266, 'dpo_reward_mean_target': 0.8087997436523438, 'standard deviation': 3.0, 'reward_a1': tensor([-0.1082], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9859651923179626, 'numerator': 0.12691113352775574, 'denominator': 0.12871766090393066}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.6220855712890625, 'dpo_reward_mean_target': 2.385528564453125, 'standard deviation': 3.0, 'reward_a1': tensor([8.4852], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 3.927201747894287, 'numerator': 0.01683036796748638, 'denominator': 0.004285587929189205}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.8007469177246094, 'dpo_reward_mean_target': 1.51263427734375, 'standard deviation': 3.0, 'reward_a1': tensor([3.6078], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 2.3067824840545654, 'numerator': 0.104203000664711, 'denominator': 0.04517244175076485}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.34124755859375, 'dpo_reward_mean_target': 3.432373046875, 'standard deviation': 3.0, 'reward_a1': tensor([3.3201], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.6360511779785156, 'numerator': 0.13288772106170654, 'denominator': 0.08122467249631882}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.5277862548828125, 'dpo_reward_mean_target': 0.3027610778808594, 'standard deviation': 3.0, 'reward_a1': tensor([1.0913], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.1174980401992798, 'numerator': 0.12846539914608002, 'denominator': 0.11495804786682129}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 1.2052078247070312, 'dpo_reward_mean_target': 3.49566650390625, 'standard deviation': 3.0, 'reward_a1': tensor([10.0449], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 7.0865864753723145, 'numerator': 0.012271707877516747, 'denominator': 0.0017316810553893447}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.024288177490234375, 'dpo_reward_mean_target': 0.024288177490234375, 'standard deviation': 3.0, 'reward_a1': tensor([0.4106], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13188253343105316, 'denominator': 0.13188253343105316}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.6849536895751953, 'dpo_reward_mean_target': -0.7376461029052734, 'standard deviation': 3.0, 'reward_a1': tensor([1.2859], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9883756041526794, 'numerator': 0.10592487454414368, 'denominator': 0.10717067122459412}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.11539459228515625, 'dpo_reward_mean_target': 0.6232833862304688, 'standard deviation': 3.0, 'reward_a1': tensor([9.7349], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 2.1774446964263916, 'numerator': 0.0013203562702983618, 'denominator': 0.000606378773227334}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.1294269561767578, 'dpo_reward_mean_target': -0.1294269561767578, 'standard deviation': 3.0, 'reward_a1': tensor([0.7582], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.12728513777256012, 'denominator': 0.12728513777256012}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 1.5120697021484375, 'dpo_reward_mean_target': 1.697479248046875, 'standard deviation': 3.0, 'reward_a1': tensor([6.5578], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.107425570487976, 'numerator': 0.035795822739601135, 'denominator': 0.032323457300662994}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.09420585632324219, 'dpo_reward_mean_target': 0.09420585632324219, 'standard deviation': 3.0, 'reward_a1': tensor([0.0942], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13298074901103973, 'denominator': 0.13298074901103973}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.7438163757324219, 'dpo_reward_mean_target': -0.11090087890625, 'standard deviation': 3.0, 'reward_a1': tensor([-0.3824], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0031654834747314, 'numerator': 0.13243722915649414, 'denominator': 0.13201932609081268}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 1.2550506591796875, 'dpo_reward_mean_target': 1.250152587890625, 'standard deviation': 3.0, 'reward_a1': tensor([0.6335], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0003368854522705, 'numerator': 0.13020126521587372, 'denominator': 0.13015741109848022}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.86480712890625, 'dpo_reward_mean_target': 0.40019798278808594, 'standard deviation': 3.0, 'reward_a1': tensor([-0.1057], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.038841724395752, 'numerator': 0.13110366463661194, 'denominator': 0.12620176374912262}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.4012603759765625, 'dpo_reward_mean_target': 0.10538482666015625, 'standard deviation': 3.0, 'reward_a1': tensor([0.5162], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9913952350616455, 'numerator': 0.1317397654056549, 'denominator': 0.1328831911087036}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 1.2265625, 'dpo_reward_mean_target': 4.6614532470703125, 'standard deviation': 3.0, 'reward_a1': tensor([8.6978], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 8.988476753234863, 'numerator': 0.05379113927483559, 'denominator': 0.0059844558127224445}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.07283973693847656, 'dpo_reward_mean_target': -0.00403594970703125, 'standard deviation': 3.0, 'reward_a1': tensor([1.2939], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.010237455368042, 'numerator': 0.12109933793544769, 'denominator': 0.11987214535474777}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.3064250946044922, 'dpo_reward_mean_target': 0.3064250946044922, 'standard deviation': 3.0, 'reward_a1': tensor([-0.0334], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13213029503822327, 'denominator': 0.13213029503822327}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.11961174011230469, 'dpo_reward_mean_target': -0.11961174011230469, 'standard deviation': 3.0, 'reward_a1': tensor([-0.0174], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13290360569953918, 'denominator': 0.13290360569953918}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.1938323974609375, 'dpo_reward_mean_target': 0.07279205322265625, 'standard deviation': 3.0, 'reward_a1': tensor([5.9225], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9250949025154114, 'numerator': 0.019868075847625732, 'denominator': 0.02147679775953293}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.00115203857421875, 'dpo_reward_mean_target': -0.00115203857421875, 'standard deviation': 3.0, 'reward_a1': tensor([-0.0012], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13298074901103973, 'denominator': 0.13298074901103973}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.10158920288085938, 'dpo_reward_mean_target': -0.10158920288085938, 'standard deviation': 3.0, 'reward_a1': tensor([0.7340], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.12792134284973145, 'denominator': 0.12792134284973145}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.07264328002929688, 'dpo_reward_mean_target': -0.07264328002929688, 'standard deviation': 3.0, 'reward_a1': tensor([0.0665], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13283780217170715, 'denominator': 0.13283780217170715}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.8192062377929688, 'dpo_reward_mean_target': -0.8192062377929688, 'standard deviation': 3.0, 'reward_a1': tensor([-0.3098], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13107772171497345, 'denominator': 0.13107772171497345}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 1.1328964233398438, 'dpo_reward_mean_target': 1.9015731811523438, 'standard deviation': 3.0, 'reward_a1': tensor([1.8059], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0249627828598022, 'numerator': 0.13291317224502563, 'denominator': 0.12967608869075775}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.35265350341796875, 'dpo_reward_mean_target': 0.7928314208984375, 'standard deviation': 3.0, 'reward_a1': tensor([1.7977], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.2223711013793945, 'numerator': 0.12572625279426575, 'denominator': 0.1028544083237648}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.6327362060546875, 'dpo_reward_mean_target': 5.713531494140625, 'standard deviation': 3.0, 'reward_a1': tensor([4.3463], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 3.5731472969055176, 'numerator': 0.11986425518989563, 'denominator': 0.0335458479821682}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.8244895935058594, 'dpo_reward_mean_target': 0.8263931274414062, 'standard deviation': 3.0, 'reward_a1': tensor([0.4849], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9999279379844666, 'numerator': 0.13212215900421143, 'denominator': 0.1321316808462143}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.10041236877441406, 'dpo_reward_mean_target': 0.11499977111816406, 'standard deviation': 3.0, 'reward_a1': tensor([1.4321], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0346914529800415, 'numerator': 0.12076212465763092, 'denominator': 0.11671318113803864}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.2924365997314453, 'dpo_reward_mean_target': -0.2924365997314453, 'standard deviation': 3.0, 'reward_a1': tensor([0.1333], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13164855539798737, 'denominator': 0.13164855539798737}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 1.2314777374267578, 'dpo_reward_mean_target': 1.395538330078125, 'standard deviation': 3.0, 'reward_a1': tensor([-0.3994], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9692582488059998, 'numerator': 0.11118748784065247, 'denominator': 0.11471399664878845}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.022230148315429688, 'dpo_reward_mean_target': -0.022230148315429688, 'standard deviation': 3.0, 'reward_a1': tensor([-0.0222], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13298074901103973, 'denominator': 0.13298074901103973}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.5542926788330078, 'dpo_reward_mean_target': 0.8853759765625, 'standard deviation': 3.0, 'reward_a1': tensor([1.1084], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0143967866897583, 'numerator': 0.13261379301548004, 'denominator': 0.13073167204856873}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.4358386993408203, 'dpo_reward_mean_target': 0.4358386993408203, 'standard deviation': 3.0, 'reward_a1': tensor([1.0543], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13018444180488586, 'denominator': 0.13018444180488586}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.11574172973632812, 'dpo_reward_mean_target': -0.025970458984375, 'standard deviation': 3.0, 'reward_a1': tensor([2.4395], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0253560543060303, 'numerator': 0.09486984461545944, 'denominator': 0.09252380579710007}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.07331466674804688, 'dpo_reward_mean_target': 0.7933769226074219, 'standard deviation': 3.0, 'reward_a1': tensor([0.3065], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9898995161056519, 'numerator': 0.13124065101146698, 'denominator': 0.1325797736644745}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.04705810546875, 'dpo_reward_mean_target': -0.04705810546875, 'standard deviation': 3.0, 'reward_a1': tensor([0.5524], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13035261631011963, 'denominator': 0.13035261631011963}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.04756927490234375, 'dpo_reward_mean_target': 0.04756927490234375, 'standard deviation': 3.0, 'reward_a1': tensor([-0.0992], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13282166421413422, 'denominator': 0.13282166421413422}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.5786857604980469, 'dpo_reward_mean_target': 0.5764656066894531, 'standard deviation': 3.0, 'reward_a1': tensor([2.6270], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9994944930076599, 'numerator': 0.10527937114238739, 'denominator': 0.1053326204419136}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 1.1626968383789062, 'dpo_reward_mean_target': 1.2169189453125, 'standard deviation': 3.0, 'reward_a1': tensor([7.5077], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0387972593307495, 'numerator': 0.014756148681044579, 'denominator': 0.014205032959580421}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 1.6323356628417969, 'dpo_reward_mean_target': 1.6217079162597656, 'standard deviation': 3.0, 'reward_a1': tensor([1.7086], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9999039173126221, 'numerator': 0.13292504847049713, 'denominator': 0.13293781876564026}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.27494239807128906, 'dpo_reward_mean_target': -0.27494239807128906, 'standard deviation': 3.0, 'reward_a1': tensor([2.7060], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.08117027580738068, 'denominator': 0.08117027580738068}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 1.9572982788085938, 'dpo_reward_mean_target': 0.731170654296875, 'standard deviation': 3.0, 'reward_a1': tensor([3.6723], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.7282117009162903, 'numerator': 0.08224021643400192, 'denominator': 0.11293449252843857}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 1.408803939819336, 'dpo_reward_mean_target': 1.7144126892089844, 'standard deviation': 3.0, 'reward_a1': tensor([1.8482], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0097795724868774, 'numerator': 0.1328485757112503, 'denominator': 0.13156194984912872}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.3028278350830078, 'dpo_reward_mean_target': -0.2881011962890625, 'standard deviation': 3.0, 'reward_a1': tensor([3.9644], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0069947242736816, 'numerator': 0.048692669719457626, 'denominator': 0.04835444316267967}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.3858909606933594, 'dpo_reward_mean_target': 0.9041919708251953, 'standard deviation': 3.0, 'reward_a1': tensor([4.5500], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.2521780729293823, 'numerator': 0.06354516744613647, 'denominator': 0.05074770748615265}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.3138542175292969, 'dpo_reward_mean_target': -0.07956695556640625, 'standard deviation': 3.0, 'reward_a1': tensor([0.3449], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.01419997215271, 'numerator': 0.13165609538555145, 'denominator': 0.12981276214122772}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 1.6835784912109375, 'dpo_reward_mean_target': 2.461639404296875, 'standard deviation': 3.0, 'reward_a1': tensor([1.1944], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9268855452537537, 'numerator': 0.12162995338439941, 'denominator': 0.1312243491411209}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -1.1104736328125, 'dpo_reward_mean_target': 0.12077713012695312, 'standard deviation': 3.0, 'reward_a1': tensor([2.2051], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.4468234777450562, 'numerator': 0.10446427017450333, 'denominator': 0.07220249623060226}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.13629150390625, 'dpo_reward_mean_target': 0.13629150390625, 'standard deviation': 3.0, 'reward_a1': tensor([0.1363], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13298074901103973, 'denominator': 0.13298074901103973}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.217681884765625, 'dpo_reward_mean_target': -0.217681884765625, 'standard deviation': 3.0, 'reward_a1': tensor([0.5153], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.1290697455406189, 'denominator': 0.1290697455406189}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.2689781188964844, 'dpo_reward_mean_target': 0.7411460876464844, 'standard deviation': 3.0, 'reward_a1': tensor([1.9906], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.2176495790481567, 'numerator': 0.12193296104669571, 'denominator': 0.10013797134160995}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.2445068359375, 'dpo_reward_mean_target': 2.4099197387695312, 'standard deviation': 3.0, 'reward_a1': tensor([1.6287], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.1747323274612427, 'numerator': 0.1285477876663208, 'denominator': 0.1094273030757904}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.05162811279296875, 'dpo_reward_mean_target': -0.05162811279296875, 'standard deviation': 3.0, 'reward_a1': tensor([0.6955], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.1289205253124237, 'denominator': 0.1289205253124237}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.9548416137695312, 'dpo_reward_mean_target': 1.3568572998046875, 'standard deviation': 3.0, 'reward_a1': tensor([2.7843], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.07545006275177, 'numerator': 0.11874868720769882, 'denominator': 0.1104176715016365}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.5175704956054688, 'dpo_reward_mean_target': 0.4137420654296875, 'standard deviation': 3.0, 'reward_a1': tensor([3.1665], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.969322144985199, 'numerator': 0.08728910982608795, 'denominator': 0.09005170315504074}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.3484535217285156, 'dpo_reward_mean_target': 0.3484535217285156, 'standard deviation': 3.0, 'reward_a1': tensor([4.3264], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.05520813167095184, 'denominator': 0.05520813167095184}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.6493377685546875, 'dpo_reward_mean_target': 0.6493377685546875, 'standard deviation': 3.0, 'reward_a1': tensor([0.4333], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13263656198978424, 'denominator': 0.13263656198978424}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.45870208740234375, 'dpo_reward_mean_target': -0.222381591796875, 'standard deviation': 3.0, 'reward_a1': tensor([4.8325], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.699941098690033, 'numerator': 0.03215865418314934, 'denominator': 0.04594479873776436}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.20605087280273438, 'dpo_reward_mean_target': -0.20605087280273438, 'standard deviation': 3.0, 'reward_a1': tensor([2.3447], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.09264160692691803, 'denominator': 0.09264160692691803}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 1.652862548828125, 'dpo_reward_mean_target': 1.22540283203125, 'standard deviation': 3.0, 'reward_a1': tensor([1.3196], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.00569486618042, 'numerator': 0.13291525840759277, 'denominator': 0.13216261565685272}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 1.6301422119140625, 'dpo_reward_mean_target': 1.91314697265625, 'standard deviation': 3.0, 'reward_a1': tensor([1.8595], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0027663707733154, 'numerator': 0.1329595148563385, 'denominator': 0.13259270787239075}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.0330657958984375, 'dpo_reward_mean_target': 1.55401611328125, 'standard deviation': 3.0, 'reward_a1': tensor([6.6149], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 2.6745545864105225, 'numerator': 0.03204909339547157, 'denominator': 0.011982965283095837}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 2.0646018981933594, 'dpo_reward_mean_target': 1.0255012512207031, 'standard deviation': 3.0, 'reward_a1': tensor([-0.4338], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.256680965423584, 'numerator': 0.118142269551754, 'denominator': 0.0940113440155983}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.06155586242675781, 'dpo_reward_mean_target': 0.06155586242675781, 'standard deviation': 3.0, 'reward_a1': tensor([1.5108], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.11833565682172775, 'denominator': 0.11833565682172775}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 1.6362285614013672, 'dpo_reward_mean_target': 1.607736587524414, 'standard deviation': 3.0, 'reward_a1': tensor([0.3243], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0041165351867676, 'numerator': 0.12135159969329834, 'denominator': 0.12085409462451935}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.5334625244140625, 'dpo_reward_mean_target': 0.62017822265625, 'standard deviation': 3.0, 'reward_a1': tensor([1.7957], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.2518396377563477, 'numerator': 0.12315451353788376, 'denominator': 0.09837882965803146}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.6128425598144531, 'dpo_reward_mean_target': 0.7122268676757812, 'standard deviation': 3.0, 'reward_a1': tensor([6.5667], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 2.610403299331665, 'numerator': 0.019806666299700737, 'denominator': 0.007587588857859373}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.6552047729492188, 'dpo_reward_mean_target': 0.9354705810546875, 'standard deviation': 3.0, 'reward_a1': tensor([0.5431], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.073819875717163, 'numerator': 0.1318482607603073, 'denominator': 0.1227843388915062}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.296234130859375, 'dpo_reward_mean_target': 0.08501434326171875, 'standard deviation': 3.0, 'reward_a1': tensor([1.0735], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0512174367904663, 'numerator': 0.1259542852640152, 'denominator': 0.11981754004955292}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 1.1518096923828125, 'dpo_reward_mean_target': 1.0953941345214844, 'standard deviation': 3.0, 'reward_a1': tensor([3.7011], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9839729070663452, 'numerator': 0.091193787753582, 'denominator': 0.09267916530370712}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.6082172393798828, 'dpo_reward_mean_target': 0.6723861694335938, 'standard deviation': 3.0, 'reward_a1': tensor([3.3699], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.6079212427139282, 'numerator': 0.08875991404056549, 'denominator': 0.05520165339112282}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.14815521240234375, 'dpo_reward_mean_target': 0.14815521240234375, 'standard deviation': 3.0, 'reward_a1': tensor([1.1547], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.12570250034332275, 'denominator': 0.12570250034332275}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.09606742858886719, 'dpo_reward_mean_target': 0.09606742858886719, 'standard deviation': 3.0, 'reward_a1': tensor([-0.3572], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13147176802158356, 'denominator': 0.13147176802158356}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 2.702178955078125, 'dpo_reward_mean_target': 5.4404296875, 'standard deviation': 3.0, 'reward_a1': tensor([10.0336], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 6.135058879852295, 'numerator': 0.04118822515010834, 'denominator': 0.006713582668453455}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.28784751892089844, 'dpo_reward_mean_target': 0.28784751892089844, 'standard deviation': 3.0, 'reward_a1': tensor([-0.1992], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13123995065689087, 'denominator': 0.13123995065689087}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.1345081329345703, 'dpo_reward_mean_target': 0.1345081329345703, 'standard deviation': 3.0, 'reward_a1': tensor([-0.2263], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13202250003814697, 'denominator': 0.13202250003814697}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.5582962036132812, 'dpo_reward_mean_target': -0.08210372924804688, 'standard deviation': 3.0, 'reward_a1': tensor([1.8707], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.1229057312011719, 'numerator': 0.1075926348567009, 'denominator': 0.09581626206636429}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 1.6351318359375, 'dpo_reward_mean_target': 0.9381332397460938, 'standard deviation': 3.0, 'reward_a1': tensor([5.1382], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.742091178894043, 'numerator': 0.04990726709365845, 'denominator': 0.06725220382213593}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.3336219787597656, 'dpo_reward_mean_target': 0.5653305053710938, 'standard deviation': 3.0, 'reward_a1': tensor([7.3634], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.1948275566101074, 'numerator': 0.01020391471683979, 'denominator': 0.008540073409676552}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.1257171630859375, 'dpo_reward_mean_target': -0.1257171630859375, 'standard deviation': 3.0, 'reward_a1': tensor([0.1676], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13234664499759674, 'denominator': 0.13234664499759674}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.1524677276611328, 'dpo_reward_mean_target': -0.1524677276611328, 'standard deviation': 3.0, 'reward_a1': tensor([-0.1715], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13297808170318604, 'denominator': 0.13297808170318604}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.12678909301757812, 'dpo_reward_mean_target': 0.43923187255859375, 'standard deviation': 3.0, 'reward_a1': tensor([0.4377], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0053848028182983, 'numerator': 0.13298074901103973, 'denominator': 0.1322685033082962}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.9161872863769531, 'dpo_reward_mean_target': 1.5060501098632812, 'standard deviation': 3.0, 'reward_a1': tensor([1.4400], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0151143074035645, 'numerator': 0.13294854760169983, 'denominator': 0.13096904754638672}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.09769630432128906, 'dpo_reward_mean_target': 0.09769630432128906, 'standard deviation': 3.0, 'reward_a1': tensor([0.0970], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13298074901103973, 'denominator': 0.13298074901103973}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 1.175323486328125, 'dpo_reward_mean_target': 1.175323486328125, 'standard deviation': 3.0, 'reward_a1': tensor([3.7715], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.09144517034292221, 'denominator': 0.09144517034292221}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.19028472900390625, 'dpo_reward_mean_target': 1.9736785888671875, 'standard deviation': 3.0, 'reward_a1': tensor([2.9229], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.4402194023132324, 'numerator': 0.12648744881153107, 'denominator': 0.08782512694597244}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.18401718139648438, 'dpo_reward_mean_target': -0.18401718139648438, 'standard deviation': 3.0, 'reward_a1': tensor([1.2273], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.11905024945735931, 'denominator': 0.11905024945735931}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.50341796875, 'dpo_reward_mean_target': 0.7126007080078125, 'standard deviation': 3.0, 'reward_a1': tensor([1.1947], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.013730764389038, 'numerator': 0.13127438724040985, 'denominator': 0.12949630618095398}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 1.0534477233886719, 'dpo_reward_mean_target': 1.281494140625, 'standard deviation': 3.0, 'reward_a1': tensor([1.4792], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0079295635223389, 'numerator': 0.132692351937294, 'denominator': 0.13164843618869781}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.2728919982910156, 'dpo_reward_mean_target': 0.2728919982910156, 'standard deviation': 3.0, 'reward_a1': tensor([0.2729], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13298074901103973, 'denominator': 0.13298074901103973}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 2.4050445556640625, 'dpo_reward_mean_target': 3.423248291015625, 'standard deviation': 3.0, 'reward_a1': tensor([7.0830], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.6026138067245483, 'numerator': 0.06318802386522293, 'denominator': 0.03942810371518135}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 1.4256134033203125, 'dpo_reward_mean_target': 0.6277847290039062, 'standard deviation': 3.0, 'reward_a1': tensor([4.0142], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.7673351764678955, 'numerator': 0.07032440602779388, 'denominator': 0.09164757281541824}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.009447097778320312, 'dpo_reward_mean_target': 0.009447097778320312, 'standard deviation': 3.0, 'reward_a1': tensor([0.0094], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13298074901103973, 'denominator': 0.13298074901103973}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.35125732421875, 'dpo_reward_mean_target': 0.7885112762451172, 'standard deviation': 3.0, 'reward_a1': tensor([1.2919], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.1455907821655273, 'numerator': 0.13112179934978485, 'denominator': 0.11445779353380203}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 1.40509033203125, 'dpo_reward_mean_target': 3.2850570678710938, 'standard deviation': 3.0, 'reward_a1': tensor([1.8559], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9028651118278503, 'numerator': 0.11871576309204102, 'denominator': 0.13148781657218933}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.0448455810546875, 'dpo_reward_mean_target': -0.0448455810546875, 'standard deviation': 3.0, 'reward_a1': tensor([0.7439], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.1284630447626114, 'denominator': 0.1284630447626114}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.08821868896484375, 'dpo_reward_mean_target': 1.5854263305664062, 'standard deviation': 3.0, 'reward_a1': tensor([2.3489], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.2860138416290283, 'numerator': 0.12874317169189453, 'denominator': 0.1001102551817894}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.041290283203125, 'dpo_reward_mean_target': 0.041290283203125, 'standard deviation': 3.0, 'reward_a1': tensor([0.0413], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13298074901103973, 'denominator': 0.13298074901103973}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.5095558166503906, 'dpo_reward_mean_target': 0.5095558166503906, 'standard deviation': 3.0, 'reward_a1': tensor([-0.4814], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.12592050433158875, 'denominator': 0.12592050433158875}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 5.3775482177734375, 'dpo_reward_mean_target': 4.3311309814453125, 'standard deviation': 3.0, 'reward_a1': tensor([3.9020], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.1170854568481445, 'numerator': 0.13162747025489807, 'denominator': 0.11783115565776825}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.12116241455078125, 'dpo_reward_mean_target': 1.458099365234375, 'standard deviation': 3.0, 'reward_a1': tensor([-0.0208], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.8865702152252197, 'numerator': 0.11776479333639145, 'denominator': 0.1328318864107132}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.3333587646484375, 'dpo_reward_mean_target': 1.5825576782226562, 'standard deviation': 3.0, 'reward_a1': tensor([3.5567], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.434349775314331, 'numerator': 0.10709118843078613, 'denominator': 0.07466183602809906}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.021368026733398438, 'dpo_reward_mean_target': -0.021368026733398438, 'standard deviation': 3.0, 'reward_a1': tensor([-0.0214], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13298074901103973, 'denominator': 0.13298074901103973}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 2.6008071899414062, 'dpo_reward_mean_target': 4.14593505859375, 'standard deviation': 3.0, 'reward_a1': tensor([6.5487], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.7248659133911133, 'numerator': 0.09649169445037842, 'denominator': 0.055941563099622726}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 2.2902069091796875, 'dpo_reward_mean_target': 4.6268768310546875, 'standard deviation': 3.0, 'reward_a1': tensor([12.5023], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 10.464994430541992, 'numerator': 0.0042397514916956425, 'denominator': 0.000405136524932459}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.695037841796875, 'dpo_reward_mean_target': 0.448699951171875, 'standard deviation': 3.0, 'reward_a1': tensor([2.5806], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.4100226163864136, 'numerator': 0.10330589860677719, 'denominator': 0.07326541841030121}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 1.2783737182617188, 'dpo_reward_mean_target': 1.3688201904296875, 'standard deviation': 3.0, 'reward_a1': tensor([2.2407], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0092589855194092, 'numerator': 0.12748175859451294, 'denominator': 0.1263122409582138}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 1.104696273803711, 'dpo_reward_mean_target': 1.2380542755126953, 'standard deviation': 3.0, 'reward_a1': tensor([0.4103], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9887859225273132, 'numerator': 0.12801384925842285, 'denominator': 0.12946568429470062}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.05193519592285156, 'dpo_reward_mean_target': -0.05193519592285156, 'standard deviation': 3.0, 'reward_a1': tensor([-0.0519], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13298074901103973, 'denominator': 0.13298074901103973}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.03249931335449219, 'dpo_reward_mean_target': 0.03249931335449219, 'standard deviation': 3.0, 'reward_a1': tensor([3.1581], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.07728086411952972, 'denominator': 0.07728086411952972}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 2.097686767578125, 'dpo_reward_mean_target': 2.240203857421875, 'standard deviation': 3.0, 'reward_a1': tensor([0.6242], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9758352041244507, 'numerator': 0.11502207070589066, 'denominator': 0.11787038296461105}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.2834186553955078, 'dpo_reward_mean_target': 0.12063980102539062, 'standard deviation': 3.0, 'reward_a1': tensor([1.2678], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0624432563781738, 'numerator': 0.12360581010580063, 'denominator': 0.11634109169244766}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.5421104431152344, 'dpo_reward_mean_target': 0.5421104431152344, 'standard deviation': 3.0, 'reward_a1': tensor([1.0368], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13118502497673035, 'denominator': 0.13118502497673035}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.019664764404296875, 'dpo_reward_mean_target': -0.019664764404296875, 'standard deviation': 3.0, 'reward_a1': tensor([0.1764], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13269716501235962, 'denominator': 0.13269716501235962}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.5750999450683594, 'dpo_reward_mean_target': 0.6606178283691406, 'standard deviation': 3.0, 'reward_a1': tensor([1.4402], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.2115225791931152, 'numerator': 0.12856526672840118, 'denominator': 0.10611875355243683}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.19112586975097656, 'dpo_reward_mean_target': -0.19112586975097656, 'standard deviation': 3.0, 'reward_a1': tensor([4.5830], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.03748643025755882, 'denominator': 0.03748643025755882}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 1.457366943359375, 'dpo_reward_mean_target': 1.5782394409179688, 'standard deviation': 3.0, 'reward_a1': tensor([9.2728], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.109768271446228, 'numerator': 0.004957970231771469, 'denominator': 0.0044675725512206554}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.0489501953125, 'dpo_reward_mean_target': -0.0489501953125, 'standard deviation': 3.0, 'reward_a1': tensor([-0.1393], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13292042911052704, 'denominator': 0.13292042911052704}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.01911163330078125, 'dpo_reward_mean_target': 0.01911163330078125, 'standard deviation': 3.0, 'reward_a1': tensor([0.0191], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13298074901103973, 'denominator': 0.13298074901103973}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 1.7157135009765625, 'dpo_reward_mean_target': 1.5702667236328125, 'standard deviation': 3.0, 'reward_a1': tensor([2.4804], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.986557126045227, 'numerator': 0.12699928879737854, 'denominator': 0.12872979044914246}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.2921905517578125, 'dpo_reward_mean_target': -0.2921905517578125, 'standard deviation': 3.0, 'reward_a1': tensor([0.6748], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.1262485384941101, 'denominator': 0.1262485384941101}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.12961959838867188, 'dpo_reward_mean_target': 0.12961959838867188, 'standard deviation': 3.0, 'reward_a1': tensor([-0.3073], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13157789409160614, 'denominator': 0.13157789409160614}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.838165283203125, 'dpo_reward_mean_target': 1.1183624267578125, 'standard deviation': 3.0, 'reward_a1': tensor([1.6534], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.3895577192306519, 'numerator': 0.1308823972940445, 'denominator': 0.09418997168540955}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.29129791259765625, 'dpo_reward_mean_target': 0.7405738830566406, 'standard deviation': 3.0, 'reward_a1': tensor([3.3279], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.1507039070129395, 'numerator': 0.09167861938476562, 'denominator': 0.07967177033424377}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.1100311279296875, 'dpo_reward_mean_target': 0.9620361328125, 'standard deviation': 3.0, 'reward_a1': tensor([8.7653], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 2.700305223464966, 'numerator': 0.0045149074867367744, 'denominator': 0.0016719989944249392}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.4756641387939453, 'dpo_reward_mean_target': 0.4756641387939453, 'standard deviation': 3.0, 'reward_a1': tensor([3.8453], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.07076746225357056, 'denominator': 0.07076746225357056}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.23095130920410156, 'dpo_reward_mean_target': 0.23095130920410156, 'standard deviation': 3.0, 'reward_a1': tensor([0.8279], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13037385046482086, 'denominator': 0.13037385046482086}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.42987823486328125, 'dpo_reward_mean_target': 0.2972564697265625, 'standard deviation': 3.0, 'reward_a1': tensor([2.7344], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9656664729118347, 'numerator': 0.0956038385629654, 'denominator': 0.09900295734405518}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.05079460144042969, 'dpo_reward_mean_target': 0.12414741516113281, 'standard deviation': 3.0, 'reward_a1': tensor([0.9929], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.007407307624817, 'numerator': 0.1275198757648468, 'denominator': 0.12658223509788513}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.030797958374023438, 'dpo_reward_mean_target': 0.030797958374023438, 'standard deviation': 3.0, 'reward_a1': tensor([-0.1171], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13281933963298798, 'denominator': 0.13281933963298798}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -1.0400638580322266, 'dpo_reward_mean_target': 0.9230937957763672, 'standard deviation': 3.0, 'reward_a1': tensor([1.3037], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.3459800481796265, 'numerator': 0.13191509246826172, 'denominator': 0.0980067253112793}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.104461669921875, 'dpo_reward_mean_target': 0.21897125244140625, 'standard deviation': 3.0, 'reward_a1': tensor([6.9920], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.2830166816711426, 'numerator': 0.01039825938642025, 'denominator': 0.008104539476335049}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 1.6903457641601562, 'dpo_reward_mean_target': 2.6041641235351562, 'standard deviation': 3.0, 'reward_a1': tensor([2.8186], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0705392360687256, 'numerator': 0.13264155387878418, 'denominator': 0.12390162795782089}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -2.4064788818359375, 'dpo_reward_mean_target': -0.5461044311523438, 'standard deviation': 3.0, 'reward_a1': tensor([5.0346], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 3.8414506912231445, 'numerator': 0.02357017621397972, 'denominator': 0.006135748699307442}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.0042476654052734375, 'dpo_reward_mean_target': 0.0042476654052734375, 'standard deviation': 3.0, 'reward_a1': tensor([0.0042], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13298074901103973, 'denominator': 0.13298074901103973}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.62548828125, 'dpo_reward_mean_target': -0.16633987426757812, 'standard deviation': 3.0, 'reward_a1': tensor([0.4378], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.043452501296997, 'numerator': 0.13031116127967834, 'denominator': 0.12488460540771484}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.00018310546875, 'dpo_reward_mean_target': 0.49709320068359375, 'standard deviation': 3.0, 'reward_a1': tensor([3.3147], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.1846168041229248, 'numerator': 0.08555439859628677, 'denominator': 0.07222115993499756}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 1.4580078125, 'dpo_reward_mean_target': 1.5399398803710938, 'standard deviation': 3.0, 'reward_a1': tensor([2.1261], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0057258605957031, 'numerator': 0.1304662674665451, 'denominator': 0.12972348928451538}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.045276641845703125, 'dpo_reward_mean_target': -0.045276641845703125, 'standard deviation': 3.0, 'reward_a1': tensor([-0.0453], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13298074901103973, 'denominator': 0.13298074901103973}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 1.1650848388671875, 'dpo_reward_mean_target': 1.8907089233398438, 'standard deviation': 3.0, 'reward_a1': tensor([7.8777], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.66854727268219, 'numerator': 0.01815338432788849, 'denominator': 0.010879754088819027}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.020778656005859375, 'dpo_reward_mean_target': -0.020778656005859375, 'standard deviation': 3.0, 'reward_a1': tensor([1.8831], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.10872681438922882, 'denominator': 0.10872681438922882}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 1.1214447021484375, 'dpo_reward_mean_target': 1.154327392578125, 'standard deviation': 3.0, 'reward_a1': tensor([4.9652], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0140814781188965, 'numerator': 0.059345755726099014, 'denominator': 0.058521680533885956}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.9533424377441406, 'dpo_reward_mean_target': 1.2883834838867188, 'standard deviation': 3.0, 'reward_a1': tensor([3.2642], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 2.1626040935516357, 'numerator': 0.10705391317605972, 'denominator': 0.04950231686234474}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 1.8881454467773438, 'dpo_reward_mean_target': 2.0950393676757812, 'standard deviation': 3.0, 'reward_a1': tensor([3.0952], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0256953239440918, 'numerator': 0.1257917284965515, 'denominator': 0.1226404458284378}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.12207412719726562, 'dpo_reward_mean_target': 0.08808135986328125, 'standard deviation': 3.0, 'reward_a1': tensor([0.3081], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0076212882995605, 'numerator': 0.13262346386909485, 'denominator': 0.1316203474998474}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 1.85980224609375, 'dpo_reward_mean_target': 1.85980224609375, 'standard deviation': 3.0, 'reward_a1': tensor([1.0577], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.12831121683120728, 'denominator': 0.12831121683120728}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 2.63818359375, 'dpo_reward_mean_target': 4.681480407714844, 'standard deviation': 3.0, 'reward_a1': tensor([3.9593], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0703434944152832, 'numerator': 0.12918253242969513, 'denominator': 0.12069258838891983}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 1.3661079406738281, 'dpo_reward_mean_target': 1.4706039428710938, 'standard deviation': 3.0, 'reward_a1': tensor([8.5327], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0861101150512695, 'numerator': 0.008326636627316475, 'denominator': 0.007666475139558315}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 1.725341796875, 'dpo_reward_mean_target': 2.9590377807617188, 'standard deviation': 3.0, 'reward_a1': tensor([5.6983], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.5841479301452637, 'numerator': 0.0876488909125328, 'denominator': 0.05532872676849365}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.3303813934326172, 'dpo_reward_mean_target': -0.03901100158691406, 'standard deviation': 3.0, 'reward_a1': tensor([0.4295], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0200846195220947, 'numerator': 0.1313687562942505, 'denominator': 0.1287822127342224}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.017452239990234375, 'dpo_reward_mean_target': -0.017452239990234375, 'standard deviation': 3.0, 'reward_a1': tensor([-0.1232], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13289809226989746, 'denominator': 0.13289809226989746}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.38057899475097656, 'dpo_reward_mean_target': 6.348388671875, 'standard deviation': 3.0, 'reward_a1': tensor([1.2874], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.2812868654727936, 'numerator': 0.0320485420525074, 'denominator': 0.1139354333281517}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 2.0046653747558594, 'dpo_reward_mean_target': 1.2096633911132812, 'standard deviation': 3.0, 'reward_a1': tensor([1.0007], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0550296306610107, 'numerator': 0.13265863060951233, 'denominator': 0.12573924660682678}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.6928939819335938, 'dpo_reward_mean_target': 1.3412017822265625, 'standard deviation': 3.0, 'reward_a1': tensor([1.7109], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0512492656707764, 'numerator': 0.1319749504327774, 'denominator': 0.1255410611629486}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.4350471496582031, 'dpo_reward_mean_target': 0.8918724060058594, 'standard deviation': 3.0, 'reward_a1': tensor([1.4625], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.1995497941970825, 'numerator': 0.13059692084789276, 'denominator': 0.10887160897254944}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.48615074157714844, 'dpo_reward_mean_target': 0.04963874816894531, 'standard deviation': 3.0, 'reward_a1': tensor([0.6083], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0504390001296997, 'numerator': 0.1306946575641632, 'denominator': 0.12441907823085785}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.036449432373046875, 'dpo_reward_mean_target': 0.036449432373046875, 'standard deviation': 3.0, 'reward_a1': tensor([0.0776], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13296829164028168, 'denominator': 0.13296829164028168}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -1.9037284851074219, 'dpo_reward_mean_target': -0.7073478698730469, 'standard deviation': 3.0, 'reward_a1': tensor([3.5776], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.9138683080673218, 'numerator': 0.047949280589818954, 'denominator': 0.025053594261407852}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.47547149658203125, 'dpo_reward_mean_target': 0.2998542785644531, 'standard deviation': 3.0, 'reward_a1': tensor([-0.1306], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9963217973709106, 'numerator': 0.13161899149417877, 'denominator': 0.13210490345954895}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.5411529541015625, 'dpo_reward_mean_target': 1.9202728271484375, 'standard deviation': 3.0, 'reward_a1': tensor([7.9607], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 7.305322647094727, 'numerator': 0.017516551539301872, 'denominator': 0.0023977793753147125}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 2.0098190307617188, 'dpo_reward_mean_target': 1.8867378234863281, 'standard deviation': 3.0, 'reward_a1': tensor([3.0016], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9856977462768555, 'numerator': 0.12410742789506912, 'denominator': 0.12590819597244263}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 1.5752029418945312, 'dpo_reward_mean_target': 2.0553741455078125, 'standard deviation': 3.0, 'reward_a1': tensor([13.9317], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.9087309837341309, 'numerator': 5.2564839279511943e-05, 'denominator': 2.7539155780686997e-05}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.23806476593017578, 'dpo_reward_mean_target': -0.23806476593017578, 'standard deviation': 3.0, 'reward_a1': tensor([-0.4501], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13264904916286469, 'denominator': 0.13264904916286469}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 1.2674484252929688, 'dpo_reward_mean_target': 0.48780059814453125, 'standard deviation': 3.0, 'reward_a1': tensor([2.6036], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.861120879650116, 'numerator': 0.10369927436113358, 'denominator': 0.12042360007762909}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.8211860656738281, 'dpo_reward_mean_target': 2.02349853515625, 'standard deviation': 3.0, 'reward_a1': tensor([0.8171], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0706405639648438, 'numerator': 0.12265188246965408, 'denominator': 0.1145593449473381}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.739471435546875, 'dpo_reward_mean_target': 1.2074508666992188, 'standard deviation': 3.0, 'reward_a1': tensor([2.0914], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0598530769348145, 'numerator': 0.1273316591978073, 'denominator': 0.12014086544513702}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.8350105285644531, 'dpo_reward_mean_target': 0.34637451171875, 'standard deviation': 3.0, 'reward_a1': tensor([1.0839], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9735754728317261, 'numerator': 0.12902189791202545, 'denominator': 0.132523775100708}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 3.053089141845703, 'dpo_reward_mean_target': 2.9730224609375, 'standard deviation': 3.0, 'reward_a1': tensor([3.2393], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9979886412620544, 'numerator': 0.13245774805545807, 'denominator': 0.13272470235824585}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.80853271484375, 'dpo_reward_mean_target': 0.7673110961914062, 'standard deviation': 3.0, 'reward_a1': tensor([0.2225], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0434951782226562, 'numerator': 0.13080628216266632, 'denominator': 0.12535399198532104}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.8371200561523438, 'dpo_reward_mean_target': 1.0722694396972656, 'standard deviation': 3.0, 'reward_a1': tensor([-0.1560], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9713980555534363, 'numerator': 0.12229012697935104, 'denominator': 0.125890851020813}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.28476715087890625, 'dpo_reward_mean_target': 0.3435688018798828, 'standard deviation': 3.0, 'reward_a1': tensor([0.7043], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0025525093078613, 'numerator': 0.13202279806137085, 'denominator': 0.13168667256832123}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.22857284545898438, 'dpo_reward_mean_target': -0.22857284545898438, 'standard deviation': 3.0, 'reward_a1': tensor([0.1171], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13210082054138184, 'denominator': 0.13210082054138184}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.06185150146484375, 'dpo_reward_mean_target': 0.08477210998535156, 'standard deviation': 3.0, 'reward_a1': tensor([-0.2201], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.99925297498703, 'numerator': 0.1322956532239914, 'denominator': 0.13239455223083496}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.8080024719238281, 'dpo_reward_mean_target': 0.8591842651367188, 'standard deviation': 3.0, 'reward_a1': tensor([0.0704], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9956693649291992, 'numerator': 0.1284628063440323, 'denominator': 0.129021555185318}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -1.5258941650390625, 'dpo_reward_mean_target': 2.8822784423828125, 'standard deviation': 3.0, 'reward_a1': tensor([5.7220], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 11.82758617401123, 'numerator': 0.08496234565973282, 'denominator': 0.007183405105024576}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -1.464111328125, 'dpo_reward_mean_target': 2.7143402099609375, 'standard deviation': 3.0, 'reward_a1': tensor([6.1451], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 12.972041130065918, 'numerator': 0.06915106624364853, 'denominator': 0.0053307777270674706}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 1.6393852233886719, 'dpo_reward_mean_target': 1.6393852233886719, 'standard deviation': 3.0, 'reward_a1': tensor([0.1804], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.1181488111615181, 'denominator': 0.1181488111615181}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.7381782531738281, 'dpo_reward_mean_target': 1.0207901000976562, 'standard deviation': 3.0, 'reward_a1': tensor([1.9606], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0345314741134644, 'numerator': 0.12661302089691162, 'denominator': 0.12238682061433792}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.6244964599609375, 'dpo_reward_mean_target': 1.7545089721679688, 'standard deviation': 3.0, 'reward_a1': tensor([2.9463], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.8766001462936401, 'numerator': 0.12289035320281982, 'denominator': 0.0654856339097023}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.18712615966796875, 'dpo_reward_mean_target': 0.47022247314453125, 'standard deviation': 3.0, 'reward_a1': tensor([0.7855], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.048155665397644, 'numerator': 0.13224850594997406, 'denominator': 0.12617258727550507}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.1412792205810547, 'dpo_reward_mean_target': -0.1412792205810547, 'standard deviation': 3.0, 'reward_a1': tensor([0.3433], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13125689327716827, 'denominator': 0.13125689327716827}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.30057525634765625, 'dpo_reward_mean_target': 0.30057525634765625, 'standard deviation': 3.0, 'reward_a1': tensor([0.0990], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.1326807737350464, 'denominator': 0.1326807737350464}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 2.0439453125, 'dpo_reward_mean_target': 4.682403564453125, 'standard deviation': 3.0, 'reward_a1': tensor([5.1711], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.6989787817001343, 'numerator': 0.13122768700122833, 'denominator': 0.07723915576934814}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 2.078125, 'dpo_reward_mean_target': 2.2657089233398438, 'standard deviation': 3.0, 'reward_a1': tensor([3.4082], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0261013507843018, 'numerator': 0.12367956340312958, 'denominator': 0.12053347378969193}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.03333854675292969, 'dpo_reward_mean_target': 0.03333854675292969, 'standard deviation': 3.0, 'reward_a1': tensor([0.2548], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13261878490447998, 'denominator': 0.13261878490447998}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.4200916290283203, 'dpo_reward_mean_target': -0.4200916290283203, 'standard deviation': 3.0, 'reward_a1': tensor([0.5687], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.12595078349113464, 'denominator': 0.12595078349113464}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.8005657196044922, 'dpo_reward_mean_target': -0.8005657196044922, 'standard deviation': 3.0, 'reward_a1': tensor([0.7852], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.11564324051141739, 'denominator': 0.11564324051141739}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.7548370361328125, 'dpo_reward_mean_target': 0.73736572265625, 'standard deviation': 3.0, 'reward_a1': tensor([1.5314], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9984767436981201, 'numerator': 0.12840290367603302, 'denominator': 0.12859879434108734}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.6808395385742188, 'dpo_reward_mean_target': 0.9276313781738281, 'standard deviation': 3.0, 'reward_a1': tensor([2.6189], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0510191917419434, 'numerator': 0.1134423241019249, 'denominator': 0.10793554037809372}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.8552017211914062, 'dpo_reward_mean_target': 1.1292839050292969, 'standard deviation': 3.0, 'reward_a1': tensor([0.9605], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9990344047546387, 'numerator': 0.13277052342891693, 'denominator': 0.13289885222911835}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -1.1309261322021484, 'dpo_reward_mean_target': 0.8505706787109375, 'standard deviation': 3.0, 'reward_a1': tensor([-0.2235], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.981816828250885, 'numerator': 0.12472493201494217, 'denominator': 0.12703482806682587}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.41443634033203125, 'dpo_reward_mean_target': 0.41443634033203125, 'standard deviation': 3.0, 'reward_a1': tensor([0.3476], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13294772803783417, 'denominator': 0.13294772803783417}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -1.0404472351074219, 'dpo_reward_mean_target': 0.2112579345703125, 'standard deviation': 3.0, 'reward_a1': tensor([5.5834], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 2.302961826324463, 'numerator': 0.026758579537272453, 'denominator': 0.011619201861321926}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 2.366302490234375, 'dpo_reward_mean_target': 6.05694580078125, 'standard deviation': 3.0, 'reward_a1': tensor([9.0149], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 7.1684064865112305, 'numerator': 0.08178745955228806, 'denominator': 0.01140943355858326}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 1.80511474609375, 'dpo_reward_mean_target': 1.7145004272460938, 'standard deviation': 3.0, 'reward_a1': tensor([6.9359], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9492201805114746, 'numerator': 0.02924138307571411, 'denominator': 0.030805690214037895}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 1.8632049560546875, 'dpo_reward_mean_target': 1.7953643798828125, 'standard deviation': 3.0, 'reward_a1': tensor([-0.4498], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.017328143119812, 'numerator': 0.10049965977668762, 'denominator': 0.09878784418106079}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.6524696350097656, 'dpo_reward_mean_target': 5.1007080078125, 'standard deviation': 3.0, 'reward_a1': tensor([2.9994], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.6414589881896973, 'numerator': 0.10405264049768448, 'denominator': 0.0633903369307518}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 1.1371726989746094, 'dpo_reward_mean_target': 1.395050048828125, 'standard deviation': 3.0, 'reward_a1': tensor([3.4402], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0642759799957275, 'numerator': 0.10540731996297836, 'denominator': 0.09904133528470993}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 1.2570724487304688, 'dpo_reward_mean_target': 1.67633056640625, 'standard deviation': 3.0, 'reward_a1': tensor([2.4411], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.046436071395874, 'numerator': 0.12872959673404694, 'denominator': 0.12301716953516006}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.25638771057128906, 'dpo_reward_mean_target': 0.3355236053466797, 'standard deviation': 3.0, 'reward_a1': tensor([-0.4940], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9655197262763977, 'numerator': 0.12799358367919922, 'denominator': 0.13256444036960602}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.4262733459472656, 'dpo_reward_mean_target': -0.07615280151367188, 'standard deviation': 3.0, 'reward_a1': tensor([2.3400], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.1060618162155151, 'numerator': 0.09614631533622742, 'denominator': 0.08692670613527298}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 1.9786605834960938, 'dpo_reward_mean_target': 1.5339431762695312, 'standard deviation': 3.0, 'reward_a1': tensor([1.9279], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9915556907653809, 'numerator': 0.13183896243572235, 'denominator': 0.13296173512935638}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 3.360565185546875, 'dpo_reward_mean_target': 4.2698211669921875, 'standard deviation': 3.0, 'reward_a1': tensor([14.9149], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 3.069112539291382, 'numerator': 0.0002453073102515191, 'denominator': 7.992776954779401e-05}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.8732833862304688, 'dpo_reward_mean_target': 0.8046417236328125, 'standard deviation': 3.0, 'reward_a1': tensor([3.3077], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9813472032546997, 'numerator': 0.0938897579908371, 'denominator': 0.09567435085773468}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.10686111450195312, 'dpo_reward_mean_target': -0.070343017578125, 'standard deviation': 3.0, 'reward_a1': tensor([2.1437], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.009099006652832, 'numerator': 0.10127885639667511, 'denominator': 0.10036563128232956}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 1.5701065063476562, 'dpo_reward_mean_target': 1.5504226684570312, 'standard deviation': 3.0, 'reward_a1': tensor([1.3894], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0003737211227417, 'numerator': 0.13278938829898834, 'denominator': 0.13273978233337402}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.7162132263183594, 'dpo_reward_mean_target': -0.21222686767578125, 'standard deviation': 3.0, 'reward_a1': tensor([-0.1572], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0173438787460327, 'numerator': 0.13295836746692657, 'denominator': 0.13069166243076324}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.06287193298339844, 'dpo_reward_mean_target': 0.06287193298339844, 'standard deviation': 3.0, 'reward_a1': tensor([0.0716], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13298018276691437, 'denominator': 0.13298018276691437}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 3.4391403198242188, 'dpo_reward_mean_target': 4.132209777832031, 'standard deviation': 3.0, 'reward_a1': tensor([-1.0790], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.6875508427619934, 'numerator': 0.02941438928246498, 'denominator': 0.04278140142560005}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 1.5781707763671875, 'dpo_reward_mean_target': 1.7305679321289062, 'standard deviation': 3.0, 'reward_a1': tensor([1.2296], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9928324818611145, 'numerator': 0.13113918900489807, 'denominator': 0.132085919380188}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 3.63519287109375, 'dpo_reward_mean_target': 4.210441589355469, 'standard deviation': 3.0, 'reward_a1': tensor([3.4612], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9709286093711853, 'numerator': 0.12889796495437622, 'denominator': 0.13275741040706635}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.0634613037109375, 'dpo_reward_mean_target': 0.08331489562988281, 'standard deviation': 3.0, 'reward_a1': tensor([-0.0635], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9988037943840027, 'numerator': 0.1328216791152954, 'denominator': 0.13298074901103973}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.3459129333496094, 'dpo_reward_mean_target': -0.3459129333496094, 'standard deviation': 3.0, 'reward_a1': tensor([3.0726], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.06947514414787292, 'denominator': 0.06947514414787292}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -1.4208297729492188, 'dpo_reward_mean_target': 0.8987579345703125, 'standard deviation': 3.0, 'reward_a1': tensor([4.1990], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 3.1565988063812256, 'numerator': 0.07261080294847488, 'denominator': 0.023002861067652702}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -2.564727783203125, 'dpo_reward_mean_target': 2.626800537109375, 'standard deviation': 3.0, 'reward_a1': tensor([12.1845], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1108.282470703125, 'numerator': 0.0008312833961099386, 'denominator': 7.500645438085485e-07}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.010839462280273438, 'dpo_reward_mean_target': 0.3180809020996094, 'standard deviation': 3.0, 'reward_a1': tensor([0.9056], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0278631448745728, 'numerator': 0.13045506179332733, 'denominator': 0.1269187033176422}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.6660385131835938, 'dpo_reward_mean_target': 1.3446884155273438, 'standard deviation': 3.0, 'reward_a1': tensor([6.3865], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.5004544258117676, 'numerator': 0.03239443898200989, 'denominator': 0.021589752286672592}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.016374588012695312, 'dpo_reward_mean_target': 0.016374588012695312, 'standard deviation': 3.0, 'reward_a1': tensor([0.0168], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13298074901103973, 'denominator': 0.13298074901103973}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.09063339233398438, 'dpo_reward_mean_target': 0.09063339233398438, 'standard deviation': 3.0, 'reward_a1': tensor([-0.0169], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.132895365357399, 'denominator': 0.132895365357399}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.04177093505859375, 'dpo_reward_mean_target': 0.9283599853515625, 'standard deviation': 3.0, 'reward_a1': tensor([8.3403], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 2.3425424098968506, 'numerator': 0.006284935865551233, 'denominator': 0.002682955004274845}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.26825714111328125, 'dpo_reward_mean_target': 1.4888916015625, 'standard deviation': 3.0, 'reward_a1': tensor([27.6859], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 37.93071365356445, 'numerator': 3.677640040327086e-18, 'denominator': 9.695677996592406e-20}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.14953994750976562, 'dpo_reward_mean_target': 0.0980224609375, 'standard deviation': 3.0, 'reward_a1': tensor([-0.1649], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9961810111999512, 'numerator': 0.13247118890285492, 'denominator': 0.13297903537750244}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.9569549560546875, 'dpo_reward_mean_target': 0.03224945068359375, 'standard deviation': 3.0, 'reward_a1': tensor([5.6400], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.9556430578231812, 'numerator': 0.023176908493041992, 'denominator': 0.011851297691464424}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.7309913635253906, 'dpo_reward_mean_target': 0.399139404296875, 'standard deviation': 3.0, 'reward_a1': tensor([1.8429], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.2869126796722412, 'numerator': 0.11843983829021454, 'denominator': 0.09203408658504486}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 1.0637931823730469, 'dpo_reward_mean_target': 1.0637931823730469, 'standard deviation': 3.0, 'reward_a1': tensor([1.7660], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.12938763201236725, 'denominator': 0.12938763201236725}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.2885150909423828, 'dpo_reward_mean_target': -0.017133712768554688, 'standard deviation': 3.0, 'reward_a1': tensor([1.2374], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9632753133773804, 'numerator': 0.1218467727303505, 'denominator': 0.12649215757846832}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.6061630249023438, 'dpo_reward_mean_target': 1.0306854248046875, 'standard deviation': 3.0, 'reward_a1': tensor([3.4940], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.8164114952087402, 'numerator': 0.09492514282464981, 'denominator': 0.05225971341133118}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.09551620483398438, 'dpo_reward_mean_target': -0.09551620483398438, 'standard deviation': 3.0, 'reward_a1': tensor([0.0477], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13282938301563263, 'denominator': 0.13282938301563263}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 1.083587646484375, 'dpo_reward_mean_target': 10.772857666015625, 'standard deviation': 3.0, 'reward_a1': tensor([31.3279], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 751177433088.0, 'numerator': 8.505818445414537e-12, 'denominator': 1.132331495556874e-23}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.09839820861816406, 'dpo_reward_mean_target': -0.09839820861816406, 'standard deviation': 3.0, 'reward_a1': tensor([0.3451], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.1315353661775589, 'denominator': 0.1315353661775589}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.02709197998046875, 'dpo_reward_mean_target': -0.02709197998046875, 'standard deviation': 3.0, 'reward_a1': tensor([-0.0271], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13298074901103973, 'denominator': 0.13298074901103973}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.09318733215332031, 'dpo_reward_mean_target': 0.09318733215332031, 'standard deviation': 3.0, 'reward_a1': tensor([-0.9799], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.12474019080400467, 'denominator': 0.12474019080400467}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.10291481018066406, 'dpo_reward_mean_target': 0.10291481018066406, 'standard deviation': 3.0, 'reward_a1': tensor([-0.1050], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13266192376613617, 'denominator': 0.13266192376613617}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.3091907501220703, 'dpo_reward_mean_target': -0.3792896270751953, 'standard deviation': 3.0, 'reward_a1': tensor([4.2495], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9648526310920715, 'numerator': 0.04044253006577492, 'denominator': 0.041915759444236755}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.14229393005371094, 'dpo_reward_mean_target': -0.14229393005371094, 'standard deviation': 3.0, 'reward_a1': tensor([1.6749], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.11069132387638092, 'denominator': 0.11069132387638092}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 2.857177734375, 'dpo_reward_mean_target': 4.957733154296875, 'standard deviation': 3.0, 'reward_a1': tensor([11.9524], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 6.538110733032227, 'numerator': 0.008776996284723282, 'denominator': 0.0013424361823126674}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.057094573974609375, 'dpo_reward_mean_target': 1.7918052673339844, 'standard deviation': 3.0, 'reward_a1': tensor([7.7700], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 3.7413554191589355, 'numerator': 0.01826016791164875, 'denominator': 0.004880629014223814}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.584991455078125, 'dpo_reward_mean_target': 0.17733001708984375, 'standard deviation': 3.0, 'reward_a1': tensor([4.8273], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.531345248222351, 'numerator': 0.04000423476099968, 'denominator': 0.026123588904738426}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 2.683624267578125, 'dpo_reward_mean_target': 2.765533447265625, 'standard deviation': 3.0, 'reward_a1': tensor([5.1025], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.021877646446228, 'numerator': 0.09817881882190704, 'denominator': 0.09607688337564468}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.263031005859375, 'dpo_reward_mean_target': 0.2809562683105469, 'standard deviation': 3.0, 'reward_a1': tensor([5.8604], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0111924409866333, 'numerator': 0.02358897402882576, 'denominator': 0.02332787774503231}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.6870040893554688, 'dpo_reward_mean_target': 1.6739501953125, 'standard deviation': 3.0, 'reward_a1': tensor([2.3607], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.138167381286621, 'numerator': 0.12954217195510864, 'denominator': 0.11381645500659943}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 1.7266006469726562, 'dpo_reward_mean_target': 2.6006393432617188, 'standard deviation': 3.0, 'reward_a1': tensor([2.1402], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9977235794067383, 'numerator': 0.13142338395118713, 'denominator': 0.13172324001789093}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.0023860931396484375, 'dpo_reward_mean_target': 0.5850429534912109, 'standard deviation': 3.0, 'reward_a1': tensor([-0.0904], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9754399061203003, 'numerator': 0.129652738571167, 'denominator': 0.13291719555854797}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.04788970947265625, 'dpo_reward_mean_target': 2.8964309692382812, 'standard deviation': 3.0, 'reward_a1': tensor([6.9694], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 5.6968207359313965, 'numerator': 0.052909307181835175, 'denominator': 0.009287514723837376}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 1.5128936767578125, 'dpo_reward_mean_target': 0.9918575286865234, 'standard deviation': 3.0, 'reward_a1': tensor([0.0191], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0740094184875488, 'numerator': 0.12617036700248718, 'denominator': 0.11747603118419647}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 4.038116455078125, 'dpo_reward_mean_target': 2.5787124633789062, 'standard deviation': 3.0, 'reward_a1': tensor([7.7249], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.4886215627193451, 'numerator': 0.030535412952303886, 'denominator': 0.0624929703772068}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.9853439331054688, 'dpo_reward_mean_target': 0.46244049072265625, 'standard deviation': 3.0, 'reward_a1': tensor([-0.4210], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0687780380249023, 'numerator': 0.12733855843544006, 'denominator': 0.11914406716823578}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.18932342529296875, 'dpo_reward_mean_target': 0.07842636108398438, 'standard deviation': 3.0, 'reward_a1': tensor([1.2223], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.038743495941162, 'numerator': 0.12365780770778656, 'denominator': 0.11904557049274445}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.2940845489501953, 'dpo_reward_mean_target': -0.26361846923828125, 'standard deviation': 3.0, 'reward_a1': tensor([-1.1658], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9970023036003113, 'numerator': 0.12710203230381012, 'denominator': 0.12748418748378754}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.3095111846923828, 'dpo_reward_mean_target': 0.052768707275390625, 'standard deviation': 3.0, 'reward_a1': tensor([2.2033], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.098405361175537, 'numerator': 0.10284889489412308, 'denominator': 0.09363473206758499}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.0045566558837890625, 'dpo_reward_mean_target': 0.47028160095214844, 'standard deviation': 3.0, 'reward_a1': tensor([1.1461], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0481456518173218, 'numerator': 0.12964878976345062, 'denominator': 0.12369348108768463}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.20186614990234375, 'dpo_reward_mean_target': 0.36490440368652344, 'standard deviation': 3.0, 'reward_a1': tensor([1.9236], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.1230005025863647, 'numerator': 0.11619043350219727, 'denominator': 0.103464275598526}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.5884246826171875, 'dpo_reward_mean_target': 0.36267852783203125, 'standard deviation': 3.0, 'reward_a1': tensor([-0.2112], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.017375111579895, 'numerator': 0.13056977093219757, 'denominator': 0.12833985686302185}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.3009166717529297, 'dpo_reward_mean_target': 0.24123191833496094, 'standard deviation': 3.0, 'reward_a1': tensor([0.4311], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0281565189361572, 'numerator': 0.1327146589756012, 'denominator': 0.12908020615577698}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.82794189453125, 'dpo_reward_mean_target': 2.542022705078125, 'standard deviation': 3.0, 'reward_a1': tensor([15.5263], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 13.958961486816406, 'numerator': 1.1379375791875646e-05, 'denominator': 8.152021564455936e-07}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.06234550476074219, 'dpo_reward_mean_target': -0.06638526916503906, 'standard deviation': 3.0, 'reward_a1': tensor([3.4236], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9984357357025146, 'numerator': 0.06759650260210037, 'denominator': 0.06770240515470505}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.18056678771972656, 'dpo_reward_mean_target': 0.18056678771972656, 'standard deviation': 3.0, 'reward_a1': tensor([0.6478], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.1313774585723877, 'denominator': 0.1313774585723877}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.37686920166015625, 'dpo_reward_mean_target': 1.3882980346679688, 'standard deviation': 3.0, 'reward_a1': tensor([4.1795], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 2.055554151535034, 'numerator': 0.08626042306423187, 'denominator': 0.04196455702185631}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.2473602294921875, 'dpo_reward_mean_target': -0.2473602294921875, 'standard deviation': 3.0, 'reward_a1': tensor([0.1156], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13201120495796204, 'denominator': 0.13201120495796204}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.6366539001464844, 'dpo_reward_mean_target': -0.4301738739013672, 'standard deviation': 3.0, 'reward_a1': tensor([-0.2203], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0072098970413208, 'numerator': 0.13265566527843475, 'denominator': 0.13170607388019562}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.4505615234375, 'dpo_reward_mean_target': 1.741455078125, 'standard deviation': 3.0, 'reward_a1': tensor([4.2909], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.5812983512878418, 'numerator': 0.09267718344926834, 'denominator': 0.0586082860827446}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.11460113525390625, 'dpo_reward_mean_target': 0.6393470764160156, 'standard deviation': 3.0, 'reward_a1': tensor([0.4750], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.017972707748413, 'numerator': 0.13278140127658844, 'denominator': 0.13043709099292755}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.10649681091308594, 'dpo_reward_mean_target': 0.10649681091308594, 'standard deviation': 3.0, 'reward_a1': tensor([0.6433], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13086844980716705, 'denominator': 0.13086844980716705}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.40486907958984375, 'dpo_reward_mean_target': 0.5541954040527344, 'standard deviation': 3.0, 'reward_a1': tensor([2.3148], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0309184789657593, 'numerator': 0.11194407194852829, 'denominator': 0.10858673602342606}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.7503852844238281, 'dpo_reward_mean_target': 2.3406295776367188, 'standard deviation': 3.0, 'reward_a1': tensor([3.2117], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.3423216342926025, 'numerator': 0.1274920254945755, 'denominator': 0.09497874230146408}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.44139862060546875, 'dpo_reward_mean_target': 0.44139862060546875, 'standard deviation': 3.0, 'reward_a1': tensor([-0.6207], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.12490199506282806, 'denominator': 0.12490199506282806}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 1.49114990234375, 'dpo_reward_mean_target': 0.1924896240234375, 'standard deviation': 3.0, 'reward_a1': tensor([13.9006], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.15193098783493042, 'numerator': 3.889712843374582e-06, 'denominator': 2.560183929745108e-05}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.5918092727661133, 'dpo_reward_mean_target': -0.5918092727661133, 'standard deviation': 3.0, 'reward_a1': tensor([-0.0601], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13090869784355164, 'denominator': 0.13090869784355164}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.2803001403808594, 'dpo_reward_mean_target': 0.2224445343017578, 'standard deviation': 3.0, 'reward_a1': tensor([3.1552], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.1946666240692139, 'numerator': 0.08246438950300217, 'denominator': 0.06902711093425751}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.7832508087158203, 'dpo_reward_mean_target': -0.5490913391113281, 'standard deviation': 3.0, 'reward_a1': tensor([1.9151], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0694655179977417, 'numerator': 0.09490294009447098, 'denominator': 0.08873866498470306}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.2568016052246094, 'dpo_reward_mean_target': -0.2568016052246094, 'standard deviation': 3.0, 'reward_a1': tensor([2.7440], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.08063486963510513, 'denominator': 0.08063486963510513}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.10013389587402344, 'dpo_reward_mean_target': 0.10013389587402344, 'standard deviation': 3.0, 'reward_a1': tensor([0.3410], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13255290687084198, 'denominator': 0.13255290687084198}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.4848155975341797, 'dpo_reward_mean_target': 1.1270637512207031, 'standard deviation': 3.0, 'reward_a1': tensor([1.8023], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.3037868738174438, 'numerator': 0.12965475022792816, 'denominator': 0.09944473952054977}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.0967254638671875, 'dpo_reward_mean_target': 0.3020782470703125, 'standard deviation': 3.0, 'reward_a1': tensor([-0.5569], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.971198320388794, 'numerator': 0.1276405155658722, 'denominator': 0.13142579793930054}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.7931327819824219, 'dpo_reward_mean_target': 1.1740570068359375, 'standard deviation': 3.0, 'reward_a1': tensor([-0.2770], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9028626680374146, 'numerator': 0.11829982697963715, 'denominator': 0.131027489900589}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.376007080078125, 'dpo_reward_mean_target': -0.376007080078125, 'standard deviation': 3.0, 'reward_a1': tensor([0.4473], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.12806671857833862, 'denominator': 0.12806671857833862}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.13027191162109375, 'dpo_reward_mean_target': -0.13027191162109375, 'standard deviation': 3.0, 'reward_a1': tensor([0.0386], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.1327703595161438, 'denominator': 0.1327703595161438}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -1.1557903289794922, 'dpo_reward_mean_target': -0.3914527893066406, 'standard deviation': 3.0, 'reward_a1': tensor([4.5267], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.568534016609192, 'numerator': 0.03468778356909752, 'denominator': 0.02211477980017662}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.568756103515625, 'dpo_reward_mean_target': -0.568756103515625, 'standard deviation': 3.0, 'reward_a1': tensor([0.0402], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.1302688866853714, 'denominator': 0.1302688866853714}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 1.6962127685546875, 'dpo_reward_mean_target': 2.1417922973632812, 'standard deviation': 3.0, 'reward_a1': tensor([9.5700], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.460516095161438, 'numerator': 0.006201408803462982, 'denominator': 0.004246039316058159}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 1.1074752807617188, 'dpo_reward_mean_target': 1.468963623046875, 'standard deviation': 3.0, 'reward_a1': tensor([10.4402], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.4442524909973145, 'numerator': 0.0015203433576971292, 'denominator': 0.0010526853147894144}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.10878372192382812, 'dpo_reward_mean_target': 0.533172607421875, 'standard deviation': 3.0, 'reward_a1': tensor([1.4295], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0907113552093506, 'numerator': 0.12717562913894653, 'denominator': 0.11659879237413406}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.22254180908203125, 'dpo_reward_mean_target': -0.5919647216796875, 'standard deviation': 3.0, 'reward_a1': tensor([5.8275], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.7742043137550354, 'numerator': 0.013473895378410816, 'denominator': 0.017403539270162582}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.071380615234375, 'dpo_reward_mean_target': 3.468292236328125, 'standard deviation': 3.0, 'reward_a1': tensor([15.1239], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 154.51437377929688, 'numerator': 7.014459697529674e-05, 'denominator': 4.539680844573013e-07}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.2416553497314453, 'dpo_reward_mean_target': 0.3687705993652344, 'standard deviation': 3.0, 'reward_a1': tensor([-0.7095], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9857712388038635, 'numerator': 0.12466318905353546, 'denominator': 0.12646259367465973}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.3034019470214844, 'dpo_reward_mean_target': -0.3034019470214844, 'standard deviation': 3.0, 'reward_a1': tensor([0.4283], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.1290832757949829, 'denominator': 0.1290832757949829}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.5906600952148438, 'dpo_reward_mean_target': -0.5906600952148438, 'standard deviation': 3.0, 'reward_a1': tensor([4.4057], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.033225852996110916, 'denominator': 0.033225852996110916}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 1.08575439453125, 'dpo_reward_mean_target': 1.8025779724121094, 'standard deviation': 3.0, 'reward_a1': tensor([4.5098], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.276556372642517, 'numerator': 0.08850377798080444, 'denominator': 0.06933009624481201}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 1.6424942016601562, 'dpo_reward_mean_target': 1.4688835144042969, 'standard deviation': 3.0, 'reward_a1': tensor([3.6920], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9596288800239563, 'numerator': 0.10105311870574951, 'denominator': 0.10530437529087067}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.01605224609375, 'dpo_reward_mean_target': 3.462432861328125, 'standard deviation': 3.0, 'reward_a1': tensor([18.6409], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 691.370849609375, 'numerator': 3.6739905340255063e-07, 'denominator': 5.314066409134455e-10}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.7205696105957031, 'dpo_reward_mean_target': 0.7205696105957031, 'standard deviation': 3.0, 'reward_a1': tensor([0.0526], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.12972469627857208, 'denominator': 0.12972469627857208}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.0075531005859375, 'dpo_reward_mean_target': 1.2405624389648438, 'standard deviation': 3.0, 'reward_a1': tensor([0.4379], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9755361676216125, 'numerator': 0.12830525636672974, 'denominator': 0.13152280449867249}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 2.0560531616210938, 'dpo_reward_mean_target': 2.6396026611328125, 'standard deviation': 3.0, 'reward_a1': tensor([8.0529], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.4476062059402466, 'numerator': 0.026107054203748703, 'denominator': 0.018034638836979866}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.9135112762451172, 'dpo_reward_mean_target': 2.63897705078125, 'standard deviation': 3.0, 'reward_a1': tensor([0.2564], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.787141740322113, 'numerator': 0.09701094031333923, 'denominator': 0.12324456125497818}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.7103099822998047, 'dpo_reward_mean_target': 0.7103099822998047, 'standard deviation': 3.0, 'reward_a1': tensor([0.0667], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.1299550086259842, 'denominator': 0.1299550086259842}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.15093994140625, 'dpo_reward_mean_target': 0.3704414367675781, 'standard deviation': 3.0, 'reward_a1': tensor([1.1798], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0639503002166748, 'numerator': 0.12822844088077545, 'denominator': 0.12052108347415924}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -1.1131362915039062, 'dpo_reward_mean_target': -0.917633056640625, 'standard deviation': 3.0, 'reward_a1': tensor([0.0318], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0230071544647217, 'numerator': 0.12648575007915497, 'denominator': 0.12364111840724945}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 1.2579383850097656, 'dpo_reward_mean_target': 0.7892131805419922, 'standard deviation': 3.0, 'reward_a1': tensor([0.6537], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.019450306892395, 'numerator': 0.13284516334533691, 'denominator': 0.13031058013439178}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 3.4922027587890625, 'dpo_reward_mean_target': 4.81689453125, 'standard deviation': 3.0, 'reward_a1': tensor([0.8029], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.6105905771255493, 'numerator': 0.05432940274477005, 'denominator': 0.08897844702005386}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 2.6770782470703125, 'dpo_reward_mean_target': 1.4204788208007812, 'standard deviation': 3.0, 'reward_a1': tensor([1.5464], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0726591348648071, 'numerator': 0.1328636258840561, 'denominator': 0.12386379390954971}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 1.545318603515625, 'dpo_reward_mean_target': 2.0236663818359375, 'standard deviation': 3.0, 'reward_a1': tensor([2.5591], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0420283079147339, 'numerator': 0.13087968528270721, 'denominator': 0.1256008893251419}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.9315032958984375, 'dpo_reward_mean_target': 0.01282501220703125, 'standard deviation': 3.0, 'reward_a1': tensor([3.0996], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.4526933431625366, 'numerator': 0.0783255323767662, 'denominator': 0.05391745641827583}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.30861473083496094, 'dpo_reward_mean_target': 2.0429859161376953, 'standard deviation': 3.0, 'reward_a1': tensor([-1.4390], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.6041783094406128, 'numerator': 0.0678059533238411, 'denominator': 0.1122283786535263}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 1.6835403442382812, 'dpo_reward_mean_target': 6.6398162841796875, 'standard deviation': 3.0, 'reward_a1': tensor([7.4483], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 6.109951972961426, 'numerator': 0.12823867797851562, 'denominator': 0.02098849229514599}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.16419219970703125, 'dpo_reward_mean_target': -0.16419219970703125, 'standard deviation': 3.0, 'reward_a1': tensor([-0.1539], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13297998905181885, 'denominator': 0.13297998905181885}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -1.0690078735351562, 'dpo_reward_mean_target': -0.3657493591308594, 'standard deviation': 3.0, 'reward_a1': tensor([0.7158], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.1184972524642944, 'numerator': 0.12461405992507935, 'denominator': 0.11141203343868256}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.3432445526123047, 'dpo_reward_mean_target': 1.9300308227539062, 'standard deviation': 3.0, 'reward_a1': tensor([1.4159], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0504618883132935, 'numerator': 0.13104191422462463, 'denominator': 0.12474694103002548}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.8786697387695312, 'dpo_reward_mean_target': 2.1971969604492188, 'standard deviation': 3.0, 'reward_a1': tensor([13.0529], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 5.403113842010498, 'numerator': 0.00019074913871008903, 'denominator': 3.5303557524457574e-05}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.03467369079589844, 'dpo_reward_mean_target': -0.03467369079589844, 'standard deviation': 3.0, 'reward_a1': tensor([0.1812], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.1326369047164917, 'denominator': 0.1326369047164917}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.17769432067871094, 'dpo_reward_mean_target': -0.4458904266357422, 'standard deviation': 3.0, 'reward_a1': tensor([0.2066], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9846695065498352, 'numerator': 0.1298718899488449, 'denominator': 0.13189388811588287}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.1557025909423828, 'dpo_reward_mean_target': -0.1557025909423828, 'standard deviation': 3.0, 'reward_a1': tensor([-0.1557], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13298074901103973, 'denominator': 0.13298074901103973}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 1.2658767700195312, 'dpo_reward_mean_target': 1.2658767700195312, 'standard deviation': 3.0, 'reward_a1': tensor([3.7205], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.09515276551246643, 'denominator': 0.09515276551246643}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.13040542602539062, 'dpo_reward_mean_target': -0.13040542602539062, 'standard deviation': 3.0, 'reward_a1': tensor([-0.1304], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13298074901103973, 'denominator': 0.13298074901103973}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 4.1185455322265625, 'dpo_reward_mean_target': 4.6542510986328125, 'standard deviation': 3.0, 'reward_a1': tensor([23.0982], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 3.045858860015869, 'numerator': 8.24365853357989e-10, 'denominator': 2.7065136243287213e-10}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.0023632049560546875, 'dpo_reward_mean_target': -0.0023632049560546875, 'standard deviation': 3.0, 'reward_a1': tensor([-0.0024], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13298074901103973, 'denominator': 0.13298074901103973}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 1.3834991455078125, 'dpo_reward_mean_target': 2.7294387817382812, 'standard deviation': 3.0, 'reward_a1': tensor([15.2729], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 7.217503070831299, 'numerator': 2.126085018971935e-05, 'denominator': 2.9457348773576086e-06}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.7596969604492188, 'dpo_reward_mean_target': -0.7596969604492188, 'standard deviation': 3.0, 'reward_a1': tensor([-0.2045], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13072267174720764, 'denominator': 0.13072267174720764}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.10426712036132812, 'dpo_reward_mean_target': 1.3546276092529297, 'standard deviation': 3.0, 'reward_a1': tensor([1.3143], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.084647536277771, 'numerator': 0.1329687535762787, 'denominator': 0.12259166687726974}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.34687042236328125, 'dpo_reward_mean_target': 4.139991760253906, 'standard deviation': 3.0, 'reward_a1': tensor([-0.8771], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.26842477917671204, 'numerator': 0.03284461423754692, 'denominator': 0.12236059457063675}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 1.2647628784179688, 'dpo_reward_mean_target': 1.9266548156738281, 'standard deviation': 3.0, 'reward_a1': tensor([2.4827], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0674068927764893, 'numerator': 0.13071608543395996, 'denominator': 0.12246134132146835}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.2016010284423828, 'dpo_reward_mean_target': 1.6806907653808594, 'standard deviation': 3.0, 'reward_a1': tensor([1.5415], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.182618260383606, 'numerator': 0.13283777236938477, 'denominator': 0.11232514679431915}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.62530517578125, 'dpo_reward_mean_target': 1.3363075256347656, 'standard deviation': 3.0, 'reward_a1': tensor([3.2878], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.1999218463897705, 'numerator': 0.10762166231870651, 'denominator': 0.08969055861234665}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.08915138244628906, 'dpo_reward_mean_target': -0.2723121643066406, 'standard deviation': 3.0, 'reward_a1': tensor([0.1545], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9932008385658264, 'numerator': 0.1316417157649994, 'denominator': 0.13254289329051971}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 1.0848541259765625, 'dpo_reward_mean_target': 1.4436492919921875, 'standard deviation': 3.0, 'reward_a1': tensor([5.2662], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.1729724407196045, 'numerator': 0.05905347689986229, 'denominator': 0.050345152616500854}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.4972209930419922, 'dpo_reward_mean_target': -0.06068992614746094, 'standard deviation': 3.0, 'reward_a1': tensor([-0.4972], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9894692897796631, 'numerator': 0.13158036768436432, 'denominator': 0.13298074901103973}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.5273895263671875, 'dpo_reward_mean_target': 0.5816650390625, 'standard deviation': 3.0, 'reward_a1': tensor([4.6753], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.7732034921646118, 'numerator': 0.05241549015045166, 'denominator': 0.029559772461652756}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 2.6182785034179688, 'dpo_reward_mean_target': 2.6878662109375, 'standard deviation': 3.0, 'reward_a1': tensor([4.1287], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0114750862121582, 'numerator': 0.11849488317966461, 'denominator': 0.11715057492256165}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.3706817626953125, 'dpo_reward_mean_target': 0.04357147216796875, 'standard deviation': 3.0, 'reward_a1': tensor([0.0039], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0077379941940308, 'numerator': 0.13296914100646973, 'denominator': 0.13194812834262848}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.5288028717041016, 'dpo_reward_mean_target': -0.5288028717041016, 'standard deviation': 3.0, 'reward_a1': tensor([1.0200], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.11638917773962021, 'denominator': 0.11638917773962021}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 2.4024658203125, 'dpo_reward_mean_target': 1.5089111328125, 'standard deviation': 3.0, 'reward_a1': tensor([11.3673], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.3928128778934479, 'numerator': 0.0006010181969031692, 'denominator': 0.0015300369122996926}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 1.5738067626953125, 'dpo_reward_mean_target': 2.1391143798828125, 'standard deviation': 3.0, 'reward_a1': tensor([12.4965], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.9509620666503906, 'numerator': 0.00034315729863010347, 'denominator': 0.00017589132767170668}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.5824661254882812, 'dpo_reward_mean_target': 2.8287200927734375, 'standard deviation': 3.0, 'reward_a1': tensor([0.8465], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 0.9004606008529663, 'numerator': 0.10690225660800934, 'denominator': 0.11871952563524246}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': 0.850799560546875, 'dpo_reward_mean_target': 2.2128677368164062, 'standard deviation': 3.0, 'reward_a1': tensor([2.6472], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.1838871240615845, 'numerator': 0.13159427046775818, 'denominator': 0.11115440726280212}


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'dpo_reward_mean_ref': -0.11063957214355469, 'dpo_reward_mean_target': -0.11063957214355469, 'standard deviation': 3.0, 'reward_a1': tensor([0.4306], device='cuda:0', grad_fn=<SubBackward0>)}
{'ratio': 1.0, 'numerator': 0.13083387911319733, 'denominator': 0.13083387911319733}


KeyboardInterrupt: 

Error in callback <bound method _WandbInit._post_run_cell_hook of <wandb.sdk.wandb_init._WandbInit object at 0x731388154150>> (for post_run_cell), with arguments args (<ExecutionResult object at 731564f06d90, execution_count=2 error_before_exec=None error_in_exec= info=<ExecutionInfo object at 731564f07690, raw_cell="seed = 1234
ref_policy_model, ref_policy_tokenizer.." transformed_cell="seed = 1234
ref_policy_model, ref_policy_tokenizer.." store_history=True silent=False shell_futures=True cell_id=vscode-notebook-cell://ssh-remote%2B7b22686f73744e616d65223a22413130305f72756e706f64227d/workspace/Self_play_DRPO/self_play_drpo_code/drpo_normal_dist.ipynb#W3sdnNjb2RlLXJlbW90ZQ%3D%3D> result=None>,),kwargs {}:


BrokenPipeError: [Errno 32] Broken pipe

In [None]:
drpo_train.select(range(1000))

Dataset({
    features: ['prompt', 'a1', 'a2', 'rank'],
    num_rows: 1000
})