In [None]:
import os
from tqdm import tqdm

import torch

from datasets import load_dataset, Dataset

from transformers import pipeline, AutoModelForCausalLM, AutoTokenizer

from trl import OpenAIPairwiseJudge

# Environment variables
# -------------------------------------------------------------------------------------------------

os.environ["OPENAI_API_KEY"] = ""

### Devices

In [2]:
# Visible devices
# -------------------------------------------------------------------------------------------------
VISIBLE_DEVICES = "4"
# -------------------------------------------------------------------------------------------------

# Enumerate GPUs based on their PCI bus IDs
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"

os.environ["CUDA_VISIBLE_DEVICES"] = f"{VISIBLE_DEVICES}"

### Models and dataset

In [3]:
# Base model
# -------------------------------------------------------------------------------------------------
BASE_MODEL_PATH = "mistral-community/Mistral-7B-v0.2"
# -------------------------------------------------------------------------------------------------

# Fine-tuned model
# -------------------------------------------------------------------------------------------------
FT_MODEL_PATH = "RLHF-And-Friends/TLDR-Mistral-7B-SFT-PPO"
# -------------------------------------------------------------------------------------------------
FT_MODEL_NAME = FT_MODEL_PATH.split('/')[1]

# Dataset
# -------------------------------------------------------------------------------------------------
DATASET_PATH = "trl-lib/tldr"
DATASET_SPLIT = "test"
PROMPT_FIELD = "prompt"
SIZE = 100
# -------------------------------------------------------------------------------------------------

DATASET_NAME = DATASET_PATH.split('/')[1]


### System prompt for evaluation

In [4]:
SYSTEM_PROMPT = '''I require a leaderboard for various large language models. I'll provide you with prompts given to these models and their corresponding outputs. Your task is to assess these responses, and select the model that produces the best output from a human perspective.

## Instruction

{{
    "instruction": """{prompt}""",
}}

## Model Outputs

Here are the unordered outputs from the models. Each output is associated with a specific model, identified by a unique model identifier.

{{
    {{
        "model_identifier": "0",
        "output": """{response0}"""
    }},
    {{
        "model_identifier": "1",
        "output": """{response1}"""
    }}
}}

## Task

Evaluate the models on the basis of the quality and relevance of their results, and select the model that generated the best result. Reply with the identifier of the best model. Our evaluation will only take into account the first character of your answer, so make sure it contains only one of the identifiers and nothing else (no quotation marks, no spaces, no new lines, ...).
'''

### Load dataset

In [5]:
test_dataset = load_dataset(DATASET_PATH, split=DATASET_SPLIT).select(range(SIZE))

### Inference

In [6]:
def get_responses(
    prompts: list[str],
    model_path: str,
    batch_size: int = 8,
    max_new_tokens: int = 512,
    is_chat_model: bool = False,
) -> list[str]:

    model = AutoModelForCausalLM.from_pretrained(
        model_path,
        torch_dtype = torch.bfloat16,
    )
    tokenizer = AutoTokenizer.from_pretrained(model_path)
    tokenizer.pad_token_id = tokenizer.eos_token_id

    text_generator = pipeline(
        task="text-generation",
        model=model,
        tokenizer=tokenizer,
        device_map='auto',
        batch_size=batch_size,
        max_new_tokens=max_new_tokens,
    )

    if is_chat_model:
        inputs = [[{'role': "user", 'content': prompt}] for prompt in prompts]
    else:
        inputs = prompts

    responses = []
    for idx in tqdm(
        range(0, len(inputs), batch_size), desc=f'{model_path} inference'
    ):
        batch = inputs[idx:idx+batch_size]
        responses.extend(text_generator(batch))

    if is_chat_model:
        text_responses = [
            response[0]['generated_text'][-1]['content'] for response in responses
        ]
    else:
        text_responses = [
            response[0]['generated_text'] for response in responses
        ]

    return text_responses

In [None]:
prompts = list(test_dataset[PROMPT_FIELD])

base_completions = get_responses(prompts, BASE_MODEL_PATH, batch_size=32)
ft_completions = get_responses(prompts, FT_MODEL_PATH, batch_size=32)

### Create dataset with models' responses and load it to HF

In [None]:
responses_dataset = Dataset.from_dict(
    {
        'prompt': prompts, 
        'base_completion': base_completions,
        'ft_completions': ft_completions
    },
    split = "test"
)

responses_dataset.push_to_hub(
    f"RLHF-And-Friends/{DATASET_NAME}-{FT_MODEL_NAME}-completions"
)

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

'(ProtocolError('Connection aborted.', RemoteDisconnected('Remote end closed connection without response')), '(Request ID: c432e44e-8bf1-493a-bb6e-2871a3c57acd)')' thrown while requesting PUT https://hf-hub-lfs-us-east-1.s3-accelerate.amazonaws.com/repos/d8/89/d8899635e6b0ecf790d84186b3087dbf6191476f20c277d9bb2fc74dcd9a945d/33a3bb6aa3a02b2fdef6e3b11301c800ee2f56ab2e9bd1cb271d8f3a333800cb?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Content-Sha256=UNSIGNED-PAYLOAD&X-Amz-Credential=AKIA2JU7TKAQLC2QXPN7%2F20250223%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=20250223T141040Z&X-Amz-Expires=900&X-Amz-Signature=92518831139a90183c8a356b6095ace185fa3170011cac66dcdfd513dc29a5a4&X-Amz-SignedHeaders=host&x-amz-storage-class=INTELLIGENT_TIERING&x-id=PutObject
Retrying in 1s [Retry 1/5].


CommitInfo(commit_url='https://huggingface.co/datasets/RLHF-And-Friends/tldr-TLDR-Mistral-7B-Base-PPO-completions/commit/8299326b43a35715881eeed544eaa0d0ff21848a', commit_message='Upload dataset', commit_description='', oid='8299326b43a35715881eeed544eaa0d0ff21848a', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/RLHF-And-Friends/tldr-TLDR-Mistral-7B-Base-PPO-completions', endpoint='https://huggingface.co', repo_type='dataset', repo_id='RLHF-And-Friends/tldr-TLDR-Mistral-7B-Base-PPO-completions'), pr_revision=None, pr_num=None)

### Load dataset with responses and prepare to judge

In [None]:
NUM_SAMPLES_TO_CHOOSE = 100
responses_dataset = load_dataset(
    f"RLHF-And-Friends/{DATASET_NAME}-{FT_MODEL_NAME}-completions"
)["test"].select(range(NUM_SAMPLES_TO_CHOOSE))

NameError: name 'load_dataset' is not defined

In [7]:
prompts = list(responses_dataset['prompt'])
completions = [
    [base_completion, ft_completion] 
    for base_completion, ft_completion in zip(
        responses_dataset['base_completion'], responses_dataset['ft_completions']
    )
]

### Judge with OpenAI API

In [None]:
gpt_judge = OpenAIPairwiseJudge(
    model = "gpt-4o-mini",
    system_prompt=SYSTEM_PROMPT
)

gpt_judgements = gpt_judge.judge(prompts, completions, shuffle_order=False)

In [9]:
gpt_winrate = sum(gpt_judgements) / len(gpt_judgements)

In [None]:
print(f"GPT-judged winrate: {gpt_winrate}")