In [1]:
import os
from tqdm import tqdm

import torch

from datasets import load_dataset, Dataset

from transformers import pipeline, AutoModelForCausalLM, AutoTokenizer

from trl import HfPairwiseJudge, OpenAIPairwiseJudge

# Environment variables
# -------------------------------------------------------------------------------------------------

os.environ["OPENAI_API_KEY"] = "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"

### Devices

In [2]:
# Visible devices
# -------------------------------------------------------------------------------------------------
VISIBLE_DEVICES = "3"
# -------------------------------------------------------------------------------------------------

# Enumerate GPUs based on their PCI bus IDs
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"

os.environ["CUDA_VISIBLE_DEVICES"] = f"{VISIBLE_DEVICES}"

### Models and dataset

In [3]:
# Base model
# -------------------------------------------------------------------------------------------------
BASE_MODEL_PATH = "mistralai/Mistral-7B-Instruct-v0.2"
# -------------------------------------------------------------------------------------------------

# Fine-tuned model
# -------------------------------------------------------------------------------------------------
FT_MODEL_PATH = "RLHF-And-Friends/Mistral-7B-LoRA-MeanRef-0.8-U13-0-1"
# -------------------------------------------------------------------------------------------------

# Dataset
# -------------------------------------------------------------------------------------------------
DATASET_PATH = "trl-internal-testing/sentiment-trl-style"
DATASET_SPLIT = "test"
PROMPT_FIELD = "prompt"
# -------------------------------------------------------------------------------------------------

DATASET_NAME = DATASET_PATH.split('/')[1]


### System prompt for evaluation

In [4]:
SYSTEM_PROMPT = '''I require a leaderboard for various large language models. I'll provide you with prompts given to these models and their corresponding outputs. Your task is to assess these responses, and select the model that produces the best output from a human perspective.

## Instruction

{{
    "instruction": """{prompt}""",
}}

## Model Outputs

Here are the unordered outputs from the models. Each output is associated with a specific model, identified by a unique model identifier.

{{
    {{
        "model_identifier": "0",
        "output": """{response0}"""
    }},
    {{
        "model_identifier": "1",
        "output": """{response1}"""
    }}
}}

## Task

Evaluate the models on the basis of the quality and relevance of their results, and select the model that generated the best result. Reply with the identifier of the best model. Our evaluation will only take into account the first character of your answer, so make sure it contains only one of the identifiers and nothing else (no quotation marks, no spaces, no new lines, ...).
'''

### Load dataset

In [5]:
test_dataset = load_dataset(DATASET_PATH, split=DATASET_SPLIT)

### Inference

In [6]:
def get_responses(
    prompts: list[str], 
    model_path: str, 
    batch_size: int = 8,
    max_new_tokens: int = 512
) -> list[str]:

    model = AutoModelForCausalLM.from_pretrained(
        model_path,
        torch_dtype = torch.bfloat16,
    )
    tokenizer = AutoTokenizer.from_pretrained(model_path)
    tokenizer.pad_token_id = tokenizer.eos_token_id
    
    text_generator = pipeline(
        task="text-generation",
        model=model,
        tokenizer=tokenizer,
        device_map='auto',
        batch_size=batch_size,
        max_new_tokens=max_new_tokens,
    )

    chats = [[{'role': "user", 'content': prompt}] for prompt in prompts]
    responses = []
    for idx in tqdm(
        range(0, len(chats), batch_size), desc=f'{model_path} inference'
    ):
        batch = chats[idx:idx+batch_size]
        responses.extend(text_generator(batch))

    text_reponses = [
        response[0]['generated_text'][-1]['content'] for response in responses
    ]

    return text_reponses


In [None]:
prompts = list(test_dataset[PROMPT_FIELD])

base_completions = get_responses(prompts, BASE_MODEL_PATH, batch_size=32)
ft_completions = get_responses(prompts, FT_MODEL_PATH, batch_size=32)

### Create dataset with models' responses and load it to HF

In [None]:
responses_dataset = Dataset.from_dict(
    {
        'prompt': prompts, 
        'base_completion': base_completions,
        'ft_completions': ft_completions
    },
    split = "test"
)

responses_dataset.push_to_hub(f"RLHF-And-Friends/{DATASET_NAME}-completions")

### Load dataset with responses and prepare to judge

In [6]:
NUM_SAMPLES_TO_CHOOSE = 100
responses_dataset = load_dataset(
    f"RLHF-And-Friends/{DATASET_NAME}-completions"
)["test"].select(range(NUM_SAMPLES_TO_CHOOSE))

In [7]:
prompts = list(responses_dataset['prompt'])
completions = [
    [base_completion, ft_completion] 
    for base_completion, ft_completion in zip(
        responses_dataset['base_completion'], responses_dataset['ft_completions']
    )
]

### Judge with OpenAI API

In [None]:
gpt_judge = OpenAIPairwiseJudge(
    model = "gpt-4o-mini",
    system_prompt=SYSTEM_PROMPT
)

gpt_judgements = gpt_judge.judge(prompts, completions, shuffle_order=False)

In [9]:
gpt_winrate = sum(gpt_judgements) / len(gpt_judgements)

In [None]:
print(f"GPT-judged winrate: {gpt_winrate}")