## Building a BATTLE Eval

This notebook shows how to:
- Build and run an eval
- Load the results and into a Pandas Dataframe

In [386]:
initial_message = [{"role": "user", "content": ""}]
battles_generation = "dense"      # methods: dense, only-new
roles = "system-user"             # methods: system-user, system, user

#!cd evals
#!git lfs fetch --all
#!git lfs pull

In [387]:
import os
from openai import OpenAI
import pandas as pd
import yaml
#from dotenv import load_dotenv

#load_dotenv()  # Load environment variables from .env file

#api_key = os.environ.get("OPENAI_API_KEY")
#client = OpenAI()

# Install Evals if you haven't already
# %pip install -e ../.
# pip install --upgrade openai
# %pip install pandas


In [388]:
# Paths. Assuming this notebook is in examples/

evals_path = os.path.join(os.getcwd(), "..", "evals")

registry_path = os.path.join(evals_path, "registry", "evals", "battles.yaml")

data_path = os.path.join(evals_path, "registry", "data", "battles")
os.makedirs(data_path, exist_ok=True)
data_path = os.path.join(data_path, "samples.jsonl")

json_logs_path = os.path.join(os.getcwd(), "logs")
os.makedirs(json_logs_path, exist_ok=True)
json_logs_path = os.path.join(json_logs_path, "logs")

df_path = os.path.join(evals_path, "evallogs", "df")
os.makedirs(df_path, exist_ok=True)

In [389]:
# Registry yaml

registry_yaml = {}

registry_yaml["battles"] = {
    "id": "battles.test.v1",
    "metrics": ["accuracy"]
}
registry_yaml["battles.test.v1"] = {
    "class": "evals.elsuite.modelgraded.classify:ModelBasedClassify",
    "args": {
        "samples_jsonl": "battles/samples.jsonl",
        "eval_type": "cot_classify",
        "modelgraded_spec": "battle"
    }
}
with open(os.path.join(registry_path), "w") as f:
    yaml.dump(registry_yaml, f)

In [390]:
# Data structure

dataset = []
candidate_messages = [initial_message]
next_candidate_messages = []
best_responses = []
best_messages = [initial_message]

In [391]:
def battle(best_message, candidate_message):

    dataset = [{"input1": candidate_message, "input2": best_message}]

    df = pd.DataFrame(dataset)
    df.to_json(data_path, orient="records", lines=True)

    !oaieval gpt-3.5-turbo battles --record_path logs/logs
    
    with open(json_logs_path, "r") as f:
        df = pd.read_json(f, lines=True)

    df.to_json(os.path.join(df_path, "events"), lines=True, orient="records")

    instruction1 = candidate_message[0]["content"]
    instruction2 = best_message[0]["content"]

    battle_prompt_content = df["data"].iloc[-2]["prompt"][0]["content"]
    response1 = battle_prompt_content.split("\n[Response 1]\n",)[1].split("\n\n[Instruction 2]\n")[0]
    response1 = response1.replace("\\'", "'").replace("\\n", "\n")
    response2 = battle_prompt_content.split("\n[Response 2]\n",)[1].split("\n\n\nIs the first response better than the second?")[0]
    response2 = response1.replace("\\'", "'").replace("\\n", "\n")

    sampled = events_df["data"].iloc[-2]["sampled"][0]

    choice = events_df["data"].iloc[-1]["choice"]

    data = {'Instruction1': instruction1, 'Response1': response1, 'Instruction2': instruction2, 'Response2': response2, 'Sampled': sampled, 'Choice': choice, 'Data': df.to_dict()}
    return data

In [392]:
def switch_system_and_user(message):
    new_message = []
    if message[0]["role"] == "system":
        new_message.append([{"role": "user", "content": message[0]["content"]}])
    else:
        new_message.append([{"role": "system", "content": message[0]["content"]}])
    return new_message

In [393]:
def list_candidate_messages(dataset):
    messages = []
    list_of_contents = pd.DataFrame(dataset)["Response1"].unique().tolist().insert(0,"")
    for content in list_of_contents:
        if roles == "system-user" or "user":
            messages.append([{"role":"user","content":content}])
        if roles == "system-user" or "system":
            messages.append([{"role":"system","content":content}])
    return messages

In [None]:
for candidate_message in candidate_messages:
    data = battle(best_messages[-1], candidate_message)
    if data["Choice"] == "Yes":
        best_messages.append(candidate_message)
        best_responses.append(data["Response1"]) 
    dataset.append(data)
    pd.DataFrame(dataset).to_json(os.path.join(df_path, "dataset"), lines=True, orient="records")

    if battles_generation == "only-new":
        new_message = [{"role": candidate_message[0]["role"], "content":data["Response1"]}]
        if new_message not in next_candidate_messages:
            next_candidate_messages.append(new_message)
            if roles == "system-user":
                next_candidate_messages.append(switch_system_and_user(new_message))

if battles_generation == "only-new":
    candidate_messages = next_candidate_messages
    next_candidate_messages = []
if battles_generation == "dense":
    candidate_messages = list_candidate_messages(dataset)

[2024-08-07 00:28:16,760] [registry.py:271] Loading registry from /Users/janvotava/Desktop/evals/evals/registry/evals
[2024-08-07 00:28:17,623] [registry.py:271] Loading registry from /Users/janvotava/.evals/evals
[2024-08-07 00:28:17,625] [oaieval.py:215] [1;35mRun started: 240806222817YUGAB6S5[0m
[2024-08-07 00:28:17,627] [registry.py:271] Loading registry from /Users/janvotava/Desktop/evals/evals/registry/modelgraded
[2024-08-07 00:28:17,654] [registry.py:271] Loading registry from /Users/janvotava/.evals/modelgraded
[2024-08-07 00:28:17,654] [data.py:94] Fetching /Users/janvotava/Desktop/evals/evals/registry/data/battles/samples.jsonl
[2024-08-07 00:28:17,654] [eval.py:36] Evaluating 1 samples
[2024-08-07 00:28:17,675] [eval.py:144] Running in threaded mode with 10 threads!
100%|█████████████████████████████████████████████| 1/1 [00:03<00:00,  3.57s/it]
[2024-08-07 00:28:21,253] [oaieval.py:275] Found 3/3 sampling events with usage data
[2024-08-07 00:28:21,253] [oaieval.py:283] 

In [395]:
dataset[-1]

IndexError: list index out of range

In [396]:
candidate_messages

[[{'role': 'user', 'content': ''}]]

In [None]:
#completion = client.chat.completions.create(
    #messages = [{"role":"system", "content":"Hello! How can I assist you today?"}],
    #model = "gpt-3.5-turbo",
    #temperature = 0,
    #seed = 20220722
#)

In [None]:
dataset

[{'Instruction1': '',
  'Response1': 'Hello! How can I assist you today?',
  'Instruction2': '',
  'Response2': 'Hello! How can I assist you today?',
  'Sampled': '1. Both responses are identical in content and structure.\n2. There is no distinguishable difference between the two responses.\n3. Both responses effectively address the instruction given.\n\nNo\n\nNo',
  'Choice': 'No',
  'Data': {'spec': {0: {'completion_fns': ['gpt-3.5-turbo'],
     'eval_name': 'battles.test.v1',
     'base_eval': 'battles',
     'split': 'test',
     'run_config': {'completion_fns': ['gpt-3.5-turbo'],
      'eval_spec': {'cls': 'evals.elsuite.modelgraded.classify:ModelBasedClassify',
       'registry_path': '/Users/janvotava/Desktop/evals/evals/registry',
       'args': {'eval_type': 'cot_classify',
        'modelgraded_spec': 'battle',
        'samples_jsonl': 'battles/samples.jsonl'},
       'key': 'battles.test.v1',
       'group': 'battles'},
      'seed': 20220722,
      'max_samples': None,
     