## Building a BATTLE Eval

This notebook shows how to:
- Build and run an eval
- Load the results and into a Pandas Dataframe

In [169]:
initial_message = [{"role": "user", "content": ""}]
#!cd evals
#!git lfs fetch --all
#!git lfs pull

In [170]:
import os
from openai import OpenAI
import pandas as pd
import yaml
#from dotenv import load_dotenv

#load_dotenv()  # Load environment variables from .env file

#api_key = os.environ.get("OPENAI_API_KEY")
#client = OpenAI()

# Install Evals if you haven't already
# %pip install -e ../.
# pip install --upgrade openai
# %pip install pandas


In [171]:
# Paths. Assuming this notebook is in examples/

evals_path = os.path.join(os.getcwd(), "..", "evals")

registry_path = os.path.join(evals_path, "registry", "evals", "battles.yaml")

data_path = os.path.join(evals_path, "registry", "data", "battles")
os.makedirs(data_path, exist_ok=True)
data_path = os.path.join(data_path, "samples.jsonl")

json_logs_path = os.path.join(os.getcwd(), "logs")
os.makedirs(json_logs_path, exist_ok=True)
json_logs_path = os.path.join(json_logs_path, "logs")

df_path = os.path.join(evals_path, "evallogs", "df")
os.makedirs(df_path, exist_ok=True)

In [172]:
# Registry yaml

registry_yaml = {}

registry_yaml["battles"] = {
    "id": "battles.test.v1",
    "metrics": ["accuracy"]
}
registry_yaml["battles.test.v1"] = {
    "class": "evals.elsuite.modelgraded.classify:ModelBasedClassify",
    "args": {
        "samples_jsonl": "battles/samples.jsonl",
        "eval_type": "cot_classify",
        "modelgraded_spec": "battle"
    }
}
with open(os.path.join(registry_path), "w") as f:
    yaml.dump(registry_yaml, f)

In [173]:
# Data structure

dataset = []
candidate_messages = [initial_message]
next_candidate_messages = []
best_responses = []
best_messages = [initial_message]

In [174]:
def battle(best_message, candidate_message):

    dataset = [{"input1": candidate_message, "input2": best_message}]

    df = pd.DataFrame(dataset)
    df.to_json(data_path, orient="records", lines=True)

    !oaieval gpt-3.5-turbo battles --record_path logs/logs
    
    with open(json_logs_path, "r") as f:
        df = pd.read_json(f, lines=True)

    df.to_json(os.path.join(df_path, "events"), lines=True, orient="records")

    instruction1 = candidate_message[0]["content"]
    instruction2 = best_message[0]["content"]

    battle_prompt_content = events_df["data"].iloc[-2]["prompt"][0]["content"]
    response1 = battle_prompt_content.split("\n[Response 1]\n",)[1].split("\n\n[Instruction 2]\n")[0]
    response1 = response1.replace("\\'", "'").replace("\\n", "\n")
    response2 = battle_prompt_content.split("\n[Response 2]\n",)[1].split("\n\n\nIs the first response better than the second?")[0]
    response2 = response1.replace("\\'", "'").replace("\\n", "\n")

    sampled = events_df["data"].iloc[-2]["sampled"][0]

    choice = events_df["data"].iloc[-1]["choice"]

    df.to_json(os.path.join(df_path, "data_full"), lines=True, orient="records")
    data = {'Instruction1': instruction1, 'Response1': response1, 'Instruction2': instruction2, 'Response2': response2, 'Sampled': sampled, 'Choice': choice}
    return data

In [175]:
def switch_system_and_user(message):
    new_message = []
    if message[0]["role"] == "system":
        new_message.append({"role": "user", "content": message[0]["content"]})
    else:
        new_message.append({"role": "system", "content": message[0]["content"]})
    return new_message

In [176]:
for candidate_message in candidate_messages:
    data = battle(best_messages[-1], candidate_message)
    if data["Choice"] == "Yes":
        best_messages.append(candidate_message)
        best_responses.append(data["Response1"]) 
    dataset.append(data)
    new_message = [{"role": candidate_message[0]["role"], "content":data["Response1"]}]
    if new_message not in next_candidate_messages:
        next_candidate_messages.append(new_message)
        next_candidate_messages.append(switch_system_and_user(new_message))

[2024-08-06 09:09:38,060] [registry.py:271] Loading registry from /Users/janvotava/Desktop/evals/evals/registry/evals
[2024-08-06 09:09:39,791] [registry.py:271] Loading registry from /Users/janvotava/.evals/evals
[2024-08-06 09:09:39,798] [oaieval.py:215] [1;35mRun started: 240806070939J34H77MS[0m
[2024-08-06 09:09:39,802] [registry.py:271] Loading registry from /Users/janvotava/Desktop/evals/evals/registry/modelgraded
[2024-08-06 09:09:39,849] [registry.py:271] Loading registry from /Users/janvotava/.evals/modelgraded
[2024-08-06 09:09:39,849] [data.py:94] Fetching /Users/janvotava/Desktop/evals/evals/registry/data/battles/samples.jsonl
[2024-08-06 09:09:39,850] [eval.py:36] Evaluating 1 samples
[2024-08-06 09:09:39,867] [eval.py:144] Running in threaded mode with 10 threads!
100%|█████████████████████████████████████████████| 1/1 [00:03<00:00,  3.11s/it]
[2024-08-06 09:09:42,984] [oaieval.py:275] Found 3/3 sampling events with usage data
[2024-08-06 09:09:42,985] [oaieval.py:283] 

In [177]:
next_candidate_messages

[[{'role': 'user', 'content': 'Hello! How can I assist you today?'}],
 [{'role': 'system', 'content': 'Hello! How can I assist you today?'}]]

In [178]:
data

{'Instruction1': '',
 'Response1': 'Hello! How can I assist you today?',
 'Instruction2': '',
 'Response2': 'Hello! How can I assist you today?',
 'Sampled': '1. Both responses are identical in content and structure.\n2. There is no distinguishable difference between the two responses.\n3. Both responses effectively address the instruction given.\n\nNo\n\nNo',
 'Choice': 'No'}