## Building a BATTLE Eval

This notebook shows how to:
- Build and run an eval
- Load the results and into a Pandas Dataframe

In [15]:
new = True
initial_message = [{"role": "user", "content": ""}]
battles_generation = "dense"      # methods: dense, only-new
roles = "system-user"             # methods: system-user, system, user

#!cd evals
#!git lfs fetch --all
#!git lfs pull

In [16]:
import os
from openai import OpenAI
import pandas as pd
import yaml
#import datetime
#from dotenv import load_dotenv

#load_dotenv()  # Load environment variables from .env file

#api_key = os.environ.get("OPENAI_API_KEY")
#client = OpenAI()

# Install Evals if you haven't already
# %pip install -e ../.
# pip install --upgrade openai
# %pip install pandas


In [17]:
# Paths. Assuming this notebook is in examples/

evals_path = os.path.join(os.getcwd(), "..", "evals")

registry_path = os.path.join(evals_path, "registry", "evals", "battles.yaml")

data_path = os.path.join(evals_path, "registry", "data", "battles")
os.makedirs(data_path, exist_ok=True)
data_path = os.path.join(data_path, "samples.jsonl")

json_logs_path = os.path.join(os.getcwd(), "logs")
os.makedirs(json_logs_path, exist_ok=True)
json_logs_path = os.path.join(json_logs_path, "logs")

df_path = os.path.join(evals_path, "evallogs", "df")
os.makedirs(df_path, exist_ok=True)
dataset_path = os.path.join(df_path, "dataset")

In [18]:
# Registry yaml

registry_yaml = {}

registry_yaml["battles"] = {
    "id": "battles.test.v1",
    "metrics": ["accuracy"]
}
registry_yaml["battles.test.v1"] = {
    "class": "evals.elsuite.modelgraded.classify:ModelBasedClassify",
    "args": {
        "samples_jsonl": "battles/samples.jsonl",
        "eval_type": "cot_classify",
        "modelgraded_spec": "battle"
    }
}
with open(os.path.join(registry_path), "w") as f:
    yaml.dump(registry_yaml, f)

In [19]:
# Initial data
if new:
    dataset = []
    best_message = initial_message
else:
    with open(dataset_path, "r") as f:
        dataset = pd.read_json(f, lines=True)
    if dataset.iloc[-1]["Choice"] == "No":
        best_message = dataset.iloc[-1]["Response2"]
    else:
        best_message = dataset.iloc[-1]["Response1"]
candidate_messages = [best_message]
next_candidate_messages = []
generation_distance = 0


In [20]:
def battle(best_message, candidate_message):

    dataset = [{"input1": candidate_message, "input2": best_message}]

    df = pd.DataFrame(dataset)
    df.to_json(data_path, orient="records", lines=True)

    !oaieval gpt-3.5-turbo battles --record_path logs/logs
    
    with open(json_logs_path, "r") as f:
        df = pd.read_json(f, lines=True)

    #current_time = datetime.datetime.now()
    #formatted_time = current_time.strftime("%Y%m%d_%H%M%S")
    #df.to_json(os.path.join(df_path, formatted_time), lines=True, orient="records")

    instruction1 = candidate_message[0]["content"]
    instruction2 = best_message[0]["content"]

    battle_prompt_content = df["data"].iloc[-2]["prompt"][0]["content"]
    response1 = battle_prompt_content.split("\n[Response 1]\n",)[1].split("\n\n[Instruction 2]\n")[0]
    response1 = response1.replace("\\'", "'").replace("\\n", "\n")
    response2 = battle_prompt_content.split("\n[Response 2]\n",)[1].split("\n\n\nIs the first response better than the second?")[0]
    response2 = response2.replace("\\'", "'").replace("\\n", "\n")
    #print(f"response1: {response1}")
    #print(f"response2: {response2}")

    sampled = df["data"].iloc[-2]["sampled"][0]

    choice = df["data"].iloc[-1]["choice"]

    data = {'Instruction1': instruction1, 'Response1': response1, 'Instruction2': instruction2, 'Response2': response2, 'Sampled': sampled, 'Choice': choice, 'Data': df.to_dict()}
    return data

In [21]:
def switch_system_and_user(message):
    new_message = []
    if message[0]["role"] == "system":
        new_message.append([{"role": "user", "content": message[0]["content"]}])
    else:
        new_message.append([{"role": "system", "content": message[0]["content"]}])
    return new_message

In [22]:
def find_parents(content, dataset):
    print (f"finding parents {content}")
    dataset = pd.DataFrame(dataset)
    dataset = dataset[dataset["Response1"] == content]
    parents = dataset["Instruction1"].unique()
    return parents

In [23]:
def find_children(content, dataset):
    print (f"finding children {content}")
    dataset = pd.DataFrame(dataset)
    dataset = dataset[dataset["Instruction1"] == content]
    children = dataset["Response1"].unique()
    return children

In [24]:
def list_candidate_messages(dataset, best_message, generation_distance):
    best_content = best_message[0]["content"]
    last_up = [best_content]
    last_down = [best_content]
    list_of_contents = [best_content]

    next_up = []
    next_down = []
    for i in range(generation_distance):
        #print (f"starting level {i}, generation distance {generation_distance}")
        for content in last_up:
            #print(f"last_up: {last_up}, now {content}")
            next_up.extend(find_parents(content, dataset))
        for content in last_down:
            #print(f"last_up: {last_down}, now {content}")
            next_down.extend(find_children(content, dataset))
        #print (f"behind for loops")
        list_of_contents.extend(next_up)
        list_of_contents.extend(next_down)
        last_up = next_up.copy()
        last_down = next_down.copy()
        #print(f"endind level {i}")
        

    list_of_contents = pd.array(list_of_contents).unique().tolist()
    
    #best_message_index = list_of_contents.index(best_message[0][content])
    #start = max(best_message_index - generation_distance, 0)
    #stop = min(len(list_of_contents))
    #stop = len(list_of_contents) - 1

    messages = []
    for content in list_of_contents:
        if roles == "system-user" or "user":
            messages.append([{"role":"user","content":content}])
        if roles == "system-user" or "system":
            messages.append([{"role":"system","content":content}])
    return messages

In [62]:
list_candidate_messages(dataset, best_message, generation_distance)

finding parents Hello! I'm here to help with any questions you have. What would you like to know?
finding children Hello! I'm here to help with any questions you have. What would you like to know?


[[{'role': 'user',
   'content': "Hello! I'm here to help with any questions you have. What would you like to know?"}],
 [{'role': 'system',
   'content': "Hello! I'm here to help with any questions you have. What would you like to know?"}],
 [{'role': 'user',
   'content': "Hello! I'm just here to chat and answer any questions you may have. How can I help you today?"}],
 [{'role': 'system',
   'content': "Hello! I'm just here to chat and answer any questions you may have. How can I help you today?"}],
 [{'role': 'user',
   'content': "Hello! I'm an AI digital assistant here to help you with any questions you may have. How can I assist you today?"}],
 [{'role': 'system',
   'content': "Hello! I'm an AI digital assistant here to help you with any questions you may have. How can I assist you today?"}],
 [{'role': 'user', 'content': 'Hello! How can I assist you today?'}],
 [{'role': 'system', 'content': 'Hello! How can I assist you today?'}]]

In [59]:
generation_distance

1

In [49]:
for candidate_message in candidate_messages:
    data = battle(best_message, candidate_message)
    dataset.append(data)
    pd.DataFrame(dataset).to_json(os.path.join(df_path, "dataset"), lines=True, orient="records")
    if data["Choice"] == "Yes":
        best_message = candidate_message
        if battles_generation == "dense":
            generation_distance = 0
    print(candidate_message)

    if battles_generation == "only-new":
        new_message = [{"role": candidate_message[0]["role"], "content":data["Response1"]}]
        if new_message not in next_candidate_messages:
            next_candidate_messages.append(new_message)
            if roles == "system-user":
                next_candidate_messages.append(switch_system_and_user(new_message))

if battles_generation == "only-new":
    candidate_messages = next_candidate_messages
    next_candidate_messages = []
if battles_generation == "dense":
    print("going to list candidate messages")
    candidate_messages = list_candidate_messages(dataset, best_message, generation_distance)
    generation_distance += 1
    print("candidate messages listed")
print("all done")

[2024-08-07 23:00:00,781] [registry.py:271] Loading registry from /Users/janvotava/Desktop/evals/evals/registry/evals
[2024-08-07 23:00:01,794] [registry.py:271] Loading registry from /Users/janvotava/.evals/evals
[2024-08-07 23:00:01,797] [oaieval.py:215] [1;35mRun started: 240807210001HVA63USL[0m
[2024-08-07 23:00:01,800] [registry.py:271] Loading registry from /Users/janvotava/Desktop/evals/evals/registry/modelgraded
[2024-08-07 23:00:01,870] [registry.py:271] Loading registry from /Users/janvotava/.evals/modelgraded
[2024-08-07 23:00:01,871] [data.py:94] Fetching /Users/janvotava/Desktop/evals/evals/registry/data/battles/samples.jsonl
[2024-08-07 23:00:01,871] [eval.py:36] Evaluating 1 samples
[2024-08-07 23:00:01,906] [eval.py:144] Running in threaded mode with 10 threads!
100%|█████████████████████████████████████████████| 1/1 [00:03<00:00,  3.70s/it]
[2024-08-07 23:00:05,617] [oaieval.py:275] Found 3/3 sampling events with usage data
[2024-08-07 23:00:05,617] [oaieval.py:283] 

In [57]:
candidate_messages

[[{'role': 'user',
   'content': "Hello! I'm here to help with any questions you have. What would you like to know?"}],
 [{'role': 'system',
   'content': "Hello! I'm here to help with any questions you have. What would you like to know?"}]]

In [58]:
best_message

[{'role': 'user',
  'content': "Hello! I'm here to help with any questions you have. What would you like to know?"}]

In [51]:
pd.DataFrame(dataset)["Response1"].to_list()

['Hello! How can I assist you today?',
 'Hello! How can I assist you today?',
 'Hello! How can I assist you today?',
 'Hello! How can I assist you today?',
 'Hello! How can I assist you today?',
 "Hello! I'm just here to chat and answer any questions you may have. How can I help you today?",
 'Hello! How can I assist you today?',
 "Hello! I'm here to help with any questions or tasks you may have. Just let me know how I can assist you.",
 'Hello! How can I assist you today?',
 "Hello! I'm here to help with any questions or tasks you may have. Just let me know how I can assist you.",
 'Hello! How can I assist you today?',
 'Hello! How can I assist you today?',
 'Hello! How can I assist you today?',
 "Hello! I'm here to assist you with any questions or concerns you may have. How can I help you today?",
 "Hello! I'm here to help with any questions you have. What would you like to know?",
 'Hello! How can I assist you today?',
 'Hello! How can I assist you today?',
 "Hello! I'm here to help

In [56]:
pd.DataFrame(dataset).loc[16].to_list()

["Hello! I'm here to help with any questions or tasks you may have. Just let me know how I can assist you.",
 'Hello! How can I assist you today?',
 'Hello! How can I assist you today?',
 "Hello! I'm here to help with any questions or tasks you may have. Just let me know how I can assist you.",
 '1. Both responses start with a friendly greeting, which is good.\n2. Response 1 directly asks how the speaker can assist, which is clear and to the point.\n3. Response 2 first states that the speaker is there to help, then asks how they can assist, which may be slightly redundant.\n\nNo\n\nNo',
 'No',
 {'spec': {0: {'completion_fns': ['gpt-3.5-turbo'],
    'eval_name': 'battles.test.v1',
    'base_eval': 'battles',
    'split': 'test',
    'run_config': {'completion_fns': ['gpt-3.5-turbo'],
     'eval_spec': {'cls': 'evals.elsuite.modelgraded.classify:ModelBasedClassify',
      'registry_path': '/Users/janvotava/Desktop/evals/evals/registry',
      'args': {'eval_type': 'cot_classify',
       

In [31]:
#completion = client.chat.completions.create(
    #messages = [{"role":"system", "content":"Hello! How can I assist you today?"}],
    #model = "gpt-3.5-turbo",
    #temperature = 0,
    #seed = 20220722
#)