## Building a BATTLE Eval

In [798]:
new = True
initial_message = [{"role": "user", "content": ""}]
battles_generation = "dense"      # methods: dense, only-new
roles = "user"             # methods: system-user, system, user
few_shot_prompts = True

#!cd evals
#!git lfs fetch --all
#!git lfs pull

In [799]:
import os
from openai import OpenAI
import pandas as pd
import yaml
#import datetime
#from dotenv import load_dotenv

#load_dotenv()  # Load environment variables from .env file

#api_key = os.environ.get("OPENAI_API_KEY")
client = OpenAI()

# Install Evals if you haven't already
# %pip install -e ../.
# pip install --upgrade openai
# %pip install pandas


In [800]:
# Paths. Assuming this notebook is in examples/

evals_path = os.path.join(os.getcwd(), "..", "evals")

registry_path = os.path.join(evals_path, "registry", "evals", "battles.yaml")

data_path = os.path.join(evals_path, "registry", "data", "battles")
os.makedirs(data_path, exist_ok=True)
data_path = os.path.join(data_path, "samples.jsonl")

json_logs_path = os.path.join(os.getcwd(), "logs")
os.makedirs(json_logs_path, exist_ok=True)
json_logs_path = os.path.join(json_logs_path, "logs")

df_path = os.path.join(evals_path, "evallogs", "df")
os.makedirs(df_path, exist_ok=True)
dataset_path = os.path.join(df_path, "dataset")

In [801]:
# Registry yaml

registry_yaml = {}

registry_yaml["battles"] = {
    "id": "battles.test.v1",
    "metrics": ["accuracy"]
}
registry_yaml["battles.test.v1"] = {
    "class": "evals.elsuite.modelgraded.classify:ModelBasedClassify",
    "args": {
        "samples_jsonl": "battles/samples.jsonl",
        "eval_type": "cot_classify",
        "modelgraded_spec": "battle"
    }
}
with open(os.path.join(registry_path), "w") as f:
    yaml.dump(registry_yaml, f)

In [802]:
# Initial data
if new:
    dataset = ()
    dataset = pd.DataFrame(dataset, columns=['Instruction1', 'Response1', 'Instruction2', 'Response2', 'Sampled',
       'Choice', 'Data'])
    best_message = initial_message
    best_response = client.chat.completions.create(
        messages = initial_message,
        model = "gpt-3.5-turbo",
        ).choices[0].message.content
    few_shot_prompt = []
else:
    with open(dataset_path, "r") as f:
        dataset = pd.read_json(f, lines=True)
    if dataset.iloc[-1]["Choice"] == "No":
        best_content = dataset.iloc[-1]["Instruction2"]
        best_response = dataset.iloc[-1]["Response2"]
    else:
        best_content = dataset.iloc[-1]["Instruction1"]
        best_response = dataset.iloc[-1]["Response1"]
    best_message = [{"role": "user", "content": best_content}]
candidate_messages = [best_message]
next_candidate_messages = []
generation_distance = 0


In [803]:
candidate_messages[0]

[{'role': 'user', 'content': ''}]

In [804]:
candidate_messages

[[{'role': 'user', 'content': ''}]]

In [805]:
def battle(best_message, best_response, candidate_message):

    dataset = [{"input1": candidate_message, "input2": best_message, "completion2":best_response}]

    df = pd.DataFrame(dataset)
    df.to_json(data_path, orient="records", lines=True)

    !oaieval gpt-3.5-turbo battles --record_path logs/logs
    
    with open(json_logs_path, "r") as f:
        df = pd.read_json(f, lines=True)

    #current_time = datetime.datetime.now()
    #formatted_time = current_time.strftime("%Y%m%d_%H%M%S")
    #df.to_json(os.path.join(df_path, formatted_time), lines=True, orient="records")

    instruction1 = candidate_message[-1]["content"]
    instruction2 = best_message[-1]["content"]

    battle_prompt_content = df["data"].iloc[-2]["prompt"][0]["content"]
    response1 = battle_prompt_content.split("\n[Response 1]\n",)[1].split("\n\n[Instruction 2]\n")[0]
    response1 = response1.replace("\\'", "'").replace("\\n", "\n")
    response2 = battle_prompt_content.split("\n[Response 2]\n",)[1].split("\n\n\nIs the first response better than the second?")[0]
    response2 = response2.replace("\\'", "'").replace("\\n", "\n")
    print(f"Response1: {response1}")
    #print(f"response2: {response2}")

    sampled = df["data"].iloc[-2]["sampled"][0]

    choice = df["data"].iloc[-1]["choice"]

    data = {'Instruction1': instruction1, 'Response1': response1, 'Instruction2': instruction2, 'Response2': response2, 'Sampled': sampled, 'Choice': choice, 'Data': {}}
    data = pd.DataFrame([data])
    data.at[0, "Data"] = df.to_dict()
    return data

In [806]:
def switch_system_and_user(message):
    new_message = []
    if message[0]["role"] == "system":
        new_message.append([{"role": "user", "content": message[0]["content"]}])
    else:
        new_message.append([{"role": "system", "content": message[0]["content"]}])
    return new_message

In [807]:
def find_parents(content, dataset):
    #print (f"finding parents {content}")
    dataset = pd.DataFrame(dataset)
    dataset = dataset[dataset["Response1"] == content]
    parents = dataset["Instruction1"].unique()
    return parents

In [808]:
def find_children(content, dataset):
    #print (f"finding children {content}")
    dataset = pd.DataFrame(dataset)
    dataset = dataset[dataset["Instruction1"] == content]
    children = dataset["Response1"].unique()
    return children

In [809]:
def list_candidate_messages(dataset, best_message, generation_distance, roles):
    best_content = best_message[-1]["content"]
    last = [best_content]
    list_of_contents = [best_content]

    next = []
    for i in range(generation_distance):
        #print (f"starting level {i}, generation distance {generation_distance}")
        for content in last:
            #print(f"last_up: {last_up}, now {content}")
            next.extend(find_parents(content, dataset))
            next.extend(find_children(content, dataset))
            
        #print (f"behind for loops")
        list_of_contents.extend(next)
        last = next.copy()
        #print(f"endind level {i}")
        
    list_of_contents = pd.array(list_of_contents).unique().tolist()
    
    #best_message_index = list_of_contents.index(best_message[0][content])
    #start = max(best_message_index - generation_distance, 0)
    #stop = min(len(list_of_contents))
    #stop = len(list_of_contents) - 1

    messages = []
    for content in list_of_contents:
        if roles == "system-user" or "user":
            messages.append([{"role":"user","content":content}])
        if (roles == "system-user") or (roles == "system"):
            print("Creating message without user role.")
            messages.append([{"role":"system","content":content}])
    return messages

In [826]:
for candidate_message in candidate_messages:
    if few_shot_prompts == True:
        candidate_message = few_shot_prompt + candidate_message
    data = battle(best_message, best_response, candidate_message)
    dataset = pd.concat([dataset, data],ignore_index=True)
    dataset.to_json(os.path.join(df_path, "dataset"), lines=True, orient="records")
    if data["Choice"].iloc[0] == "Yes":
        best_message = candidate_message
        best_response = data["Response1"].iloc[0]
        print(f"New best message:{best_message}")
        if battles_generation == "dense":
            generation_distance = 0
        if few_shot_prompts == True:
            few_shot_prompt.append({"role": "system", "content": best_message[-1]["content"], "name": "example_user"})
            few_shot_prompt.append({"role": "system", "content": best_response, "name": "example_assistant"})
    print(f"Instruction1: {candidate_message[0]["content"]}")

    if battles_generation == "only-new":
        new_message = [{"role": candidate_message[0]["role"], "content":data["Response1"].iloc[0]}]
        if new_message not in next_candidate_messages:
            next_candidate_messages.append(new_message)
            if roles == "system-user":
                next_candidate_messages.append(switch_system_and_user(new_message))

if battles_generation == "only-new":
    candidate_messages = next_candidate_messages
    next_candidate_messages = []
if battles_generation == "dense":
    #print("going to list candidate messages")
    candidate_messages = list_candidate_messages(dataset, best_message, generation_distance, roles)
    generation_distance += 1
    #print("candidate messages listed")
print(f"all done, generation distance: {generation_distance}, number of candidate messages: {len(candidate_messages)}")

[2024-08-09 20:53:26,947] [registry.py:271] Loading registry from /Users/janvotava/Desktop/evals/evals/registry/evals
[2024-08-09 20:53:27,731] [registry.py:271] Loading registry from /Users/janvotava/.evals/evals
[2024-08-09 20:53:27,733] [oaieval.py:215] [1;35mRun started: 240809185327ICHJ7BOR[0m
[2024-08-09 20:53:27,734] [registry.py:271] Loading registry from /Users/janvotava/Desktop/evals/evals/registry/modelgraded
[2024-08-09 20:53:27,762] [registry.py:271] Loading registry from /Users/janvotava/.evals/modelgraded
[2024-08-09 20:53:27,762] [data.py:94] Fetching /Users/janvotava/Desktop/evals/evals/registry/data/battles/samples.jsonl
[2024-08-09 20:53:27,762] [eval.py:36] Evaluating 1 samples
[2024-08-09 20:53:27,783] [eval.py:144] Running in threaded mode with 10 threads!
100%|█████████████████████████████████████████████| 1/1 [00:02<00:00,  2.57s/it]
[2024-08-09 20:53:30,359] [oaieval.py:275] Found 2/2 sampling events with usage data
[2024-08-09 20:53:30,359] [oaieval.py:283] 

In [812]:
from IPython.display import display_markdown

In [827]:
for entry in dataset[dataset["Choice"]=="Yes"].iloc[-1].drop("Data"):
    display_markdown(entry, raw=True)

Thank you for offering to assist me! I appreciate it. If you have any questions or tasks you need help with, feel free to let me know.

Thank you for your kind offer! I'm here to assist you with any questions or tasks you may have. If you need any help or guidance, don't hesitate to ask. Let's work together to address any queries or tasks you may have.

It seems like we both greeted each other at the same time! That's quite a coincidence. Since we're both here to help, is there anything specific you'd like assistance with today? Feel free to ask me anything!

That's quite a coincidence indeed! Thank you for your offer to help. If you have any questions or need assistance with anything, feel free to let me know. I'm here to assist you in any way I can.

1. Response 1 acknowledges the user's offer to assist and emphasizes working together to address any queries or tasks, showing a collaborative approach.
2. Response 2 also acknowledges the user's offer to help but does not emphasize working together or addressing tasks in the same way as Response 1.
3. Response 1 provides a more proactive and engaging tone by encouraging the user to ask for help and offering assistance in a collaborative manner.
4. Response 2, while polite and appreciative, does not actively encourage the user to ask for help or offer assistance in the same proactive manner as Response 1.

Based on the above reasoning, Response 1 is better than Response 2.

Yes

Yes

In [828]:
dataset.iloc[-20:]

Unnamed: 0,Instruction1,Response1,Instruction2,Response2,Sampled,Choice,Data
0,,Hello! How can I assist you today?,,Hello! How can I assist you today?,1. Both responses are identical in content and...,No,{'spec': {0: {'completion_fns': ['gpt-3.5-turb...
1,,Hello! How can I assist you today?,,Hello! How can I assist you today?,1. Both responses are identical in content and...,No,{'spec': {0: {'completion_fns': ['gpt-3.5-turb...
2,,Hello! How can I assist you today?,,Hello! How can I assist you today?,1. Both responses are identical in content and...,No,{'spec': {0: {'completion_fns': ['gpt-3.5-turb...
3,Hello! How can I assist you today?,Hello! I'm here to help with any questions or ...,,Hello! How can I assist you today?,1. Response 1 provides a more detailed and pro...,Yes,{'spec': {0: {'completion_fns': ['gpt-3.5-turb...
4,Hello! How can I assist you today?,It seems like there was a bit of confusion the...,Hello! How can I assist you today?,Hello! I'm here to help with any questions or ...,1. Both responses start with a friendly greeti...,No,{'spec': {0: {'completion_fns': ['gpt-3.5-turb...
5,Hello! How can I assist you today?,Hello! It seems like there was a mix-up in our...,Hello! How can I assist you today?,Hello! I'm here to help with any questions or ...,1. Both responses start with a friendly greeti...,No,{'spec': {0: {'completion_fns': ['gpt-3.5-turb...
6,,Hello! How can I assist you today? Feel free t...,Hello! How can I assist you today?,Hello! I'm here to help with any questions or ...,1. The first response includes an additional p...,No,{'spec': {0: {'completion_fns': ['gpt-3.5-turb...
7,Hello! I'm here to help with any questions or ...,It seems like we both greeted each other at th...,Hello! How can I assist you today?,Hello! I'm here to help with any questions or ...,1. Response 1 acknowledges the coincidence of ...,No,{'spec': {0: {'completion_fns': ['gpt-3.5-turb...
8,It seems like there was a bit of confusion the...,Thank you for offering to assist me! I appreci...,Hello! How can I assist you today?,Hello! I'm here to help with any questions or ...,1. Response 1 acknowledges the initial confusi...,No,{'spec': {0: {'completion_fns': ['gpt-3.5-turb...
9,Hello! How can I assist you today?,It seems like there was a misunderstanding. I'...,Hello! How can I assist you today?,Hello! I'm here to help with any questions or ...,1. Both responses start with a greeting and ex...,Yes,{'spec': {0: {'completion_fns': ['gpt-3.5-turb...


In [829]:
candidate_messages

[[{'role': 'user',
   'content': 'Thank you for offering to assist me! I appreciate it. If you have any questions or tasks you need help with, feel free to let me know.'}]]