## Building a BATTLE Eval

In [334]:
new = True
initial_message = [{"role": "user", "content": ""}]
battles_generation = "dense"      # methods: dense, only-new
roles = "system-user"             # methods: system-user, system, user

#!cd evals
#!git lfs fetch --all
#!git lfs pull

In [335]:
import os
from openai import OpenAI
import pandas as pd
import yaml
#import datetime
#from dotenv import load_dotenv

#load_dotenv()  # Load environment variables from .env file

#api_key = os.environ.get("OPENAI_API_KEY")
client = OpenAI()

# Install Evals if you haven't already
# %pip install -e ../.
# pip install --upgrade openai
# %pip install pandas


In [336]:
# Paths. Assuming this notebook is in examples/

evals_path = os.path.join(os.getcwd(), "..", "evals")

registry_path = os.path.join(evals_path, "registry", "evals", "battles.yaml")

data_path = os.path.join(evals_path, "registry", "data", "battles")
os.makedirs(data_path, exist_ok=True)
data_path = os.path.join(data_path, "samples.jsonl")

json_logs_path = os.path.join(os.getcwd(), "logs")
os.makedirs(json_logs_path, exist_ok=True)
json_logs_path = os.path.join(json_logs_path, "logs")

df_path = os.path.join(evals_path, "evallogs", "df")
os.makedirs(df_path, exist_ok=True)
dataset_path = os.path.join(df_path, "dataset")

In [337]:
# Registry yaml

registry_yaml = {}

registry_yaml["battles"] = {
    "id": "battles.test.v1",
    "metrics": ["accuracy"]
}
registry_yaml["battles.test.v1"] = {
    "class": "evals.elsuite.modelgraded.classify:ModelBasedClassify",
    "args": {
        "samples_jsonl": "battles/samples.jsonl",
        "eval_type": "cot_classify",
        "modelgraded_spec": "battle"
    }
}
with open(os.path.join(registry_path), "w") as f:
    yaml.dump(registry_yaml, f)

In [338]:
# Initial data
if new:
    dataset = ()
    dataset = pd.DataFrame(dataset, columns=['Instruction1', 'Response1', 'Instruction2', 'Response2', 'Sampled',
       'Choice', 'Data'])
    best_message = initial_message
    best_response = client.chat.completions.create(
        messages = initial_message,
        model = "gpt-3.5-turbo",
        ).choices[0].message.content
else:
    with open(dataset_path, "r") as f:
        dataset = pd.read_json(f, lines=True)
    if dataset.iloc[-1]["Choice"] == "No":
        best_content = dataset.iloc[-1]["Instruction2"]
        best_response = dataset.iloc[-1]["Response2"]
    else:
        best_content = dataset.iloc[-1]["Instruction1"]
        best_response = dataset.iloc[-1]["Response1"]
    best_message = [{"role": "user", "content": best_content}]
candidate_messages = [best_message]
next_candidate_messages = []
generation_distance = 0


In [339]:
def battle(best_message, best_response, candidate_message):

    dataset = [{"input1": candidate_message, "input2": best_message, "completion2":best_response}]

    df = pd.DataFrame(dataset)
    df.to_json(data_path, orient="records", lines=True)

    !oaieval gpt-3.5-turbo battles --record_path logs/logs
    
    with open(json_logs_path, "r") as f:
        df = pd.read_json(f, lines=True)

    #current_time = datetime.datetime.now()
    #formatted_time = current_time.strftime("%Y%m%d_%H%M%S")
    #df.to_json(os.path.join(df_path, formatted_time), lines=True, orient="records")

    instruction1 = candidate_message[0]["content"]
    instruction2 = best_message[0]["content"]

    battle_prompt_content = df["data"].iloc[-2]["prompt"][0]["content"]
    response1 = battle_prompt_content.split("\n[Response 1]\n",)[1].split("\n\n[Instruction 2]\n")[0]
    response1 = response1.replace("\\'", "'").replace("\\n", "\n")
    response2 = battle_prompt_content.split("\n[Response 2]\n",)[1].split("\n\n\nIs the first response better than the second?")[0]
    response2 = response2.replace("\\'", "'").replace("\\n", "\n")
    print(f"Response1: {response1}")
    #print(f"response2: {response2}")

    sampled = df["data"].iloc[-2]["sampled"][0]

    choice = df["data"].iloc[-1]["choice"]

    data = {'Instruction1': instruction1, 'Response1': response1, 'Instruction2': instruction2, 'Response2': response2, 'Sampled': sampled, 'Choice': choice, 'Data': {}}
    data = pd.DataFrame([data])
    data.at[0, "Data"] = df.to_dict()
    return data

In [340]:
def switch_system_and_user(message):
    new_message = []
    if message[0]["role"] == "system":
        new_message.append([{"role": "user", "content": message[0]["content"]}])
    else:
        new_message.append([{"role": "system", "content": message[0]["content"]}])
    return new_message

In [341]:
def find_parents(content, dataset):
    print (f"finding parents {content}")
    dataset = pd.DataFrame(dataset)
    dataset = dataset[dataset["Response1"] == content]
    parents = dataset["Instruction1"].unique()
    return parents

In [342]:
def find_children(content, dataset):
    print (f"finding children {content}")
    dataset = pd.DataFrame(dataset)
    dataset = dataset[dataset["Instruction1"] == content]
    children = dataset["Response1"].unique()
    return children

In [343]:
def list_candidate_messages(dataset, best_message, generation_distance):
    best_content = best_message[0]["content"]
    last = [best_content]
    list_of_contents = [best_content]

    next = []
    for i in range(generation_distance):
        print (f"starting level {i}, generation distance {generation_distance}")
        for content in last:
            #print(f"last_up: {last_up}, now {content}")
            next.extend(find_parents(content, dataset))
            next.extend(find_children(content, dataset))
            
        print (f"behind for loops")
        list_of_contents.extend(next)
        last = next.copy()
        print(f"endind level {i}")
        
    list_of_contents = pd.array(list_of_contents).unique().tolist()
    
    #best_message_index = list_of_contents.index(best_message[0][content])
    #start = max(best_message_index - generation_distance, 0)
    #stop = min(len(list_of_contents))
    #stop = len(list_of_contents) - 1

    messages = []
    for content in list_of_contents:
        if roles == "system-user" or "user":
            messages.append([{"role":"user","content":content}])
        if roles == "system-user" or "system":
            messages.append([{"role":"system","content":content}])
    return messages

In [419]:
for candidate_message in candidate_messages:
    data = battle(best_message, best_response, candidate_message)
    dataset = pd.concat([dataset, data],ignore_index=True)
    dataset.to_json(os.path.join(df_path, "dataset"), lines=True, orient="records")
    if data["Choice"].iloc[0] == "Yes":
        best_message = candidate_message
        best_response = data["Response1"].iloc[0]
        print(f"New best message:{best_message}")
        if battles_generation == "dense":
            generation_distance = 0
    print(f"Instruction1: {candidate_message[0]["content"]}")

    if battles_generation == "only-new":
        new_message = [{"role": candidate_message[0]["role"], "content":data["Response1"].iloc[0]}]
        if new_message not in next_candidate_messages:
            next_candidate_messages.append(new_message)
            if roles == "system-user":
                next_candidate_messages.append(switch_system_and_user(new_message))

if battles_generation == "only-new":
    candidate_messages = next_candidate_messages
    next_candidate_messages = []
if battles_generation == "dense":
    #print("going to list candidate messages")
    candidate_messages = list_candidate_messages(dataset, best_message, generation_distance)
    generation_distance += 1
    #print("candidate messages listed")
print(f"all done, generation distance: {generation_distance}, number of candidate messages: {len(candidate_messages)}")

[2024-08-09 10:53:16,649] [registry.py:271] Loading registry from /Users/janvotava/Desktop/evals/evals/registry/evals
[2024-08-09 10:53:18,629] [registry.py:271] Loading registry from /Users/janvotava/.evals/evals
[2024-08-09 10:53:18,635] [oaieval.py:215] [1;35mRun started: 240809085318JOWMY25E[0m
[2024-08-09 10:53:18,637] [registry.py:271] Loading registry from /Users/janvotava/Desktop/evals/evals/registry/modelgraded
[2024-08-09 10:53:18,675] [registry.py:271] Loading registry from /Users/janvotava/.evals/modelgraded
[2024-08-09 10:53:18,675] [data.py:94] Fetching /Users/janvotava/Desktop/evals/evals/registry/data/battles/samples.jsonl
[2024-08-09 10:53:18,676] [eval.py:36] Evaluating 1 samples
[2024-08-09 10:53:18,702] [eval.py:144] Running in threaded mode with 10 threads!
100%|█████████████████████████████████████████████| 1/1 [00:02<00:00,  2.43s/it]
[2024-08-09 10:53:21,140] [oaieval.py:275] Found 2/2 sampling events with usage data
[2024-08-09 10:53:21,140] [oaieval.py:283] 

In [413]:
dataset.iloc[-20:]

Unnamed: 0,Instruction1,Response1,Instruction2,Response2,Sampled,Choice,Data
125,Thank you for providing your order number. How...,"I'm sorry, I don't have an order number. I was...",I have a question about my recent order. Can y...,Of course! I'd be happy to help. Please provid...,1. Response 1 acknowledges the lack of an orde...,No,{'spec': {0: {'completion_fns': ['gpt-3.5-turb...
126,Thank you for providing your order number. How...,"I'm sorry, but I am unable to assist with spec...",I have a question about my recent order. Can y...,Of course! I'd be happy to help. Please provid...,1. Response 1 states that they are unable to a...,No,{'spec': {0: {'completion_fns': ['gpt-3.5-turb...
127,I have a question about my recent order. Can y...,Of course! I'd be happy to help. Please provid...,I have a question about my recent order. Can y...,Of course! I'd be happy to help. Please provid...,1. Both responses start with the same positive...,No,{'spec': {0: {'completion_fns': ['gpt-3.5-turb...
128,I have a question about my recent order. Can y...,Of course! I'd be happy to help you with any q...,I have a question about my recent order. Can y...,Of course! I'd be happy to help. Please provid...,"1. Both responses start with ""Of course! I'd b...",No,{'spec': {0: {'completion_fns': ['gpt-3.5-turb...
129,I have a question about my recent order. Can y...,Of course! I'd be happy to help. Please provid...,I have a question about my recent order. Can y...,Of course! I'd be happy to help. Please provid...,1. Both responses start with the same positive...,No,{'spec': {0: {'completion_fns': ['gpt-3.5-turb...
130,I have a question about my recent order. Can y...,Of course! I'd be happy to help you with your ...,I have a question about my recent order. Can y...,Of course! I'd be happy to help. Please provid...,1. Response 1 acknowledges the request for hel...,Yes,{'spec': {0: {'completion_fns': ['gpt-3.5-turb...
131,Hello! I'm here to assist you with any questio...,I have a question about my recent order. Can y...,I have a question about my recent order. Can y...,Of course! I'd be happy to help you with your ...,1. Response 1 directly asks for help with a re...,Yes,{'spec': {0: {'completion_fns': ['gpt-3.5-turb...
132,Hello! I'm here to assist you with any questio...,Hello! How can I assist you today?,Hello! I'm here to assist you with any questio...,I have a question about my recent order. Can y...,1. Response 1 directly mirrors the language us...,No,{'spec': {0: {'completion_fns': ['gpt-3.5-turb...
133,Of course! I'd be happy to help. Please provid...,If you have any questions or concerns about yo...,Hello! I'm here to assist you with any questio...,I have a question about my recent order. Can y...,1. Response 1 in Instruction 1 asks for specif...,Yes,{'spec': {0: {'completion_fns': ['gpt-3.5-turb...
134,Of course! I'd be happy to help. Please provid...,"Sure, I can help you with that. Please provide...",Of course! I'd be happy to help. Please provid...,If you have any questions or concerns about yo...,1. Both responses acknowledge the request for ...,No,{'spec': {0: {'completion_fns': ['gpt-3.5-turb...


In [347]:
from IPython.display import display_markdown

In [420]:
for entry in dataset[dataset["Choice"]=="Yes"].iloc[-1].drop("Data"):
    display_markdown(entry, raw=True)

I have a question about my recent order. Can you help me with that?

Of course! I'd be happy to help you with any questions you have about your recent order. Please provide me with your order number or any other relevant details so I can assist you more effectively.

Of course! I'd be happy to help. Please provide me with your order number and any specific details about your question so I can assist you further.

Thank you for providing your order number. How can I assist you with your order today?

1. Response 1 acknowledges the customer's question and expresses willingness to help.
2. Response 1 asks for relevant details to assist the customer more effectively.
3. Response 2 also acknowledges the customer's request for help.
4. Response 2 asks for the order number and specific details about the question.

Based on the steps above, Response 1 is better than Response 2 as it not only expresses willingness to help but also asks for relevant details upfront to assist the customer more effectively.

Yes

Yes

In [415]:
candidate_messages

[[{'role': 'user',
   'content': "Of course! I'd be happy to help. Please provide me with your order number and any specific details about your question so I can assist you further."}],
 [{'role': 'system',
   'content': "Of course! I'd be happy to help. Please provide me with your order number and any specific details about your question so I can assist you further."}]]