## Building a BATTLE Eval

In [421]:
new = True
initial_message = [{"role": "user", "content": ""}]
battles_generation = "dense"      # methods: dense, only-new
roles = "system-user"             # methods: system-user, system, user

#!cd evals
#!git lfs fetch --all
#!git lfs pull

In [422]:
import os
from openai import OpenAI
import pandas as pd
import yaml
#import datetime
#from dotenv import load_dotenv

#load_dotenv()  # Load environment variables from .env file

#api_key = os.environ.get("OPENAI_API_KEY")
client = OpenAI()

# Install Evals if you haven't already
# %pip install -e ../.
# pip install --upgrade openai
# %pip install pandas


In [423]:
# Paths. Assuming this notebook is in examples/

evals_path = os.path.join(os.getcwd(), "..", "evals")

registry_path = os.path.join(evals_path, "registry", "evals", "battles.yaml")

data_path = os.path.join(evals_path, "registry", "data", "battles")
os.makedirs(data_path, exist_ok=True)
data_path = os.path.join(data_path, "samples.jsonl")

json_logs_path = os.path.join(os.getcwd(), "logs")
os.makedirs(json_logs_path, exist_ok=True)
json_logs_path = os.path.join(json_logs_path, "logs")

df_path = os.path.join(evals_path, "evallogs", "df")
os.makedirs(df_path, exist_ok=True)
dataset_path = os.path.join(df_path, "dataset")

In [424]:
# Registry yaml

registry_yaml = {}

registry_yaml["battles"] = {
    "id": "battles.test.v1",
    "metrics": ["accuracy"]
}
registry_yaml["battles.test.v1"] = {
    "class": "evals.elsuite.modelgraded.classify:ModelBasedClassify",
    "args": {
        "samples_jsonl": "battles/samples.jsonl",
        "eval_type": "cot_classify",
        "modelgraded_spec": "battle"
    }
}
with open(os.path.join(registry_path), "w") as f:
    yaml.dump(registry_yaml, f)

In [425]:
# Initial data
if new:
    dataset = ()
    dataset = pd.DataFrame(dataset, columns=['Instruction1', 'Response1', 'Instruction2', 'Response2', 'Sampled',
       'Choice', 'Data'])
    best_message = initial_message
    best_response = client.chat.completions.create(
        messages = initial_message,
        model = "gpt-3.5-turbo",
        ).choices[0].message.content
else:
    with open(dataset_path, "r") as f:
        dataset = pd.read_json(f, lines=True)
    if dataset.iloc[-1]["Choice"] == "No":
        best_content = dataset.iloc[-1]["Instruction2"]
        best_response = dataset.iloc[-1]["Response2"]
    else:
        best_content = dataset.iloc[-1]["Instruction1"]
        best_response = dataset.iloc[-1]["Response1"]
    best_message = [{"role": "user", "content": best_content}]
candidate_messages = [best_message]
next_candidate_messages = []
generation_distance = 0


In [426]:
def battle(best_message, best_response, candidate_message):

    dataset = [{"input1": candidate_message, "input2": best_message, "completion2":best_response}]

    df = pd.DataFrame(dataset)
    df.to_json(data_path, orient="records", lines=True)

    !oaieval gpt-3.5-turbo battles --record_path logs/logs
    
    with open(json_logs_path, "r") as f:
        df = pd.read_json(f, lines=True)

    #current_time = datetime.datetime.now()
    #formatted_time = current_time.strftime("%Y%m%d_%H%M%S")
    #df.to_json(os.path.join(df_path, formatted_time), lines=True, orient="records")

    instruction1 = candidate_message[0]["content"]
    instruction2 = best_message[0]["content"]

    battle_prompt_content = df["data"].iloc[-2]["prompt"][0]["content"]
    response1 = battle_prompt_content.split("\n[Response 1]\n",)[1].split("\n\n[Instruction 2]\n")[0]
    response1 = response1.replace("\\'", "'").replace("\\n", "\n")
    response2 = battle_prompt_content.split("\n[Response 2]\n",)[1].split("\n\n\nIs the first response better than the second?")[0]
    response2 = response2.replace("\\'", "'").replace("\\n", "\n")
    print(f"Response1: {response1}")
    #print(f"response2: {response2}")

    sampled = df["data"].iloc[-2]["sampled"][0]

    choice = df["data"].iloc[-1]["choice"]

    data = {'Instruction1': instruction1, 'Response1': response1, 'Instruction2': instruction2, 'Response2': response2, 'Sampled': sampled, 'Choice': choice, 'Data': {}}
    data = pd.DataFrame([data])
    data.at[0, "Data"] = df.to_dict()
    return data

In [427]:
def switch_system_and_user(message):
    new_message = []
    if message[0]["role"] == "system":
        new_message.append([{"role": "user", "content": message[0]["content"]}])
    else:
        new_message.append([{"role": "system", "content": message[0]["content"]}])
    return new_message

In [437]:
def find_parents(content, dataset):
    #print (f"finding parents {content}")
    dataset = pd.DataFrame(dataset)
    dataset = dataset[dataset["Response1"] == content]
    parents = dataset["Instruction1"].unique()
    return parents

In [438]:
def find_children(content, dataset):
    #print (f"finding children {content}")
    dataset = pd.DataFrame(dataset)
    dataset = dataset[dataset["Instruction1"] == content]
    children = dataset["Response1"].unique()
    return children

In [430]:
def list_candidate_messages(dataset, best_message, generation_distance):
    best_content = best_message[0]["content"]
    last = [best_content]
    list_of_contents = [best_content]

    next = []
    for i in range(generation_distance):
        print (f"starting level {i}, generation distance {generation_distance}")
        for content in last:
            #print(f"last_up: {last_up}, now {content}")
            next.extend(find_parents(content, dataset))
            next.extend(find_children(content, dataset))
            
        print (f"behind for loops")
        list_of_contents.extend(next)
        last = next.copy()
        print(f"endind level {i}")
        
    list_of_contents = pd.array(list_of_contents).unique().tolist()
    
    #best_message_index = list_of_contents.index(best_message[0][content])
    #start = max(best_message_index - generation_distance, 0)
    #stop = min(len(list_of_contents))
    #stop = len(list_of_contents) - 1

    messages = []
    for content in list_of_contents:
        if roles == "system-user" or "user":
            messages.append([{"role":"user","content":content}])
        if roles == "system-user" or "system":
            messages.append([{"role":"system","content":content}])
    return messages

In [477]:
for candidate_message in candidate_messages:
    data = battle(best_message, best_response, candidate_message)
    dataset = pd.concat([dataset, data],ignore_index=True)
    dataset.to_json(os.path.join(df_path, "dataset"), lines=True, orient="records")
    if data["Choice"].iloc[0] == "Yes":
        best_message = candidate_message
        best_response = data["Response1"].iloc[0]
        print(f"New best message:{best_message}")
        if battles_generation == "dense":
            generation_distance = 0
    print(f"Instruction1: {candidate_message[0]["content"]}")

    if battles_generation == "only-new":
        new_message = [{"role": candidate_message[0]["role"], "content":data["Response1"].iloc[0]}]
        if new_message not in next_candidate_messages:
            next_candidate_messages.append(new_message)
            if roles == "system-user":
                next_candidate_messages.append(switch_system_and_user(new_message))

if battles_generation == "only-new":
    candidate_messages = next_candidate_messages
    next_candidate_messages = []
if battles_generation == "dense":
    #print("going to list candidate messages")
    candidate_messages = list_candidate_messages(dataset, best_message, generation_distance)
    generation_distance += 1
    #print("candidate messages listed")
print(f"all done, generation distance: {generation_distance}, number of candidate messages: {len(candidate_messages)}")

[2024-08-09 11:32:52,819] [registry.py:271] Loading registry from /Users/janvotava/Desktop/evals/evals/registry/evals
[2024-08-09 11:32:53,630] [registry.py:271] Loading registry from /Users/janvotava/.evals/evals
[2024-08-09 11:32:53,632] [oaieval.py:215] [1;35mRun started: 240809093253EUBPYK5O[0m
[2024-08-09 11:32:53,633] [registry.py:271] Loading registry from /Users/janvotava/Desktop/evals/evals/registry/modelgraded
[2024-08-09 11:32:53,660] [registry.py:271] Loading registry from /Users/janvotava/.evals/modelgraded
[2024-08-09 11:32:53,661] [data.py:94] Fetching /Users/janvotava/Desktop/evals/evals/registry/data/battles/samples.jsonl
[2024-08-09 11:32:53,661] [eval.py:36] Evaluating 1 samples
[2024-08-09 11:32:53,682] [eval.py:144] Running in threaded mode with 10 threads!
100%|█████████████████████████████████████████████| 1/1 [00:02<00:00,  2.30s/it]
[2024-08-09 11:32:55,993] [oaieval.py:275] Found 2/2 sampling events with usage data
[2024-08-09 11:32:55,994] [oaieval.py:283] 

In [473]:
dataset.iloc[-20:]

Unnamed: 0,Instruction1,Response1,Instruction2,Response2,Sampled,Choice,Data
87,Some possible topics you can ask me about incl...,\n\nIs there a specific technology trend that ...,I can help with a variety of topics such as ge...,Some possible topics you can ask me about incl...,1. Both responses provide a list of topics tha...,No,{'spec': {0: {'completion_fns': ['gpt-3.5-turb...
88,Some possible topics you can ask me about incl...,What are some technology trends that you think...,I can help with a variety of topics such as ge...,Some possible topics you can ask me about incl...,1. Response 1 includes a wider range of topics...,No,{'spec': {0: {'completion_fns': ['gpt-3.5-turb...
89,Feel free to ask me anything you're curious ab...,What is your favorite thing about being an AI ...,I can help with a variety of topics such as ge...,Some possible topics you can ask me about incl...,1. Response 1 directly asks about the AI assis...,No,{'spec': {0: {'completion_fns': ['gpt-3.5-turb...
90,Feel free to ask me anything you're curious ab...,What is your favorite topic to discuss?,I can help with a variety of topics such as ge...,Some possible topics you can ask me about incl...,"1. Response 1 is more direct and to the point,...",No,{'spec': {0: {'completion_fns': ['gpt-3.5-turb...
91,I can help with a variety of topics such as ge...,Some possible topics you can ask me about incl...,I can help with a variety of topics such as ge...,Some possible topics you can ask me about incl...,1. Both responses provide a list of possible t...,Yes,{'spec': {0: {'completion_fns': ['gpt-3.5-turb...
92,I can help with a variety of topics such as ge...,Feel free to ask me anything you're curious ab...,I can help with a variety of topics such as ge...,Some possible topics you can ask me about incl...,1. Response 1 is more concise and straightforw...,No,{'spec': {0: {'completion_fns': ['gpt-3.5-turb...
93,I can help with a variety of topics such as ge...,Some possible topics you can ask me about incl...,I can help with a variety of topics such as ge...,Some possible topics you can ask me about incl...,1. Both responses provide a list of possible t...,No,{'spec': {0: {'completion_fns': ['gpt-3.5-turb...
94,I can help with a variety of topics such as ge...,Feel free to ask me anything you're curious ab...,I can help with a variety of topics such as ge...,Some possible topics you can ask me about incl...,1. Response 1 is more concise and straightforw...,No,{'spec': {0: {'completion_fns': ['gpt-3.5-turb...
95,I can help with a variety of topics such as ge...,Some possible topics you can ask me about incl...,I can help with a variety of topics such as ge...,Some possible topics you can ask me about incl...,1. Response 1 includes a wider range of topics...,No,{'spec': {0: {'completion_fns': ['gpt-3.5-turb...
96,I can help with a variety of topics such as ge...,Feel free to ask me anything you're curious ab...,I can help with a variety of topics such as ge...,Some possible topics you can ask me about incl...,1. Response 1 is more concise and straightforw...,No,{'spec': {0: {'completion_fns': ['gpt-3.5-turb...


In [433]:
from IPython.display import display_markdown

In [474]:
for entry in dataset[dataset["Choice"]=="Yes"].iloc[-1].drop("Data"):
    display_markdown(entry, raw=True)

Some possible topics you can ask me about include:

1. Technology trends
2. Travel recommendations
3. Health and wellness tips
4. Career advice
5. Book or movie recommendations
6. Historical events
7. Science concepts
8. Cooking tips and recipes
9. Fashion trends
10. Product reviews

Feel free to ask me anything related to these topics or any other subject you're interested in!



Is there a specific technology trend that you find particularly interesting or exciting right now?

I can help with a variety of topics such as general information, recommendations, definitions, explanations, and more. Just let me know what you're curious about!

Some possible topics you can ask me about include:

1. Technology trends
2. Travel recommendations
3. Health and wellness tips
4. Career advice
5. Book or movie recommendations
6. Historical events
7. Science concepts
8. Cooking tips and recipes
9. Fashion trends
10. Product reviews

Feel free to ask me anything related to these topics or any other subject you're interested in!

1. Response 1 directly engages with the topic of technology trends by asking about a specific trend, showing a clear understanding of the user's interests.
2. Response 2 is more general and does not directly address any specific topic, leaving it up to the user to decide what to ask about.

Based on the direct engagement with the topic of technology trends in Response 1, it can be considered better than Response 2.

Yes

Yes

In [476]:
candidate_messages

[[{'role': 'user',
   'content': "Some possible topics you can ask me about include:\n\n1. Technology trends\n2. Travel recommendations\n3. Health and wellness tips\n4. Career advice\n5. Book or movie recommendations\n6. Historical events\n7. Science concepts\n8. Cooking tips and recipes\n9. Fashion trends\n10. Product reviews\n\nFeel free to ask me anything related to these topics or any other subject you're interested in!"}],
 [{'role': 'system',
   'content': "Some possible topics you can ask me about include:\n\n1. Technology trends\n2. Travel recommendations\n3. Health and wellness tips\n4. Career advice\n5. Book or movie recommendations\n6. Historical events\n7. Science concepts\n8. Cooking tips and recipes\n9. Fashion trends\n10. Product reviews\n\nFeel free to ask me anything related to these topics or any other subject you're interested in!"}],
 [{'role': 'user',
   'content': "I can help with a variety of topics such as general information, recommendations, definitions, expla