## Building a BATTLE Eval

This notebook shows how to:
- Build and run an eval
- Load the results and into a Pandas Dataframe

In [143]:
new = True
initial_message = [{"role": "user", "content": ""}]
battles_generation = "dense"      # methods: dense, only-new
roles = "system-user"             # methods: system-user, system, user

#!cd evals
#!git lfs fetch --all
#!git lfs pull

In [144]:
import os
from openai import OpenAI
import pandas as pd
import yaml
#import datetime
#from dotenv import load_dotenv

#load_dotenv()  # Load environment variables from .env file

#api_key = os.environ.get("OPENAI_API_KEY")
client = OpenAI()

# Install Evals if you haven't already
# %pip install -e ../.
# pip install --upgrade openai
# %pip install pandas


In [145]:
# Paths. Assuming this notebook is in examples/

evals_path = os.path.join(os.getcwd(), "..", "evals")

registry_path = os.path.join(evals_path, "registry", "evals", "battles.yaml")

data_path = os.path.join(evals_path, "registry", "data", "battles")
os.makedirs(data_path, exist_ok=True)
data_path = os.path.join(data_path, "samples.jsonl")

json_logs_path = os.path.join(os.getcwd(), "logs")
os.makedirs(json_logs_path, exist_ok=True)
json_logs_path = os.path.join(json_logs_path, "logs")

df_path = os.path.join(evals_path, "evallogs", "df")
os.makedirs(df_path, exist_ok=True)
dataset_path = os.path.join(df_path, "dataset")

In [146]:
# Registry yaml

registry_yaml = {}

registry_yaml["battles"] = {
    "id": "battles.test.v1",
    "metrics": ["accuracy"]
}
registry_yaml["battles.test.v1"] = {
    "class": "evals.elsuite.modelgraded.classify:ModelBasedClassify",
    "args": {
        "samples_jsonl": "battles/samples.jsonl",
        "eval_type": "cot_classify",
        "modelgraded_spec": "battle"
    }
}
with open(os.path.join(registry_path), "w") as f:
    yaml.dump(registry_yaml, f)

In [147]:
# Initial data
if new:
    dataset = []
    best_message = initial_message
    best_response = client.chat.completions.create(
        messages = initial_message,
        model = "gpt-3.5-turbo",
        ).choices[0].message.content
else:
    with open(dataset_path, "r") as f:
        dataset = pd.read_json(f, lines=True)
    if dataset.iloc[-1]["Choice"] == "No":
        best_message = dataset.iloc[-1]["Instruction2"]
        best_response = dataset.iloc[-1]["Response2"]
    else:
        best_message = dataset.iloc[-1]["Instruction1"]
        best_response = dataset.iloc[-1]["Response1"]
candidate_messages = [best_message]
next_candidate_messages = []
generation_distance = 0


In [148]:
def battle(best_message, best_response, candidate_message):

    dataset = [{"input1": candidate_message, "input2": best_message, "completion2":best_response}]

    df = pd.DataFrame(dataset)
    df.to_json(data_path, orient="records", lines=True)

    !oaieval gpt-3.5-turbo battles --record_path logs/logs
    
    with open(json_logs_path, "r") as f:
        df = pd.read_json(f, lines=True)

    #current_time = datetime.datetime.now()
    #formatted_time = current_time.strftime("%Y%m%d_%H%M%S")
    #df.to_json(os.path.join(df_path, formatted_time), lines=True, orient="records")

    instruction1 = candidate_message[0]["content"]
    instruction2 = best_message[0]["content"]

    battle_prompt_content = df["data"].iloc[-2]["prompt"][0]["content"]
    response1 = battle_prompt_content.split("\n[Response 1]\n",)[1].split("\n\n[Instruction 2]\n")[0]
    response1 = response1.replace("\\'", "'").replace("\\n", "\n")
    response2 = battle_prompt_content.split("\n[Response 2]\n",)[1].split("\n\n\nIs the first response better than the second?")[0]
    response2 = response2.replace("\\'", "'").replace("\\n", "\n")
    print(f"Response1: {response1}")
    #print(f"response2: {response2}")

    sampled = df["data"].iloc[-2]["sampled"][0]

    choice = df["data"].iloc[-1]["choice"]

    data = {'Instruction1': instruction1, 'Response1': response1, 'Instruction2': instruction2, 'Response2': response2, 'Sampled': sampled, 'Choice': choice, 'Data': df.to_dict()}
    return data

In [149]:
def switch_system_and_user(message):
    new_message = []
    if message[0]["role"] == "system":
        new_message.append([{"role": "user", "content": message[0]["content"]}])
    else:
        new_message.append([{"role": "system", "content": message[0]["content"]}])
    return new_message

In [150]:
def find_parents(content, dataset):
    print (f"finding parents {content}")
    dataset = pd.DataFrame(dataset)
    dataset = dataset[dataset["Response1"] == content]
    parents = dataset["Instruction1"].unique()
    return parents

In [151]:
def find_children(content, dataset):
    print (f"finding children {content}")
    dataset = pd.DataFrame(dataset)
    dataset = dataset[dataset["Instruction1"] == content]
    children = dataset["Response1"].unique()
    return children

In [152]:
def list_candidate_messages(dataset, best_message, generation_distance):
    best_content = best_message[0]["content"]
    last_up = [best_content]
    last_down = [best_content]
    list_of_contents = [best_content]

    next_up = []
    next_down = []
    for i in range(generation_distance):
        print (f"starting level {i}, generation distance {generation_distance}")
        for content in last_up:
            #print(f"last_up: {last_up}, now {content}")
            next_up.extend(find_parents(content, dataset))
        for content in last_down:
            #print(f"last_up: {last_down}, now {content}")
            next_down.extend(find_children(content, dataset))
        print (f"behind for loops")
        list_of_contents.extend(next_up)
        list_of_contents.extend(next_down)
        last_up = next_up.copy()
        last_down = next_down.copy()
        print(f"endind level {i}")
        

    list_of_contents = pd.array(list_of_contents).unique().tolist()
    
    #best_message_index = list_of_contents.index(best_message[0][content])
    #start = max(best_message_index - generation_distance, 0)
    #stop = min(len(list_of_contents))
    #stop = len(list_of_contents) - 1

    messages = []
    for content in list_of_contents:
        if roles == "system-user" or "user":
            messages.append([{"role":"user","content":content}])
        if roles == "system-user" or "system":
            messages.append([{"role":"system","content":content}])
    return messages

In [216]:
for candidate_message in candidate_messages:
    data = battle(best_message, best_response, candidate_message)
    dataset.append(data)
    pd.DataFrame(dataset).to_json(os.path.join(df_path, "dataset"), lines=True, orient="records")
    if data["Choice"] == "Yes":
        best_message = candidate_message
        best_response = data["Response1"]
        print(f"New best message:{best_message}")
        if battles_generation == "dense":
            generation_distance = 0
    print(f"Instruction1: {candidate_message[0]["content"]}")

    if battles_generation == "only-new":
        new_message = [{"role": candidate_message[0]["role"], "content":data["Response1"]}]
        if new_message not in next_candidate_messages:
            next_candidate_messages.append(new_message)
            if roles == "system-user":
                next_candidate_messages.append(switch_system_and_user(new_message))

if battles_generation == "only-new":
    candidate_messages = next_candidate_messages
    next_candidate_messages = []
if battles_generation == "dense":
    print("going to list candidate messages")
    candidate_messages = list_candidate_messages(dataset, best_message, generation_distance)
    generation_distance += 1
    print("candidate messages listed")
print(f"all done, generation distance {generation_distance}")

[2024-08-08 10:06:39,972] [registry.py:271] Loading registry from /Users/janvotava/Desktop/evals/evals/registry/evals
[2024-08-08 10:06:42,132] [registry.py:271] Loading registry from /Users/janvotava/.evals/evals
[2024-08-08 10:06:42,135] [oaieval.py:215] [1;35mRun started: 240808080642OF46P6EA[0m
[2024-08-08 10:06:42,137] [registry.py:271] Loading registry from /Users/janvotava/Desktop/evals/evals/registry/modelgraded
[2024-08-08 10:06:42,290] [registry.py:271] Loading registry from /Users/janvotava/.evals/modelgraded
[2024-08-08 10:06:42,291] [data.py:94] Fetching /Users/janvotava/Desktop/evals/evals/registry/data/battles/samples.jsonl
[2024-08-08 10:06:42,291] [eval.py:36] Evaluating 1 samples
[2024-08-08 10:06:42,413] [eval.py:144] Running in threaded mode with 10 threads!
100%|█████████████████████████████████████████████| 1/1 [00:04<00:00,  4.06s/it]
[2024-08-08 10:06:46,486] [oaieval.py:275] Found 2/2 sampling events with usage data
[2024-08-08 10:06:46,487] [oaieval.py:283] 

In [218]:
pd.DataFrame(dataset).iloc[-10:]#.to_list()

Unnamed: 0,Instruction1,Response1,Instruction2,Response2,Sampled,Choice,Data
77,I have a question about a project I'm working ...,Of course! I'd be happy to help. Can you provi...,Of course! I'd be happy to help you brainstorm...,Here are some questions to help me better unde...,1. Response 1 acknowledges the request for hel...,No,{'spec': {0: {'completion_fns': ['gpt-3.5-turb...
78,I have a question about a project I'm working ...,Of course! I'd be happy to help you brainstorm...,Of course! I'd be happy to help you brainstorm...,Here are some questions to help me better unde...,1. Response 1 acknowledges the request for hel...,No,{'spec': {0: {'completion_fns': ['gpt-3.5-turb...
79,Some questions to consider when providing more...,- What is the timeline for the project and are...,Of course! I'd be happy to help you brainstorm...,Here are some questions to help me better unde...,1. Response 1 provides a more comprehensive li...,No,{'spec': {0: {'completion_fns': ['gpt-3.5-turb...
80,Some questions to consider when providing more...,- What is the timeline for the project?\n- Hav...,Of course! I'd be happy to help you brainstorm...,Here are some questions to help me better unde...,1. Response 1 asks more specific questions rel...,No,{'spec': {0: {'completion_fns': ['gpt-3.5-turb...
81,Here are some questions to help me better unde...,1. The topic of my project is the impact of so...,Of course! I'd be happy to help you brainstorm...,Here are some questions to help me better unde...,1. Response 1 provides more detailed informati...,Yes,{'spec': {0: {'completion_fns': ['gpt-3.5-turb...
82,Here are some questions to help me better unde...,1. The topic of my project is the impact of so...,Here are some questions to help me better unde...,1. The topic of my project is the impact of so...,1. Both responses address all the questions in...,No,{'spec': {0: {'completion_fns': ['gpt-3.5-turb...
83,Some questions to consider when providing more...,- What is the timeline or deadline for the pro...,Here are some questions to help me better unde...,1. The topic of my project is the impact of so...,1. The first response addresses important aspe...,No,{'spec': {0: {'completion_fns': ['gpt-3.5-turb...
84,Some questions to consider when providing more...,Thank you for providing more context! Based on...,Here are some questions to help me better unde...,1. The topic of my project is the impact of so...,1. Response 1 provides a more detailed and str...,No,{'spec': {0: {'completion_fns': ['gpt-3.5-turb...
85,Here are some questions to help me better unde...,1. The topic of my project is the impact of so...,Here are some questions to help me better unde...,1. The topic of my project is the impact of so...,1. Response 1 provides more specific details a...,No,{'spec': {0: {'completion_fns': ['gpt-3.5-turb...
86,Here are some questions to help me better unde...,Thank you for providing those questions. Here ...,Here are some questions to help me better unde...,1. The topic of my project is the impact of so...,1. Response 1 provides more detailed answers t...,No,{'spec': {0: {'completion_fns': ['gpt-3.5-turb...


In [226]:
pd.DataFrame(dataset).iloc[-3].to_list()

["Some questions to consider when providing more details about your project could include:\n\n- What is the goal or objective of the project?\n- What is the target audience or market for the project?\n- What resources do you currently have available for the project?\n- What specific challenges or obstacles are you facing with the project?\n- What specific areas do you need help or guidance with (e.g. brainstorming ideas, creating a plan, executing the project, etc.)?\n\nFeel free to provide as much information as you can, and I'll do my best to assist you with your project!",
 "Thank you for providing more context! Based on the information you've shared, it seems like you are working on a project related to creating a digital platform for a non-profit organization. Here are some questions to help you further clarify your project:\n\n1. What is the main goal or objective of the digital platform you are creating for the non-profit organization? Is it to increase awareness, raise funds, e

In [223]:
pd.DataFrame(dataset)["Response1"].iloc[-10:].to_list()

["Of course! I'd be happy to help. Can you provide some more details about your project and what specific aspects you're looking for guidance on? This will help me tailor my suggestions to better suit your needs.",
 "Of course! I'd be happy to help you brainstorm ideas or provide guidance on your project. Can you please provide me with some more details about the project and what specific help you are looking for? This will help me better understand the scope of the project and provide you with more tailored assistance.",
 "- What is the timeline for the project and are there any deadlines that need to be met?\n- What is the budget for the project and are there any financial constraints that need to be considered?\n- Are there any key stakeholders or team members involved in the project that need to be consulted or informed?\n- What are the key deliverables or outcomes that need to be achieved by the end of the project?\n- Are there any specific metrics or KPIs that will be used to mea

In [227]:
candidate_messages

[[{'role': 'user',
   'content': "Here are some questions to help me better understand your project:\n\n1. What is the topic or subject of your project?\n2. What is the goal or objective of your project?\n3. What stage are you currently at in the project (e.g., planning, research, writing, etc.)?\n4. Are there any specific challenges or areas where you are seeking guidance or ideas?\n5. What is the timeline or deadline for the project?\n6. Are there any specific requirements or constraints that you need to consider for the project?\n7. Do you have any specific preferences or ideas that you would like to incorporate into the project?\n\nFeel free to provide as much detail as you can, and I'll do my best to assist you with your project."}],
 [{'role': 'system',
   'content': "Here are some questions to help me better understand your project:\n\n1. What is the topic or subject of your project?\n2. What is the goal or objective of your project?\n3. What stage are you currently at in the pr

In [228]:
best_message

[{'role': 'user',
  'content': "Here are some questions to help me better understand your project:\n\n1. What is the topic or subject of your project?\n2. What is the goal or objective of your project?\n3. What stage are you currently at in the project (e.g., planning, research, writing, etc.)?\n4. Are there any specific challenges or areas where you are seeking guidance or ideas?\n5. What is the timeline or deadline for the project?\n6. Are there any specific requirements or constraints that you need to consider for the project?\n7. Do you have any specific preferences or ideas that you would like to incorporate into the project?\n\nFeel free to provide as much detail as you can, and I'll do my best to assist you with your project."}]

In [229]:
best_response

"1. The topic of my project is the impact of social media on mental health.\n2. The goal of my project is to explore how social media usage can affect individuals' mental well-being and to provide recommendations for promoting a healthier relationship with social media.\n3. I am currently in the research phase of the project, gathering data and literature on the topic.\n4. I am seeking guidance on how to structure my findings and analysis in a coherent and impactful way. I am also looking for ideas on potential interventions or strategies to mitigate the negative effects of social media on mental health.\n5. The deadline for the project is in two months.\n6. One constraint I need to consider is the limited scope of the project, as I need to focus on specific aspects of social media and mental health due to time constraints.\n7. I would like to incorporate real-life examples and case studies into the project to make it more relatable and engaging for the audience."