## Building a BATTLE Eval

In [1]:
file_name = "dataset"
model = "gpt-3.5-turbo"
choices = "user"                     # options: user, ai
num_few_shot = 0
initial_message = [{"role": "user", "content": ""}]
roles = "user"                      # options: system-user, system, user
battles_generation = "complete-chat"        # options: dense, only-new, complete-chat

#!cd evals
#!git lfs fetch --all
#!git lfs pull
import os
from openai import OpenAI
import pandas as pd
import yaml
from IPython.display import display_markdown
import copy

client = OpenAI()
#import datetime
#from dotenv import load_dotenv

#load_dotenv()  # Load environment variables from .env file

#api_key = os.environ.get("OPENAI_API_KEY")

# Install Evals if you haven't already
# %pip install -e ../.
# pip install --upgrade openai
# %pip install pandas

# Paths. Assuming this notebook is in examples/

evals_path = os.path.join(os.getcwd(), "..", "evals")

registry_path = os.path.join(evals_path, "registry", "evals", "battles.yaml")

data_path = os.path.join(evals_path, "registry", "data", "battles")
os.makedirs(data_path, exist_ok=True)
data_path = os.path.join(data_path, "samples.jsonl")

json_logs_path = os.path.join(os.getcwd(), "logs")
os.makedirs(json_logs_path, exist_ok=True)
json_logs_path = os.path.join(json_logs_path, "logs")

df_path = os.path.join(evals_path, "evallogs", "df")
os.makedirs(df_path, exist_ok=True)
dataset_path = os.path.join(df_path, file_name)
# Registry yaml

registry_yaml = {}

registry_yaml["battles"] = {
    "id": "battles.test.v1",
    "metrics": ["accuracy"]
}
registry_yaml["battles.test.v1"] = {
    "class": "evals.elsuite.modelgraded.classify:ModelBasedClassify",
    "args": {
        "samples_jsonl": "battles/samples.jsonl",
        "eval_type": "cot_classify",
        "modelgraded_spec": "battle"
    }
}
with open(os.path.join(registry_path), "w") as f:
    yaml.dump(registry_yaml, f)
def best(dataset, memory_limit=False):
    #best_message = dataset[dataset["Choice"]=="Yes"].iloc[-1]["Instruction1"]
    #best_response = dataset[dataset["Choice"]=="Yes"].iloc[-1]["Response1"]
    if dataset.iloc[-1]["Choice"] == "Yes":
        best_content = dataset.iloc[-1]["Instruction1"]
        best_response = dataset.iloc[-1]["Response1"]
    else:
        best_content = dataset.iloc[-1]["Instruction2"]
        best_response = dataset.iloc[-1]["Response2"]
    if memory_limit:
        best_message = [{"role": "user", "content": best_content}]
    if not memory_limit:
        try:
            best_message = dataset[dataset["Choice"]=="Yes"].iloc[-1]["Data"]["Full message"]
        except IndexError:
            best_message = dataset.iloc[0]["Data"]["Full message"]
    return best_message,best_response

In [158]:
# Load data

try:
    with open(dataset_path, "r") as f:
        dataset = pd.read_json(f, lines=True)
    best_message, best_response = best(dataset)

# Or create new data file

except FileNotFoundError:
    completion = client.chat.completions.create(
        messages = initial_message,
        model = model,
        )
    dataset = pd.DataFrame(
        [
            ["", "", "", "", "", "", {}],
            ["", completion.choices[0].message.content, "", "", "", "", {}]
            ], 
        columns=['Instruction1', 'Response1', 'Instruction2', 'Response2', 'Sampled', 'Choice', 'Data']
    ) 
    dataset.at[0, "Data"] = {"Completion":completion.to_dict(),"Full message":initial_message}
candidate_messages = [initial_message]
next_candidate_messages = []
generation_distance = 0
message_in_candidate_messages = 0
dataset.to_json(os.path.join(df_path, "dataset"), lines=True, orient="records")


In [7]:
def battle(best_message, best_response, candidate_message):

    dataset = [{"input1": candidate_message, "input2": best_message, "completion2":best_response}]

    df = pd.DataFrame(dataset)
    df.to_json(data_path, orient="records", lines=True)

    !oaieval gpt-3.5-turbo battles --record_path logs/logs
    
    with open(json_logs_path, "r") as f:
        df = pd.read_json(f, lines=True)

    #current_time = datetime.datetime.now()
    #formatted_time = current_time.strftime("%Y%m%d_%H%M%S")
    #df.to_json(os.path.join(df_path, formatted_time), lines=True, orient="records")

    instruction1 = candidate_message[-1]["content"]
    instruction2 = best_message[-1]["content"]

    battle_prompt_content = df["data"].iloc[-2]["prompt"][0]["content"]
    response1 = battle_prompt_content.split("\n[Response 1]\n",)[1].split("\n\n[Instruction 2]\n")[0]
    response1 = response1.replace("\\'", "'").replace("\\n", "\n")
    response2 = battle_prompt_content.split("\n[Response 2]\n",)[1].split("\n\n\nIs the first response better than the second?")[0]
    response2 = response2.replace("\\'", "'").replace("\\n", "\n")
    print(f"Response1: {response1}")
    #print(f"response2: {response2}")

    sampled = df["data"].iloc[-2]["sampled"][0]

    choice = df["data"].iloc[-1]["choice"]

    data = {'Instruction1': instruction1, 'Response1': response1, 'Instruction2': instruction2, 'Response2': response2, 'Sampled': sampled, 'Choice': choice, 'Data': {}}
    data = pd.DataFrame([data])
    data.at[0, "Data"] = df.to_dict()
    return data
def switch_system_and_user(message):
    new_message = []
    if message[0]["role"] == "system":
        new_message.append([{"role": "user", "content": message[0]["content"]}])
    else:
        new_message.append([{"role": "system", "content": message[0]["content"]}])
    return new_message
def list_candidate_messages(dataset,generation_distance,roles="user"):
    best_message = best(dataset)[0]
    best_content = best_message[-1]["content"]
    last = [best_content]
    list_of_contents = [best_content]

    next = []
    for i in range(generation_distance):
        #print (f"starting level {i}, generation distance {generation_distance}")
        for content in last:
            #print(f"last_up: {last_up}, now {content}")
            next.extend(find_parents(content, dataset))
            next.extend(find_children(content, dataset))
            
        #print (f"behind for loops")
        list_of_contents.extend(next)
        last = next.copy()
        #print(f"endind level {i}")
        
    list_of_contents = pd.array(list_of_contents).unique().tolist()
    
    #best_message_index = list_of_contents.index(best_message[0][content])
    #start = max(best_message_index - generation_distance, 0)
    #stop = min(len(list_of_contents))
    #stop = len(list_of_contents) - 1

    messages = []
    for content in list_of_contents:
        if roles == "system-user" or "user":
            messages.append([{"role":"user","content":content}])
        if (roles == "system-user") or (roles == "system"):
            print("Creating message without user role.")
            messages.append([{"role":"system","content":content}])

    return messages
def make_x_shot_prompt(dataset,message,num_few_shot):
    few_shot_prompt = []
    for _, key_battle in dataset[dataset["Choice"]=="Yes"].iloc[-num_few_shot:].iterrows():
        few_shot_prompt.append({"role": "system", "content": key_battle["Instruction1"], "name": "example_user"})
        few_shot_prompt.append({"role": "system", "content": key_battle["Response1"], "name": "example_assistant"})
    message = few_shot_prompt + message
    return message
def transform_roles(chat,assistant_transform_type = "user-assistant"):
    prompt_messages = copy.deepcopy(chat)
    if assistant_transform_type == "user-assistant":
      i = 1
      while i <= len(prompt_messages) and prompt_messages[-i]["role"] != "system":
        if i%2 == 1:
          prompt_messages[-i]["role"] = "user"
        else:
          prompt_messages[-i]["role"] = "assistant"
        i += 1
    else:
      for prompt_message in prompt_messages:
        if prompt_message["role"] == "assistant":
          prompt_message["role"] = assistant_transform_type
    return prompt_messages
def user_choice_prompt(candidate_message,n,best_message,best_response):

    prompt = "You are comparing two responses to the following two instructions."

    prompt += "\n\n[Instruction 1]\n"
    prompt += candidate_message[-1]["content"]

    prompt += "\n\n[Response 1]\n"
    completion = client.chat.completions.create(
        messages = candidate_message,
        model = model,
        )
    response1 = completion.choices[0].message.content
    prompt += response1
    
    prompt += "\n\n[Instruction 2]\n"
    best_content = best_message[-1]["content"]
    prompt += best_content

    prompt += "\n\n[Response 2]\n"
    prompt += best_response

    prompt += "\n\n\nIs the first response better than the second? You must provide one answer based on your subjective view."

    data = {'Instruction1': candidate_message[-1]["content"], 'Response1': response1, 'Instruction2': best_content, 'Response2': best_response, 'Sampled': "", 'Choice': "", 'Data': {}}
    data = pd.DataFrame([data])
    data.at[0, "Data"] = {"Completion":completion.to_dict(),"Full message":candidate_message,"Parent":n}

    display_markdown(prompt, raw=True)
    return data,candidate_messages,message_in_candidate_messages,generation_distance
def user_choice_prompt_old(candidate_messages,message_in_candidate_messages,dataset, generation_distance, roles, battles_generation,num_few_shot):

    prompt = "You are comparing two responses to the following two instructions."

    prompt += "\n\n[Instruction 1]\n"
    if battles_generation == "dense":
        if message_in_candidate_messages < len(candidate_messages):
            candidate_message = candidate_messages[message_in_candidate_messages]
            message_in_candidate_messages += 1
        else:
            candidate_messages = list_candidate_messages(dataset, generation_distance, roles)
            generation_distance += 1
            candidate_message = candidate_messages[0]
            message_in_candidate_messages = 1
            print(f"candidate messages listed, generation distance: {generation_distance}, number of candidate messages: {len(candidate_messages)}")
    if battles_generation == "complete-chat":
        chat = best(dataset)[0] + [{'role': 'assistant', 'content': best(dataset)[1]}]
        candidate_message = transform_roles(chat)
    if num_few_shot > 0:
        candidate_message = make_x_shot_prompt(dataset,candidate_message,num_few_shot)
    prompt += candidate_message[-1]["content"]

    prompt += "\n\n[Response 1]\n"
    completion = client.chat.completions.create(
        messages = candidate_message,
        model = model,
        )
    response1 = completion.choices[0].message.content
    prompt += response1
    
    prompt += "\n\n[Instruction 2]\n"
    best_content = best(dataset)[0][-1]["content"]
    prompt += best_content

    prompt += "\n\n[Response 2]\n"
    prompt += best(dataset)[1]

    prompt += "\n\n\nIs the first response better than the second? You must provide one answer based on your subjective view."

    data = {'Instruction1': candidate_message[-1]["content"], 'Response1': response1, 'Instruction2': best_content, 'Response2': best(dataset)[1], 'Sampled': "", 'Choice': "", 'Data': {}}
    data = pd.DataFrame([data])
    data.at[0, "Data"] = {"Completion":completion.to_dict(),"Full message":candidate_message}

    display_markdown(prompt, raw=True)
    return data,candidate_messages,message_in_candidate_messages,generation_distance

In [None]:
if choices == "ai":
    for candidate_message in candidate_messages:
        if num_few_shot > 0:
            candidate_message = make_x_shot_prompt(dataset,candidate_message,num_few_shot)
        data = battle(best_message, best_response, candidate_message)
        dataset = pd.concat([dataset, data],ignore_index=True)
        dataset.to_json(os.path.join(df_path, "dataset"), lines=True, orient="records")
        if data["Choice"].iloc[0] == "Yes":
            best_message = candidate_message
            best_response = data["Response1"].iloc[0]
            print(f"New best message:{best_message}")
            if battles_generation == "dense":
                generation_distance = 0

        print(f"Instruction1: {candidate_message[-1]["content"]}")

        if battles_generation == "only-new":
            new_message = [{"role": candidate_message[0]["role"], "content":data["Response1"].iloc[0]}]
            if new_message not in next_candidate_messages:
                next_candidate_messages.append(new_message)
                if roles == "system-user":
                    next_candidate_messages.append(switch_system_and_user(new_message))

    if battles_generation == "only-new":
        candidate_messages = next_candidate_messages
        next_candidate_messages = []
    if battles_generation == "dense":
        #print("going to list candidate messages")
        candidate_messages = list_candidate_messages(dataset, best_message, generation_distance, roles)
        generation_distance += 1
        #print("candidate messages listed")
    print(f"all done, generation distance: {generation_distance}, number of candidate messages: {len(candidate_messages)}")
"""if choices == "user":
    data,candidate_messages,message_in_candidate_messages,generation_distance = user_choice_prompt_old(
        candidate_messages,
        message_in_candidate_messages,
        dataset,
        generation_distance,
        roles,
        battles_generation,
        num_few_shot)"""

In [153]:
def continue_chat(n, dataset_path):
    with open(dataset_path, "r") as f:
        dataset = pd.read_json(f, lines=True)
    last_response = dataset.iloc[n]["Response1"]
    chat = dataset.iloc[n]["Data"]["Full message"] + [{'role': 'assistant', 'content': last_response}]
    candidate_message = transform_roles(chat)
    last_response = dataset.iloc[n]["Response1"]
    parent = n
    data = user_choice_prompt(candidate_message,parent,candidate_message,last_response)[0]
    return data
def improve_chat(n, dataset_path):
    with open(dataset_path, "r") as f:
        dataset = pd.read_json(f, lines=True)
    candidate_message = dataset.iloc[n]["Data"]["Full message"]
    last_response = dataset.iloc[n]["Response1"]
    parent = dataset.iloc[n]["Data"]["Parent"]
    data = user_choice_prompt(candidate_message,parent,candidate_message,last_response)[0]
    return data

In [389]:
distance = 5
ed = extend_dataset()["Siblings"]
ed[1:1+distance]

1    6
2    5
3    4
4    3
5    2
Name: Siblings, dtype: int64

In [402]:
display_responses([1,2,3,4,5,6])

1

Hello! How can I assist you today?

2

Hello! I'm here to help you with any questions you might have or provide information on a variety of topics. Feel free to ask me anything!

3

Thank you for offering your help! Is there anything specific you would like to know more about or discuss today?

4

Thank you for asking! I would be happy to talk about various topics, such as technology, science, history, language learning, travel, and more. If you have any specific questions or interests, feel free to share them!

5

That sounds like a wide range of interesting topics to explore! If you have any questions or if you would like to delve deeper into any of these subjects, please let me know. I'm here to provide information and engage in discussions with you.

6

Thank you for your willingness to engage in discussions! Let's start with a broad question: Do you have any insights into current technology trends or developments that you find particularly fascinating or noteworthy?

In [350]:
dataset.loc[24,"Choice"] = "No"

In [395]:
n = 25
data = continue_chat(n,dataset_path)

You are comparing two responses to the following two instructions.

[Instruction 1]
That's great to hear! I am knowledgeable about a wide range of topics, so feel free to ask me anything related to technology, science, history, language learning, travel, or any other subject that interests you. I'm here to help and provide information on any topic you choose. Let me know how I can assist you today!

[Response 1]
Thank you for your willingness to help! I'd like to know more about the latest advancements in artificial intelligence and how they are shaping various industries. Can you provide me with some insights on this topic?

[Instruction 2]
That's great to hear! I am knowledgeable about a wide range of topics, so feel free to ask me anything related to technology, science, history, language learning, travel, or any other subject that interests you. I'm here to help and provide information on any topic you choose. Let me know how I can assist you today!

[Response 2]
That's great to hear! I am knowledgeable about a wide range of topics, so feel free to ask me anything related to technology, science, history, language learning, travel, or any other subject that interests you. I'm here to help and provide information on any topic you choose. Let me know how I can assist you today!


Is the first response better than the second? You must provide one answer based on your subjective view.

In [394]:
choice = ""
#choice = "Yes"
data.at[0, "Choice"] = choice
print(f"Your answer:{choice}")
if choice == "Yes":
    print(f"New best message and response!")
    if battles_generation == "dense":
        generation_distance = 0
dataset = pd.concat([dataset, data],ignore_index=True)
dataset.to_json(os.path.join(df_path, "dataset"), lines=True, orient="records")
#data,candidate_messages,message_in_candidate_messages,generation_distance = user_choice_prompt_old(candidate_messages,message_in_candidate_messages,dataset,generation_distance,roles,battles_generation,num_few_shot)

Your answer:


In [383]:
dataset[-5:]

Unnamed: 0,Instruction1,Response1,Instruction2,Response2,Sampled,Choice,Data
45,Hello! I'm here to help you with any questions...,Thank you for your offer to help! I appreciate...,Hello! I'm here to help you with any questions...,Thank you for offering your help! Is there any...,,No,{'Completion': {'id': 'chatcmpl-9wEzqZk2JGX17u...
46,,Hello! How can I assist you today?,,Hello! How can I assist you today?,,No,{'Completion': {'id': 'chatcmpl-9wF69NC6iJYRi5...
47,Hello! How can I assist you today?,Hello! I'm here to help with any questions or ...,Hello! How can I assist you today?,Hello! I'm here to help you with any questions...,,No,{'Completion': {'id': 'chatcmpl-9wF7nE0AWrcOnS...
48,Hello! I'm here to help you with any questions...,Thank you for offering your help! I appreciate...,Hello! I'm here to help you with any questions...,Thank you for offering your help! Is there any...,,No,{'Completion': {'id': 'chatcmpl-9wF8KERBcLnHns...
49,Thank you for offering your help! Is there any...,Thank you for asking! I am an AI assistant and...,Thank you for offering your help! Is there any...,Thank you for asking! I would be happy to talk...,,No,{'Completion': {'id': 'chatcmpl-9wF9YZaPrtJE0H...


In [384]:
def get_best_chat(dataset=dataset):
    best_chat = [0]
    best_responses = set(dataset[dataset["Choice"] == "Yes"].index)
    best = 0
    while True:
        level_up = best_responses.intersection(set(children(best)))
        if len(level_up) == 0:
            break
        best = max(level_up)
        best_chat.append(best)
    return best_chat
    

In [385]:
def msg_to_content(msg):
    content = []
    for line in msg:
        content.append(line["content"])
    return content
def best_msg_level_up(dataset, msg, best_chat):
    level = 0 if best_chat.empty else len(msg)
    filtered_dataset = dataset[
        (dataset["Choice"] != "No") & 
        (dataset["Data"].apply(lambda x: len(x["Full message"])) == level + 1) &
        (dataset["Data"].apply(lambda x: is_parent(x["Full message"], msg, dataset)))
        ] 
    if len(filtered_dataset) > 1:
        #print(filtered_dataset)
        filtered_dataset = filtered_dataset.iloc[[-1]]
        print(filtered_dataset)
    if filtered_dataset.empty:
        return best_chat
    best_chat = pd.concat([best_chat, filtered_dataset],ignore_index=True)
    #try:
    #print(filtered_dataset.iloc[0])
    msg = filtered_dataset.iloc[0]["Data"]["Full message"]
    #except:
        #print("Returning filtered dataset")
        #return filtered_dataset.iloc[0]
    best_msg_level_up(dataset, msg, best_chat)
    return best_chat
    

In [386]:
def parent(n,dataset=dataset):
    if n == 0:
        return None
    p = dataset.iloc[n]["Data"]["Parent"]
    return p
def children(n, dataset=dataset):
    ch = []
    for x in dataset.index:
        if parent(x) == n:
            ch.append(x)
    #filtered_dataset = dataset[dataset["Data"].apply(lambda x: x["Parent"] == n)]
    #ch = filtered_dataset.index.to_list()
    return ch
def ancestors(n,dataset=dataset):
    ancestors = []
    while n > 0:
        ancestors.append(parent(n))
        n = parent(n)
    ancestors.reverse()
    return ancestors
def descendants(n,dataset=dataset):
    def descendants_recursion(n,l,dataset=dataset):
        l.extend(children(n))
        for child in children(n):
            descendants_recursion(child,l)
        return l
    l = []
    return descendants_recursion(n,l)
def best_in_level(level,dataset=dataset):
    best_responses = set(dataset[(dataset["Choice"] != "No")].index)
    bil = []
    for n in best_responses:
        if len(ancestors(n)) == level:
            bil.append(n)
    return bil
    
    #(dataset["Data"].apply(lambda x: len(x["Full message"])) == level + 1) &
    #(dataset.apply(lambda x: is_parent(msg,x["Data"]["Full message"],x["Response1"])))
def display_responses(numbers,dataset=dataset):
    for row in dataset.filter(numbers,axis=0).iterrows():
        display_markdown(row[0],raw=True)
        display_markdown(row[1].iloc[1],raw=True)
def extend_dataset(dataset=dataset):
    extended_dataset = dataset.copy()
    extended_dataset["Parent"] = pd.to_numeric(extended_dataset["Data"].apply(lambda x: x["Parent"]), errors='coerce').astype(pd.Int64Dtype())
    extended_dataset["Children"] = extended_dataset.apply(lambda x: len(children(x.name)), axis=1)
    extended_dataset["Siblings"] = extended_dataset.apply(lambda x: len(children(parent(x.name))), axis=1)
    return extended_dataset
def best_chats(dataset=dataset):
    best_chats = []
    best_responses = set(dataset[(dataset["Choice"] != "No")].index)
    ancestor_responses = set()
    for n in best_responses:
        ancestor_responses = ancestor_responses.union(ancestors(n))
    best_chats_ends = best_responses.difference(ancestor_responses)
    for chat_end in best_chats_ends:
        print(chat_end)
        chat = ancestors(chat_end)
        chat.append(chat_end)
        best_chats.append(chat)
    return best_chats
    
                                
    #(dataset["Data"].apply(lambda x: len(x["Full message"])) == level + 1) &
    #(dataset.apply(lambda x: is_parent(msg,x["Data"]["Full message"],x["Response1"])))


In [387]:
ed = extend_dataset()
#ed = ed.filter(best_chats()[1],axis=0)
ed[["Response1","Siblings"]]

Unnamed: 0,Response1,Siblings
0,,1
1,Hello! How can I assist you today?,6
2,Hello! I'm here to help you with any questions...,5
3,Thank you for offering your help! Is there any...,4
4,Thank you for asking! I would be happy to talk...,3
5,That sounds like a wide range of interesting t...,2
6,Thank you for your willingness to engage in di...,2
7,Certainly! One of the most notable technology ...,2
8,Those are indeed fascinating developments! Art...,2
9,I find all of these technology trends to be in...,2


In [388]:
test = dataset[dataset["Choice"]=="Yes"]#.iloc[0]
for row in test.iterrows():
        display_markdown(row[0], raw=True)
        display_markdown(row[1].iloc[1], raw=True)
