## Building a BATTLE Eval

This notebook shows how to:
- Build and run an eval
- Load the results and into a Pandas Dataframe

In [9]:
initial_message = [{"role": "user", "content": ""}]

In [8]:
#!cd evals
#!git lfs fetch --all
#!git lfs pull

In [10]:
import os
from openai import OpenAI
import pandas as pd
#from dotenv import load_dotenv

#load_dotenv()  # Load environment variables from .env file

#api_key = os.environ.get("OPENAI_API_KEY")
client = OpenAI()

# Install Evals if you haven't already
# %pip install -e ../.
# pip install --upgrade openai

# %pip install pandas


In [11]:
# Assuming this notebook is in examples/
registry_path = os.path.join(os.getcwd(), "..", "evals", "registry")
data_path = os.path.join(registry_path, "data", "battles")
os.makedirs(data_path, exist_ok=True)

In [12]:
top_message = initial_message
data_all = []

In [33]:
# Build the initial candidates dataset

def switch_system_and_user(message):
    new_message = []
    if message[0]["role"] == "system":
        new_message.append({"role": "user", "content": message[0]["content"]})
    else:
        new_message.append({"role": "system", "content": message[0]["content"]})
    return new_message

def initial_candidates_dataset(top_message):
    dataset = [
            {"input1": top_message, "input2": top_message},
            {"input1": top_message, "input2": switch_system_and_user(top_message)}
        ]
    return dataset

dataset = initial_candidates_dataset(top_message)
dataset


[{'input1': [{'role': 'user', 'content': ''}],
  'input2': [{'role': 'user', 'content': ''}]},
 {'input1': [{'role': 'user', 'content': ''}],
  'input2': [{'role': 'system', 'content': ''}]}]

In [37]:
# %pip install yaml
import yaml

# Assuming this notebook is in examples/
registry_path = os.path.join(os.getcwd(), "..", "evals", "registry")
samples_path = os.path.join(registry_path, "data", "battles")
os.makedirs(samples_path, exist_ok=True)
samples_path = os.path.join(samples_path, "samples.jsonl")
df = pd.DataFrame(dataset)
df.to_json(samples_path, orient="records", lines=True)

registry_yaml = {}

registry_yaml["battles"] = {
    "id": "battles.test.v1",
    "metrics": ["accuracy"]
}
registry_yaml["battles.test.v1"] = {
    "class": "evals.elsuite.modelgraded.classify:ModelBasedClassify",
    "args": {
        "samples_jsonl": "battles/samples.jsonl",
        "eval_type": "cot_classify",
        "modelgraded_spec": "battle"
    }
}
with open(os.path.join(registry_path, "evals", "battles.yaml"), "w") as f:
    yaml.dump(registry_yaml, f)

In [38]:
# This will generate a JSONL which will record samples and logs and store it in /tmp/evallogs
!oaieval gpt-3.5-turbo battles --max_samples 20

[2024-08-04 21:11:25,097] [registry.py:271] Loading registry from /Users/janvotava/Desktop/evals/evals/registry/evals
[2024-08-04 21:11:26,232] [registry.py:271] Loading registry from /Users/janvotava/.evals/evals
[2024-08-04 21:11:26,234] [oaieval.py:215] [1;35mRun started: 240804191126OA7ZASN7[0m
[2024-08-04 21:11:26,237] [registry.py:271] Loading registry from /Users/janvotava/Desktop/evals/evals/registry/modelgraded
[2024-08-04 21:11:26,269] [registry.py:271] Loading registry from /Users/janvotava/.evals/modelgraded
[2024-08-04 21:11:26,269] [data.py:94] Fetching /Users/janvotava/Desktop/evals/evals/registry/data/battles/samples.jsonl
[2024-08-04 21:11:26,270] [eval.py:36] Evaluating 2 samples
[2024-08-04 21:11:26,297] [eval.py:144] Running in threaded mode with 10 threads!
100%|█████████████████████████████████████████████| 2/2 [00:02<00:00,  1.48s/it]
[2024-08-04 21:11:29,268] [oaieval.py:275] Found 6/6 sampling events with usage data
[2024-08-04 21:11:29,268] [oaieval.py:283] 

In [None]:
# How to process the log events generated by oaieval

log_name = "battle"
events = f"/tmp/evallogs/{log_name}"

with open(events, "r") as f:
    events_df = pd.read_json(f, lines=True)

In [None]:
import matplotlib.pyplot as plt

# your list of scores
scores = matches_df['sacrebleu_sentence_score']

# define the threshold scores as a range from the minimum to the maximum score, in increments of 5
thresholds = range(int(min(scores)), int(max(scores)) + 5, 5)

# count the number of scores above and below each threshold
above_counts = [len([score for score in scores if score >= threshold]) for threshold in thresholds]

# plot the counts as a step function
plt.step(thresholds, above_counts, label='number of samples withabove')

# set the x and y labels
plt.xlabel('sacrebleu threshold')
plt.ylabel('number of samples w/ score > threshold')

# show the plot
plt.show()

In [None]:
# Inspect samples
for i, r in pd.json_normalize(events_df[events_df.type == "sampling"].data).iterrows():
    print(f"Prompt: {r.prompt}")
    print(f"Sampled: {r.sampled}")
    print("-" * 25)