## Building a BATTLE Eval

This notebook shows how to:
- Build and run an eval
- Load the results and into a Pandas Dataframe

In [1]:
initial_message = [{"role": "user", "content": ""}]

In [17]:
!cd /Users/janvotava/evals
!git lfs fetch --all
!git lfs pull

fetch: 560 objects found, done.                                                 
fetch: Fetching all references...


In [14]:
!pwd

/Users/janvotava/evals/examples


In [3]:
import os
from openai import OpenAI
import pandas as pd
#from dotenv import load_dotenv

#load_dotenv()  # Load environment variables from .env file

#api_key = os.environ.get("OPENAI_API_KEY")
client = OpenAI()

# Install Evals if you haven't already
# %pip install -e ../.
%pip install --upgrade openai

# %pip install pandas


Note: you may need to restart the kernel to use updated packages.


In [4]:
# Assuming this notebook is in examples/
registry_path = os.path.join(os.getcwd(), "..", "evals", "registry")
data_path = os.path.join(registry_path, "data", "battles")
os.makedirs(data_path, exist_ok=True)

In [5]:
top_message = initial_message
data_all = []

In [6]:
# Build the initial candidates dataset

def switch_system_and_user(message):
    new_message = []
    if message[0]["role"] == "system":
        new_message.append({"role": "user", "content": message[0]["content"]})
    else:
        new_message.append({"role": "system", "content": message[0]["content"]})
    return new_message

def initial_candidates_dataset(top_message):
    dataset = [
            {"instruction1": top_message, "instruction2": top_message},
            {"instruction1": top_message, "instruction2": switch_system_and_user(top_message)}
        ]
    return dataset

dataset = initial_candidates_dataset(top_message)
dataset


[{'instruction1': [{'role': 'user', 'content': ''}],
  'instruction2': [{'role': 'user', 'content': ''}]},
 {'instruction1': [{'role': 'user', 'content': ''}],
  'instruction2': [{'role': 'system', 'content': ''}]}]

In [7]:
# %pip install yaml
import yaml

# Assuming this notebook is in examples/
registry_path = os.path.join(os.getcwd(), "..", "evals", "registry")
output_path = os.path.join(registry_path, "data", "battles")
os.makedirs(output_path, exist_ok=True)
samples_path = os.path.join(registry_path, "data", "dataset", "samples.jsonl")

registry_yaml = {}

registry_yaml["battles"] = {
    "id": "battles.test.v1",
    "metrics": ["accuracy"]
}
registry_yaml[f"battles.test.v1"] = {
    "class": "evals.elsuite.modelgraded.classify:ModelBasedClassify",
    "args": {
        "samples_jsonl": samples_path,
        "modelgraded_spec": "battle"
    }
}
os.makedirs(os.path.join(registry_path, "evals"), exist_ok=True)
with open(os.path.join(registry_path, "evals", "battle.yaml"), "w") as f:
    yaml.dump(registry_yaml, f)

In [18]:
# This will generate a JSONL which will record samples and logs and store it in /tmp/evallogs
!oaieval gpt-3.5-turbo test-match

Traceback (most recent call last):
  File "/opt/anaconda3/bin/oaieval", line 5, in <module>
    from evals.cli.oaieval import main
  File "/Users/janvotava/evals/evals/__init__.py", line 5, in <module>
    from .completion_fns.openai import OpenAIChatCompletionFn as OpenAIChatCompletionFn
  File "/Users/janvotava/evals/evals/completion_fns/openai.py", line 13, in <module>
    from evals.utils.api_utils import (
  File "/Users/janvotava/evals/evals/utils/api_utils.py", line 17, in <module>
    openai.error.ServiceUnavailableError,
    ^^^^^^^^^^^^
AttributeError: module 'openai' has no attribute 'error'


In [None]:
# How to process the log events generated by oaieval

log_name = "battle"
events = f"/tmp/evallogs/{log_name}"

with open(events, "r") as f:
    events_df = pd.read_json(f, lines=True)

In [None]:
import matplotlib.pyplot as plt

# your list of scores
scores = matches_df['sacrebleu_sentence_score']

# define the threshold scores as a range from the minimum to the maximum score, in increments of 5
thresholds = range(int(min(scores)), int(max(scores)) + 5, 5)

# count the number of scores above and below each threshold
above_counts = [len([score for score in scores if score >= threshold]) for threshold in thresholds]

# plot the counts as a step function
plt.step(thresholds, above_counts, label='number of samples withabove')

# set the x and y labels
plt.xlabel('sacrebleu threshold')
plt.ylabel('number of samples w/ score > threshold')

# show the plot
plt.show()

In [None]:
# Inspect samples
for i, r in pd.json_normalize(events_df[events_df.type == "sampling"].data).iterrows():
    print(f"Prompt: {r.prompt}")
    print(f"Sampled: {r.sampled}")
    print("-" * 25)