In [1]:
import os
os.environ['OPENAI_API_KEY'] = "XXXX"

import os
os.chdir(os.path.dirname(globals()['_dh'][0]))

%load_ext autoreload
%autoreload 2

In [27]:
# Build the prompts using Chat format. We support converting Chat conversations to text for non-Chat models

from typing import List, Union


def create_chat_example(sys_msg: str, user_prompt: str, subject: dict, examples: Union[List[List[str]], None] = None):
    """
    Form few-shot prompts in the recommended format: https://github.com/openai/openai-python/blob/main/chatml.md#few-shot-prompting
    """
    if "{{" in user_prompt:
        user_prompt = user_prompt.replace("{{", "{")
    if "}}" in user_prompt:
        user_prompt = user_prompt.replace("}}", "}")
    messages = [{"role": "system", "content": sys_msg}]
    if examples:
        for example in examples:
            messages.append({"role": "system", "content": example[0], "name": "example_user"})
            messages.append({"role": "system", "content": example[1], "name": "example_assistant"})
    messages.append({"role": "user", "content": user_prompt.format(**subject)})
    return messages

In [33]:
import os
import yaml
from datasets import load_dataset

def convert_df_to_json_temp(task_name: str, few_shot: bool = False) -> str:
    registry_path = os.path.join(os.getcwd(), "evals/registry")
    dataset_path = "nguha/legalbench"
    dataset_name = dataset_path.split("/")[-1]
    task_directory = os.path.join(registry_path, "data", dataset_name, task_name)
    os.makedirs(task_directory, exist_ok=True)

    dataset = load_dataset(dataset_path, task_name,  num_proc=8)
    df = dataset["test"].to_pandas()

    prompt_path = os.path.join(os.getcwd(), f"prompts/{dataset_name}/tasks/{task_name}/base_prompt.txt")
    with open(prompt_path, "r") as file:
        prompt = file.read()
    system_prompt, user_prompt = prompt.rsplit("\n\n", 1)

    if few_shot:
        df["sample"] = df.apply(lambda x: create_chat_example(sys_msg=system_prompt, user_prompt=user_prompt, subject=x), axis=1)
    # few_shot_path = os.path.join(task_directory, "few_shot.jsonl")
    # dev_df[["sample"]].to_json(few_shot_path, lines=True, orient="records")

    # Create test prompts and ideal completions
    df["input"] = df.apply(lambda x: create_chat_example(sys_msg=system_prompt, user_prompt=user_prompt, subject=x), axis=1)
    df["ideal"] = df["answer"]
    samples_path = os.path.join(task_directory, "samples.jsonl")
    df[["input", "ideal"]].to_json(samples_path, lines=True, orient="records")

    registry_yaml = {}
    eval_id = f"{dataset_name}-{task_name}"
    registry_yaml[eval_id] = {
        "id": f"{eval_id}.test.v1",
        "metrics": ["accuracy"]
    }
    registry_yaml[f"{eval_id}.test.v1"] = {
        "class": "evals.elsuite.basic.includes:Includes",
        "args": {
            "samples_jsonl": samples_path,
        }
    }

    with open(os.path.join(registry_path, "evals", f"{dataset_name}.yaml"), "a") as f:
        yaml.dump(registry_yaml, f)

In [37]:
convert_df_to_json_temp("canada_tax_court_outcomes")

Setting num_proc from 8 back to 1 for the train split to disable multiprocessing as it only contains one shard.
Generating train split: 100%|██████████| 6/6 [00:00<00:00, 15.71 examples/s]
Setting num_proc from 8 back to 1 for the test split to disable multiprocessing as it only contains one shard.
Generating test split: 100%|██████████| 244/244 [00:00<00:00, 808.06 examples/s]


In [42]:
!python -m evals.cli.oaieval gpt-3.5-turbo legalbench-canada_tax_court_outcomes --max_samples 5 --extra_eval_params ignore_case=True --record_path ./logs/legalbench-canada_tax_court_outcomes

usage: oaieval.py [-h] [--extra_eval_params EXTRA_EVAL_PARAMS]
                  [--completion_args COMPLETION_ARGS]
                  [--max_samples MAX_SAMPLES] [--cache | --no-cache]
                  [--visible | --no-visible] [--seed SEED] [--user USER]
                  [--record_path RECORD_PATH] [--log_to_file LOG_TO_FILE]
                  [--registry_path REGISTRY_PATH] [--debug | --no-debug]
                  [--local-run | --no-local-run] [--http-run | --no-http-run]
                  [--http-run-url HTTP_RUN_URL]
                  [--http-batch-size HTTP_BATCH_SIZE]
                  [--http-fail-percent-threshold HTTP_FAIL_PERCENT_THRESHOLD]
                  [--dry-run | --no-dry-run]
                  [--dry-run-logging | --no-dry-run-logging]
                  completion_fn eval

Run evals through the API

positional arguments:
  completion_fn         One or more CompletionFn URLs, separated by commas
                        (,). A CompletionFn can either be the name o

In [40]:
!python -m evals.cli.oaieval gpt-4o-mini  legalbench-consumer_contracts_qa

[2024-09-29 18:37:08,717] [registry.py:276] Loading registry from /Users/guanxiaoyao/qa/legal/research/evals/evals/registry/evals
[2024-09-29 18:37:09,850] [registry.py:276] Loading registry from /Users/guanxiaoyao/.evals/evals
[2024-09-29 18:37:09,854] [oaieval.py:215] [1;35mRun started: 240929103709HDK2HJVG[0m
[2024-09-29 18:37:09,990] [data.py:94] Fetching /Users/guanxiaoyao/qa/legal/research/evals/evals/registry/data/legalbench/consumer_contracts_qa/samples.jsonl
[2024-09-29 18:37:09,998] [eval.py:36] Evaluating 396 samples
[2024-09-29 18:37:10,010] [eval.py:144] Running in threaded mode with 10 threads!
 23%|█████████▌                                | 90/396 [00:09<00:27, 11.02it/s][2024-09-29 18:37:19,931] [record.py:360] Logged 181 rows of events to /tmp/evallogs/240929103709HDK2HJVG_gpt-4o-mini_legalbench-consumer_contracts_qa.jsonl: insert_time=29.885ms
 47%|███████████████████▏                     | 185/396 [00:19<00:20, 10.12it/s][2024-09-29 18:37:30,028] [record.py:360] L

In [41]:
!python -m evals.cli.oaieval o1-mini  legalbench-consumer_contracts_qa

[2024-09-29 18:37:56,120] [registry.py:276] Loading registry from /Users/guanxiaoyao/qa/legal/research/evals/evals/registry/evals
[2024-09-29 18:37:57,453] [registry.py:276] Loading registry from /Users/guanxiaoyao/.evals/evals
[2024-09-29 18:37:57,457] [oaieval.py:215] [1;35mRun started: 240929103757P563FIPD[0m
[2024-09-29 18:37:57,617] [data.py:94] Fetching /Users/guanxiaoyao/qa/legal/research/evals/evals/registry/data/legalbench/consumer_contracts_qa/samples.jsonl
[2024-09-29 18:37:57,629] [eval.py:36] Evaluating 396 samples
[2024-09-29 18:37:57,645] [eval.py:144] Running in threaded mode with 10 threads!
 12%|█████▏                                    | 49/396 [00:18<02:06,  2.74it/s][2024-09-29 18:38:16,372] [record.py:360] Logged 100 rows of events to /tmp/evallogs/240929103757P563FIPD_o1-mini_legalbench-consumer_contracts_qa.jsonl: insert_time=16.414ms
 25%|██████████▌                               | 99/396 [00:34<01:43,  2.88it/s][2024-09-29 18:38:32,402] [record.py:360] Logge

In [42]:
!python -m evals.cli.oaieval o1-preview  legalbench-consumer_contracts_qa

[2024-09-29 18:40:17,121] [registry.py:276] Loading registry from /Users/guanxiaoyao/qa/legal/research/evals/evals/registry/evals
[2024-09-29 18:40:17,985] [registry.py:276] Loading registry from /Users/guanxiaoyao/.evals/evals
[2024-09-29 18:40:17,986] [oaieval.py:215] [1;35mRun started: 240929104017NSNMDUP2[0m
[2024-09-29 18:40:18,081] [data.py:94] Fetching /Users/guanxiaoyao/qa/legal/research/evals/evals/registry/data/legalbench/consumer_contracts_qa/samples.jsonl
[2024-09-29 18:40:18,088] [eval.py:36] Evaluating 396 samples
[2024-09-29 18:40:18,097] [eval.py:144] Running in threaded mode with 10 threads!
 12%|█████▏                                    | 49/396 [00:31<03:13,  1.80it/s][2024-09-29 18:40:51,904] [record.py:360] Logged 100 rows of events to /tmp/evallogs/240929104017NSNMDUP2_o1-preview_legalbench-consumer_contracts_qa.jsonl: insert_time=16.233ms
 25%|██████████▍                               | 98/396 [01:03<03:42,  1.34it/s][2024-09-29 18:41:22,245] [record.py:360] Lo