## Building an MMLU Eval

This notebook shows how to:
- Build and run an eval
- Load the results and into a Pandas Dataframe

We use the `evals.elsuite.basic.match:Match` Eval class here to check whether new completions match the correct answer. Under the hood, it will generate a completion with the choice of model for each prompt, check if the completion matches the true answer, then logs a result.

In [1]:
# Install, and download MMLU if you haven't already
%pip install -e ../.

!curl -O https://people.eecs.berkeley.edu/~hendrycks/data.tar
!tar -xf data.tar
data_path = "data"

Obtaining file:///Users/guanxiaoyao/qa/legal/research/evals
  Installing build dependencies ... [?25ldone
[?25h  Checking if build backend supports build_editable ... [?25ldone
[?25h  Getting requirements to build editable ... [?25ldone
[?25h  Preparing editable metadata (pyproject.toml) ... [?25ldone
Building wheels for collected packages: evals
  Building editable for evals (pyproject.toml) ... [?25ldone
[?25h  Created wheel for evals: filename=evals-3.0.1.post1-0.editable-py3-none-any.whl size=8270 sha256=123f97ff7f4dd4ccfd30493b495cf81db1ec665f49f222ef9faa7664bcf495f3
  Stored in directory: /private/var/folders/h1/3v_qx3d534l7lq6zbgjfn88h0000gn/T/pip-ephem-wheel-cache-epi977jd/wheels/43/25/cf/bf1dc16feab6d55644c61671d650fe2c1e88ebbfc9685eb72c
Successfully built evals
Installing collected packages: evals
  Attempting uninstall: evals
    Found existing installation: evals 3.0.1.post1
    Uninstalling evals-3.0.1.post1:
      Successfully uninstalled evals-3.0.1.post1
Succes

In [2]:
import pandas as pd
import os

# Assuming this notebook is in examples/
registry_path = os.path.join(os.getcwd(), "../evals/registry")

In [3]:
# Build the prompts using Chat format. We support converting Chat conversations to text for non-Chat models

choices = ["A", "B", "C", "D"]
sys_msg = "The following are multiple choice questions (with answers) about {}."
def create_chat_prompt(sys_msg, question, answers, subject):
    user_prompt = f"{question}\n" + "\n".join([f"{choice}. {answer}" for choice, answer in zip(choices, answers)]) + "\nAnswer:"
    return [
        {"role": "system", "content": sys_msg.format(subject)},
        {"role": "user", "content": user_prompt}
    ]

def create_chat_example(question, answers, correct_answer):
    """
    Form few-shot prompts in the recommended format: https://github.com/openai/openai-python/blob/main/chatml.md#few-shot-prompting
    """
    user_prompt = f"{question}\n" + "\n".join([f"{choice}. {answer}" for choice, answer in zip(choices, answers)]) + "\nAnswer:"
    return [
        {"role": "system", "content": user_prompt, "name": "example_user"},
        {"role": "system", "content": correct_answer, "name": "example_assistant"},
    ]

In [4]:
import yaml
subjects = sorted([f.split("_test.csv")[0] for f in os.listdir(os.path.join(data_path, "test")) if "_test.csv" in f])

registry_yaml = {}

for subject in subjects:
    subject_path = os.path.join(registry_path, "data", "mmlu", subject)
    os.makedirs(subject_path, exist_ok=True)

    # Create few-shot prompts
    dev_df = pd.read_csv(os.path.join(data_path, "dev", subject + "_dev.csv"), names=("Question", "A", "B", "C", "D", "Answer"))
    dev_df["sample"] = dev_df.apply(lambda x: create_chat_example(x["Question"], x[["A", "B", "C", "D"]], x["Answer"]), axis=1)
    few_shot_path = os.path.join(subject_path, "few_shot.jsonl")
    dev_df[["sample"]].to_json(few_shot_path, lines=True, orient="records")

    # Create test prompts and ideal completions
    test_df = pd.read_csv(os.path.join(data_path, "test", subject + "_test.csv"), names=("Question", "A", "B", "C", "D", "Answer"))
    test_df["input"] = test_df.apply(lambda x: create_chat_prompt(sys_msg, x["Question"], x[["A", "B", "C", "D"]], subject), axis=1)
    test_df["ideal"] = test_df.Answer
    samples_path = os.path.join(subject_path, "samples.jsonl")
    test_df[["input", "ideal"]].to_json(samples_path, lines=True, orient="records")

    eval_id = f"match_mmlu_{subject}"

    registry_yaml[eval_id] = {
        "id": f"{eval_id}.test.v1",
        "metrics": ["accuracy"]
    }
    registry_yaml[f"{eval_id}.test.v1"] = {
        "class": "evals.elsuite.basic.match:Match",
        "args": {
            "samples_jsonl": samples_path,
            "few_shot_jsonl": few_shot_path,
            "num_few_shot": 4,
        }
    }

with open(os.path.join(registry_path, "evals", "mmlu.yaml"), "w") as f:
    yaml.dump(registry_yaml, f)

In [9]:
import os
os.environ['OPENAI_API_KEY'] = "XXXX"

import os
os.chdir(os.path.dirname(globals()['_dh'][0]))

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [23]:
!python -m evals.cli.oaieval gpt-3.5-turbo  match_mmlu_professional_law --max_samples 25

[2024-09-29 14:45:07,680] [registry.py:276] Loading registry from /Users/guanxiaoyao/qa/legal/research/evals/evals/registry/evals
[2024-09-29 14:45:08,527] [registry.py:276] Loading registry from /Users/guanxiaoyao/.evals/evals
[2024-09-29 14:45:08,529] [oaieval.py:215] [1;35mRun started: 240929064508P3HXFQZ2[0m
[2024-09-29 14:45:08,619] [data.py:94] Fetching /Users/guanxiaoyao/qa/legal/research/evals/examples/../evals/registry/data/mmlu/professional_law/few_shot.jsonl
[2024-09-29 14:45:08,620] [data.py:94] Fetching /Users/guanxiaoyao/qa/legal/research/evals/examples/../evals/registry/data/mmlu/professional_law/samples.jsonl
[2024-09-29 14:45:08,633] [eval.py:36] Evaluating 25 samples
[2024-09-29 14:45:08,643] [eval.py:144] Running in threaded mode with 10 threads!
100%|███████████████████████████████████████████| 25/25 [00:03<00:00,  6.59it/s]
[2024-09-29 14:45:12,466] [oaieval.py:302] Found 25/25 sampling events with usage data
[{'completion_tokens': 1, 'prompt_tokens': 1320, 'tota

In [24]:
!python -m evals.cli.oaieval gpt-4o-mini  match_mmlu_professional_law --max_samples 25

[2024-09-29 14:46:07,823] [registry.py:276] Loading registry from /Users/guanxiaoyao/qa/legal/research/evals/evals/registry/evals
[2024-09-29 14:46:08,666] [registry.py:276] Loading registry from /Users/guanxiaoyao/.evals/evals
[2024-09-29 14:46:08,667] [oaieval.py:215] [1;35mRun started: 240929064608RLKDYN6R[0m
[2024-09-29 14:46:08,757] [data.py:94] Fetching /Users/guanxiaoyao/qa/legal/research/evals/examples/../evals/registry/data/mmlu/professional_law/few_shot.jsonl
[2024-09-29 14:46:08,758] [data.py:94] Fetching /Users/guanxiaoyao/qa/legal/research/evals/examples/../evals/registry/data/mmlu/professional_law/samples.jsonl
[2024-09-29 14:46:08,775] [eval.py:36] Evaluating 25 samples
[2024-09-29 14:46:08,784] [eval.py:144] Running in threaded mode with 10 threads!
100%|███████████████████████████████████████████| 25/25 [00:03<00:00,  6.69it/s]
[2024-09-29 14:46:12,551] [oaieval.py:302] Found 25/25 sampling events with usage data
[{'completion_tokens': 1, 'prompt_tokens': 1353, 'tota

In [33]:
# This will generate a JSONL which will record samples and logs and store it in /tmp/evallogs
!python -m evals.cli.oaieval o1-mini  match_mmlu_professional_law --max_samples 25

[2024-09-29 15:07:18,708] [registry.py:276] Loading registry from /Users/guanxiaoyao/qa/legal/research/evals/evals/registry/evals
[2024-09-29 15:07:19,796] [registry.py:276] Loading registry from /Users/guanxiaoyao/.evals/evals
[2024-09-29 15:07:19,799] [oaieval.py:215] [1;35mRun started: 240929070719L2MVKL5D[0m
[2024-09-29 15:07:19,959] [data.py:94] Fetching /Users/guanxiaoyao/qa/legal/research/evals/examples/../evals/registry/data/mmlu/professional_law/few_shot.jsonl
[2024-09-29 15:07:19,960] [data.py:94] Fetching /Users/guanxiaoyao/qa/legal/research/evals/examples/../evals/registry/data/mmlu/professional_law/samples.jsonl
[2024-09-29 15:07:19,981] [eval.py:36] Evaluating 25 samples
[2024-09-29 15:07:20,000] [eval.py:144] Running in threaded mode with 10 threads!
100%|███████████████████████████████████████████| 25/25 [00:23<00:00,  1.06it/s]
[2024-09-29 15:07:43,625] [oaieval.py:302] Found 25/25 sampling events with usage data
[{'completion_tokens': 539, 'prompt_tokens': 1529, 'to

In [34]:
# This will generate a JSONL which will record samples and logs and store it in /tmp/evallogs
!python -m evals.cli.oaieval o1-preview  match_mmlu_professional_law --max_samples 25

[2024-09-29 15:07:45,833] [registry.py:276] Loading registry from /Users/guanxiaoyao/qa/legal/research/evals/evals/registry/evals
[2024-09-29 15:07:46,880] [registry.py:276] Loading registry from /Users/guanxiaoyao/.evals/evals
[2024-09-29 15:07:46,883] [oaieval.py:215] [1;35mRun started: 240929070746JDYWPKT7[0m
[2024-09-29 15:07:47,011] [data.py:94] Fetching /Users/guanxiaoyao/qa/legal/research/evals/examples/../evals/registry/data/mmlu/professional_law/few_shot.jsonl
[2024-09-29 15:07:47,011] [data.py:94] Fetching /Users/guanxiaoyao/qa/legal/research/evals/examples/../evals/registry/data/mmlu/professional_law/samples.jsonl
[2024-09-29 15:07:47,030] [eval.py:36] Evaluating 25 samples
[2024-09-29 15:07:47,040] [eval.py:144] Running in threaded mode with 10 threads!
100%|███████████████████████████████████████████| 25/25 [00:41<00:00,  1.67s/it]
[2024-09-29 15:08:28,867] [oaieval.py:302] Found 25/25 sampling events with usage data
[{'completion_tokens': 523, 'prompt_tokens': 1224, 'to

In [35]:
!python -m evals.cli.oaieval o1-preview  match_mmlu_professional_law --cache

[2024-09-29 15:10:13,136] [registry.py:276] Loading registry from /Users/guanxiaoyao/qa/legal/research/evals/evals/registry/evals
[2024-09-29 15:10:14,184] [registry.py:276] Loading registry from /Users/guanxiaoyao/.evals/evals
[2024-09-29 15:10:14,186] [oaieval.py:215] [1;35mRun started: 240929071014HILLC5OK[0m
[2024-09-29 15:10:14,420] [data.py:94] Fetching /Users/guanxiaoyao/qa/legal/research/evals/examples/../evals/registry/data/mmlu/professional_law/few_shot.jsonl
[2024-09-29 15:10:14,421] [data.py:94] Fetching /Users/guanxiaoyao/qa/legal/research/evals/examples/../evals/registry/data/mmlu/professional_law/samples.jsonl
[2024-09-29 15:10:14,440] [eval.py:36] Evaluating 1534 samples
[2024-09-29 15:10:14,453] [eval.py:144] Running in threaded mode with 10 threads!
  3%|█▎                                       | 49/1534 [01:12<24:27,  1.01it/s][2024-09-29 15:11:29,665] [record.py:360] Logged 100 rows of events to /tmp/evallogs/240929071014HILLC5OK_o1-preview_match_mmlu_professional

In [None]:
# How to process the log events generated by oaieval
events = "/tmp/evallogs/{log_name}"

with open(events, "r") as f:
    events_df = pd.read_json(f, lines=True)

matches_df = events_df[events_df.type == "match"].reset_index(drop=True)
matches_df = matches_df.join(pd.json_normalize(matches_df.data))
matches_df.correct.value_counts().plot.bar(title="Correctness of generated answers", xlabel="Correctness", ylabel="Count")

In [None]:
# Inspect samples
for i, r in pd.json_normalize(events_df[events_df.type == "sampling"].data).iterrows():
    print(f"Prompt: {r.prompt}")
    print(f"Sampled: {r.sampled}")
    print("-" * 25)