Skip to content

Commit

Permalink
First steps to running suites
Browse files Browse the repository at this point in the history
This is a barely functional wrapper for running "test suites", which are
just a list of preconfigured tasks. You can specify prompt and model.
This needs more testing and UI cleanup.
  • Loading branch information
polm committed Nov 6, 2023
1 parent 96b590b commit c6b7bcb
Show file tree
Hide file tree
Showing 2 changed files with 188 additions and 0 deletions.
33 changes: 33 additions & 0 deletions lm_eval/suites/ja8.conf
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
# This is the standard eight-task eval suite.

[tasks.mgsm]
version = 1.0
fewshot = 5

[tasks.xwinograd_ja]
# this has no version
fewshot = 0

[tasks.xlsum_ja]
version = 1.0
fewshot = 1

[tasks.jaqket_v2]
version = 0.1
fewshot = 1

[tasks.marc_ja]
version = 1.1
fewshot = 3

[tasks.jnli]
version = 1.1
fewshot = 3

[tasks.jcommonsenseqa]
version = 1.1
fewshot = 3

[tasks.jsquad]
version = 1.1
fewshot = 2
155 changes: 155 additions & 0 deletions scripts/run_suite.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,155 @@
#!/usr/bin/env python
# Run a suite of tests

import argparse
import configparser
from dataclasses import dataclass
from typing import Optional
import os
from pathlib import Path

from lm_eval import tasks, evaluator

# get path of current file
FILE_PATH = Path(os.path.dirname(os.path.realpath(__file__)))
# Path to suite configs
SUITE_DIR = FILE_PATH / "../lm_eval/suites"

# names for prompts
# TODO move this into lm_eval
PROMPT_CODES = {
"user": "0.0",
"jgpt": "0.1",
"fintan": "0.2",
"fintan2": "0.2.1",
"ja-alpaca": "0.3",
"rinna-sft": "0.4",
"rinna-bilingual": "0.5",
"llama2": "0.6",
}


@dataclass
class TaskSpec:
"""Specification of a task in an eval suite.
A suite is a list of these specs, plus a prompt."""

# The real arguments have to be massaged into messy strings and parallel
# lists, but this is a more reasonable structure - we can handle conversion
# separately.

name: str
fewshot: int
version: Optional[str]


def build_eval_args(specs: list[TaskSpec], prompt: str) -> tuple[list[str], list[int]]:
"""Convert list of TaskSpecs into args for simple_evaluate."""

tasks = []
fewshot = []
prompt_code = PROMPT_CODES[prompt]
for spec in specs:
task_name = spec.name
if spec.version is not None:
task_name += "-" + spec.version + "-" + prompt_code

tasks.append(task_name)
fewshot.append(spec.fewshot)

return (tasks, fewshot)


def load_suite(name):
"""Read in configuration for a test suite.
A suite will have a config file named something like `my_suite.conf`. For
each task in the file, a version, fewshot config, and any other details
will be specified.
Example entry:
[tasks.mgsm]
version = 1.0
fewshot = 5
"""
conf = configparser.ConfigParser()
conf.read(SUITE_DIR / (name + ".conf"))

specs = []
for key, val in conf.items():
if not key.startswith("tasks."):
continue

spec = TaskSpec(
name=key.split(".", 1)[1],
version=val.get("version", None),
fewshot=int(val["fewshot"]),
)
specs.append(spec)
return specs


def run_suite(
model_args, suite, prompt, *, model_type="hf-causal", output=None, verbose=False
):
# Confusing detail: in the "simple evaluate", "model" is the HF model type,
# which is almost always hf-causal or hf-causal-experimental. `model_args`
# looks like this:
#
# pretrained=hoge/piyo,tokenizer=...,asdf=...

# device never changes in practice
device = "cuda"

print("suite", suite)
specs = load_suite(suite)
print(specs)
tasks, num_fewshot = build_eval_args(specs, prompt)
print(tasks)
print(num_fewshot)
print(model_args)

evaluator.simple_evaluate(
model=model_type,
model_args=model_args,
tasks=tasks,
num_fewshot=num_fewshot,
device=device,
verbose=verbose,
)


def main():
parser = argparse.ArgumentParser(
prog="run_suite.py", description="Run a test suite with a model"
)
parser.add_argument("model", help="Model path (or HF spec)")
parser.add_argument("suite", help="Test suite to run")
parser.add_argument("prompt", help="Prompt to use")
parser.add_argument("-m", "--model_args", help="Additional model arguments")
parser.add_argument(
"-t", "--model_type", default="hf-causal-experimental", help="Model type"
)
parser.add_argument("-o", "--output", help="Output file")
parser.add_argument("-v", "--verbose", action="store_true")

args = parser.parse_args()

margs = f"pretrained={args.model}"
if args.model_args:
margs = args.model + "," + args.model_args

run_suite(
margs,
args.suite,
args.prompt,
model_type=args.model_type,
output=args.output,
verbose=args.verbose,
)


if __name__ == "__main__":
main()

0 comments on commit c6b7bcb

Please sign in to comment.