In [1]:
import yaml
import os

from gen_answer import AnswerGenerator
from gen_judgment import Judge
from show_result import Result

In [2]:
LLM_JUDGE = None                # judge model from API                      ("gpt-4-1106-preview")
LLM_MODEL = None                # testing model from API                    ("gemma-2-9b-it")

GEN_ANSWER_CONFIG_NAME = None   # necessary if setup is not default         ("temp-0.2_tokens-4k")
OVERRIDE_GEN_PARAMS = None      # optional if such setup already created    ({temperature: 0.2, max_tokens: 4096})

JUDGE_CONFIG_NAME = None        # necessary if setup is not default         ("without_pairwise")
OVERRIDE_JUDGE_PARAMS = None    # optional if such setup already created    ({pairwise: False})

In [3]:
default_gen_config_path = "default_config/gen_answer_config.yaml"
default_judge_config_path = "default_config/judge_config.yaml"
api_config_path = "default_config/api_config.yaml"
data_path = "data"


with open(default_gen_config_path, 'r') as f:
    gen_config = yaml.safe_load(f)

with open(default_judge_config_path, 'r') as f:
    judge_config = yaml.safe_load(f)

with open(api_config_path, 'r') as f:
    api_config = yaml.safe_load(f)

## Generation

In [4]:
def equal_configs(dict1, dict2):
    dict1_filtered = {k: v for k, v in dict1.items() if k != "model_list"}
    dict2_filtered = {k: v for k, v in dict2.items() if k != "model_list"}

    return dict1_filtered == dict2_filtered


def config_name_provided(new_config, default_config_path, name):
    with open(default_config_path, 'r') as f:
        default_config = yaml.safe_load(f)
        if not equal_configs(new_config, default_config) and not name:
            return False    
            
    return True


def update_config(new_config, config_file, new_model):
    if os.path.isfile(config_file):
        with open(config_file, 'r') as f:
            config = yaml.safe_load(f)
        assert equal_configs(config, new_config)

        if new_model and (new_model not in config["model_list"]):
            config["model_list"].append(new_model)
    else:
        os.makedirs(os.path.dirname(config_file), exist_ok=True)
        with open(config_file, "w") as f:
            yaml.dump(new_config, f, default_flow_style=False, sort_keys=False)




if LLM_MODEL:
    assert isinstance(LLM_MODEL, str)
    assert LLM_MODEL in api_config
assert config_name_provided(new_config=gen_config, default_config_path=default_gen_config_path, name=GEN_ANSWER_CONFIG_NAME)
gen_config_name = (GEN_ANSWER_CONFIG_NAME if GEN_ANSWER_CONFIG_NAME else "default")


if OVERRIDE_GEN_PARAMS:
    gen_config.update(OVERRIDE_GEN_PARAMS)

if LLM_MODEL:
    gen_config["model_list"].append(LLM_MODEL)

gen_config_file = os.path.join(data_path, gen_config["bench_name"], "model_answer", gen_config_name, "config", "gen_answer_config.yaml")

update_config(new_config=gen_config, config_file=gen_config_file, new_model=LLM_MODEL)

In [5]:
generation_path = os.path.join(data_path, gen_config["bench_name"], "model_answer", gen_config_name)
generator = AnswerGenerator(generation_path, api_config_path)
print(generator)
generator.generate()


{'name': 'config of answer generation for arena-hard-v0.1', 'bench_name': 'arena-hard-v0.1', 'temperature': 0.0, 'max_tokens': 2048, 'num_choices': 1, 'model_list': ['gpt-3.5-turbo-0125', 'gpt-4-1106-preview', 'gpt-4o-mini', 'gpt-3.5-turbo-1106']}

Output to data/arena-hard-v0.1/model_answer/default/gpt-3.5-turbo-0125.jsonl
500 number of existing answers


0it [00:00, ?it/s]


Output to data/arena-hard-v0.1/model_answer/default/gpt-4-1106-preview.jsonl
500 number of existing answers


0it [00:00, ?it/s]


Output to data/arena-hard-v0.1/model_answer/default/gpt-4o-mini.jsonl
500 number of existing answers


0it [00:00, ?it/s]


Output to data/arena-hard-v0.1/model_answer/default/gpt-3.5-turbo-1106.jsonl
500 number of existing answers


0it [00:00, ?it/s]


## Judgment

In [6]:
if LLM_JUDGE:
    assert isinstance(LLM_JUDGE, str)
    assert LLM_JUDGE in api_config
assert config_name_provided(new_config=judge_config, default_config_path=default_judge_config_path, name=JUDGE_CONFIG_NAME)
assert "judge_model" in judge_config

if OVERRIDE_JUDGE_PARAMS:
    judge_config.update(OVERRIDE_JUDGE_PARAMS)

if LLM_JUDGE:
    judge_config["judge_model"] = LLM_JUDGE

judge_config_name = judge_config["judge_model"] + '_' + (JUDGE_CONFIG_NAME if JUDGE_CONFIG_NAME else "default")

if LLM_MODEL:
    judge_config["model_list"].append(LLM_MODEL)

judge_config_file = os.path.join(data_path, judge_config["bench_name"], "model_judgment", judge_config_name, "config", "judge_config.yaml")

update_config(new_config=judge_config, config_file=judge_config_file, new_model=LLM_MODEL)

In [7]:
judgment_path = os.path.join(data_path, judge_config["bench_name"], "model_judgment", judge_config_name)
judge = Judge(judgment_path, generation_path, api_config_path)
print(judge)
judge.judge()


judge model: gpt-4-1106-preview, baseline: True, baseline model: gpt-3.5-turbo-0125,reference: False, reference models: None, temperature: 0, max tokens: 4096, pairwise: True

500 number of existing judgments
500 number of existing judgments
500 number of existing judgments
500 number of existing judgments
500 number of existing judgments
500 number of existing judgments
500 number of existing judgments
500 number of existing judgments
500 number of existing judgments
500 number of existing judgments
500 number of existing judgments
500 number of existing judgments
500 number of existing judgments
500 number of existing judgments
500 number of existing judgments
500 number of existing judgments
500 number of existing judgments
500 number of existing judgments
500 number of existing judgments
500 number of existing judgments
500 number of existing judgments
500 number of existing judgments
500 number of existing judgments
500 number of existing judgments
500 number of existing judgment

0it [00:00, ?it/s]


## Result

In [10]:
result = Result(generation_path, judgment_path)
print(result)
result.show()


{'bench_name': 'arena-hard-v0.1', 'judge_name': 'gpt-4-1106-preview', 'baseline': 'gpt-3.5-turbo-0125', 'load_battles': False, 'load_bootstrap': False, 'show_elo': False, 'length_control': False, 'weight': 3, 'num_rounds': 100, 'output': False, 'first_game_only': False}

Turning judgment results into battles...


100%|██████████| 49/49 [01:21<00:00,  1.67s/it]
bootstrap: 100%|██████████| 100/100 [00:06<00:00, 16.50it/s]
