In [49]:
OPENAI_GPT_4o_IN = 5 / 1000000
OPENAI_GPT_4o_OUT = 20 / 1000000
EVAL_OPTIM_FEEDBACK_PROMPT = 80
EVAL_OPTIM_FEEDBACK_LEN = 50
TOOL_FEEDBACK_LEN = 50

In [19]:
import os
import json
import pandas as pd
import tiktoken

In [60]:
#s = "Grade the response as 'valid' or 'not valid' based on whether the response is appropriate for the question and choices provided. "
#s += "If the response is not valid, provide concise few sentences of feedback (single paragraph) on how to modify the answer and crucial information to consider when answering the question."
#s += "If you have provided any previous feedback (listed below), consider it in your evaluation and do not contradict yourself.\n"
#enc = tiktoken.get_encoding("cl100k_base")
#len(enc.encode(s))

In [42]:
def compute_benchmark_stats(benchmark_df):
    enc = tiktoken.get_encoding("cl100k_base")
    avg_in_tokens = benchmark_df["question"].dropna().astype(str).apply(lambda s: len(enc.encode(s))).mean() + benchmark_df["options"].dropna().astype(str).apply(lambda s: len(enc.encode(s))).mean()

    avg_out_tokens = 4
    num_entries = len(benchmark_df)

    return {
        "avg_in_tokens": avg_in_tokens,
        "avg_out_tokens": avg_out_tokens,
        "num_entries": num_entries 
        }

In [32]:
benchmark_file_map = {
        'mmlu_ethics' : 'ethics/mmlu_ethics.json',
        'triage_ethics' : 'ethics/triage_ethics.json',
        'truthfulqa_ethics' : 'ethics/truthfulqa_ethics.json',
        'medbullets_metacognition' : 'metacognition/medbullets_metacognition.json',
        'medcalc_metacognition' : 'metacognition/medcalc_metacognition.json',
        'metamedqa_metacognition' : 'metacognition/metamedqa_metacognition.json',
        'mmlu_metacognition' : 'metacognition/mmlu_metacognition.json',
        'mmlu_pro_metacognition' : 'metacognition/mmlu_pro_metacognition.json',
        'pubmedqa_metacognition' : 'metacognition/pubmedqa_metacognition.json',
        'bbq_safety' : 'safety/bbq_safety.json',
        'casehold_safety' : 'safety/casehold_safety.json',
        'mmlu_safety' : 'safety/mmlu_safety.json',
        'mmlupro_safety' : 'safety/mmlupro_safety.json'
    }

benchmark_len_stats = {}
for benchmark in benchmark_file_map:
    df = pd.DataFrame(json.load(open(f"../../benchmarks/{benchmark_file_map[benchmark]}", 'r'))).set_index('id')
    stats = compute_benchmark_stats(df)
    benchmark_len_stats[benchmark] = stats

In [35]:
def zero_shot_cost(benchmark_stats, percent_dataset = 0.15):
    cost = 0
    for _, stats in benchmark_stats.items():
        num_entries = int(stats['num_entries'] * percent_dataset)
        cost += (stats['avg_in_tokens'] * OPENAI_GPT_4o_IN + stats['avg_out_tokens'] * OPENAI_GPT_4o_OUT) * num_entries
    return cost

In [44]:
def eval_optimizer_cost(benchmark_stats, percent_dataset = 0.15):
    cost = 0
    for _, stats in benchmark_stats.items():
        num_entries = int(stats['num_entries'] * percent_dataset)
        round_1_cost = (((2 * stats['avg_in_tokens'] + EVAL_OPTIM_FEEDBACK_PROMPT) * OPENAI_GPT_4o_IN)) + ((stats['avg_out_tokens'] + EVAL_OPTIM_FEEDBACK_LEN) * OPENAI_GPT_4o_OUT)
        round_2_cost = (((2 * stats['avg_in_tokens'] + EVAL_OPTIM_FEEDBACK_PROMPT + (EVAL_OPTIM_FEEDBACK_LEN)) * OPENAI_GPT_4o_IN)) + ((stats['avg_out_tokens'] + EVAL_OPTIM_FEEDBACK_LEN) * OPENAI_GPT_4o_OUT)
        round_3_cost = (((2 * stats['avg_in_tokens'] + EVAL_OPTIM_FEEDBACK_PROMPT + (2*EVAL_OPTIM_FEEDBACK_LEN)) * OPENAI_GPT_4o_IN)) + ((stats['avg_out_tokens'] + EVAL_OPTIM_FEEDBACK_LEN) * OPENAI_GPT_4o_OUT)
        cost += (num_entries * (round_1_cost + round_2_cost + round_3_cost))
    return cost

In [50]:
def mas_cost(benchmark_stats, percent_dataset = 0.15):
    cost = 0
    for _, stats in benchmark_stats.items():
        num_entries = int(stats['num_entries'] * percent_dataset)
        num_tool_calls = 4
        mas_orchestration_cost = ((stats['avg_in_tokens'] + (4*TOOL_FEEDBACK_LEN)) * OPENAI_GPT_4o_IN) + (stats['avg_out_tokens'] * OPENAI_GPT_4o_OUT) 
        mas_tool_cost = num_tool_calls * ((stats['avg_in_tokens'] * OPENAI_GPT_4o_IN) + (TOOL_FEEDBACK_LEN * OPENAI_GPT_4o_OUT))
        cost += (num_entries * (mas_tool_cost + mas_orchestration_cost))
    return cost

In [61]:
total_cost = zero_shot_cost(benchmark_len_stats, 0.45) + eval_optimizer_cost(benchmark_len_stats, 0.45) + mas_cost(benchmark_len_stats, 0.45)
print(f"Total cost for 45% of the dataset: ${total_cost:.2f}")

Total cost for 45% of the dataset: $103.66


In [None]:
for benchmark, stats in benchmark_len_stats.items():
    print(f"python boostrapping.py --benchmark {benchmark} --n_samples {int(0.45*stats['num_entries'])} --experiment_path bootstrapping_45percent/{benchmark}_N{int(0.45*stats['num_entries'])}")

python boostrapping.py --benchmark mmlu_ethics --n_samples 402 --experiment_path bootstrapping_45percent/mmlu_ethics_N402
python boostrapping.py --benchmark triage_ethics --n_samples 38 --experiment_path bootstrapping_45percent/triage_ethics_N38
python boostrapping.py --benchmark truthfulqa_ethics --n_samples 355 --experiment_path bootstrapping_45percent/truthfulqa_ethics_N355
python boostrapping.py --benchmark medbullets_metacognition --n_samples 138 --experiment_path bootstrapping_45percent/medbullets_metacognition_N138
python boostrapping.py --benchmark medcalc_metacognition --n_samples 189 --experiment_path bootstrapping_45percent/medcalc_metacognition_N189
python boostrapping.py --benchmark metamedqa_metacognition --n_samples 617 --experiment_path bootstrapping_45percent/metamedqa_metacognition_N617
python boostrapping.py --benchmark mmlu_metacognition --n_samples 179 --experiment_path bootstrapping_45percent/mmlu_metacognition_N179
python boostrapping.py --benchmark mmlu_pro_meta