# Demo: Math Benchmark Selection
Generally, a good ALM eval task is something hard for vanilla LLMs, where we hope tools come in to assist.

Here we demo how to retrive some math eval from MATH dataset.

In [1]:
import os
import json
import pandas as pd
from gentopia import AgentAssembler
from gentbench.grader import GateGrader, BatchGateGrader
from gentopia.llm import OpenAIGPTClient
from tqdm import tqdm

In [2]:
# Recursive function to load json files from a path and its subdirectories
def load_from_path_recursive(path):
    data = []
    for root, dirs, files in os.walk(path):
        for file in files:
            if file.endswith(".json"):
                with open(os.path.join(root, file), 'r') as f:
                    data.append(json.load(f))
    return data


In [3]:
# Read MATH dataset
data = load_from_path_recursive("../benchmark/raw/MATH/")
# Initial filter by level of difficulty 
hard_data = []
for data in data:
    if data["level"] in ["Level 5"]:
        hard_data.append(data)

In [4]:
len(hard_data)

3628

In [5]:
# Use vanilla gpt-3.5-turbo as threshold
dummy_agent = AgentAssembler(file="../eval/config/chatgpt.yaml").get_agent()

eval_llm = OpenAIGPTClient(model_name="gpt-4")
grader = GateGrader(llm=eval_llm)
batch_grader = BatchGateGrader(llm=eval_llm)

## Single Eval

In [6]:
hard_data[0]

{'problem': 'Find the largest negative integer $x$ which satisfies the congruence $34x+6\\equiv 2\\pmod {20}$.',
 'level': 'Level 5',
 'type': 'Number Theory',
 'solution': 'We can simplify the congruence as follows (all of the following congruences are equivalent):\n\\begin{align*}\n34x+6&\\equiv 2\\pmod {20}\\\\\n14x+6&\\equiv 2\\pmod {20}\\\\\n14x&\\equiv 16\\pmod {20}\\\\\n7x&\\equiv 8\\pmod {10}\\\\\n21x&\\equiv 8\\cdot 3\\pmod {10}\\\\\nx&\\equiv 24\\pmod{10}\\\\\nx&\\equiv 4\\pmod{10}\\\\\nx&\\equiv \\boxed{-6}\\pmod{10}.\n\\end{align*}'}

In [7]:
pred = dummy_agent.run(hard_data[0]["problem"]).output
pred

'We can simplify the congruence as follows: \\begin{align*}\n34x+6&\\equiv 2\\pmod{20} \\\\\n34x&\\equiv -4\\pmod{20} \\\\\n17x&\\equiv -2\\pmod{10} \\\\\n17x&\\equiv 8\\pmod{10} \\\\\n7x&\\equiv 8\\pmod{10}.\n\\end{align*}We can then find the inverse of $7$ modulo $10$ by testing values: \\begin{align*}\n7\\cdot 1&\\equiv 7\\pmod{10} \\\\\n7\\cdot 2&\\equiv 4\\pmod{10} \\\\\n7\\cdot 3&\\equiv 1\\pmod{10}.\n\\end{align*}Therefore, the inverse of $7$ modulo $10$ is $3$. Multiplying both sides of $7x\\equiv 8\\pmod{10}$ by $3$, we get \\[21x\\equiv 24\\pmod{10}.\\]Since $21x\\equiv 1x\\pmod{10}$, we have $x\\equiv 24\\pmod{10}$. The largest negative integer $x$ satisfying this congruence is $\\boxed{-6}$.'

In [8]:
grader.run(hard_data[0]["problem"], hard_data[0]["solution"], pred)

AgentOutput(output='passed', cost=0.01704, token_usage=567)

## Batch Eval

In [9]:
problems = ["Answer in short: " + data["problem"] for data in hard_data]
solutions = [data["solution"] for data in hard_data]

In [10]:
BS = 3
SAMPLE = 300

probs, sols, preds, grades = [], [], [], []
cost, tokens = 0, 0
for i in tqdm(range(0, SAMPLE, BS)):
    batch_problems = problems[i:i+BS]
    batch_solutions = solutions[i:i+BS]
    try:
        batch_preds = [dummy_agent.run(prob).output for prob in batch_problems]
        res = batch_grader.run(batch_problems, batch_solutions, batch_preds)
        probs += batch_problems
        sols += batch_solutions
        preds += batch_preds
        grades += res.output.split(",")
        cost += res.cost
        tokens += res.token_usage
    except Exception as e:
        continue


100%|██████████| 2/2 [01:13<00:00, 36.61s/it]

Exception: The server is overloaded or not ready yet.
1 validation error for BaseCompletion
content
  str type expected (type=type_error.str)





In [13]:
grades

['failed', 'failed', 'failed', 'failed', 'failed']

In [26]:
solutions[0]

new tasks to be saved into benchmark: 253


In [27]:
preds[0]

In [28]:
grades

Eval spent you $6.4428600000000005 and 213446 tokens.


In [29]:
df = pd.DataFrame({"problem": problems, "solution": solutions})

{'problem': 'Think step by step and answer in short: For $1 \\le n \\le 100$, how many integers are there such that $\\frac{n}{n+1}$ is a repeating decimal?',
 'solution': 'Note that $n+1$ and $n$ will never share any common factors except for $1$, because they are consecutive integers. Therefore, $n/(n+1)$ is already simplified, for all positive integers $n$.\n\nSince $1 \\le n \\le 100$, it follows that $2 \\le n+1 \\le 101$. Recall that a simplified fraction has a repeating decimal representation if and only if its denominator is divisible by a prime other than 2 and 5. The numbers between 2 and 101 which are divisible only by 2 and 5 comprise the set $\\{2, 4, 5, 8, \\allowbreak 10, 16, 20, 25, \\allowbreak 32, 40, 50, 64, \\allowbreak 80, 100\\}$. Therefore, there are $14$ terminating decimals and $100 - 14 = \\boxed{86}$ repeating decimals.',
 'pred': 'To determine how many integers satisfy the given condition, we need to find the number of integers $n$ such that $\\frac{n}{n+1}$