In [1]:
import json

import numpy as np
from datasets import load_dataset
from math_agent import (
    MathAgent,
    MathEnvironment,
    extract_result_value,
    solve_task,
)
from termcolor import colored
from tqdm import tqdm

from tapeagents.llms import LLAMA

In [4]:
env = MathEnvironment()


def eval(tested_agent, test_set) -> float:
    test_solved = []
    for i, sample in enumerate(tqdm(test_set)):
        sample = extract_result_value(sample)
        try:
            tape = solve_task(tested_agent, env, sample)
            test_solved.append(int(tape.metadata.result["solved"]))
        except Exception as e:
            print(colored("Failed to solve task: {e}", "red"))
            test_solved.append(0)
            raise e
        if i % 10 == 0 and i > 0:
            print(f"{i}: Current accuracy: {np.mean(test_solved):.3f}")
    acc = np.mean(test_solved).item()
    return acc

In [5]:
test_dataset = load_dataset("openai/gsm8k", "main", split="test")
test_samples = [s for s in test_dataset]
np.random.seed(42)
np.random.shuffle(test_samples)  # type: ignore
test_set = test_samples[:200]

dataset = load_dataset("openai/gsm8k", "main", split="train")
val_samples = [s for s in dataset]
np.random.seed(42)
np.random.shuffle(val_samples)  # type: ignore
val_set = val_samples[:200]

## Evaluation

In [5]:
# run inference: vllm serve meta-llama/Meta-Llama-3.1-8B-Instruct
untuned_agent = MathAgent(
    llms={
        "default": LLAMA(
            base_url="http://localhost:8000",
            model_name="meta-llama/Meta-Llama-3.1-8B-Instruct",
            tokenizer_name="meta-llama/Meta-Llama-3.1-8B-Instruct",
            parameters=dict(temperature=0.1),
            use_cache=False,
        )
    }
)

## Untuned model accuracy

In [None]:
val_acc = eval(untuned_agent, val_set)
print(f"Untuned on train {val_acc:.3f}")

In [None]:
acc = eval(untuned_agent, test_set)
print(f"Untuned on test {acc:.3f}")

In [19]:
with open("results.json", "w") as f:
    f.write(json.dumps({"untuned": {"train": val_acc, "test": acc}}))

## Tuned model accuracy

In [6]:
# run inference: vllm serve gsm8k/tune1/intermediate/1000/
tuned_agent = MathAgent(
    llms={
        "default": LLAMA(
            base_url="http://localhost:8000",
            model_name="gsm8k/tune1/intermediate/1000/",
            tokenizer_name="meta-llama/Meta-Llama-3.1-8B-Instruct",
            parameters=dict(temperature=0.0),
            use_cache=False,
        )
    }
)

In [None]:
tuned_val_acc = eval(tuned_agent, test_set)
print(f"Tuned on test {tuned_val_acc:.3f}")

In [None]:
tuned_acc = eval(tuned_agent, val_set)
print(f"Tuned on train {tuned_acc:.3f}")

In [None]:
with open("results.json", "w") as f:
    f.write(
        json.dumps(
            {
                "untuned": {"train": val_acc, "test": acc},
                "tuned": {"train": tuned_acc, "test": tuned_val_acc},
            }
        )
    )