In [2]:
import os

import numpy as np
from datasets import load_dataset
from math_agent import (
    MathAgent,
    MathEnvironment,
    extract_result_value,
    save_tape,
    solve_task,
)
from termcolor import colored
from tqdm import tqdm

from tapeagents.llms import LLAMA

## Prepare tasks, env, llm and agent to run on training split

In [None]:
# Exam the dataset sample
dataset = load_dataset("openai/gsm8k", "main", split="train")
print(dataset[0])

In [6]:
dataset = load_dataset("openai/gsm8k", "main", split="train")
samples = [s for s in dataset]
np.random.seed(42)
np.random.shuffle(samples)  # type: ignore

llm = LLAMA(
    base_url="https://api.together.xyz",
    model_name="meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo",
    tokenizer_name="meta-llama/Meta-Llama-3.1-70B-Instruct",
    parameters=dict(temperature=0.2),
    use_cache=False,
)
agent = MathAgent(llms={"default": llm})
env = MathEnvironment()

## Solve tasks

In [None]:
exp_path = "gsm8k/tuning/llama31_70b_train"
attempts = 1

tapes_dir = os.path.join(exp_path, "tapes")
os.makedirs(tapes_dir, exist_ok=True)
os.environ["TAPEAGENTS_SQLITE_DB"] = os.path.join(exp_path, "llm_calls.sqlite")

solved = []
for i, sample in enumerate(tqdm(samples)):
    sample = extract_result_value(sample)
    for j in range(attempts):
        tape_file = os.path.join(tapes_dir, f"task{i}_attempt{j+1}.json")
        if os.path.exists(tape_file):
            print(f"Task {i} attempt {j+1} already solved, skipping")
            continue
        try:
            tape = solve_task(agent, env, sample, tape_file)
            solved.append(int(tape.metadata.result["solved"]))
            save_tape(tape_file, tape)
        except Exception as e:
            print(colored(f"Failed to solve task, attempt {j+1}: {e}", "red"))
            solved.append(0)
    if i % 10 == 0 and i > 0:
        print(f"{i}: Current accuracy: {np.mean(solved):.3f}, prompt tokens used: {agent.llm.token_count}")