In [1]:
import json
import os
import dspy

from dotenv import load_dotenv

load_dotenv("../keys.env")

  from .autonotebook import tqdm as notebook_tqdm


True

In [2]:
def load_math_as_examples(path="../datasets/MATH", split="train"):
    with open(os.path.join(path, split, "dataset_numeric.json")) as f:
        data = json.load(f)

    examples = []

    for question, answer in zip(data["question"], data["extracted_answers"]):
        example = dspy.Example(question=question, answer=answer).with_inputs("question")
        examples.append(example)

    return examples

In [12]:
HOST = "localhost"
# HOST = "babel-15-20"
PORT = 8000
# API_KEY = "EMPTY"
# API_BASE = f"http://{HOST}:{PORT}/v1"
# MODEL_NAME = "meta-llama/Llama-3.1-8B-Instruct"
MODEL_NAME = "gpt-4o-mini"
# MODEL_NAME = "llm"


OPENAI_API_KEY = os.environ["OPENAI_API_KEY"]
ANTHROPIC_API_KEY = os.environ["ANTHROPIC_API_KEY"]

# lm = dspy.LM(
#     "openai/" + MODEL_NAME,
#     api_key=OPENAI_API_KEY,
#     # model_type="chat",
#     cache=False,
# )
# dspy.configure(lm=lm)


# Use Claude 3.5 Sonnet model
MODEL_NAME = "claude-3-sonnet-20240229"

lm = dspy.LM(
    "anthropic/" + MODEL_NAME,
    api_key=ANTHROPIC_API_KEY,
    cache=True,
)
dspy.configure(lm=lm)


In [13]:
dataset = load_math_as_examples(split="test")
len(dataset)

3203

In [14]:
class MathSignature(dspy.Signature):
    """Answer the math question."""

    question = dspy.InputField()
    answer = dspy.OutputField(
        desc="The final answer in number format. Ignore any latex that exists in the question, and determine the numerical answer step by step. Reason logically internally and arrive at a final answer. If you get an expression, simplify it and return a numerical answer. If your answer is a fraction, simplify it and return a decimal rounded to 3 places. Your final answer should be a number in the last line."
    )


# class GenerateSignature(dspy.Signature):
#     """Create similar math questions and answers as the ones in the input"""

#     question = dspy.InputField()
#     answer = dspy.InputField()
#     generated_examples: list[dict[str, str]] = dspy.OutputField(
#         desc="1 generated similar question and answer"
#     )


# class MathManyShotSignature(dspy.Signature):
#     """Answer the math question."""

#     examples: list[dict[str, str]] = dspy.InputField(
#         desc="List of examples with question and answer"
#     )
#     # examples: str = dspy.InputField(desc="List of examples with question and answer")
#     question = dspy.InputField()
#     answer = dspy.OutputField(
#         desc="The final answer in latex format. Do not include the \\boxed{} symbol."
#     )

In [15]:
# augment = dspy.Predict(GenerateSignature, n=10)

dataset[0]

Example({'question': 'When counting from $3$ to $201$, $53$ is the $51^\\mathrm{st}$ number counted. When counting backwards from $201$ to $3$, $53$ is the $n^\\mathrm{th}$ number counted. What is $n$?', 'answer': '149'}) (input_keys={'question'})

In [19]:
predict = dspy.ChainOfThought(MathSignature)
correct = 0
total = 0

for example in dataset[:20]:
    try:
        response = predict(**example)
        pred_answer = float(response.answer)
        true_answer = float(example["answer"])

        if pred_answer == true_answer:
            correct += 1
        total += 1

        print(f"Example {total}:")
        print(f"Predicted: {pred_answer}")
        print(f"True: {true_answer}")
        print(f"Match: {pred_answer == true_answer}\n")
    except Exception as e:
        print(f"Error processing example {total}\n")
        total += 1
        continue

print(f"Final accuracy: {correct/total:.2%} ({correct}/{total})")

Example 1:
Predicted: 149.0
True: 149.0
Match: True

Error processing example 1

Example 3:
Predicted: 512.0
True: 512.0
Match: True

Example 4:
Predicted: 4.0
True: 6.0
Match: False

Example 5:
Predicted: 27.0
True: 23.0
Match: False

Example 6:
Predicted: 560.0
True: 560.0
Match: True

Example 7:
Predicted: 772.0
True: 772.0
Match: True

Example 8:
Predicted: 1428.0
True: 6732.0
Match: False

Example 9:
Predicted: 1728.0
True: 200.0
Match: False

Example 10:
Predicted: 66.0
True: 28.0
Match: False

Example 11:
Predicted: 720.0
True: 540.0
Match: False

Example 12:
Predicted: 165.0
True: 165.0
Match: True

Example 13:
Predicted: 0.076
True: 0.076
Match: True

Example 14:
Predicted: 4.0
True: 97.0
Match: False

Example 15:
Predicted: 1.0
True: 1.0
Match: True

Example 16:
Predicted: 28.0
True: 21.0
Match: False

Example 17:
Predicted: 4.0
True: 28.0
Match: False

Example 18:
Predicted: 105.0
True: 105.0
Match: True

Example 19:
Predicted: 8601.0
True: 1201.0
Match: False

Example 20:
P