In [1]:
import json
import os

import dspy

In [2]:
def load_math_as_examples(path="../datasets/MATH", split="train"):
    with open(os.path.join(path, split, "dataset.json")) as f:
        data = json.load(f)
    
    examples = []

    for question, answer in zip(data["question"], data["extracted_answers"]):
        example = dspy.Example(question=question, answer=answer).with_inputs("question")
        examples.append(example)

    return examples

In [4]:
HOST = "localhost"
# HOST = "babel-15-20"
PORT = 8000
API_KEY = "EMPTY"
API_BASE = f"http://{HOST}:{PORT}/v1"
MODEL_NAME = "meta-llama/Llama-3.1-8B-Instruct"
# MODEL_NAME = "llm"

lm = dspy.LM("openai/"+MODEL_NAME, api_base=API_BASE, api_key=API_KEY, model_type='chat', cache=False)
dspy.configure(lm=lm)

In [12]:
class MathSignature(dspy.Signature):
    """Answer the math question."""

    question = dspy.InputField()
    answer = dspy.OutputField(desc="The final answer in latex format. Do not include the \\boxed{} symbol.")

In [13]:
class BaselineCoT(dspy.Module):

    def __init__(self):
        self.cot = dspy.ChainOfThought(MathSignature)
    
    def forward(self, example):
        response = self.cot(**example.inputs())
        return response

In [14]:
dataset = load_math_as_examples(split="test")
len(dataset)

5000

In [15]:
baseline = dspy.ChainOfThought(MathSignature)

In [16]:
evaluator = dspy.evaluate.Evaluate(devset=dataset, num_threads=8, display_progress=True)

In [17]:
score, outputs = evaluator(baseline, metric=dspy.evaluate.metrics.answer_exact_match, return_outputs=True)
score

Average Metric: 7.00 / 119 (5.9%):   2%|▏         | 119/5000 [02:29<51:51,  1.57it/s]  



KeyboardInterrupt: 

In [None]:
dspy.inspect_history()





[34m[2024-11-20T16:52:01.307106][0m

[31mSystem message:[0m

Your input fields are:
1. `question` (str)

Your output fields are:
1. `reasoning` (str)
2. `answer` (str): The final answer in latex format. Do not include the $ or \boxed{} symbols.

All interactions will be structured in the following way, with the appropriate values filled in.

[[ ## question ## ]]
{question}

[[ ## reasoning ## ]]
{reasoning}

[[ ## answer ## ]]
{answer}

[[ ## completed ## ]]

In adhering to this structure, your objective is: 
        Answer the math question.


[31mUser message:[0m

[[ ## question ## ]]
How many subsets of the set of divisors of $72$ contain only composite numbers? For example, $\{8,9\}$ and $\{4,8,12\}$ are two such sets. Include the empty set in your count.

Respond with the corresponding output fields, starting with the field `[[ ## reasoning ## ]]`, then `[[ ## answer ## ]]`, and then ending with the marker for `[[ ## completed ## ]]`.


[31mResponse:[0m

[32m[[ ## rea