In [1]:
import os
import sys
import logging
from dotenv import load_dotenv

import dspy
from dspy.datasets.gsm8k import GSM8K, gsm8k_metric
from dspy.evaluate import Evaluate
import numpy as np

sys.path.append(os.path.dirname(os.getcwd()))
from mcts_llm.mctsr import MCTSr, ZeroShotCoT, MultipleTurnSelfRefine, Policy

load_dotenv()

logging.basicConfig(level=logging.WARNING)
logging.getLogger("mcts-llm").setLevel(logging.INFO)

np.random.seed(42)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
ollama = dspy.OllamaLocal(
    model="qwen2.5:7b-instruct", 
    model_type="chat",
    temperature=1.0,
    max_tokens=1024,
    num_ctx=1024,
    timeout_s=600,
    cache=False
)
dspy.settings.configure(lm=ollama, experimental=True)

In [3]:
gsm8k = GSM8K()

100%|██████████| 7473/7473 [00:00<00:00, 49215.28it/s]
100%|██████████| 1319/1319 [00:00<00:00, 54553.66it/s]


In [4]:
gsm8k_trainset = [
    dspy.Example(
        problem=example['question'], 
        gold_reasoning=example['gold_reasoning'],
        answer=example['answer']
    ).with_inputs("problem") for example in gsm8k.train
]
np.random.shuffle(gsm8k_trainset)
gsm8k_trainset[:10]

[Example({'problem': 'A bowl of fruit holds 18 peaches. Four of the peaches are ripe and two more ripen every day, but on the third day three are eaten. How many more ripe peaches than unripe peaches are in the bowl after five days?', 'gold_reasoning': 'In 5 days, 2 * 5 = <<2*5=10>>10 peaches will ripen. With the 4 that were already ripe, there will be 14 peaches that have ripened. Three were eaten, so there will be 14 - 3 = <<14-3=11>>11 ripe peaches left. There are 18 - 14 = <<18-14=4>>4 unripe peaches left. Therefore, there will be 11 - 4 = <<11-4=7>>7 more ripe peaches than unripe peaches in the bowl after five days.', 'answer': '7'}) (input_keys={'problem'}),
 Example({'problem': 'John needs to replace his shoes so he decides to buy a $150 pair of Nikes and a $120 pair of work boots.  Tax is 10%.  How much did he pay for everything?', 'gold_reasoning': 'The shoes cost $150 + $120 = $<<150+120=270>>270 The tax was $270 * .1 = $<<270*.1=27>>27 So the total cost was $270 + $27 = $<<2

In [5]:
gsm8k_testset = [
    dspy.Example(
        problem=example['question'], 
        gold_reasoning=example['gold_reasoning'],
        answer=example['answer']
    ).with_inputs("problem") for example in gsm8k.test
]
np.random.shuffle(gsm8k_testset)
gsm8k_testset[:10]

[Example({'problem': "Great Grandma Jones has three children.  And each of her children has three children of their own, who are Great Grandma Jones' grandchildren.  And each of these grandchildren has three babies of their own, who are Great Grandma Jones' great grand-babies.  If all of the family show up at the family reunion, how many great grand-babies will be there for Great Grandma Jones to kiss?", 'gold_reasoning': 'Three children each have three children, for a total of 3*3=<<3*3=9>>9 grandchildren. 9 grandchildren each have 3 babies, for a total of 9*3=<<9*3=27>>27 great grand-babies', 'answer': '27'}) (input_keys={'problem'}),
 Example({'problem': 'Mark was unwell for 3 months, during which he lost 10 pounds per month. If his final weight was 70 pounds, what was his initial weight?', 'gold_reasoning': "Since Mark was losing 10 pounds each month, in 3 months he lost 10 * 3 = <<10*3=30>>30 pounds. Adding the weight lost to the final weight gives an initial weight of 70 + 30 = <

In [6]:
evaluate = Evaluate(
    devset=gsm8k_testset[:20], 
    metric=gsm8k_metric, 
    num_threads=os.cpu_count(), 
    display_progress=True,
    display_table=10,
)

In [7]:
evaluate(ZeroShotCoT())

Average Metric: 18 / 20  (90.0): 100%|██████████| 20/20 [02:12<00:00,  6.64s/it]


Unnamed: 0,problem,gold_reasoning,example_answer,pred_answer,gsm8k_metric
0,"Great Grandma Jones has three children. And each of her children has three children of their own, who are Great Grandma Jones' grandchildren. And each...","Three children each have three children, for a total of 3*3=<<3*3=9>>9 grandchildren. 9 grandchildren each have 3 babies, for a total of 9*3=<<9*3=27>>27 great grand-babies",27,There will be 27 great grand-babies at the family reunion.,✔️ [True]
1,"Mark was unwell for 3 months, during which he lost 10 pounds per month. If his final weight was 70 pounds, what was his initial...","Since Mark was losing 10 pounds each month, in 3 months he lost 10 * 3 = <<10*3=30>>30 pounds. Adding the weight lost to the...",100,Mark's initial weight was 100 pounds.,✔️ [True]
2,"On Tuesday, Peter wants to exercise for twice the amount of time he did on Monday and Sunday combined. On Sunday he exercised for 23...",On Sunday and Monday he exercised a total of 39 minutes because 23 + 16 = <<23+16=39>>39 On Tuesday he has to exercise for 78...,78,Peter has to exercise for 78 minutes on Tuesday to reach his goal.,✔️ [True]
3,"Abraham owns 80 square meters of unused land. He sold half of the land for $50, and after a month, he sold another 1/4 of...","Abraham sold 1/2 x 80= <<1/2*80=40>>40 square meters of his unused land. After a month, he sold 1/4 x 40 = <<1/4*40=10>>10 square meters of...",170,Abraham will be able to earn $170 after selling all his unused land.,✔️ [True]
4,"A food caterer was told to prepare gourmet hot dogs for 36 guests. While most people would only eat one hotdog, he prepared enough for...","The food caterer prepared an additional 36/2 = <<36/2=18>>18 hot dogs. Altogether, he prepared 36 + 18 = <<36+18=54>>54 hotdogs. If 40 people wanted seconds,...",26,26,✔️ [True]
5,Bryce and four of his friends each ordered their own pizzas after football practice. Each pizza had 12 slices. Bryce and two friends ate 2/3...,"Bryce and his 4 friends each ordered a pizza with 12 slices, so there were 5 pizzas x 12 slices per pizza = <<5*12=60>>60 slices...",18,The number of pizza slices left is 18.,✔️ [True]
6,"Farmer Brown's farm is 200 acres, and Farmer Smith's farm is 100 acres more than twice that. How many acres do the two farms have,...",Farmer Smith has 2*200+100=<<2*200+100=500>>500 acres. The total is 200+500=<<200+500=700>>700.,700,The two farms have a combined total of 700 acres.,✔️ [True]
7,Charlie is a tennis pro. He spends most of the day teaching others lessons on how to improve their game. His standard fee is $80...,"4 1-hour lessons at his standard rate cost 4*$80=$<<4*80=320>>320. Reducing the rate by 25% for veterans means he takes $80/4=$<<80/4=20>>20 off his standard fee. Thus,...",440,The total amount of money Charlie will make for the day is $440.,✔️ [True]
8,"Samwell owns an appliances store. For this week, one-fourth of their sales are smart TVs, one-eighth are analog TVs, and the rest are OLED TVs....","Samwell was able to sell 1/4 x 40 = <<1/4*40=10>>10 Smart TVs. He was able to sell 1/8 x 40 = <<1/8*40=5>>5 analog TVs. So,...",25,25 OLED TVs were sold.,✔️ [True]
9,"Arianna plants a garden that has 10 rows of flowers with 20 flowers in each row. Currently, only 4/5 of the planted flowers have bloomed....","Arianna planted 10 x 20 = <<10*20=200>>200 flowers in her garden. Out of the 200, 200 x 4/5 = <<200*4/5=160>>160 flowers that have currently bloomed.",160,The number of flowers in Arianna's garden that have bloomed is 160.,✔️ [True]


90.0

In [8]:
evaluate(MultipleTurnSelfRefine(num_turns=1))

Average Metric: 16 / 20  (80.0): 100%|██████████| 20/20 [06:39<00:00, 19.98s/it]


Unnamed: 0,problem,gold_reasoning,example_answer,pred_answer,gsm8k_metric
0,"Great Grandma Jones has three children. And each of her children has three children of their own, who are Great Grandma Jones' grandchildren. And each...","Three children each have three children, for a total of 3*3=<<3*3=9>>9 grandchildren. 9 grandchildren each have 3 babies, for a total of 9*3=<<9*3=27>>27 great grand-babies",27,There will be 81 great grand-babies at the family reunion for Great Grandma Jones to kiss.,
1,"Mark was unwell for 3 months, during which he lost 10 pounds per month. If his final weight was 70 pounds, what was his initial...","Since Mark was losing 10 pounds each month, in 3 months he lost 10 * 3 = <<10*3=30>>30 pounds. Adding the weight lost to the...",100,Mark's initial weight was 100 pounds.,✔️ [True]
2,"On Tuesday, Peter wants to exercise for twice the amount of time he did on Monday and Sunday combined. On Sunday he exercised for 23...",On Sunday and Monday he exercised a total of 39 minutes because 23 + 16 = <<23+16=39>>39 On Tuesday he has to exercise for 78...,78,Peter has to exercise for 78 minutes on Tuesday to reach his goal.,✔️ [True]
3,"Abraham owns 80 square meters of unused land. He sold half of the land for $50, and after a month, he sold another 1/4 of...","Abraham sold 1/2 x 80= <<1/2*80=40>>40 square meters of his unused land. After a month, he sold 1/4 x 40 = <<1/4*40=10>>10 square meters of...",170,Abraham will be able to earn a total of $140 after selling all his unused land.,
4,"A food caterer was told to prepare gourmet hot dogs for 36 guests. While most people would only eat one hotdog, he prepared enough for...","The food caterer prepared an additional 36/2 = <<36/2=18>>18 hot dogs. Altogether, he prepared 36 + 18 = <<36+18=54>>54 hotdogs. If 40 people wanted seconds,...",26,22,
5,Bryce and four of his friends each ordered their own pizzas after football practice. Each pizza had 12 slices. Bryce and two friends ate 2/3...,"Bryce and his 4 friends each ordered a pizza with 12 slices, so there were 5 pizzas x 12 slices per pizza = <<5*12=60>>60 slices...",18,The number of slices of pizza left is 18.,✔️ [True]
6,"Farmer Brown's farm is 200 acres, and Farmer Smith's farm is 100 acres more than twice that. How many acres do the two farms have,...",Farmer Smith has 2*200+100=<<2*200+100=500>>500 acres. The total is 200+500=<<200+500=700>>700.,700,The two farms together have 700 acres.,✔️ [True]
7,Charlie is a tennis pro. He spends most of the day teaching others lessons on how to improve their game. His standard fee is $80...,"4 1-hour lessons at his standard rate cost 4*$80=$<<4*80=320>>320. Reducing the rate by 25% for veterans means he takes $80/4=$<<80/4=20>>20 off his standard fee. Thus,...",440,Charlie will make a total of $440 for the day.,✔️ [True]
8,"Samwell owns an appliances store. For this week, one-fourth of their sales are smart TVs, one-eighth are analog TVs, and the rest are OLED TVs....","Samwell was able to sell 1/4 x 40 = <<1/4*40=10>>10 Smart TVs. He was able to sell 1/8 x 40 = <<1/8*40=5>>5 analog TVs. So,...",25,Samwell's store was able to sell 25 OLED TVs.,✔️ [True]
9,"Arianna plants a garden that has 10 rows of flowers with 20 flowers in each row. Currently, only 4/5 of the planted flowers have bloomed....","Arianna planted 10 x 20 = <<10*20=200>>200 flowers in her garden. Out of the 200, 200 x 4/5 = <<200*4/5=160>>160 flowers that have currently bloomed.",160,160,✔️ [True]


80.0

In [9]:
evaluate(MCTSr(samples_per_node=5, default_uct_score=1e8))

Average Metric: 17 / 20  (85.0): 100%|██████████| 20/20 [56:55<00:00, 170.77s/it] 


Unnamed: 0,problem,gold_reasoning,example_answer,pred_answer,gsm8k_metric
0,"Great Grandma Jones has three children. And each of her children has three children of their own, who are Great Grandma Jones' grandchildren. And each...","Three children each have three children, for a total of 3*3=<<3*3=9>>9 grandchildren. 9 grandchildren each have 3 babies, for a total of 9*3=<<9*3=27>>27 great grand-babies",27,There will be 27 great grand-babies at the family reunion for Great Grandma Jones to kiss.,✔️ [True]
1,"Mark was unwell for 3 months, during which he lost 10 pounds per month. If his final weight was 70 pounds, what was his initial...","Since Mark was losing 10 pounds each month, in 3 months he lost 10 * 3 = <<10*3=30>>30 pounds. Adding the weight lost to the...",100,Mark's initial weight was 100 pounds.,✔️ [True]
2,"On Tuesday, Peter wants to exercise for twice the amount of time he did on Monday and Sunday combined. On Sunday he exercised for 23...",On Sunday and Monday he exercised a total of 39 minutes because 23 + 16 = <<23+16=39>>39 On Tuesday he has to exercise for 78...,78,Peter has to exercise for 78 minutes on Tuesday to reach his goal.,✔️ [True]
3,"Abraham owns 80 square meters of unused land. He sold half of the land for $50, and after a month, he sold another 1/4 of...","Abraham sold 1/2 x 80= <<1/2*80=40>>40 square meters of his unused land. After a month, he sold 1/4 x 40 = <<1/4*40=10>>10 square meters of...",170,Abraham will be able to earn $170 after selling all his unused land.,✔️ [True]
4,"A food caterer was told to prepare gourmet hot dogs for 36 guests. While most people would only eat one hotdog, he prepared enough for...","The food caterer prepared an additional 36/2 = <<36/2=18>>18 hot dogs. Altogether, he prepared 36 + 18 = <<36+18=54>>54 hotdogs. If 40 people wanted seconds,...",26,15 guests did not get a second hotdog.,
5,Bryce and four of his friends each ordered their own pizzas after football practice. Each pizza had 12 slices. Bryce and two friends ate 2/3...,"Bryce and his 4 friends each ordered a pizza with 12 slices, so there were 5 pizzas x 12 slices per pizza = <<5*12=60>>60 slices...",18,There are 18 slices of pizza left.,✔️ [True]
6,"Farmer Brown's farm is 200 acres, and Farmer Smith's farm is 100 acres more than twice that. How many acres do the two farms have,...",Farmer Smith has 2*200+100=<<2*200+100=500>>500 acres. The total is 200+500=<<200+500=700>>700.,700,The two farms have a total of 700 acres together.,✔️ [True]
7,Charlie is a tennis pro. He spends most of the day teaching others lessons on how to improve their game. His standard fee is $80...,"4 1-hour lessons at his standard rate cost 4*$80=$<<4*80=320>>320. Reducing the rate by 25% for veterans means he takes $80/4=$<<80/4=20>>20 off his standard fee. Thus,...",440,The total amount of money Charlie will make for the day is $440.,✔️ [True]
8,"Samwell owns an appliances store. For this week, one-fourth of their sales are smart TVs, one-eighth are analog TVs, and the rest are OLED TVs....","Samwell was able to sell 1/4 x 40 = <<1/4*40=10>>10 Smart TVs. He was able to sell 1/8 x 40 = <<1/8*40=5>>5 analog TVs. So,...",25,25,✔️ [True]
9,"Arianna plants a garden that has 10 rows of flowers with 20 flowers in each row. Currently, only 4/5 of the planted flowers have bloomed....","Arianna planted 10 x 20 = <<10*20=200>>200 flowers in her garden. Out of the 200, 200 x 4/5 = <<200*4/5=160>>160 flowers that have currently bloomed.",160,160,✔️ [True]


85.0

In [10]:
evaluate(MCTSr(policy=Policy.IMPORTANCE_SAMPLING, samples_per_node=5, default_uct_score=1e8))

Average Metric: 14 / 20  (70.0): 100%|██████████| 20/20 [56:16<00:00, 168.81s/it] 


Unnamed: 0,problem,gold_reasoning,example_answer,pred_answer,gsm8k_metric
0,"Great Grandma Jones has three children. And each of her children has three children of their own, who are Great Grandma Jones' grandchildren. And each...","Three children each have three children, for a total of 3*3=<<3*3=9>>9 grandchildren. 9 grandchildren each have 3 babies, for a total of 9*3=<<9*3=27>>27 great grand-babies",27,27,✔️ [True]
1,"Mark was unwell for 3 months, during which he lost 10 pounds per month. If his final weight was 70 pounds, what was his initial...","Since Mark was losing 10 pounds each month, in 3 months he lost 10 * 3 = <<10*3=30>>30 pounds. Adding the weight lost to the...",100,100,✔️ [True]
2,"On Tuesday, Peter wants to exercise for twice the amount of time he did on Monday and Sunday combined. On Sunday he exercised for 23...",On Sunday and Monday he exercised a total of 39 minutes because 23 + 16 = <<23+16=39>>39 On Tuesday he has to exercise for 78...,78,78,✔️ [True]
3,"Abraham owns 80 square meters of unused land. He sold half of the land for $50, and after a month, he sold another 1/4 of...","Abraham sold 1/2 x 80= <<1/2*80=40>>40 square meters of his unused land. After a month, he sold 1/4 x 40 = <<1/4*40=10>>10 square meters of...",170,Abraham will be able to earn $140 after selling all his unused land.,
4,"A food caterer was told to prepare gourmet hot dogs for 36 guests. While most people would only eat one hotdog, he prepared enough for...","The food caterer prepared an additional 36/2 = <<36/2=18>>18 hot dogs. Altogether, he prepared 36 + 18 = <<36+18=54>>54 hotdogs. If 40 people wanted seconds,...",26,There will be \( 22 \) guests who did not get a second hotdog.,
5,Bryce and four of his friends each ordered their own pizzas after football practice. Each pizza had 12 slices. Bryce and two friends ate 2/3...,"Bryce and his 4 friends each ordered a pizza with 12 slices, so there were 5 pizzas x 12 slices per pizza = <<5*12=60>>60 slices...",18,\(18\),✔️ [True]
6,"Farmer Brown's farm is 200 acres, and Farmer Smith's farm is 100 acres more than twice that. How many acres do the two farms have,...",Farmer Smith has 2*200+100=<<2*200+100=500>>500 acres. The total is 200+500=<<200+500=700>>700.,700,The two farms together have 700 acres.,✔️ [True]
7,Charlie is a tennis pro. He spends most of the day teaching others lessons on how to improve their game. His standard fee is $80...,"4 1-hour lessons at his standard rate cost 4*$80=$<<4*80=320>>320. Reducing the rate by 25% for veterans means he takes $80/4=$<<80/4=20>>20 off his standard fee. Thus,...",440,$440,✔️ [True]
8,"Samwell owns an appliances store. For this week, one-fourth of their sales are smart TVs, one-eighth are analog TVs, and the rest are OLED TVs....","Samwell was able to sell 1/4 x 40 = <<1/4*40=10>>10 Smart TVs. He was able to sell 1/8 x 40 = <<1/8*40=5>>5 analog TVs. So,...",25,25,✔️ [True]
9,"Arianna plants a garden that has 10 rows of flowers with 20 flowers in each row. Currently, only 4/5 of the planted flowers have bloomed....","Arianna planted 10 x 20 = <<10*20=200>>200 flowers in her garden. Out of the 200, 200 x 4/5 = <<200*4/5=160>>160 flowers that have currently bloomed.",160,160 flowers have bloomed.,✔️ [True]


70.0

In [11]:
evaluate(MCTSr(max_rollouts=8))

Average Metric: 15 / 20  (75.0): 100%|██████████| 20/20 [1:24:10<00:00, 252.53s/it]


Unnamed: 0,problem,gold_reasoning,example_answer,pred_answer,gsm8k_metric
0,"Great Grandma Jones has three children. And each of her children has three children of their own, who are Great Grandma Jones' grandchildren. And each...","Three children each have three children, for a total of 3*3=<<3*3=9>>9 grandchildren. 9 grandchildren each have 3 babies, for a total of 9*3=<<9*3=27>>27 great grand-babies",27,27,✔️ [True]
1,"Mark was unwell for 3 months, during which he lost 10 pounds per month. If his final weight was 70 pounds, what was his initial...","Since Mark was losing 10 pounds each month, in 3 months he lost 10 * 3 = <<10*3=30>>30 pounds. Adding the weight lost to the...",100,Mark's initial weight was 100 pounds.,✔️ [True]
2,"On Tuesday, Peter wants to exercise for twice the amount of time he did on Monday and Sunday combined. On Sunday he exercised for 23...",On Sunday and Monday he exercised a total of 39 minutes because 23 + 16 = <<23+16=39>>39 On Tuesday he has to exercise for 78...,78,Peter has to exercise for 78 minutes on Tuesday to reach his goal.,✔️ [True]
3,"Abraham owns 80 square meters of unused land. He sold half of the land for $50, and after a month, he sold another 1/4 of...","Abraham sold 1/2 x 80= <<1/2*80=40>>40 square meters of his unused land. After a month, he sold 1/4 x 40 = <<1/4*40=10>>10 square meters of...",170,Abraham will be able to earn $170 after selling all his unused land.,✔️ [True]
4,"A food caterer was told to prepare gourmet hot dogs for 36 guests. While most people would only eat one hotdog, he prepared enough for...","The food caterer prepared an additional 36/2 = <<36/2=18>>18 hot dogs. Altogether, he prepared 36 + 18 = <<36+18=54>>54 hotdogs. If 40 people wanted seconds,...",26,4,
5,Bryce and four of his friends each ordered their own pizzas after football practice. Each pizza had 12 slices. Bryce and two friends ate 2/3...,"Bryce and his 4 friends each ordered a pizza with 12 slices, so there were 5 pizzas x 12 slices per pizza = <<5*12=60>>60 slices...",18,There are 18 slices of pizza left.,✔️ [True]
6,"Farmer Brown's farm is 200 acres, and Farmer Smith's farm is 100 acres more than twice that. How many acres do the two farms have,...",Farmer Smith has 2*200+100=<<2*200+100=500>>500 acres. The total is 200+500=<<200+500=700>>700.,700,"Together, the two farms have 700 acres.",✔️ [True]
7,Charlie is a tennis pro. He spends most of the day teaching others lessons on how to improve their game. His standard fee is $80...,"4 1-hour lessons at his standard rate cost 4*$80=$<<4*80=320>>320. Reducing the rate by 25% for veterans means he takes $80/4=$<<80/4=20>>20 off his standard fee. Thus,...",440,$440,✔️ [True]
8,"Samwell owns an appliances store. For this week, one-fourth of their sales are smart TVs, one-eighth are analog TVs, and the rest are OLED TVs....","Samwell was able to sell 1/4 x 40 = <<1/4*40=10>>10 Smart TVs. He was able to sell 1/8 x 40 = <<1/8*40=5>>5 analog TVs. So,...",25,Samwell's store was able to sell 25 OLED TVs.,✔️ [True]
9,"Arianna plants a garden that has 10 rows of flowers with 20 flowers in each row. Currently, only 4/5 of the planted flowers have bloomed....","Arianna planted 10 x 20 = <<10*20=200>>200 flowers in her garden. Out of the 200, 200 x 4/5 = <<200*4/5=160>>160 flowers that have currently bloomed.",160,160,✔️ [True]


75.0

In [12]:
evaluate(MCTSr(max_rollouts=8, policy=Policy.IMPORTANCE_SAMPLING))

Average Metric: 13 / 20  (65.0): 100%|██████████| 20/20 [1:23:27<00:00, 250.39s/it]


Unnamed: 0,problem,gold_reasoning,example_answer,pred_answer,gsm8k_metric
0,"Great Grandma Jones has three children. And each of her children has three children of their own, who are Great Grandma Jones' grandchildren. And each...","Three children each have three children, for a total of 3*3=<<3*3=9>>9 grandchildren. 9 grandchildren each have 3 babies, for a total of 9*3=<<9*3=27>>27 great grand-babies",27,There will be 27 great-grandbabies at the family reunion.,✔️ [True]
1,"Mark was unwell for 3 months, during which he lost 10 pounds per month. If his final weight was 70 pounds, what was his initial...","Since Mark was losing 10 pounds each month, in 3 months he lost 10 * 3 = <<10*3=30>>30 pounds. Adding the weight lost to the...",100,"Therefore, Mark's initial weight was \(100\) pounds.",✔️ [True]
2,"On Tuesday, Peter wants to exercise for twice the amount of time he did on Monday and Sunday combined. On Sunday he exercised for 23...",On Sunday and Monday he exercised a total of 39 minutes because 23 + 16 = <<23+16=39>>39 On Tuesday he has to exercise for 78...,78,I don't know.,
3,"Abraham owns 80 square meters of unused land. He sold half of the land for $50, and after a month, he sold another 1/4 of...","Abraham sold 1/2 x 80= <<1/2*80=40>>40 square meters of his unused land. After a month, he sold 1/4 x 40 = <<1/4*40=10>>10 square meters of...",170,Abraham will be able to earn a total of $170 after selling all his unused land.,✔️ [True]
4,"A food caterer was told to prepare gourmet hot dogs for 36 guests. While most people would only eat one hotdog, he prepared enough for...","The food caterer prepared an additional 36/2 = <<36/2=18>>18 hot dogs. Altogether, he prepared 36 + 18 = <<36+18=54>>54 hotdogs. If 40 people wanted seconds,...",26,I don't know.,
5,Bryce and four of his friends each ordered their own pizzas after football practice. Each pizza had 12 slices. Bryce and two friends ate 2/3...,"Bryce and his 4 friends each ordered a pizza with 12 slices, so there were 5 pizzas x 12 slices per pizza = <<5*12=60>>60 slices...",18,I don't know.,
6,"Farmer Brown's farm is 200 acres, and Farmer Smith's farm is 100 acres more than twice that. How many acres do the two farms have,...",Farmer Smith has 2*200+100=<<2*200+100=500>>500 acres. The total is 200+500=<<200+500=700>>700.,700,The two farms have a total of 700 acres.,✔️ [True]
7,Charlie is a tennis pro. He spends most of the day teaching others lessons on how to improve their game. His standard fee is $80...,"4 1-hour lessons at his standard rate cost 4*$80=$<<4*80=320>>320. Reducing the rate by 25% for veterans means he takes $80/4=$<<80/4=20>>20 off his standard fee. Thus,...",440,$440,✔️ [True]
8,"Samwell owns an appliances store. For this week, one-fourth of their sales are smart TVs, one-eighth are analog TVs, and the rest are OLED TVs....","Samwell was able to sell 1/4 x 40 = <<1/4*40=10>>10 Smart TVs. He was able to sell 1/8 x 40 = <<1/8*40=5>>5 analog TVs. So,...",25,They were able to sell 25 OLED TVs.,✔️ [True]
9,"Arianna plants a garden that has 10 rows of flowers with 20 flowers in each row. Currently, only 4/5 of the planted flowers have bloomed....","Arianna planted 10 x 20 = <<10*20=200>>200 flowers in her garden. Out of the 200, 200 x 4/5 = <<200*4/5=160>>160 flowers that have currently bloomed.",160,160 flowers in Arianna's garden have bloomed.,✔️ [True]


65.0

In [13]:
evaluate(MCTSr(max_rollouts=16))

Average Metric: 17 / 20  (85.0): 100%|██████████| 20/20 [2:52:12<00:00, 516.64s/it]  


Unnamed: 0,problem,gold_reasoning,example_answer,pred_answer,gsm8k_metric
0,"Great Grandma Jones has three children. And each of her children has three children of their own, who are Great Grandma Jones' grandchildren. And each...","Three children each have three children, for a total of 3*3=<<3*3=9>>9 grandchildren. 9 grandchildren each have 3 babies, for a total of 9*3=<<9*3=27>>27 great grand-babies",27,27,✔️ [True]
1,"Mark was unwell for 3 months, during which he lost 10 pounds per month. If his final weight was 70 pounds, what was his initial...","Since Mark was losing 10 pounds each month, in 3 months he lost 10 * 3 = <<10*3=30>>30 pounds. Adding the weight lost to the...",100,Mark's initial weight was 100 pounds.,✔️ [True]
2,"On Tuesday, Peter wants to exercise for twice the amount of time he did on Monday and Sunday combined. On Sunday he exercised for 23...",On Sunday and Monday he exercised a total of 39 minutes because 23 + 16 = <<23+16=39>>39 On Tuesday he has to exercise for 78...,78,Peter has to exercise for 78 minutes on Tuesday to reach his goal.,✔️ [True]
3,"Abraham owns 80 square meters of unused land. He sold half of the land for $50, and after a month, he sold another 1/4 of...","Abraham sold 1/2 x 80= <<1/2*80=40>>40 square meters of his unused land. After a month, he sold 1/4 x 40 = <<1/4*40=10>>10 square meters of...",170,Abraham will be able to earn $170 after selling all his unused land.,✔️ [True]
4,"A food caterer was told to prepare gourmet hot dogs for 36 guests. While most people would only eat one hotdog, he prepared enough for...","The food caterer prepared an additional 36/2 = <<36/2=18>>18 hot dogs. Altogether, he prepared 36 + 18 = <<36+18=54>>54 hotdogs. If 40 people wanted seconds,...",26,4 guests did not get a second hotdog.,
5,Bryce and four of his friends each ordered their own pizzas after football practice. Each pizza had 12 slices. Bryce and two friends ate 2/3...,"Bryce and his 4 friends each ordered a pizza with 12 slices, so there were 5 pizzas x 12 slices per pizza = <<5*12=60>>60 slices...",18,There are 18 slices of pizza left.,✔️ [True]
6,"Farmer Brown's farm is 200 acres, and Farmer Smith's farm is 100 acres more than twice that. How many acres do the two farms have,...",Farmer Smith has 2*200+100=<<2*200+100=500>>500 acres. The total is 200+500=<<200+500=700>>700.,700,The two farms together have 700 acres.,✔️ [True]
7,Charlie is a tennis pro. He spends most of the day teaching others lessons on how to improve their game. His standard fee is $80...,"4 1-hour lessons at his standard rate cost 4*$80=$<<4*80=320>>320. Reducing the rate by 25% for veterans means he takes $80/4=$<<80/4=20>>20 off his standard fee. Thus,...",440,$440,✔️ [True]
8,"Samwell owns an appliances store. For this week, one-fourth of their sales are smart TVs, one-eighth are analog TVs, and the rest are OLED TVs....","Samwell was able to sell 1/4 x 40 = <<1/4*40=10>>10 Smart TVs. He was able to sell 1/8 x 40 = <<1/8*40=5>>5 analog TVs. So,...",25,25,✔️ [True]
9,"Arianna plants a garden that has 10 rows of flowers with 20 flowers in each row. Currently, only 4/5 of the planted flowers have bloomed....","Arianna planted 10 x 20 = <<10*20=200>>200 flowers in her garden. Out of the 200, 200 x 4/5 = <<200*4/5=160>>160 flowers that have currently bloomed.",160,The number of bloomed flowers is 160.,✔️ [True]


85.0

In [14]:
evaluate(MCTSr(max_rollouts=16, policy=Policy.IMPORTANCE_SAMPLING))

Average Metric: 13 / 20  (65.0): 100%|██████████| 20/20 [2:51:25<00:00, 514.27s/it]  


Unnamed: 0,problem,gold_reasoning,example_answer,pred_answer,gsm8k_metric
0,"Great Grandma Jones has three children. And each of her children has three children of their own, who are Great Grandma Jones' grandchildren. And each...","Three children each have three children, for a total of 3*3=<<3*3=9>>9 grandchildren. 9 grandchildren each have 3 babies, for a total of 9*3=<<9*3=27>>27 great grand-babies",27,27,✔️ [True]
1,"Mark was unwell for 3 months, during which he lost 10 pounds per month. If his final weight was 70 pounds, what was his initial...","Since Mark was losing 10 pounds each month, in 3 months he lost 10 * 3 = <<10*3=30>>30 pounds. Adding the weight lost to the...",100,Mark's initial weight was 100 pounds.,✔️ [True]
2,"On Tuesday, Peter wants to exercise for twice the amount of time he did on Monday and Sunday combined. On Sunday he exercised for 23...",On Sunday and Monday he exercised a total of 39 minutes because 23 + 16 = <<23+16=39>>39 On Tuesday he has to exercise for 78...,78,Peter has to exercise for 78 minutes on Tuesday to reach his goal.,✔️ [True]
3,"Abraham owns 80 square meters of unused land. He sold half of the land for $50, and after a month, he sold another 1/4 of...","Abraham sold 1/2 x 80= <<1/2*80=40>>40 square meters of his unused land. After a month, he sold 1/4 x 40 = <<1/4*40=10>>10 square meters of...",170,Abraham will be able to earn $140 after selling all his unused land.,
4,"A food caterer was told to prepare gourmet hot dogs for 36 guests. While most people would only eat one hotdog, he prepared enough for...","The food caterer prepared an additional 36/2 = <<36/2=18>>18 hot dogs. Altogether, he prepared 36 + 18 = <<36+18=54>>54 hotdogs. If 40 people wanted seconds,...",26,18 guests did not get a second hotdog.,
5,Bryce and four of his friends each ordered their own pizzas after football practice. Each pizza had 12 slices. Bryce and two friends ate 2/3...,"Bryce and his 4 friends each ordered a pizza with 12 slices, so there were 5 pizzas x 12 slices per pizza = <<5*12=60>>60 slices...",18,18,✔️ [True]
6,"Farmer Brown's farm is 200 acres, and Farmer Smith's farm is 100 acres more than twice that. How many acres do the two farms have,...",Farmer Smith has 2*200+100=<<2*200+100=500>>500 acres. The total is 200+500=<<200+500=700>>700.,700,The two farms have a total of 700 acres together.,✔️ [True]
7,Charlie is a tennis pro. He spends most of the day teaching others lessons on how to improve their game. His standard fee is $80...,"4 1-hour lessons at his standard rate cost 4*$80=$<<4*80=320>>320. Reducing the rate by 25% for veterans means he takes $80/4=$<<80/4=20>>20 off his standard fee. Thus,...",440,The total amount of money Charlie will make for the day is \(440\) dollars.,✔️ [True]
8,"Samwell owns an appliances store. For this week, one-fourth of their sales are smart TVs, one-eighth are analog TVs, and the rest are OLED TVs....","Samwell was able to sell 1/4 x 40 = <<1/4*40=10>>10 Smart TVs. He was able to sell 1/8 x 40 = <<1/8*40=5>>5 analog TVs. So,...",25,They were able to sell 25 OLED TVs.,✔️ [True]
9,"Arianna plants a garden that has 10 rows of flowers with 20 flowers in each row. Currently, only 4/5 of the planted flowers have bloomed....","Arianna planted 10 x 20 = <<10*20=200>>200 flowers in her garden. Out of the 200, 200 x 4/5 = <<200*4/5=160>>160 flowers that have currently bloomed.",160,160 flowers have bloomed.,✔️ [True]


65.0