In [None]:
!pip install transformers datasets pydantic replicate

In [2]:
import os
import re
import time
import random
import replicate
import pandas as pd
from tqdm import tqdm
from datasets import load_dataset
from pydantic import BaseModel
from typing import List, Literal, Union
from collections import Counter

REPLICATE_API_TOKEN = ""
os.environ['REPLICATE_API_TOKEN'] = REPLICATE_API_TOKEN

In [38]:
gsm8k_dataset = load_dataset("gsm8k", "main")
test_dataset = gsm8k_dataset['test']

In [22]:
def clean_and_normalize_answer(answer):
    answer = answer.lower().strip()
    answer = re.sub(r'[^\w\s\$.]', '', answer)
    answer = re.sub(r'\s+', ' ', answer)
    return answer

def extract_last_numeric_value(text):
    match = re.findall(r'[-+]?\d*\.\d+|\d+', text)
    if match:
        return match[-1].strip()
    else:
        return None

def extract_numeric(text):
    match = re.search(r"Final Numeric Answer:\s*[\*\(]*([\d\.]+)[\*\)]*", text)
    return match.group(1) if match else None

def join_tokens(response):
    if isinstance(response, list):
        return ''.join(response)
    return response

In [5]:
class GSM8KDataRow(BaseModel):
    question: str
    generated_answer: str
    correct_answer: str
    strategy: Literal["zero-shot", "few-shot", "cot", "sc-cot"]
    priority: int

class GSM8KDataset(BaseModel):
    data: List[GSM8KDataRow]

In [26]:
def zero_shot_prompt(question):
    zero_shot_prompt = f"""
    Solve the following question and provide the final answer in the format: 'Final Numeric Answer: (numeric value)'.
    Do not include any commas or delimiters in the final numeric answer. If the answer is 50,000, you must output this as 50000.
    Q: {question} \n
    A:
    Final Numeric Answer:
    \n\n
    """
    return zero_shot_prompt

def few_shot_prompt(question):
    few_shot_prompt = f"""
    Solve the following question and provide the final answer in the format: 'Final Numeric Answer: (numeric value)'. Here are some few shot examples to help generation:
    Do not include any commas or delimiters in the final numeric answer. If the answer is 50,000, you must output this as 50000.
    -- BEGIN OF FEW SHOT EXAMPLES --
    1.
    ```
    Sample Q: Q: Roger has 5 tennis balls. He buys 2 more cans of tennis balls. Each can has 3 tennis balls. How many tennis balls does he have now?
    Sample A: Roger started with 5 balls. 2 cans of 3 tennis balls each is 6 tennis balls. 5 + 6 = 11. The answer is 11.
    Sample Final Numeric Answer: 11
    \n\n
    ```
    2.
    ```
    Sample Q: Julie, Micah, and Mitchell sold 32 glasses of lemonade at their lemonade stand. Julie sold 14 glasses and the boys sold an equal number of glasses. How many more glasses did Julie sell than Micah?
    Sample A: Micah and Mitchell sold 32 - 14 = <<32-14=18>>18 glasses. They each sold 18/2 = <<18/2=9>>9 glasses. Julie sold 14 - 9 = <<14-9=5>>5 glasses more than Micah
    Sample Final Numeric Answer: 5
    \n\n
    ```
    3.
    ```
    Sample Q: Two sports coaches went shopping together. The baseball coach bought 9 new baseballs for $3 each. The basketball coach bought 8 new basketballs for $14 each. How much more did the basketball coach spend than the baseball coach?
    Sample A: The cost of the baseballs is 9 × $3 = $<<9*3=27>>27. The cost of the basketballs is 8 × $14 = $<<8*14=112>>112. Basketballs cost $112 − $27 = $85 more.
    Sample Final Numeric Answer: 85
    \n\n
    ```
    -- END OF FEW SHOT EXAMPLES --
    Q: {question} \n
    A:
    Final Numeric Answer:
    \n\n
    """
    return few_shot_prompt

def cot_prompt(question):
    cot_prompt = f"""
    Solve the following question and provide the final answer in the format: 'Final Numeric Answer: (numeric value)'. Let's think step by step, ensure you use logical reasoning steps to arrive at your solution. Here are some few shot examples to help generation:
    Do not include any commas or delimiters in the final numeric answer. If the answer is 50,000, you must output this as 50000.
    -- BEGIN OF FEW SHOT EXAMPLES --
    1.
    ```
    Sample Q: Charisma works for 8 hours every day.  She has a timer to remind her to get up and walk for 5 minutes every hour she’s at work.  After 5 days at the office, how many minutes has she walked?
    Sample A: Let's break it down step by step. Charisma walks for 5 minutes every hour, and she works for 8 hours a day. So, she walks for: 5 minutes/hour × 8 hours/day = 40 minutes/day. She works for 5 days, so she walks for: 40 minutes/day × 5 days = 200 minutes
    Sample Final Numeric Answer: 200
    \n\n
    ```
    2.
    ```
    Sample Q: Julie, Micah, and Mitchell sold 32 glasses of lemonade at their lemonade stand. Julie sold 14 glasses and the boys sold an equal number of glasses. How many more glasses did Julie sell than Micah?
    Sample A: Let's break it down step by step! We know that Julie sold 14 glasses of lemonade. The total number of glasses sold is 32. The boys, Micah and Mitchell, sold an equal number of glasses. So, they sold a total of: 32 - 14 = 18 glasses. Since they sold an equal number of glasses, we divide 18 by 2 to find out how many glasses each boy sold: 18 ÷ 2 = 9. Now, we can find out how many more glasses Julie sold than Micah: Julie sold 14 glasses, and Micah sold 9 glasses. To find the difference, we subtract: 14 - 9 = 5. So, Julie sold 5 more glasses than Micah.
    Sample Final Numeric Answer: 5
    \n\n
    ```
    3.
    ```
    Sample Q: Two sports coaches went shopping together. The baseball coach bought 9 new baseballs for $3 each. The basketball coach bought 8 new basketballs for $14 each. How much more did the basketball coach spend than the baseball coach?
    Sample A: Let's break it down step by step. The baseball coach bought 9 new baseballs for $3 each, so the total cost is: 9 x $3 = $27. The basketball coach bought 8 new basketballs for $14 each, so the total cost is: 8 x $14 = $112. Now, let's find out how much more the basketball coach spent: $112 (basketball coach) - $27 (baseball coach) = $85.
    Sample Final Numeric Answer: 85
    \n\n
    ```
    -- END OF FEW SHOT EXAMPLES --
    Q: {question} \n
    A:
    Final Numeric Answer:
    \n\n
    """
    return cot_prompt

def sc_prompt(question):
    sc_prompt = f"""
    Solve the following question and provide the final answer in the format: 'Final Numeric Answer: (numeric value)'. Let's think step by step, ensure you use logical reasoning steps to arrive at your solution. Here are some few shot examples to help generation:
    Do not include any commas or delimiters in the final numeric answer. If the answer is 50,000, you must output this as 50000.
    -- BEGIN OF FEW SHOT EXAMPLES --
    1.
    ```
    Sample Q: Charisma works for 8 hours every day.  She has a timer to remind her to get up and walk for 5 minutes every hour she’s at work.  After 5 days at the office, how many minutes has she walked?
    Sample A: Let's break it down step by step. Charisma walks for 5 minutes every hour, and she works for 8 hours a day. So, she walks for: 5 minutes/hour × 8 hours/day = 40 minutes/day. She works for 5 days, so she walks for: 40 minutes/day × 5 days = 200 minutes
    Sample Final Numeric Answer: 200
    \n\n
    ```
    2.
    ```
    Sample Q: Julie, Micah, and Mitchell sold 32 glasses of lemonade at their lemonade stand. Julie sold 14 glasses and the boys sold an equal number of glasses. How many more glasses did Julie sell than Micah?
    Sample A: Let's break it down step by step! We know that Julie sold 14 glasses of lemonade. The total number of glasses sold is 32. The boys, Micah and Mitchell, sold an equal number of glasses. So, they sold a total of: 32 - 14 = 18 glasses. Since they sold an equal number of glasses, we divide 18 by 2 to find out how many glasses each boy sold: 18 ÷ 2 = 9. Now, we can find out how many more glasses Julie sold than Micah: Julie sold 14 glasses, and Micah sold 9 glasses. To find the difference, we subtract: 14 - 9 = 5. So, Julie sold 5 more glasses than Micah.
    Sample Final Numeric Answer: 5
    \n\n
    ```
    3.
    ```
    Sample Q: Two sports coaches went shopping together. The baseball coach bought 9 new baseballs for $3 each. The basketball coach bought 8 new basketballs for $14 each. How much more did the basketball coach spend than the baseball coach?
    Sample A: Let's break it down step by step. The baseball coach bought 9 new baseballs for $3 each, so the total cost is: 9 x $3 = $27. The basketball coach bought 8 new basketballs for $14 each, so the total cost is: 8 x $14 = $112. Now, let's find out how much more the basketball coach spent: $112 (basketball coach) - $27 (baseball coach) = $85.
    Sample Final Numeric Answer: 85
    \n\n
    ```
    -- END OF FEW SHOT EXAMPLES --
    Q: {question} \n
    A:
    Final Numeric Answer:
    \n\n
    """
    return sc_prompt

def get_prompt(strategy, question):
    if strategy == 'zero-shot':
        return zero_shot_prompt(question)
    elif strategy == 'few-shot':
        return few_shot_prompt(question)
    elif strategy == 'cot':
        return cot_prompt(question)
    elif strategy == 'sc-cot':
        return sc_prompt(question)
    else:
        raise ValueError('Invalid')

In [16]:
def generate_llm_response(prompt, model_name, retries=3):
    for attempt in range(retries):
        try:
            response = replicate.run(
                model_name,
                input={"prompt": prompt, "max_length": 512, "temperature": 0.7, "top_p": 0.9}
            )
            response_text = join_tokens(response)
            return response_text
        except Exception as e:
            print(f"Error on attempt {attempt+1}/{retries}: {str(e)}")
            if attempt < retries - 1:
                time.sleep(2)
            else:
                print("Max retries reached, skipping this sample.")
                return ""

In [36]:
def generate_self_consistent_answers(prompt, model_name, num_samples=5, retries=3):
    answers = []
    for i in range(num_samples):
        for attempt in range(retries):
            try:
                response = replicate.run(
                    model_name,
                    input={"prompt": prompt, "max_length": 512, "temperature": 0.7, "top_p": 0.9}
                )
                response_text = join_tokens(response)
                answer = extract_last_numeric_value(response_text)
                if answer:
                    answers.append(answer)
                break
            except Exception as e:
                print(f"Error on attempt {attempt+1}/{retries}: {str(e)}")
                if attempt < retries - 1:
                    time.sleep(2)
                else:
                    print("Max retries reached, skipping this sample.")
                    continue
    if answers:
        answer_counts = Counter(answers)
        most_common_answer = answer_counts.most_common(1)[0][0]
        return most_common_answer
    else:
        return ""


In [None]:
strategies = ["zero-shot", "few-shot", "cot", "sc-cot"]
priority_map = {"zero-shot": 0, "few-shot": 1, "cot": 2, "sc-cot": 3}

dataset = GSM8KDataset(data=[])

model_version = "meta/meta-llama-3-8b-instruct"

for item in tqdm(test_dataset, desc="Processing GSM8K Dataset"):
    question = item['question']
    correct_answer_text = item.get('answer')

    correct_answer = extract_last_numeric_value(correct_answer_text)

    for strategy in strategies:
        prompt = get_prompt(strategy, question)

        if strategy == 'sc-cot':
            generated_answer = generate_self_consistent_answers(prompt, model_name=model_version, num_samples=7)
            response = ""
        else:
            response = generate_llm_response(prompt, model_name=model_version)
            generated_answer = extract_numeric(response)

        data_row = GSM8KDataRow(
            question=question,
            generated_answer=generated_answer or "",
            correct_answer=correct_answer or "",
            strategy=strategy,
            priority=priority_map[strategy]
        )
        dataset.data.append(data_row)

In [33]:
data_records = [data_row.dict() for data_row in dataset.data]
df = pd.DataFrame(data_records)
df.to_csv("gsm8k_generated_dataset.csv", index=False)
df.to_json("gsm8k_generated_dataset.json", orient='records', lines=True)

In [None]:
accuracy_per_strategy = {}
for strategy in strategies:
    strategy_df = df[df['strategy'] == strategy]
    total = len(strategy_df)
    correct = 0
    for _, row in strategy_df.iterrows():
        try:
            if float(row['generated_answer']) == float(row['correct_answer']):
                correct += 1
        except:
            pass
    accuracy = correct / total if total > 0 else 0
    accuracy_per_strategy[strategy] = accuracy * 100
    print(f"Strategy: {strategy}, Accuracy: {accuracy * 100:.2f}%")

strategy_ranking = sorted(
    strategies,
    key=lambda x: (priority_map[x], -accuracy_per_strategy[x])
)

print("\nStrategy Ranking based on priority (lower is better) and performance:")
for rank, strategy in enumerate(strategy_ranking, 1):
    print(f"{rank}. Strategy: {strategy}, Priority: {priority_map[strategy]}, Accuracy: {accuracy_per_strategy[strategy]:.2f}%")