# Intialize environment

In [1]:
"""
Notebook: 01_generate_and_evaluate.ipynb
Project: LLM Reasoning Diagnostics (Math Edition)
Author: Scott Fortune

Purpose:
- Load benchmark math datasets (MATH & GSM8K)
- Generate model outputs using GPT-style LLMs (e.g., GPT-4, Claude)
- Prepare raw problem-output pairs for diagnostic error classification

Dependencies:
- datasets (Hugging Face)
- openai (for GPT-4 API access)
- pandas (for output formatting and analysis)
- langchain (for LLM integration)

Next steps:
- Filter/normalize problems into unified format
- Prompt LLMs with CoT reasoning prompts
- Save model completions for further analysis
"""

# Install dependencies if needed
try:
    import pandas as pd
    from datasets import load_dataset
    from dotenv import load_dotenv
except ImportError:
    %pip install pandas datasets dotenv --quiet
    import pandas as pd
    from datasets import load_dataset

# Install LangChain and langchain_community if needed
try:
    from langchain.llms import OpenAI
except ImportError:
    %pip install langchain langchain_community openai--quiet
    from langchain.llms import OpenAI

# Setup OpenAI API (configure via env var or notebook cell if running interactively)
import os
import sys
from dotenv import load_dotenv

load_dotenv()

if "OPENAI_API_KEY" in os.environ:
    api_key = os.environ["OPENAI_API_KEY"]
else:
    print("Add OPENAI_API_KEY")  # Replace with your key or use `!export` in shell
    sys.exit(1)

print("✅ Project environment initialized. Ready to load MATH and GSM8K datasets.")


✅ Project environment initialized. Ready to load MATH and GSM8K datasets.


# Load GSM8K and MATH datasets

In [2]:
from datasets import load_dataset

# Load GSM8K dataset (Grade School Math Word Problems)
gsm8k = load_dataset("gsm8k", "main")  # subsets: 'train', 'test'

# Load MATH dataset (High School Competition-Level Problems)
# Note: Requires `datasets` >=2.9.0 to avoid split issues
math = load_dataset("qwedsacf/competition_math")  # subsets: 'train', 'test'

# Preview a few examples from each
print(f"GSM8K train samples: {len(gsm8k['train'])}")
print("GSM8K example:")
print(gsm8k['train'][0])

print(f"\nMATH train samples: {len(math['train'])}")
print("MATH example:")
print(math['train'][0])


GSM8K train samples: 7473
GSM8K example:
{'question': 'Natalia sold clips to 48 of her friends in April, and then she sold half as many clips in May. How many clips did Natalia sell altogether in April and May?', 'answer': 'Natalia sold 48/2 = <<48/2=24>>24 clips in May.\nNatalia sold 48+24 = <<48+24=72>>72 clips altogether in April and May.\n#### 72'}

MATH train samples: 12500
MATH example:
{'problem': 'Let \\[f(x) = \\left\\{\n\\begin{array}{cl} ax+3, &\\text{ if }x>2, \\\\\nx-5 &\\text{ if } -2 \\le x \\le 2, \\\\\n2x-b &\\text{ if } x <-2.\n\\end{array}\n\\right.\\]Find $a+b$ if the piecewise function is continuous (which means that its graph can be drawn without lifting your pencil from the paper).', 'level': 'Level 5', 'type': 'Algebra', 'solution': 'For the piecewise function to be continuous, the cases must "meet" at $2$ and $-2$. For example, $ax+3$ and $x-5$ must be equal when $x=2$. This implies $a(2)+3=2-5$, which we solve to get $2a=-6 \\Rightarrow a=-3$. Similarly, $x-5$

# Normalize GSM8K and MATH to unified format

In [3]:
import pandas as pd

# Normalize GSM8K (uses 'question' and 'answer' fields)
gsm8k_data = [
    {
        "source": "gsm8k",
        "id": f"gsm8k_{i}",
        "question": item["question"],
        "ground_truth": item["answer"],
    }
    for i, item in enumerate(gsm8k["train"])
]

# Normalize MATH (uses 'problem' and 'solution' fields)
math_data = [
    {
        "source": "math",
        "id": f"math_{i}",
        "question": item["problem"],
        "ground_truth": item["solution"],
    }
    for i, item in enumerate(math["train"])
]

# Combine and convert to DataFrame
combined_data = gsm8k_data + math_data
df = pd.DataFrame(combined_data)

print(f"✅ Combined dataset with {len(df)} samples")
df.sample(3)


✅ Combined dataset with 19973 samples


Unnamed: 0,source,id,question,ground_truth
5455,gsm8k,gsm8k_5455,"Hank gave his wife, Delphine, a box of 24 choc...",Twice as many chocolates as she ate the first ...
7095,gsm8k,gsm8k_7095,There are 50 oysters on the rocks at La Push P...,The first day he saw 50 oysters + 72 crabs = <...
1530,gsm8k,gsm8k_1530,Miles is going to spend 1/6 of a day reading. ...,Miles will read for 4 hours because 24 x (1/6)...


# Use only 100 samples

In [4]:
df = df.sample(n=100, random_state=42).reset_index(drop=True)
print(f"✅ New dataframe created with {len(df)} rows.")
pd.set_option('display.max_colwidth', None)
df.sample(1)

✅ New dataframe created with 100 rows.


Unnamed: 0,source,id,question,ground_truth
19,math,math_11374,"Sarah bought two t-shirts and one sweatshirt. The t-shirts cost $\$15.22$ each. If Sarah spent a total of $\$67.94,$ how many dollars did the sweatshirt cost? Express your answer as a decimal to the nearest hundredth.","The amount she spent on t-shirts is $$15.22 +15.22 = (15+15)+ (0.22+0.22) = 30 + 0.44 = 30.44$$dollars.\n\nTherefore, Sarah must have spent the remaining $67.94 - 30.44$ dollars on sweatshirts. We can organize the subtraction concisely using columns as follows: \[\n\begin{array}{@{}c@{}c@{}c@{}c@{}c}\n& 6 & 7. & 9 & 4 \\\n- & 3 & 0. & 4 & 4\n\\ \cline{1-5}\n& 3 & 7. & 5 & 0 \\\n\end{array}\n\]Our answer is $\boxed{37.50}$."


# Add Final Answer Column

In [5]:
import re

# Function to extract the numerical answer
def extract_numerical_answer(answer, source):
    if source == "gsm8k":
        # Extract the last number after '####' for gsm8k
        match = re.search(r"####\s*([\d.]+)", answer)
        return match.group(1) if match else None
    elif source == "math":
        # Extract the value inside \boxed{} for math, accounting for nested brackets
        match = re.search(r"\\boxed\{((?:[^\{\}]|\{(?:[^\{\}]|\{[^\{\}]*\})*\})*)\}", answer)
        return match.group(1) if match else None


# Apply the function to create a new column
df["numerical_answer"] = df.apply(lambda row: extract_numerical_answer(row["ground_truth"], row["source"]), axis=1)

print("✅ Added 'numerical_answer' column.")
df.sample(3)



✅ Added 'numerical_answer' column.


Unnamed: 0,source,id,question,ground_truth,numerical_answer
57,math,math_552,A lattice point is a point whose coordinates are both integers. How many lattice points are on the boundary or inside the region bounded by $y=|x|$ and $y=-x^2+6$?,"The graph of the two equations is shown below:\n\n[asy]\nLabel f;\n\nf.p=fontsize(4);\n\nxaxis(-3,3,Ticks(f, 2.0));\n\nyaxis(-1,7,Ticks(f, 2.0));\n\nreal f(real x)\n\n{\n\nreturn abs(x);\n\n}\n\ndraw(graph(f,-3,3), linewidth(1));\nreal g(real x)\n\n{\n\nreturn -x^2+6;\n\n}\n\ndraw(graph(g,-2.5,2.5), linewidth(1));\n[/asy]\n\nWe first find the $x$ values at which the two equations intersect. When $x\ge 0$, $y=|x|=x$. Plugging this into the second equation to eliminate $y$, we get $x=-x^2+6\Rightarrow x^2+x-6=0$. Factoring the left hand side gives $(x+3)(x-2)=0$, so $x=2$ (since we stated the $x$ was non-negative). By symmetry, the $x$ value of the left intersection is $x=-2$. So we just have to consider the integer $x$ values between these two bounds and find all integer $y$ values that make the point $(x,y)$ fall inside the region.\n\nFor $x=-2$, there is 1 point that works: $(-2,2)$. For $x=-1$, the value of $y=|x|$ is $y=1$ and the value of $y=-x^2+6$ is $y=5$, so all $y$ values between 1 and 5 inclusive work, for a total of 5 points. For $x=0$, the value of $y=|x|$ is $y=0$ and the value of $y=-x^2+6$ is $y=6$, so all $y$ values between 0 and 6 inclusive work, for a total of 7 points. By symmetry, when $x=1$, there are 5 points that work, and when $x=2$, there is 1 point that works.\n\nIn total, there are $1+5+7+5+1=\boxed{19}$ lattice points in the region or on the boundary.",19
6,math,math_1599,"A point $(3\sqrt{5},d+3)$ is $3d$ units away from the origin. What is the smallest possible value of $d$?","By the distance formula, the distance between the origin and $(3\sqrt{5},d+3)$ is $\sqrt{(3\sqrt{5})^2+(d+3)^2}$. Setting this equal to $3d$, we have \begin{align*}\n9d^2&=(3\sqrt{5})^2+(d+3)^2\\\n9d^2&=45+d^2+6d+9\\\n8d^2-6d-54&=0\\\n4d^2-3d-27&=0\\\n(4d+9)(d-3)&=0\n\end{align*}Thus, the values of $d$ are $-\frac{9}{4}$ and $3$. We find that $-\frac{9}{4}$ is an extraneous answer (since distance cannot be negative), so our answer is $d=\boxed{3}$.",3
76,math,math_968,"If $x^2+y^2=1$, what is the largest possible value of $|x|+|y|$?","If $(x,y)$ lies on the circle, so does $(x,-y),$ $(-x,-y),$ and $(-x,-y),$ (which all give the same value of $|x| + |y|$), so we can assume that $x \ge 0$ and $y \ge 0.$\n\nThen $|x| + |y| = x + y.$ Squaring, we get\n\[(x + y)^2 = x^2 + 2xy + y^2 = 1 + 2xy.\]Note that $(x - y)^2 \ge 0.$ Expanding, we get $x^2 - 2xy + y^2 \ge 0,$ so $2xy \le x^2 + y^2 = 1.$ Hence,\n\[1 + 2xy \le 2,\]which means $x + y \le \sqrt{2}.$ Equality occurs when $x = y = \frac{1}{\sqrt{2}},$ so the maximum value of $|x| + |y|$ is $\boxed{\sqrt{2}}.$",\sqrt{2}


# Prompt GPT-4o mini

In [11]:
from langchain.chat_models import ChatOpenAI
from langchain.schema import HumanMessage

# Initialize GPT-4 chat model
llm = ChatOpenAI(
    model="gpt-4o-mini", 
    temperature=0.0,
    max_tokens=1200,
    openai_api_key=api_key)

print(f"Model: {llm.model_name}")

Model: gpt-4o-mini


In [12]:
# Define a math question
math_question = df.sample(1)["question"].values[0]
print(f"Sample Math Question:\n{math_question}\n\n")

# Create a prompt for GPT-4o mini
math_prompt = f"You are a helpful mathematical assistant. Solve the following problem step by step, cleary explaining \
    explaining your reasoning. At the end, provide the final numerical answer on a new line, following the format: \
    \n\n #### <answer>\n\n Question: {math_question} \n\n"

# Generate a response using GPT-4o mini
math_response = llm([HumanMessage(content=math_prompt)])

# Extract and print the response
print(f"GPT-4 Response:\n{math_response.content}\n\n")
print("Ground Truth:\n", df.sample(1)["ground_truth"].values[0])

Sample Math Question:
The Greek army contained two types of soldiers: the upper class and the lower class soldiers. If there were a total of 5 upper class soldiers, and 10 lower class soldiers in a certain part of Athens, and the battle of Thermopylae demands a force of 4 upper class soldiers and 8 lower class soldiers, how many different battalions can be sent?


GPT-4 Response:
To solve the problem, we need to determine how many different combinations of soldiers can be selected to form a battalion that meets the specified requirements of 4 upper class soldiers and 8 lower class soldiers.

1. **Identify the total number of soldiers available**:
   - Upper class soldiers: 5
   - Lower class soldiers: 10

2. **Determine the required number of soldiers for the battalion**:
   - Required upper class soldiers: 4
   - Required lower class soldiers: 8

3. **Check if the requirements can be met**:
   - We have 5 upper class soldiers and need to choose 4. This is possible since 4 ≤ 5.
   - We

In [14]:
# Generate GPT-4o mini responses for each question in the dataframe
from tqdm import tqdm

# Initialize a list to store GPT-4o mini responses
gpt4o_responses = []
gpt4o_final_answers = []

# Iterate over each row in the dataframe
for _, row in tqdm(df.iterrows(), total=len(df)):
    question = row['question']
    prompt = f"You are a helpful mathematical assistant. Solve the following problem step by step, \
        clearly explaining your reasoning. At the end, provide the final numerical answer on a new line, \
        following the format: \n\n #### <answer>\n\n Question: {question} \n\n"
    response = llm([HumanMessage(content=prompt)])
    gpt4o_responses.append(response.content)
    gpt4o_final_answer = response.content.split("####")[-1].strip()  # Extract the final answer
    gpt4o_final_answers.append(gpt4o_final_answer)

# Add the responses as a new column in the dataframe
df['GPT-4o mini response'] = gpt4o_responses
    
# Add the final answers to the dataframe
df['GPT-4o mini final answer'] = gpt4o_final_answers

print("✅ GPT-4o mini responses added to the dataframe.")
df.sample(3)

100%|██████████| 100/100 [18:23<00:00, 11.03s/it]

✅ GPT-4o mini responses added to the dataframe.





Unnamed: 0,source,id,question,ground_truth,numerical_answer,GPT-4o mini response,GPT-4o mini final answer
23,gsm8k,gsm8k_3222,"Nicole collected 400 Pokemon cards. Cindy collected twice as many, and Rex collected half of Nicole and Cindy's combined total. If Rex divided his card equally among himself and his three younger siblings, how many cards does Rex have left?",Cindy has 400 x 2 = <<400*2=800>>800 cards.\nNicole and Cindy have 400 + 800 = <<400+800=1200>>1200 cards.\nRex has 1200/2 = <<1200/2=600>>600 cards.\nRex is left with 600/(3+1=4) = <<600/4=150>>150 cards\n#### 150,150,"To solve the problem, we will break it down step by step.\n\n1. **Determine how many cards Cindy collected:**\n - Nicole collected 400 Pokemon cards.\n - Cindy collected twice as many as Nicole.\n - Therefore, Cindy's total is:\n \[\n \text{Cindy's cards} = 2 \times \text{Nicole's cards} = 2 \times 400 = 800\n \]\n\n2. **Calculate the combined total of Nicole and Cindy's cards:**\n - We add the number of cards Nicole and Cindy collected:\n \[\n \text{Combined total} = \text{Nicole's cards} + \text{Cindy's cards} = 400 + 800 = 1200\n \]\n\n3. **Determine how many cards Rex collected:**\n - Rex collected half of the combined total of Nicole and Cindy's cards.\n - Therefore, Rex's total is:\n \[\n \text{Rex's cards} = \frac{1}{2} \times \text{Combined total} = \frac{1}{2} \times 1200 = 600\n \]\n\n4. **Calculate how many cards Rex has after dividing them among himself and his three siblings:**\n - Rex has 3 younger siblings, which means there are 4 people in total (Rex + 3 siblings).\n - If Rex divides his 600 cards equally among the 4 of them, each person gets:\n \[\n \text{Cards per person} = \frac{\text{Rex's cards}}{4} = \frac{600}{4} = 150\n \]\n\n5. **Determine how many cards Rex has left after the division:**\n - Since Rex has 150 cards after the division, he does not have any cards left for himself. Therefore, the number of cards Rex has left is:\n \[\n \text{Rex's remaining cards} = 0\n \]\n\nThus, the final answer is:\n\n#### 0",0
10,gsm8k,gsm8k_2748,"The price of a book was $400. If the book's price is decreased by 15% and then increased by 40%, what is the final price of the book?","When the price of the book is decreased by 15%, the total price decrease is 15/100*$400=$<<15/100*400=60>>60\nThe new price of the book becomes $400-$60 = $<<400-60=340>>340\nThe price of the book is then increased by 40/100*340 = $<<40/100*340=136>>136\nThe final price of the book is $340+$136 = $<<340+136=476>>476\n#### 476",476,"To solve the problem, we will follow these steps:\n\n1. **Calculate the decrease in price**: The book's original price is $400. We need to find 15% of this price to determine how much the price decreases.\n\n \[\n \text{Decrease} = 15\% \text{ of } 400 = 0.15 \times 400 = 60\n \]\n\n Now, we subtract this decrease from the original price:\n\n \[\n \text{New Price after Decrease} = 400 - 60 = 340\n \]\n\n2. **Calculate the increase in price**: Next, we need to find 40% of the new price ($340) to determine how much the price increases.\n\n \[\n \text{Increase} = 40\% \text{ of } 340 = 0.40 \times 340 = 136\n \]\n\n Now, we add this increase to the new price:\n\n \[\n \text{Final Price} = 340 + 136 = 476\n \]\n\nThus, the final price of the book after the decrease and increase is $476.\n\nThe final answer is:\n\n#### 476",476
3,math,math_7539,Expand $(2x^5 + 3x^2)(x^4 - 4x^2 + 3x - 8)$.,"Using the distributive property, we have \begin{align*}\n&(2x^5 + 3x^2)(x^4 - 4x^2 + 3x - 8) \\\n&\qquad= 2x^5(x^4 - 4x^2 + 3x - 8) + 3x^2(x^4 - 4x^2 + 3x - 8) \\\n&\qquad= 2x^9 - 8x^7 + 6x^6 - 16x^5 + 3x^6 - 12x^4 + 9x^3 - 24x^2 \\\n&\qquad= \boxed{2x^9 - 8x^7 + 9x^6 - 16x^5 - 12x^4 + 9x^3 - 24x^2}.\n\end{align*}",2x^9 - 8x^7 + 9x^6 - 16x^5 - 12x^4 + 9x^3 - 24x^2,"To expand the expression \((2x^5 + 3x^2)(x^4 - 4x^2 + 3x - 8)\), we will use the distributive property, also known as the FOIL method for binomials, but in this case, we will distribute each term in the first polynomial to each term in the second polynomial.\n\n1. **Distributing \(2x^5\)**:\n - \(2x^5 \cdot x^4 = 2x^{5+4} = 2x^9\)\n - \(2x^5 \cdot (-4x^2) = -8x^{5+2} = -8x^7\)\n - \(2x^5 \cdot 3x = 6x^{5+1} = 6x^6\)\n - \(2x^5 \cdot (-8) = -16x^5\)\n\n So, the contributions from \(2x^5\) are:\n \[\n 2x^9 - 8x^7 + 6x^6 - 16x^5\n \]\n\n2. **Distributing \(3x^2\)**:\n - \(3x^2 \cdot x^4 = 3x^{2+4} = 3x^6\)\n - \(3x^2 \cdot (-4x^2) = -12x^{2+2} = -12x^4\)\n - \(3x^2 \cdot 3x = 9x^{2+1} = 9x^3\)\n - \(3x^2 \cdot (-8) = -24x^2\)\n\n So, the contributions from \(3x^2\) are:\n \[\n 3x^6 - 12x^4 + 9x^3 - 24x^2\n \]\n\n3. **Combining all contributions**:\n Now we combine all the terms we obtained from both distributions:\n \[\n 2x^9 + (-8x^7) + (6x^6 + 3x^6) + (-16x^5) + (-12x^4) + 9x^3 - 24x^2\n \]\n\n This simplifies to:\n \[\n 2x^9 - 8x^7 + (6x^6 + 3x^6) - 16x^5 - 12x^4 + 9x^3 - 24x^2\n \]\n \[\n = 2x^9 - 8x^7 + 9x^6 - 16x^5 - 12x^4 + 9x^3 - 24x^2\n \]\n\n4. **Final expression**:\n The final expanded expression is:\n \[\n 2x^9 - 8x^7 + 9x^6 - 16x^5 - 12x^4 + 9x^3 - 24x^2\n \]\n\nThus, the final answer is:\n\n#### 2x^9 - 8x^7 + 9x^6 - 16x^5 - 12x^4 + 9x^3 - 24x^2",2x^9 - 8x^7 + 9x^6 - 16x^5 - 12x^4 + 9x^3 - 24x^2


# Check if answers are correct

In [15]:
# Check if GPT-4o mini final answer matches the ground truth numerical answer
df['is_correct'] = df['GPT-4o mini final answer'] == df['numerical_answer']

# Calculate the total number of matches
total_matches = df['is_correct'].sum()

print(f"Total matches: {total_matches} out of {len(df)}")

Total matches: 62 out of 100


# Save Dataframe with GPT-4o mini resonses

In [16]:
import os

# Create the 'data' folder if it doesn't exist
output_dir = "data"
os.makedirs(output_dir, exist_ok=True)

# Save the dataframe to a CSV file in the 'data' folder
output_file = os.path.join(output_dir, "gpt4o_mini_responses.csv")
df.to_csv(output_file, index=False)

print(f"✅ Dataframe saved to {output_file}")

✅ Dataframe saved to data\gpt4o_mini_responses.csv
