# Load in responses

In [3]:
import pandas as pd

# Load the mini responses CSV into a pandas DataFrame
mini_responses_path = "./data/gpt4o_mini_responses.csv"
mini_responses_df = pd.read_csv(mini_responses_path)

# Display the first few rows of the DataFrame
mini_responses_df.head()

Unnamed: 0,source,id,question,ground_truth,numerical_answer,GPT-4o mini response,GPT-4o mini final answer,is_correct
0,gsm8k,gsm8k_6248,"James has five huskies, two pitbulls and four ...",There are 5+2 = <<5+2=7>>7 huskies and pitbull...,30,"To solve the problem, we will first determine ...",30,True
1,math,math_8134,"If $f(x)$ is a polynomial of degree 3, and $g(...",Let $f(x) = a_3 x^3 + a_2 x^2 + a_1 x + a_0$ a...,5,To determine the degree of the polynomial \(2f...,5,True
2,math,math_11453,"In a convex heptagon, the degree measures of t...",The sum of the angle measures in a polygon wit...,132,"To solve the problem, we first need to find th...",132.57,False
3,math,math_7539,Expand $(2x^5 + 3x^2)(x^4 - 4x^2 + 3x - 8)$.,"Using the distributive property, we have \begi...",2x^9 - 8x^7 + 9x^6 - 16x^5 - 12x^4 + 9x^3 - 24x^2,To expand the expression \((2x^5 + 3x^2)(x^4 -...,2x^9 - 8x^7 + 9x^6 - 16x^5 - 12x^4 + 9x^3 - 24x^2,True
4,math,math_9099,How many ways are there to put 4 balls in 3 bo...,Without regard to the distinguishability of th...,14,To solve the problem of how many ways there ar...,14,True


In [4]:
correct_count = mini_responses_df['is_correct'].sum()
print(f"Number of correct responses: {correct_count}")

Number of correct responses: 62


# Use gpt4o-mini to compare responses

In [10]:
from langchain.chat_models import ChatOpenAI

# Initialize the GPT-4o mini model
gpt4o_mini = ChatOpenAI(model="gpt-4o-mini")

# Function to generate a prompt and get GPT-4o mini's response
def check_answer_with_gpt4o(row):
    prompt = f"Are the following two answers the same?\n\n" \
             f"Answer 1: {row['numerical_answer']}\n" \
             f"Answer 2: {row['GPT-4o mini final answer']}\n\n" \
             f"Please respond with 'True' if both answers are the same, or 'False' if they are different."
    response = gpt4o_mini.call_as_llm(prompt)
    return response.strip()

# Apply the function to each row and store the results in a new column
mini_responses_df['gpt4o_check'] = mini_responses_df.apply(check_answer_with_gpt4o, axis=1)

# Display the updated DataFrame
mini_responses_df.head()

Unnamed: 0,source,id,question,ground_truth,numerical_answer,GPT-4o mini response,GPT-4o mini final answer,is_correct,gpt4o_check
0,gsm8k,gsm8k_6248,"James has five huskies, two pitbulls and four ...",There are 5+2 = <<5+2=7>>7 huskies and pitbull...,30,"To solve the problem, we will first determine ...",30,True,True
1,math,math_8134,"If $f(x)$ is a polynomial of degree 3, and $g(...",Let $f(x) = a_3 x^3 + a_2 x^2 + a_1 x + a_0$ a...,5,To determine the degree of the polynomial \(2f...,5,True,True
2,math,math_11453,"In a convex heptagon, the degree measures of t...",The sum of the angle measures in a polygon wit...,132,"To solve the problem, we first need to find th...",132.57,False,False
3,math,math_7539,Expand $(2x^5 + 3x^2)(x^4 - 4x^2 + 3x - 8)$.,"Using the distributive property, we have \begi...",2x^9 - 8x^7 + 9x^6 - 16x^5 - 12x^4 + 9x^3 - 24x^2,To expand the expression \((2x^5 + 3x^2)(x^4 -...,2x^9 - 8x^7 + 9x^6 - 16x^5 - 12x^4 + 9x^3 - 24x^2,True,True
4,math,math_9099,How many ways are there to put 4 balls in 3 bo...,Without regard to the distinguishability of th...,14,To solve the problem of how many ways there ar...,14,True,True


# See what gpt4o labeled differently

In [31]:
# Print the number of correct responses according to GPT-4o mini
correct_count_gpt4o = (mini_responses_df['gpt4o_check'] == 'True').sum()
print("Number of correct responses according to GPT-4o mini:", correct_count_gpt4o)

# Print the different rows between the two counts, where is_correct is false and gpt4o_check is 'yes', or vice versa
different_rows = mini_responses_df[
    (mini_responses_df['is_correct'] != (mini_responses_df['gpt4o_check'] == 'True'))
]
print("Number of overlooked rows GPT-4o mini found:", len(different_rows))
print(different_rows[['numerical_answer', 'GPT-4o mini final answer', 'is_correct', 'gpt4o_check']])

Number of correct responses according to GPT-4o mini: 76
Number of overlooked rows GPT-4o mini found: 14
                                     numerical_answer  \
8                                           \sqrt{39}   
9                                        \frac{5}{18}   
14                                             (-2,1)   
18                                        50\sqrt{10}   
25                                                 13   
26                                        \frac{1}{4}   
33  \left( -\infty, -\frac{1}{4} \right) \cup (2,\...   
60                                        -\frac{15}4   
74                                           \sqrt{3}   
76                                           \sqrt{2}   
77                                           (15,-11)   
91                                 \frac{\sqrt{3}}{3}   
97                                        \frac{1}{3}   
99                                        \frac1{512}   

                             GPT-4o min