In [1]:
import pandas as pd
import json

In [2]:
# Only care about the core models for now
models = [
    "gpt-4o", "gpt-4.1", "gpt-4o-mini",
    "gemini/gemini-2.0-flash", "gemini/gemini-2.5-flash-preview-04-17", "gemini/gemma-3-27b-it",
    'Qwen/Qwen2.5-72B-Instruct', 'DeepSeek-Prover-V2-671B', 'DeepSeek-R1', 'DeepSeek-V3', 'meta-llama/Llama-4-Scout-17B-16E-Instruct'
]

# Step 1 - Questions
We start by looking at all of the questions we have 

In [3]:
questions = pd.read_json('questions.json')
len(questions)

17092

# Step 2 - Raw answers
Now, we look at all of the entries we've collected for all of the models

In [5]:
raw_ans = pd.read_excel('almost_full_rechecked_integrals.xlsx')

In [6]:
# raw_ans = raw_ans.drop('Unnamed: 0', axis=1)
# raw_ans = raw_ans[raw_ans.model.apply(lambda m: m in models)]
raw_ans['endpoint'] = raw_ans.model.astype(str) + '_' + raw_ans.code_execution.astype(str)
raw_ans.head()

Unnamed: 0.2,full_answer,tokens_used,final_answer_latex,question_id,model,true_answer,question_text,code_execution,error,Unnamed: 0,endpoint,non_empty,model_answer,model_answer_type,numeric_comparison,numeric_subs_error,Unnamed: 0.1
0,Let's analyze the integral step by step.\n\nFi...,758,-\pi \log(2),12823,gemini/gemini-2.0-flash,-pi*log(2),"Solve the following integral. Assume A,B,F are...",0.0,,57791.0,gemini/gemini-2.0-flash_0.0,1.0,-pi*log(2),deterministic,1.0,,
1,"Okay, I will solve the indefinite integral usi...",533,\frac{61\left(9 - 11\cos^{2}{\left(33x \right)...,15299,gemini/gemini-2.0-flash,61*(9*sec(33*x)**2 - 11)*sec(33*x)**9/3267,Evaluate the indefinite integral. \n\n$\int \...,1.0,,77824.0,gemini/gemini-2.0-flash_1.0,1.0,(549 - 671*cos(33*x)**2)/(3267*cos(33*x)**11),deterministic,0.0,,
2,To evaluate the iterated integral \(\int_0^1dy...,1645,\frac{59 (74 e^{75} + 1)}{5625},16009,Qwen/Qwen2.5-72B-Instruct,(-1 + (1653 + 122248*exp(75))*exp(78750))*exp(...,Evaluate the iterated integral $\int_0^1dy\int...,,,83875.0,Qwen/Qwen2.5-72B-Instruct_nan,1.0,59/5625 + 4366*exp(75)/5625,deterministic,1.0,,
3,Let the given integral be\n$$ I = \int \frac{ ...,2011,\frac{1}{\left(3 \left(\frac{\tan{\left(x \rig...,2152,gemini/gemma-3-27b-it,5*x**(4/5)/4 + 5*atan(x**(1/5)),"Solve the following integral. Assume A,B,F,G a...",,,131281.0,gemini/gemma-3-27b-it_nan,1.0,(3*(tan(x) + tan(x*(G - 1)))/((-tan(x)*tan(x*(...,deterministic,0.0,,
4,Let the integral be $I$. We have\n$$ I = \int ...,1185,\frac{1}{2 \left(3 + x^{3}\right)^{\frac{2}{3}}},2322,gemini/gemini-2.0-flash,5*x**(4/5)/4 + 5*atan(x**(1/5)),"Solve the following integral. Assume A,B,F,G a...",0.0,,133164.0,gemini/gemini-2.0-flash_0.0,1.0,1/(2*(x**3 + 3)**(2/3)),deterministic,0.0,,


In [7]:
stats = raw_ans.groupby('endpoint').agg(
    n_answers=('question_id','nunique'),
    n_entries=('question_id','count'),
)
stats

Unnamed: 0_level_0,n_answers,n_entries
endpoint,Unnamed: 1_level_1,Unnamed: 2_level_1
DeepSeek-Prover-V2-671B_nan,14623,14624
DeepSeek-R1_nan,15675,15677
DeepSeek-V3_nan,16715,16718
Qwen/Qwen2.5-72B-Instruct_nan,16590,16591
gemini/gemini-2.0-flash_0.0,16847,16847
gemini/gemini-2.0-flash_1.0,16041,16044
gemini/gemini-2.5-flash-preview-04-17_0.0,16848,16850
gemini/gemini-2.5-flash-preview-04-17_1.0,16521,16522
gemini/gemma-3-27b-it_nan,16555,16555
gpt-4.1_0.0,16848,16852


In [30]:
# pretty darn good, but plenty of duplicates. We will the a good solution for each

# Step 3 - Non empty questions

In [8]:
raw_ans['non_empty'] = ~raw_ans.full_answer.isna()

In [9]:
stats = raw_ans.groupby('endpoint').agg(
    n_answers=('question_id','nunique'),
    n_entries=('question_id','count'),
    n_non_empty=('non_empty','sum'),
)
stats

Unnamed: 0_level_0,n_answers,n_entries,n_non_empty
endpoint,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
DeepSeek-Prover-V2-671B_nan,14623,14624,14624
DeepSeek-R1_nan,15675,15677,15677
DeepSeek-V3_nan,16715,16718,16718
Qwen/Qwen2.5-72B-Instruct_nan,16590,16591,16591
gemini/gemini-2.0-flash_0.0,16847,16847,16847
gemini/gemini-2.0-flash_1.0,16041,16044,16044
gemini/gemini-2.5-flash-preview-04-17_0.0,16848,16850,16850
gemini/gemini-2.5-flash-preview-04-17_1.0,16521,16522,16522
gemini/gemma-3-27b-it_nan,16555,16555,16555
gpt-4.1_0.0,16848,16852,16852


In [10]:
valid_ans = raw_ans[raw_ans.non_empty].copy()
stats = valid_ans.groupby('endpoint').agg(
    n_answers=('question_id','nunique'),
    n_entries=('question_id','count'),
    n_non_empty=('non_empty','sum'),
)
stats

Unnamed: 0_level_0,n_answers,n_entries,n_non_empty
endpoint,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
DeepSeek-Prover-V2-671B_nan,14623,14624,14624
DeepSeek-R1_nan,15675,15677,15677
DeepSeek-V3_nan,16715,16718,16718
Qwen/Qwen2.5-72B-Instruct_nan,16590,16591,16591
gemini/gemini-2.0-flash_0.0,16847,16847,16847
gemini/gemini-2.0-flash_1.0,16041,16044,16044
gemini/gemini-2.5-flash-preview-04-17_0.0,16848,16850,16850
gemini/gemini-2.5-flash-preview-04-17_1.0,16521,16522,16522
gemini/gemma-3-27b-it_nan,16555,16555,16555
gpt-4.1_0.0,16848,16852,16852


In [11]:
stats.reset_index().endpoint.values

array(['DeepSeek-Prover-V2-671B_nan', 'DeepSeek-R1_nan',
       'DeepSeek-V3_nan', 'Qwen/Qwen2.5-72B-Instruct_nan',
       'gemini/gemini-2.0-flash_0.0', 'gemini/gemini-2.0-flash_1.0',
       'gemini/gemini-2.5-flash-preview-04-17_0.0',
       'gemini/gemini-2.5-flash-preview-04-17_1.0',
       'gemini/gemma-3-27b-it_nan', 'gpt-4.1_0.0', 'gpt-4.1_1.0',
       'gpt-4o-mini_0.0', 'gpt-4o-mini_1.0', 'gpt-4o_0.0', 'gpt-4o_1.0',
       'meta-llama/Llama-4-Scout-17B-16E-Instruct_nan'], dtype=object)

# Step 3 - Latex extraction
Performing the calculation again here

In [45]:
from collect_llm_answers import extract_latex_answer
latex_answers = []
extract_success = []
for i, row in valid_ans.iterrows():
    if i % 10000 == 0:
        print(f'Processed {i}/{len(raw_ans)} rows')
    try:
        latex_answers.append(extract_latex_answer(row.full_answer))
        extract_success.append(True)
    except Exception as e:
        latex_answers.append(str(e))
        extract_success.append(False)

Processed 0/249271 rows
Processed 10000/249271 rows
Processed 20000/249271 rows
Processed 30000/249271 rows
Processed 40000/249271 rows
Processed 50000/249271 rows
Processed 60000/249271 rows
Processed 70000/249271 rows
Processed 80000/249271 rows
Processed 90000/249271 rows
Processed 100000/249271 rows
Processed 110000/249271 rows
Processed 130000/249271 rows
Processed 140000/249271 rows
Processed 150000/249271 rows
Processed 160000/249271 rows
Processed 170000/249271 rows
Processed 180000/249271 rows
Processed 190000/249271 rows
Processed 210000/249271 rows
Processed 220000/249271 rows
Processed 230000/249271 rows
Processed 240000/249271 rows
Processed 250000/249271 rows
Processed 260000/249271 rows


In [46]:
valid_ans['final_answer_latex'] = latex_answers
valid_ans['latex_extraction_success'] = extract_success

In [49]:
stats = valid_ans.groupby('endpoint').agg(
    n_answers=('question_id','nunique'),
    n_entries=('question_id','count'),
    n_non_empty=('non_empty','sum'),
    n_good_latex=('latex_extraction_success','sum'),
)
stats['% valid ans'] = stats.n_good_latex / stats.n_non_empty
stats

Unnamed: 0_level_0,n_answers,n_entries,n_non_empty,n_good_latex,% valid ans
endpoint,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
DeepSeek-Prover-V2-671B_nan,14772,14772,14772,14353,0.971636
DeepSeek-R1_nan,15879,15879,15879,15872,0.999559
DeepSeek-V3_nan,16920,16920,16920,16917,0.999823
Qwen/Qwen2.5-72B-Instruct_nan,16760,16760,16760,16759,0.99994
gemini/gemini-2.0-flash_0.0,17050,17050,17050,17050,1.0
gemini/gemini-2.0-flash_1.0,16234,16234,16234,15882,0.978317
gemini/gemma-3-27b-it_nan,16738,16738,16738,16738,1.0
gpt-4.1_0.0,17081,17081,17081,17074,0.99959
gpt-4.1_1.0,17081,17081,17081,17081,1.0
gpt-4o-mini_0.0,17081,17081,17081,17081,1.0


In [65]:
stast.n_entries

NameError: name 'stast' is not defined

# Try the same for the answers and see whatsup

In [4]:
df = pd.read_excel('almost_full_rechecked_integrals.xlsx')
df

Unnamed: 0.2,full_answer,tokens_used,final_answer_latex,question_id,model,true_answer,question_text,code_execution,error,Unnamed: 0,endpoint,non_empty,model_answer,model_answer_type,numeric_comparison,numeric_subs_error,Unnamed: 0.1
0,Let's analyze the integral step by step.\n\nFi...,758,-\pi \log(2),12823,gemini/gemini-2.0-flash,-pi*log(2),"Solve the following integral. Assume A,B,F are...",0.0,,57791.0,gemini/gemini-2.0-flash_0.0,1.0,-pi*log(2),deterministic,1.0,,
1,"Okay, I will solve the indefinite integral usi...",533,\frac{61\left(9 - 11\cos^{2}{\left(33x \right)...,15299,gemini/gemini-2.0-flash,61*(9*sec(33*x)**2 - 11)*sec(33*x)**9/3267,Evaluate the indefinite integral. \n\n$\int \...,1.0,,77824.0,gemini/gemini-2.0-flash_1.0,1.0,(549 - 671*cos(33*x)**2)/(3267*cos(33*x)**11),deterministic,0.0,,
2,To evaluate the iterated integral \(\int_0^1dy...,1645,\frac{59 (74 e^{75} + 1)}{5625},16009,Qwen/Qwen2.5-72B-Instruct,(-1 + (1653 + 122248*exp(75))*exp(78750))*exp(...,Evaluate the iterated integral $\int_0^1dy\int...,,,83875.0,Qwen/Qwen2.5-72B-Instruct_nan,1.0,59/5625 + 4366*exp(75)/5625,deterministic,1.0,,
3,Let the given integral be\n$$ I = \int \frac{ ...,2011,\frac{1}{\left(3 \left(\frac{\tan{\left(x \rig...,2152,gemini/gemma-3-27b-it,5*x**(4/5)/4 + 5*atan(x**(1/5)),"Solve the following integral. Assume A,B,F,G a...",,,131281.0,gemini/gemma-3-27b-it_nan,1.0,(3*(tan(x) + tan(x*(G - 1)))/((-tan(x)*tan(x*(...,deterministic,0.0,,
4,Let the integral be $I$. We have\n$$ I = \int ...,1185,\frac{1}{2 \left(3 + x^{3}\right)^{\frac{2}{3}}},2322,gemini/gemini-2.0-flash,5*x**(4/5)/4 + 5*atan(x**(1/5)),"Solve the following integral. Assume A,B,F,G a...",0.0,,133164.0,gemini/gemini-2.0-flash_0.0,1.0,1/(2*(x**3 + 3)**(2/3)),deterministic,0.0,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
264403,To compute the first 6 nonzero terms of the Ma...,1139,\frac{97\sqrt{2}}{2} + \frac{97\sqrt{2}}{2}x -...,582,DeepSeek-V3,(sqrt(2)*x**5/2 + 5*sqrt(2)*x**4/2 - 10*sqrt(2...,Compute the first 6 nonzero terms of the Macla...,,timeout,168255.0,DeepSeek-V3_nan,1.0,,,,,585.0
264404,"First, let's simplify the given differential e...",1794,2 e,4399,DeepSeek-Prover-V2-671B,4/5,Consider the differential equation \n$ \left(-...,,timeout,155332.0,DeepSeek-Prover-V2-671B_nan,1.0,,,,,591.0
264405,"<think>\nOkay, let's try to tackle this limit ...",4981,e^{\frac{1}{4}},940,DeepSeek-R1,e**(1/4),Evaluate \n$ \lim_{x \to 0^+} \left(\frac{\si...,,timeout,197648.0,DeepSeek-R1_nan,1.0,,,,,607.0
264406,The final answer is:\n$$\n29 x^2 - \frac{53 x^...,186,29 x^2 - \frac{53 x^{7 \cdot 99 + 1}}{7 \cdot ...,16078,gpt-4.1,x*(1 + 7*99)**(-1)*(x*(1 + 7*99)*29 - atan(3*8...,What is the integral of $ 2 \left(29\right) ...,1.0,timeout,,,,,,,,61.0


In [5]:
df.columns

Index(['full_answer', 'tokens_used', 'final_answer_latex', 'question_id',
       'model', 'true_answer', 'question_text', 'code_execution', 'error',
       'Unnamed: 0', 'endpoint', 'non_empty', 'model_answer',
       'model_answer_type', 'numeric_comparison', 'numeric_subs_error',
       'Unnamed: 0.1'],
      dtype='object')

In [6]:
clean_df = df.drop('Unnamed: 0', axis=1) # .drop('Unnamed: 0.1', axis=1).drop('Unnamed: 0.2', axis=1)
clean_df = clean_df[clean_df.model.apply(lambda m: m in models)]
clean_df['endpoint'] = clean_df.model.astype(str) + '_' + clean_df.code_execution.astype(str)
clean_df.head()

Unnamed: 0,full_answer,tokens_used,final_answer_latex,question_id,model,true_answer,question_text,code_execution,error,endpoint,non_empty,model_answer,model_answer_type,numeric_comparison,numeric_subs_error,Unnamed: 0.1
0,Let's analyze the integral step by step.\n\nFi...,758,-\pi \log(2),12823,gemini/gemini-2.0-flash,-pi*log(2),"Solve the following integral. Assume A,B,F are...",0.0,,gemini/gemini-2.0-flash_0.0,1.0,-pi*log(2),deterministic,1.0,,
1,"Okay, I will solve the indefinite integral usi...",533,\frac{61\left(9 - 11\cos^{2}{\left(33x \right)...,15299,gemini/gemini-2.0-flash,61*(9*sec(33*x)**2 - 11)*sec(33*x)**9/3267,Evaluate the indefinite integral. \n\n$\int \...,1.0,,gemini/gemini-2.0-flash_1.0,1.0,(549 - 671*cos(33*x)**2)/(3267*cos(33*x)**11),deterministic,0.0,,
2,To evaluate the iterated integral \(\int_0^1dy...,1645,\frac{59 (74 e^{75} + 1)}{5625},16009,Qwen/Qwen2.5-72B-Instruct,(-1 + (1653 + 122248*exp(75))*exp(78750))*exp(...,Evaluate the iterated integral $\int_0^1dy\int...,,,Qwen/Qwen2.5-72B-Instruct_nan,1.0,59/5625 + 4366*exp(75)/5625,deterministic,1.0,,
3,Let the given integral be\n$$ I = \int \frac{ ...,2011,\frac{1}{\left(3 \left(\frac{\tan{\left(x \rig...,2152,gemini/gemma-3-27b-it,5*x**(4/5)/4 + 5*atan(x**(1/5)),"Solve the following integral. Assume A,B,F,G a...",,,gemini/gemma-3-27b-it_nan,1.0,(3*(tan(x) + tan(x*(G - 1)))/((-tan(x)*tan(x*(...,deterministic,0.0,,
4,Let the integral be $I$. We have\n$$ I = \int ...,1185,\frac{1}{2 \left(3 + x^{3}\right)^{\frac{2}{3}}},2322,gemini/gemini-2.0-flash,5*x**(4/5)/4 + 5*atan(x**(1/5)),"Solve the following integral. Assume A,B,F,G a...",0.0,,gemini/gemini-2.0-flash_0.0,1.0,1/(2*(x**3 + 3)**(2/3)),deterministic,0.0,,


In [7]:
len(clean_df)

264408

In [8]:
clean_df['non_empty'] = ~clean_df.full_answer.isna()
clean_df['latex_extraction_success'] = ~clean_df.final_answer_latex.isna()
clean_df['sympy_extraction_success'] = ~clean_df.model_answer.isna()

In [9]:
stats = clean_df.groupby('endpoint').agg(
    n_answers=('question_id','nunique'),
    n_entries=('question_id','count'),
    n_non_empty=('non_empty','sum'),
    n_good_latex=('latex_extraction_success','sum'),
    n_good_sympy=('sympy_extraction_success','sum'),
)
stats

Unnamed: 0_level_0,n_answers,n_entries,n_non_empty,n_good_latex,n_good_sympy
endpoint,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
DeepSeek-Prover-V2-671B_nan,14623,14624,14624,14176,13661
DeepSeek-R1_nan,15675,15677,15677,15657,15649
DeepSeek-V3_nan,16715,16718,16718,16590,16079
Qwen/Qwen2.5-72B-Instruct_nan,16590,16591,16591,16590,16169
gemini/gemini-2.0-flash_0.0,16847,16847,16847,16766,16722
gemini/gemini-2.0-flash_1.0,16041,16044,16044,15682,15319
gemini/gemini-2.5-flash-preview-04-17_0.0,16848,16850,16850,16822,16782
gemini/gemini-2.5-flash-preview-04-17_1.0,16521,16522,16522,14012,13954
gemini/gemma-3-27b-it_nan,16555,16555,16555,16526,16318
gpt-4.1_0.0,16848,16852,16852,16841,16462


In [15]:
succ_comp = clean_df[~clean_df.numeric_comparison.isna()]

In [16]:
stats = succ_comp.groupby('endpoint').agg(
    n_answers=('question_id','nunique'),
    n_entries=('question_id','count'),
    n_non_empty=('non_empty','sum'),
    n_good_latex=('latex_extraction_success','sum'),
    n_good_sympy=('sympy_extraction_success','sum'),
)
stats

Unnamed: 0_level_0,n_answers,n_entries,n_non_empty,n_good_latex,n_good_sympy
endpoint,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
DeepSeek-Prover-V2-671B_nan,12908,12908,12908,12884,12660
DeepSeek-R1_nan,14627,14627,14627,14616,14614
DeepSeek-V3_nan,15356,15356,15356,15241,15029
Qwen/Qwen2.5-72B-Instruct_nan,15375,15375,15375,15375,15066
gemini/gemini-2.0-flash_0.0,15668,15668,15668,15592,15564
gemini/gemini-2.0-flash_1.0,14320,14320,14320,14311,14104
gemini/gemini-2.5-flash-preview-04-17_0.0,15715,15715,15715,15703,15685
gemini/gemini-2.5-flash-preview-04-17_1.0,12919,12919,12919,12914,12887
gemini/gemma-3-27b-it_nan,15419,15419,15419,15392,15223
gpt-4.1_0.0,15577,15577,15577,15573,15292


In [17]:
stats = succ_comp.groupby('endpoint').agg(
    n_answers=('question_id','nunique'),
    n_correct=('numeric_comparison', 'sum')
    n_tot=('numeric_comparison', 'sum')
)