In [1]:
# Standard imports
import json
from tqdm import tqdm
import sys
import pandas as pd
import random
import os
from joblib import Parallel, delayed
import importlib
from functools import partial
import csv
import dotenv
dotenv.load_dotenv()

True

In [2]:
# Method imports
sys.path.append('..')

from methods.mathprompterpp import mathprompterpp
from methods.mathprompter import mathprompter
from methods.zeroshotCoT import zeroshotCoT
from utils.method_utils import generating_algebraic_template
from utils.plot_utils import compute_detailed_metrics, plot_individual_metrics
from llm_inference.llm_factory import get_llm_service

import methods.mathprompterpp
import methods.mathprompter
import methods.zeroshotCoT
import utils.method_utils
import llm_inference.llm_factory

importlib.reload(methods.mathprompterpp)
importlib.reload(methods.zeroshotCoT)   
importlib.reload(utils.method_utils)
importlib.reload(llm_inference.llm_factory)


  from .autonotebook import tqdm as notebook_tqdm


<module 'llm_inference.llm_factory' from '/Users/s/Desktop/Academics/Quarter 4/CSE256/Project/FinalCode/llm_inference/llm_factory.py'>

In [3]:
# Initialize parameters
suffix = str(random.randint(1, 1000000))
random.seed(42)
model_service = 'openai'
model_name = 'gpt-4o-mini'
data_size = 85
max_tokens = 200
few_shot = True
consensus_k = 5

In [4]:
# Initialize model and data
SERVICE = get_llm_service(model_service, model_name, temperature=0.0, max_tokens=max_tokens)

In [5]:
# Define the methods you want to evaluate
run_methods = ["MathPrompter", "MathPrompter++", "Zeroshot CoT"]  # Example methods

In [6]:
with open('data/domain.json', 'r') as file:
    data = json.load(file)
data = random.sample(data, k=data_size)  # Take a random sample of 100 rows

suffix += "_domain_" + model_name.split("/")[-1] + "_" + str(data_size) 
suffix += "_few_shot" if few_shot else ""
print(suffix)

# Create folder if not exists
if not os.path.exists('results/' + suffix):
    os.makedirs('results/' + suffix)

484748_domain_gpt-4o-mini_85_few_shot


In [7]:
if "MathPrompter" in run_methods:
    # MathPrompter
    header_mathprompter = ["Question", "Type", "Correct Answer", "Predicted Answer", "Predicted Algebraic Answer", "Predicted Python Answer", "Generated Algebraic Template", "Generated Expression", "Generated Python Code", "Variable Mapping", "Error", "Error Message"]
    df_mathprompter = pd.DataFrame(columns=header_mathprompter)
    csv_file_mathprompter = 'results/' + suffix + '/mathprompter_results.csv'  # CSV file path

if "MathPrompter++" in run_methods:
    # MathPrompter++
    header_mathprompterpp = ["Question", "Type", "Correct Answer", "Predicted Answer", "Predicted Algebraic Answer", "Predicted Python Answer", "Generated Algebraic Template", "Rough Solution", "Generated Expression", "Generated Python Code", "Variable Mapping", "Error", "Error Message"]
    df_mathprompterpp = pd.DataFrame(columns=header_mathprompterpp)
    csv_file_mathprompterpp = 'results/' + suffix + '/mathprompterpp_results.csv'  # CSV file path

if "Zeroshot CoT" in run_methods:
    # Zeroshot CoT
    header_zeroshot_cot = ["Question", "Type", "Correct Answer", "Predicted Answer", "Response", "Error", "Error Message"]
    df_zeroshot_cot = pd.DataFrame(columns=header_zeroshot_cot)
    csv_file_zeroshot_cot = 'results/' + suffix + '/zeroshot_cot_results.csv'  # CSV file path

# Example of saving the DataFrames to CSV files
if "MathPrompter" in run_methods:
    df_mathprompter.to_csv(csv_file_mathprompter, index=False)

if "MathPrompter++" in run_methods:
    df_mathprompterpp.to_csv(csv_file_mathprompterpp, index=False)

if "Zeroshot CoT" in run_methods:
    df_zeroshot_cot.to_csv(csv_file_zeroshot_cot, index=False)


In [8]:
def reset_csv_files():
    if "MathPrompter" in run_methods:
        with open(csv_file_mathprompter, 'w') as f:
            writer = csv.writer(f)
            writer.writerow(header_mathprompter)
    if "MathPrompter++" in run_methods:
        with open(csv_file_mathprompterpp, 'w') as f:
            writer = csv.writer(f)
            writer.writerow(header_mathprompterpp)
    if "Zeroshot CoT" in run_methods:
        with open(csv_file_zeroshot_cot, 'w') as f:
            writer = csv.writer(f)
            writer.writerow(header_zeroshot_cot)


In [9]:
# Parallelize across different data rows using joblib
def process_question(item, debug = False):
    question = item["Body"] + " " + item["Question"]
    correct_answer = item["Answer"]
    question_type = item["Type"]
    qt, variable_mapping = generating_algebraic_template(question) 
    if "MathPrompter" in run_methods:   
        mathprompter_answer = mathprompter(qt, variable_mapping, SERVICE, k = consensus_k, few_shot=few_shot)

        # Create a dictionary for the new row
        new_row = {
            "Question": question,
            "Type": question_type,
            "Correct Answer": correct_answer,
            "Predicted Answer": mathprompter_answer["Result"],
            "Predicted Algebraic Answer": mathprompter_answer["Expression_Result"],
            "Predicted Python Answer": mathprompter_answer["Code_Result"],
            "Generated Algebraic Template": qt,
            "Generated Expression": mathprompter_answer["Expression"],
            "Generated Python Code": mathprompter_answer["Code"],
            "Variable Mapping": variable_mapping,
            "Error": mathprompter_answer["Error"],
            "Error Message": mathprompter_answer["Error_Message"]
        }
        if debug:
            print("MathPrompter")
            print(mathprompter_answer)
            print(new_row)
        # Append the new row to the DataFrame using pd.concat
        # df_mathprompter = pd.concat([df_mathprompter, pd.DataFrame([new_row])], ignore_index=True)
        # Write to csv
        with open(csv_file_mathprompter, 'a') as f:
            writer = csv.writer(f)
            writer.writerow(new_row.values())
    if "MathPrompter++" in run_methods:
        mathprompterpp_answer = mathprompterpp(question, qt, variable_mapping, SERVICE, k = consensus_k, few_shot=few_shot)
        
        # Create a dictionary for the new row
        new_row = {
            "Question": question,
            "Type": question_type,
            "Correct Answer": correct_answer,
            "Predicted Answer": mathprompterpp_answer["Result"],
            "Predicted Algebraic Answer": mathprompterpp_answer["Expression_Result"],
            "Predicted Python Answer": mathprompterpp_answer["Code_Result"],
            "Generated Algebraic Template": qt,
            "Rough Solution": mathprompterpp_answer["Rough_Solution"],
            "Generated Expression": mathprompterpp_answer["Expression"],
            "Generated Python Code": mathprompterpp_answer["Code"],
            "Variable Mapping": variable_mapping,
            "Error": mathprompterpp_answer["Error"],
            "Error Message": mathprompterpp_answer["Error_Message"]
        }
        if debug:
            print("MathPrompter++")
            print(mathprompterpp_answer)
            print(new_row)
        # Append the new row to the DataFrame using pd.concat
        # df_mathprompterpp = pd.concat([df_mathprompterpp, pd.DataFrame([new_row])], ignore_index=True)
        # Write to csv
        with open(csv_file_mathprompterpp, 'a') as f:
            writer = csv.writer(f)
            writer.writerow(new_row.values())
    if "Zeroshot CoT" in run_methods:
        zeroshot_cot_answer = zeroshotCoT(question, SERVICE, max_tokens)    

        # Create a dictionary for the new row
        new_row = {
            "Question": question,
            "Type": question_type,
            "Correct Answer": correct_answer,
            "Predicted Answer": zeroshot_cot_answer["Answer"],
            "Response": zeroshot_cot_answer["Response"],
            "Error": zeroshot_cot_answer["Error"],
            "Error Message": zeroshot_cot_answer["Error_Message"]
        }
        if debug:
            print("Zeroshot CoT")
            print(zeroshot_cot_answer)
            print(new_row)
        # Append the new row to the DataFrame using pd.concat
        # df_zeroshot_cot = pd.concat([df_zeroshot_cot, pd.DataFrame([new_row])], ignore_index=True)
        # Write to csv
        with open(csv_file_zeroshot_cot, 'a') as f:
            writer = csv.writer(f)
            writer.writerow(new_row.values())

In [10]:
# Test 
reset_csv_files()
# if not few_shot:
run_methods = ["MathPrompter", "MathPrompter++"]
for i in tqdm(range(85)):
    # print(data[i])
    process_question(data[i], debug=False)

 11%|█         | 9/85 [00:37<05:09,  4.07s/it]

An error occurred during the verification process: 'Code_Result'


 15%|█▌        | 13/85 [00:57<05:44,  4.79s/it]

An error occurred during the verification process: int too large to convert to float


 22%|██▏       | 19/85 [01:24<05:13,  4.75s/it]

An error occurred during the verification process: 'Code_Result'


 25%|██▍       | 21/85 [02:16<14:39, 13.74s/it]

An error occurred during the verification process: float() argument must be a string or a real number, not 'tuple'


 26%|██▌       | 22/85 [02:21<11:38, 11.09s/it]

An error occurred during the verification process: float() argument must be a string or a real number, not 'tuple'


 27%|██▋       | 23/85 [02:28<10:04,  9.74s/it]

An error occurred during the verification process: 'Code_Result'


 31%|███       | 26/85 [03:56<28:31, 29.00s/it]

Runtime error during expression evaluation: float division by zero
An error occurred during the verification process: 'Code_Result'
Runtime error during expression evaluation: 'NoneType' object is not subscriptable
An error occurred during the verification process: float() argument must be a string or a real number, not 'NoneType'


 32%|███▏      | 27/85 [04:01<21:03, 21.78s/it]

Runtime error during expression evaluation: 'NoneType' object is not subscriptable
An error occurred during the verification process: float() argument must be a string or a real number, not 'NoneType'


 34%|███▍      | 29/85 [04:11<12:25, 13.32s/it]

An error occurred during the verification process: 'Code_Result'


 47%|████▋     | 40/85 [05:06<03:32,  4.72s/it]

An error occurred during the verification process: 'Code_Result'


 51%|█████     | 43/85 [05:29<04:57,  7.09s/it]

Runtime error during expression evaluation: 'NoneType' object is not subscriptable
An error occurred during the verification process: float() argument must be a string or a real number, not 'NoneType'


 52%|█████▏    | 44/85 [05:33<04:10,  6.11s/it]

Runtime error during expression evaluation: 'NoneType' object is not subscriptable
An error occurred during the verification process: float() argument must be a string or a real number, not 'NoneType'


 62%|██████▏   | 53/85 [06:10<02:17,  4.29s/it]

An error occurred during the verification process: 'Code_Result'


 69%|██████▉   | 59/85 [32:01<2:02:10, 281.93s/it]

An error occurred during the verification process: 'Code_Result'
Runtime error during expression evaluation: 'NoneType' object is not subscriptable
An error occurred during the verification process: float() argument must be a string or a real number, not 'NoneType'


 71%|███████   | 60/85 [32:04<1:22:41, 198.46s/it]

Runtime error during expression evaluation: 'NoneType' object is not subscriptable
An error occurred during the verification process: float() argument must be a string or a real number, not 'NoneType'


 72%|███████▏  | 61/85 [32:08<55:58, 139.93s/it]  

Syntax error in the expression: invalid syntax (<string>, line 1)
An error occurred during the verification process: float() argument must be a string or a real number, not 'NoneType'


 73%|███████▎  | 62/85 [32:11<37:56, 98.98s/it] 

An error occurred during the verification process: float division by zero


 74%|███████▍  | 63/85 [32:14<25:44, 70.22s/it]

An error occurred during the verification process: 'Code_Result'


 80%|████████  | 68/85 [32:36<04:30, 15.89s/it]

An error occurred during the verification process: 'Code_Result'


 86%|████████▌ | 73/85 [33:02<01:35,  7.97s/it]

Runtime error during expression evaluation: 'NoneType' object is not subscriptable
An error occurred during the verification process: float() argument must be a string or a real number, not 'NoneType'


 87%|████████▋ | 74/85 [33:06<01:13,  6.72s/it]

Runtime error during expression evaluation: 'NoneType' object is not subscriptable
An error occurred during the verification process: float() argument must be a string or a real number, not 'NoneType'


 89%|████████▉ | 76/85 [33:13<00:46,  5.12s/it]

Syntax error in the expression: invalid syntax (<string>, line 1)
An error occurred during the verification process: float() argument must be a string or a real number, not 'NoneType'


 93%|█████████▎| 79/85 [33:24<00:26,  4.35s/it]

An error occurred during the verification process: 'Code_Result'


 94%|█████████▍| 80/85 [33:29<00:23,  4.67s/it]

An error occurred during the verification process: 'Code_Result'
Error when executing Python code: solution() got an unexpected keyword argument 'var_A'
An error occurred during the verification process: float() argument must be a string or a real number, not 'NoneType'


 95%|█████████▌| 81/85 [33:33<00:17,  4.37s/it]

An error occurred during the verification process: 'Code_Result'


100%|██████████| 85/85 [33:46<00:00, 23.85s/it]


In [11]:
# Run the parallelized function
return
reset_csv_files()
process_question_partial = partial(process_question, debug=False)
Parallel(n_jobs=2)(delayed(process_question_partial)(item) for item in tqdm(data[:1], desc="Processing Questions"))

SyntaxError: 'return' outside function (1598447969.py, line 2)

In [12]:
process_question(data[0], debug=True)

MathPrompter
defaultdict(<function mathprompter.<locals>.<lambda> at 0x3081bb2e0>, {'Expression': 'var_A + var_B', 'Code': 'def solution(var_A, var_B):\n    return var_A + var_B', 'Result': 91, 'Results': {'Expression': [475, 501, 569, 263, 469], 'Code': [475, 501, 569, 263, 469], 'Test_Input': [{'var_A': 165, 'var_B': 310}, {'var_A': 244, 'var_B': 257}, {'var_A': 78, 'var_B': 491}, {'var_A': 32, 'var_B': 231}, {'var_A': 53, 'var_B': 416}]}, 'Expression_Result': 91.0, 'Code_Result': 91.0, 'Error': False, 'Error_Message': ''})
{'Question': 'Danny collects bottle caps. He lost 66 bottle caps at the park. Now he has 25 bottle caps in his collection. How many bottle caps did danny have at first?', 'Type': 'Addition', 'Correct Answer': 91.0, 'Predicted Answer': 91, 'Predicted Algebraic Answer': 91.0, 'Predicted Python Answer': 91.0, 'Generated Algebraic Template': 'Danny collects bottle caps. He lost var_A bottle caps at the park. Now he has var_B bottle caps in his collection. How many bot