# Importing libraries, datasets, and correct solutions

In [1]:
#Import important libraries
import os
import json
import ast
import re

import torch
import pandas as pd
import numpy as np
import sklearn as sk

from dotenv import load_dotenv
from openai import OpenAI

file_id = "2025-06-09_14_16_30"

load_dotenv()
TOKEN = os.getenv("AStarPrivate")
if TOKEN is None:
    raise RuntimeError("Token parsing failed")

client = OpenAI(
    base_url="https://openrouter.ai/api/v1",
    api_key=TOKEN,
)

models = ["openai/gpt-4.1", "meta-llama/llama-4-scout", "deepseek/deepseek-r1", 
          "microsoft/phi-4", "qwen/qwen3-14b", "anthropic/claude-3.7-sonnet"]
models_free = ["microsoft/phi-4-reasoning-plus:free", "deepseek/deepseek-r1-distill-qwen-32b:free", "meta-llama/llama-3.3-8b-instruct:free",
              "qwen/qwen3-32b:free"]
model_default = "deepseek/deepseek-r1-distill-qwen-32b:free"
model = None

with open(f"./dataset/{file_id}_generated_questions.json", "r") as file:
    dataset = json.load(file)

print(len(dataset.keys()))
print(dataset.keys())

#For printing aesthetics
white_space = " " * 50

32
dict_keys(['P1N100', 'P1AddSub', 'P1MulDiv', 'P1Money', 'P2N1000', 'P2FracW', 'P2FracAS', 'P3N10k', 'P3FracEq', 'P4N100k', 'P4FctorM', 'P44Op', 'P4FracMI', 'P4FracSet', 'P4Deci3d', 'P4DeciAS', 'P4DeciMD', 'P5N1m', 'P5FracDv', 'P5Frac4Op', 'P5Deci4Op', 'P5perctg', 'P5rate', 'P6ratio', 'P6algebr', 'O1NumOps', 'O1RioPro', 'O1RatSpd', 'O1AgbrEF', 'O1EqIneq', 'O2Prob', 'O3NumOps'])


# Step 1: Correct solution generation
Refer to 'solution_generator' file for the code

# Step 2: Evaluate the given solution based on own correct solutions

Evaluate the given solution, using the generated solution above. The generated solution should come from the same model that is evaluating, to maintain fairness

In [2]:
import sys

In [11]:
"""
This code block is used to evaluate the given solution

Input:
dataset (dct) : { (question ID) : {'ID' : question ID,
                                   'Grade' : the grade the question is in,
                                   'KC' : knowledge component of the question,
                                   'Question' : question content
                                   } : Question dataset
given_solution (for custom mode) (dct): { (question ID) : {(solution ID) : {'question' : question content
                                                                            'solution' : {'step (number)' : step content}
                                                                           }
                                                          }
                                        }: Given solution to the question
model_solution (dct): { (question ID) : {'question' : question content,
                                         'kc' : knowledge component of this question,
                                         'solution' : {'step (number)' : step content}
                                        }
                      } : Model solution that the model should base on to evaluate the given solution
## Note that all question ID must match with each other
models : List of large language models to be used
system_prompt_fpath : System prompt text file path
user_prompt_fpath : User prompt text file path
solution_type_list : Types of solution to be evaluated, correct means the evaluator will evaluate correct solutions, incorrect means the evaluator will
evaluate incorrect solutions. This is only used to get the dataset from the correct file and return the evaluation in the right structure, thus would not
affect the performance of the LLM.
solution_type : Type of solution to be evaluated. Choose from solution_type_list.
"""
# dataset, models should be loaded before step 1
solution_type_list = ['correct','wrong','custom']
solution_type = solution_type_list[1]
given_solution = {}

with open('./solution/wrong solution/step 2/combined_error_types.json','r') as file:
    error_types = json.load(file)

evaluating_model_idx = 0
for model_evaluator in models[:1]:
    if model_evaluator == models[0]:
        confirmation = input("Are you sure you want to use chatgpt to generate 240 responses? (Empty answer for no)")
        if not confirmation:
            sys.exit(0)
    evaluating_model_idx = 1
    print(f"Current evaluator model: {model_evaluator} ({evaluating_model_idx}/{len(models)})")

    #Initialize output
    evaluation_dct = {}
    evaluation_text = {}
    errors = {}

    #Replace / to _ so that it would not interfere with file path:
    model_evaluator_path = model_evaluator.replace("/","_")
    system_prompt_fpath = f"./prompt/evaluation/{model_evaluator_path}_system_prompt.txt" 
    user_prompt_fpath = f"./prompt/evaluation/{model_evaluator_path}_user_prompt.txt"
    generated_evaluation_fpath = f'./evaluation/{solution_type} solution/dataset_{file_id}/{model_evaluator_path}_evaluating.json'
    with open(f"./solution/correct solution/dataset_{file_id}/{model_evaluator_path}_solution.json") as file:
        model_solution = json.load(file)
        
    #Load prompt for solution evaluation
    try:
        with open(system_prompt_fpath,"r") as file:
            system_prompt = file.read()
    except Exception as e:
        system_prompt = ""
    try:
        with open(user_prompt_fpath,"r") as file:
            user_prompt = file.read()
    except Exception as e:
        user_prompt = ""
    
    evaluated_model_idx = 0
    for model in models[:1]:
        evaluated_model_idx += 1
        print(f"Currently evaluating model: {model} ({evaluated_model_idx}/{len(models)})", flush = True)
        model_path = model.replace("/","_")

        #Load solutions to evaluate
        if solution_type == 'correct':
            correct_solutions_repo = f"./solution/correct solution/dataset_{file_id}/{model_path}_solution.json"
            with open(correct_solutions_repo) as file:
                loaded_solution = json.load(file)
            for question_id in loaded_solution.keys():
                given_solution[question_id] = {}
                given_solution[question_id]["Correct Solution"] = loaded_solution[question_id]
        elif solution_type == 'wrong':
            wrong_solutions_repo = f"./solution/wrong solution/step 3/dataset_{file_id}/temporary_files/{model_path}_wrong_solution.json"
            with open(wrong_solutions_repo) as file:
                given_solution = json.load(file)
        else:
            pass

        evaluation_dct[model] = {}
        evaluation_text[model] = {}
        errors[model] = {}
        
        # Assuming primary_df is defined elsewhere
        question_idx = 0
        for questionID in list(given_solution.keys()):
            question_idx += 1
            evaluation_dct[model][questionID] = {}
            evaluation_text[model][questionID] = {}
            print(f"\rEvaluating question {questionID} ({question_idx}/{len(given_solution.keys())}){white_space}")

            solution_idx = 0
            for solutionID in given_solution[questionID].keys():
                solution_idx += 1
                print(f"\rEvaluating solution {solutionID} ({solution_idx}/{len(given_solution[questionID].keys())}){white_space}", end = '', flush=True)
                info = {
                    "question" : given_solution[questionID][solutionID]["question"],
                    "model solution" : model_solution.get(questionID, {}).get("solution", None),
                    "given solution" : given_solution[questionID][solutionID]["solution"] if solution_type == 'correct' or solution_type == 'custom'
                                       else given_solution[questionID][solutionID]["wrong_solution"],
                    "kc": dataset[questionID]["Grade"] + ' ' + dataset[questionID]["KC"],
                    "error_types" : error_types[questionID]["type_of_error"],
                    "error_explanation" : error_types[questionID]["error_explanation"]
                }
            
                prompt = [{
                            "role": "user",
                            "content": system_prompt.format(),
                          },
                          {
                              "role" : "user",
                              "content" : user_prompt.format(
                                  question=info["question"],
                                  model_solution=info["model solution"],
                                  given_solution=info["given solution"],
                                  kc=info["kc"],
                                  type_of_error = info["error_types"],
                                  error_explanation = info["error_explanation"]
                              )
                          }]
                
                #Get response from API call
                response = client.chat.completions.create(
                        model=model_evaluator,
                        messages=prompt,
                        temperature=0.9
                    )
                message = response.choices[0].message
                if message is None or message.content is None:
                    print(f"No valid message returned for question: {info["question"]}")
                    continue    
                response_text = message.content.strip()
                response_text_extracted = None
    
                #Extract and convert to dictionary
                match = re.findall(r'```(.*?)```', response_text, re.DOTALL)
                if match:
                    response_text_extracted = match[-1]
                    match = re.findall(r'\{.*\}', response_text_extracted, re.DOTALL)
                else:
                    match = re.findall(r'\{.*\}', response_text, re.DOTALL)
                if match:
                    response_text_extracted = match[0]
                else:
                    print("NO JSON FORMAT OUTPUT FOUND! ERROR!")
                    print(response_text)   
                try:
                    response_dct = ast.literal_eval(response_text_extracted)
                    evaluation_dct[model][questionID][solutionID] = response_dct
                except Exception as e:
                    evaluation_text[model][questionID][solutionID] = response_text
                    print("Found error at", questionID, model, solutionID, "error: ", e)
                    print(response_text)
    
    #Save to file
    os.makedirs(os.path.dirname(generated_evaluation_fpath), exist_ok=True)
    with open(generated_evaluation_fpath, 'w') as file:
        json.dump(evaluation_dct, file, indent=4)

    os.makedirs(os.path.dirname(f'./error/raw_responses/{model_evaluator_path}_evaluating.json'), exist_ok=True)
    with open(f'./error/raw_responses/{model_evaluator_path}_evaluating.json','w') as file:
        json.dump(evaluation_text,file,indent=4)

    if errors:
        with open(f'./error/{model_evaluator_path}_evaluating_{model_path}.json', 'w') as file:
            json.dump(errors, file, indent=4)

Are you sure you want to use chatgpt to generate 240 responses? (Empty answer for no) 0


Current evaluator model: openai/gpt-4.1 (1/6)
Currently evaluating model: openai/gpt-4.1 (1/6)
Evaluating question P6ratio (1/15)                                                  
Evaluating question O3NumOps (2/15)                                                                              
Evaluating question P5Deci4Op (3/15)                                                                             
Evaluating question P5rate (4/15)                                                                                
Evaluating question P2FracW (5/15)                                                                               
Evaluating question P4FctorM (6/15)                                                                              
Evaluating question P3N10k (7/15)                                                                                
Evaluating question P5FracDv (8/15)                                                                              
Evaluating question O1

In [35]:
os.makedirs(os.path.dirname(generated_evaluation_fpath), exist_ok=True)
with open(generated_evaluation_fpath, 'w') as file:
    json.dump(evaluation_dct, file, indent=4)

In [22]:
os.makedirs(os.path.dirname(generated_evaluation_fpath), exist_ok=True)
with open(generated_evaluation_fpath, 'w') as file:
    json.dump(evaluation_dct, file, indent=4)

In [43]:
import requests
import json

response = requests.get(
  url="https://openrouter.ai/api/v1/auth/key",
  headers={
    "Authorization": f"Bearer {TOKEN}"
  }
)

print(json.dumps(response.json(), indent=2))

{
  "data": {
    "label": "sk-or-v1-8ce...e3b",
    "limit": null,
    "usage": 21.1609805,
    "is_provisioning_key": false,
    "limit_remaining": null,
    "is_free_tier": false,
    "rate_limit": {
      "requests": 450,
      "interval": "10s"
    }
  }
}


### 2'. Reinserting erroneous responses

In [40]:
model_evaluator_path = models[4].replace("/","_")
with open(f'./evaluation/{solution_type} solution/dataset_{file_id}/{model_evaluator_path}_evaluating.json', 'r') as file:
    evaluation_dct = json.load(file)
    print(evaluation_dct.keys())
model = models[0]
questionID = 'O1EqIneq'
solutionID = 'Invalid Assumption'
fixed_response = """{
"question" : "A mobile phone usually costs \$ x \$ dollars. During a sale, its price is reduced by 50 dollars and the new price is 320 dollars. Write down an equation to represent this situation and find the value of \$ x \$.",
"correct" : "no",
"wrong_solution" : {"Step 2": "Assume there is an additional service charge of 30 dollars added during the sale.", "Step 3": "So, the new price after reduction and adding the service charge is \$ x - 50 + 30 \$ dollars."},
"type_of_error" : {"Step 2": "Invalid Assumption"},
"error_explanation" : {"Step 2": "The student made an invalid assumption by introducing an additional service charge of 30 dollars, which is not mentioned in the question. This extra information is not part of the original problem and affects the solution unnecessarily."},
"correct_solution" : {"Step 1": "The original price of the mobile phone is \$ x \$. During the sale, the price is reduced by 50 dollars, so the new price is \$ x - 50 \$ dollars.", "Step 2": "We are told that the new price is 320 dollars. Therefore, the equation representing this situation is \$ x - 50 = 320 \$.", "Step 3": "To find the value of \$ x \$, we solve the equation \$ x - 50 = 320 \$. Adding 50 to both sides of the equation gives \$ x = 320 + 50 \$.", "Step 4": "Simplifying the right side of the equation, we get \$ x = 370 \$."}}"""
try:
    response_dct = ast.literal_eval(fixed_response)
    evaluation_dct[model][questionID][solutionID] = response_dct
except Exception as e:
    evaluation_text[model][questionID][solutionID] = response_text
    print("Found error at", questionID, model, solutionID, "error: ", e)
    print(response_text)

dict_keys(['anthropic/claude-3.7-sonnet'])


  fixed_response = """{
  fixed_response = """{


KeyError: 'openai/gpt-4.1'

## 3. High quality dataset extraction code