In [1]:
import os
import base64
from pydantic import BaseModel
from openai import AzureOpenAI
from api_utils import *
import jiter
from extract import get_next_question_id
import re
import json

claude4_model_id = "apac.anthropic.claude-sonnet-4-20250514-v1:0"
client = create_client()
openai_client = create_openai_client()
aws_client = get_aws_client()

# Experiments


In [2]:
class MWP(BaseModel):
    answerability: int
    primary_kc_alignment: int
    secondary_kc_alignment: int
    topic_alignment: int
    grade_alignment: int
    real_world_feasibility: int
    synergy: int
    clarity: int
    conciseness_and_relevance: int
    language_quality: int
    content_appropriateness: int
    explanation: str

    '''  "answerability": 0/1,
  "primary_kc_alignment": 0/1,
  "secondary_kc_alignment": 0/1,
  "synergy": 0/1,
  "topic_alignment": 0/1,
  "grade_alignment": 0/1,
  "real_world_feasibility": 0/1,
  "clarity": 0/1,
  "conciseness_and_relevance": 0/1,
  "language_quality: 0/1,
  "content_appropriateness": 0/1,
  "solution_correctness": 0/1,'''

In [3]:
sys_prompt_path = r"Prompts\GPT\evaluation_system_prompt_v4.1.txt"
with open(sys_prompt_path, "r", encoding="utf-8") as file:
    sys_prompt = file.read()
    
user_prompt_path = r"Prompts\GPT\evaluation_user_prompt_v4.txt"
with open(user_prompt_path, "r", encoding="utf-8") as file:
    user_prompt_template = file.read()

claude_prompt_path = r"Prompts\Claude\evaluation_prompt_v2.txt"
with open(claude_prompt_path, "r", encoding="utf-8") as file:
    claude_prompt_template = file.read()
    print(claude_prompt_template)

def metadata(response_dict=None, **kwargs):
    kwargs['response_dict'] = response_dict
    return kwargs

def question_evaluator(word_problem, solution, kc1, kc2, topic, grade, qid, model = "GPT"):
    if (word_problem is None) or (word_problem == ""):
        response_dict = {
            "answerability": 0,
            "primary_kc_alignment": 0,
            "secondary_kc_alignment": 0,
            "topic_alignment": 0,
            "grade_alignment": 0,
            "real_world_feasibility": 0,
            "synergy": 0,
            "clarity_score": 0,
            "conciseness_and_relevance": 0,
            "language_quality_score": 0,
            "content_appropriateness": 0,
            "solution_correctness": 0,
            "explanation": "No word problem is found"
        }
    else:
        if model == "GPT4.1":
            user_prompt = user_prompt_template.format(word_problem = word_problem, kc1=kc1, kc2=kc2, topic=topic, grade=grade)
            response_dict = get_GPT_struct_response(openai_client, sys_prompt, user_prompt, MWP)
        elif model == "Claude4":
            claude_prompt = claude_prompt_template.format(word_problem = word_problem, kc1 = kc1, kc2 = kc2, topic = topic, grade=grade)
            response_dict = get_claude_response(client = aws_client, model_id = claude4_model_id, sys_prompt = None, user_prompt = claude_prompt, config={})
            
            # Extract the JSON block from the triple backticks
            match = re.search(r"```json\s*(\{.*?\})\s*```", response_dict["response"], re.DOTALL)
            if match:
                json_str = match.group(1)
                extracted_dict = json.loads(json_str)
            else:
                print("No JSON found")

            # Save back into response dict
            for key, value in extracted_dict.items():
                response_dict[key] = value
            response_dict.pop("response","Text response already popped")
        else:
            print("This model is not supported yet. Choose GPT/Claude to continue")
            return
    return metadata(response_dict=response_dict, Word_problem = word_problem, Solution = solution, Primary_kc = kc1, Secondary_kc = kc2, Topic=topic, Grade=grade, QID=qid)

Role and Objective:
You are a strict educational content evaluator specialized in **EVALUATE THE QUALITY OF MATH WORD PROBBLEMS ** following Singapore math syllabus. 
Your goal is to evaluate a **GIVEN MATH WORD PROBLEM** as strictly as possible based on the given **EVALUATION CRITERIA**. The math word problem is created using an intended Primary Knowledge Component, an intended Secondary Knowledge Component, an intended Topic and an intended Grade. 

Evaluation Criteria: 
1. Answerability: The word problem must be answerable with NO missing information, NO conflicting information and NO illogical relationship. 
2. Primary Knowledge Component Alignment: The word problem MUST be on the intended Primary Knowledge Component, that is, the Primary Knowledge Component is needed for solving the word problem. The Primary Knowledge Component decides the main idea and difficulty level of the word problem and there should be no other knowledge component in the word problem that is harder than it.

In [4]:
# Testing the function
response= question_evaluator( "\nA factory produces \\( 3\\frac{1}{2} \\) metres of fabric each hour. In one day, the factory needs to use \\( \\frac{3}{4} \\) of a metre of fabric to make each bag.\n\n(a) If the factory wants to pack all of today's fabric equally into 5 boxes, how many metres of fabric will each box have?\n\n(b) After packing, the manager realises that \\(1\\frac{1}{4}\\) metres has already been used for samples. How much fabric is left in each box after the samples have been subtracted?\n", 
                              "\n(a) Total fabric produced in one hour is \\( 3\\frac{1}{2} = \\frac{7}{2} \\) metres.\n\nIf this fabric is packed equally into 5 boxes, the amount in each box is:\n\n\\[\n\\frac{7}{2} \\div 5 = \\frac{7}{2} \\times \\frac{1}{5} = \\frac{7}{10}\n\\]\n\nSo, each box has \\(\\frac{7}{10}\\) metres of fabric.\n\n(b) If \\(1\\frac{1}{4}\\) metres is used for samples, subtract this from the amount in each box:\n\nFirst, convert \\(1\\frac{1}{4}\\) to an improper fraction:\n\\[\n1\\frac{1}{4} = \\frac{5}{4}\n\\]\n\nSubtract from \\(\\frac{7}{10}\\) (find a common denominator, which is 20):\n\n\\[\n\\frac{7}{10} = \\frac{14}{20}, \\quad \\frac{5}{4} = \\frac{25}{20}\n\\]\n\\[\n\\frac{14}{20} - \\frac{25}{20} = -\\frac{11}{20}\n\\]\n\nSince \\(-\\frac{11}{20}\\) is negative, it means there is not enough fabric in each box after the samples are taken; each box is short of \\(\\frac{11}{20}\\) metre.\n", 
                              "FRACTIONS | Division | dividing a proper fraction by a whole number", 
                              "FRACTIONS | Subtraction | subtracting fractions", 
                              "Manufacturing",
                              "Primary 6",
                              "P6-FrDivPN_P5-FrSubMix_GPT4.1_Manufacturing_05",
                              "Claude")
print(response.get("response_dict"))

{'status': 0, 'response_time': 13.63425350189209, 'answerability': 0, 'primary_kc_alignment': 0, 'secondary_kc_alignment': 0, 'synergy': 0, 'topic_alignment': 1, 'grade_alignment': 1, 'real_world_feasibility': 1, 'clarity': 1, 'conciseness_and_relevance': 0, 'language_quality': 1, 'content_appropriateness': 1, 'explanation': 'Answerability: Missing information about factory operating hours per day makes the problem unsolvable. Primary KC Alignment: Problem requires dividing mixed number 3 1/2 by whole number 5, not dividing proper fraction by whole number as intended. Secondary KC Alignment: While fraction subtraction occurs, the synergy is missing. Synergy: Parts (a) and (b) are not connected - part (b) uses fixed subtraction 1 1/4 instead of building on part (a) result. Conciseness: The 3/4 metre per bag information is irrelevant to solving either question.'}


In [4]:
def batch_question_evaluator(all_responses, input_path, output_path, model = "GPT"):
    with open(input_path, "r", encoding="utf-8") as f:
        input_data = json.load(f)
    question_tracker = 0
    for qid, data_dict in input_data.items():
        question_tracker += 1
        print(f"\rCurrent question: {qid} ({question_tracker}/{len(input_data)})", " "*50, end = "")
        word_problem = data_dict.get("response_dict", {}).get("word_problem", "")
        solution = data_dict.get("response_dict", {}).get("solution", "")
        kc1 = data_dict.get("Primary_kc", "")
        kc2 = data_dict.get("Secondary_kc", "")
        topic = data_dict.get("Topic", "")
        grade = data_dict.get("Grade", "")
        response = question_evaluator(word_problem = word_problem, solution = solution, kc1 = kc1, kc2 = kc2, topic = topic, grade = grade, qid = qid, model = model)
        if response:  
            all_responses["Question " + str(question_tracker)] = response
            all_responses = exercise_NA_evaluation(all_responses)
            with open(output_path, "w", encoding="utf-8") as f:
                json.dump(all_responses, f, indent=2)

def batch_split_question_evaluator(all_responses, input_path, output_path, model = "GPT"):
    with open(input_path, "r", encoding="utf-8") as f:
        input_data = json.load(f)

    evaluator_tracker = 0
    for evaluator, question_set in input_data.items():
        evaluator_tracker += 1
        question_tracker = 0
        all_responses[evaluator] = {}
        for qid, data_dict in question_set.items():
            question_tracker += 1
            print(f"\rCurrent model: {model}, Current corresponding human evaluator: {evaluator} ({evaluator_tracker}/{len(input_data)}) Current question: {qid} ({question_tracker}/{len(question_set)})" + " "*50, end = " ")
            word_problem = data_dict.get("response_dict", {}).get("word_problem", "")
            solution = data_dict.get("response_dict", {}).get("solution", "")
            kc1 = data_dict.get("Primary_kc", "")
            kc2 = data_dict.get("Secondary_kc", "")
            topic = data_dict.get("Topic", "")
            grade = data_dict.get("Grade", "")
            response = question_evaluator(word_problem = word_problem, solution = solution, kc1 = kc1, kc2 = kc2, topic = topic, grade = grade, qid = qid, model = model)
            if response:  
                all_responses[evaluator]["Question " + str(question_tracker)] = response
                all_responses = exercise_NA_evaluation(all_responses)
                with open(output_path, "w", encoding="utf-8") as f:
                    json.dump(all_responses, f, indent=2)
    return all_responses

def exercise_NA_evaluation(all_responses):
    depth = 1
    next_layer = all_responses.get(list(all_responses.keys())[0])
    while type(next_layer) == dict:
        depth += 1
        next_layer = next_layer.get(list(next_layer.keys())[0])

    if depth == 3:
        for evaluator, question_list in all_responses.items():
            for qid, evaluation in question_list.items():
                response = evaluation["response_dict"]
                if response["answerability"] == 0:
                    response["primary_kc_alignment"] = 'NA'
                    response["secondary_kc_alignment"] = 'NA'
                    response["topic_alignment"] = 'NA'
                    response['grade_alignment'] = 'NA'
                    response["real_world_feasibility"] = 'NA'
                    response["synergy"] = 'NA'
                    response["clarity"] = 'NA'
                    response['conciseness_and_relevance'] = 'NA'
                    response["language_quality"] = 'NA'
                    response["content_appropriateness"] = 'NA'
                if response["primary_kc_alignment"] == 0:
                    response['grade_alignment'] = 'NA'
                    response["synergy"] = 'NA'
                if response["secondary_kc_alignment"] == 0:
                    response["synergy"] = 'NA'
    elif depth == 2:
         for qid, evaluation in all_responses.items():
            response = evaluation["response_dict"]
            if response["answerability"] == 0:
                response["primary_kc_alignment"] = 'NA'
                response["secondary_kc_alignment"] = 'NA'
                response["topic_alignment"] = 'NA'
                response['grade_alignment'] = 'NA'
                response["real_world_feasibility"] = 'NA'
                response["synergy"] = 'NA'
                response["clarity"] = 'NA'
                response['conciseness_and_relevance'] = 'NA'
                response["language_quality"] = 'NA'
                response["content_appropriateness"] = 'NA'
            if response["primary_kc_alignment"] == 0:
                response['grade_alignment'] = 'NA'
                response["synergy"] = 'NA'
            if response["secondary_kc_alignment"] == 0:
                response["synergy"] = 'NA'
    else:
        print("Wrong argument format! Please check again if it is all_responses dictionary generated by the LLM")
    return all_responses

In [7]:
# GPT evaluating common questions in recreation
version = 0
for i in range(5):
    version += 1
    print("Evaluation round:", i+1, "/ 5")
    all_responses_r_common = {}
    all_responses_r_common = batch_question_evaluator(all_responses = all_responses_r_common,
                                                      input_path = r"C:\Users\Amin\A-Star-AI-for-education\AStar Internship - Question\Data\Generated_questions\GPT4.1\All\recreation\100_recreation_v8_v1.json",
                                                      output_path = rf".\Data\Evaluated_questions\GPT4.1\All\recreation\recreation_gpt_v2_common_{version}.json",
                                                      model = "GPT"
    )

Evaluation round: 1 / 5
Current question: P6-RoFndRoWN_P1-WNSub2nd_GPT4.1_Recreation_01 (100/100)                                                   Evaluation round: 2 / 5
Current question: P6-RoFndRoWN_P1-WNSub2nd_GPT4.1_Recreation_01 (100/100)                                                   Evaluation round: 3 / 5
Current question: P6-RoFndRoWN_P1-WNSub2nd_GPT4.1_Recreation_01 (100/100)                                                   Evaluation round: 4 / 5
Current question: P6-RoFndRoWN_P1-WNSub2nd_GPT4.1_Recreation_01 (100/100)                                                   Evaluation round: 5 / 5
Current question: P6-RoFndRoWN_P1-WNSub2nd_GPT4.1_Recreation_01 (100/100)                                                   

In [10]:
# Claude evaluating common questions in recreation
version = 0
for i in range(5):
    version += 1
    all_responses_r_common = {}
    print("Evaluation round:", i+1, "/ 5")
    all_responses_r_common = batch_question_evaluator(all_responses = all_responses_r_common,
                                                      input_path = r"C:\Users\Amin\A-Star-AI-for-education\AStar Internship - Question\Data\Generated_questions\GPT4.1\All\recreation\100_recreation_v8_v1.json",
                                                      output_path = rf".\Data\Evaluated_questions\Claude4\All\recreation\recreation_gpt_v2_common_{version}.json",
                                                      model = "Claude"
    )

Evaluation round: 1 / 5
Current question: P6-RoFndRoWN_P1-WNSub2nd_GPT4.1_Recreation_01 (100/100)                                                   Evaluation round: 2 / 5
Current question: P6-RoFndRoWN_P1-WNSub2nd_GPT4.1_Recreation_01 (100/100)                                                   Evaluation round: 3 / 5
Current question: P6-RoFndRoWN_P1-WNSub2nd_GPT4.1_Recreation_01 (100/100)                                                   Evaluation round: 4 / 5
Current question: P6-RoFndRoWN_P1-WNSub2nd_GPT4.1_Recreation_01 (100/100)                                                   Evaluation round: 5 / 5
Current question: P6-RoFndRoWN_P1-WNSub2nd_GPT4.1_Recreation_01 (100/100)                                                   

In [None]:
# Evaluating all topics generated by GPT
models = ["GPT4.1", "Claude4"]
topics = ["recreation", "services", "householdfinance"]
for evaluator_model in models:
    for version in range(1,6):
        print(f"Current round: {version}/5")
        all_responses = {}
        for topic in topics:
            all_responses[topic] = {}
            all_resposnes[topic] = batch_split_question_evaluator(all_responses = all_responses[topic], 
                                                          input_path = rf".\Data\Generated_questions\GPT4.1\All\{topic}\{topic}_v8_v1_split.json", 
                                                          output_path = rf".\Data\Evaluated_questions\{evaluator_model}\All\{topic}\{topic}_gpt_v2_{version}.json",
                                                          model = evaluator_model
                                                         )

In [13]:
# Evaluating all topics generated by Claude
models = ["GPT4.1", "Claude4"]
topics = ["recreation","services","householdfinance"] # 1 topic generated by Claude only
for evaluator_model in models:
    for version in range(1,6):
        print(f"Current round: {version}/5")
        all_responses = {}
        for topic in topics:
            all_responses[topic] = {}
            all_responses[topic] = batch_split_question_evaluator(all_responses = all_responses[topic], 
                                                          input_path = rf".\Data\Generated_questions\Claude4\All\{topic}\{topic}_split_v3.json", 
                                                          output_path = rf".\Data\Evaluated_questions\{evaluator_model}\All\{topic}\{topic}_claude_v3_{version}.json",
                                                          model = evaluator_model
                                                         )

Current round: 2/5
Current model: GPT4.1, Current corresponding human evaluator: Sarah (3/3) Current question: O3-SPMulProb_O3-SPFndPrCE_sonnet4_Recreation_04 (64/64)                                                   Current round: 3/5
Current model: GPT4.1, Current corresponding human evaluator: Sarah (3/3) Current question: O3-SPMulProb_O3-SPFndPrCE_sonnet4_Recreation_04 (64/64)                                                   Current round: 4/5
Current model: GPT4.1, Current corresponding human evaluator: Sarah (3/3) Current question: O3-SPMulProb_O3-SPFndPrCE_sonnet4_Recreation_04 (64/64)                                                   Current round: 5/5
Current model: GPT4.1, Current corresponding human evaluator: Sarah (3/3) Current question: O3-SPMulProb_O3-SPFndPrCE_sonnet4_Recreation_04 (64/64)                                                   

In [44]:
# This code is to turn the evaluations to NA where applicable (e.g. turn all to NA when "answerable" is 0)
# Set your directory path
directory = r".\Data\Evaluated_questions\Claude4\All\recreation"

# Loop through all JSON files
file_id = 0
for filename in os.listdir(directory):
    file_id += 1
    if filename.endswith(".json"):
        json_path = os.path.join(directory, filename)

        # Load JSON
        with open(json_path, "r", encoding="utf-8") as f:
            data = json.load(f)

    # Use the helper function defined above (same block as batch_split_question_evaluator)
    exercise_NA_evaluation(data)
    with open(json_path, "w", encoding="utf-8") as f:
        json.dump(data, f, indent = 4)

In [20]:
import os
import json
import pandas as pd
import re

def sanitize_excel_cell(val):
    if pd.isna(val):
        return val
    try:
        if isinstance(val, (int, float)):
            return val
        val_str = str(val)
        val_str = re.sub(r'[\x00-\x1F\x7F]', '', val_str)
        val_str = re.sub(r'\\[a-zA-Z]+(?:\{.*?\})*', '[FORMULA]', val_str)
        if val_str == "NA":
            return "'NA"  # Force Excel to show it as a string
        return val_str
    except Exception:
        return '[INVALID]'

# Set your directory path
directory = r".\Data\Evaluated_questions\Claude4\All\recreation"

# Loop through all JSON files
file_id = 0
for filename in os.listdir(directory):
    file_id += 1
    if filename.endswith(".json") and filename.startswith("recreation_claude"):
        json_path = os.path.join(directory, filename)

        # Load JSON
        with open(json_path, "r", encoding="utf-8") as f:
            data = json.load(f)

        depth = 1
        next_layer = data.get(list(data.keys())[0])
        while type(next_layer) == dict:
            depth += 1
            next_layer = next_layer.get(list(next_layer.keys())[0])

        # Build row-wise records
        rows = []
        # Depth = 3 means the dataset is split for 3 evaluator
        # Depth = 2 means common dataset
        if depth == 3:
            for evaluator, questions in data.items():
                for question_id, question_data in questions.items():
                    response_dict = question_data.get("response_dict", {})
                    row = {"Evaluator": evaluator, "Question": question_id}
                    row.update(response_dict)
                    rows.append(row)
        else:
            for question_id, question_data in data.items():
                response_dict = question_data.get("response_dict", {})
                row = {"Evaluator": evaluator, "Question": question_id}
                row.update(response_dict)
                rows.append(row)

        df = pd.DataFrame(rows)

        # Sanitize each cell
        df = df.applymap(sanitize_excel_cell)

        # Save to Excel
        excel_filename = filename.replace(".json", ".xlsx")
        excel_path = os.path.join(directory, excel_filename)
        df.to_excel(excel_path, index=False)

        print(f"Saved sanitized file: {excel_filename}")


  df = df.applymap(sanitize_excel_cell)


Saved sanitized file: recreation_claude_v2_1.xlsx
Saved sanitized file: recreation_claude_v2_2.xlsx
Saved sanitized file: recreation_claude_v2_3.xlsx
Saved sanitized file: recreation_claude_v2_4.xlsx
Saved sanitized file: recreation_claude_v2_5.xlsx


In [23]:
import os
import pandas as pd

# Set your directory path
directory = r".\Data\Evaluated_questions\Claude4\All\recreation"

# List all Excel files in the folder
excel_files = [f for f in os.listdir(directory) if f.endswith(".xlsx") and f.startswith("recreation_claude")]

# Create a new Excel writer for the combined output
with pd.ExcelWriter(os.path.join(directory, "combined_output_claude.xlsx"), engine="openpyxl") as writer:
    for file in excel_files:
        file_path = os.path.join(directory, file)
        # Read the first (and only) sheet
        df = pd.read_excel(file_path)
        # Use filename (without .xlsx) as sheet name, limit to 31 chars
        sheet_name = os.path.splitext(file)[0][:31]
        df.to_excel(writer, sheet_name=sheet_name, index=False)

        print(f"✅ Added sheet: {sheet_name}")

✅ Added sheet: recreation_claude_v2_1
✅ Added sheet: recreation_claude_v2_2
✅ Added sheet: recreation_claude_v2_3
✅ Added sheet: recreation_claude_v2_4
✅ Added sheet: recreation_claude_v2_5


In [None]:
"C:\Users\Amin\A-Star-AI-for-education\AStar Internship - Question\Data\Generated_questions\GPT4.1\All\householdfinance\householdfinance_v8_v1_split.json"

In [8]:
import json
import pandas as pd

def convert_json_to_table(json_file_path, output_file_path, output_format="csv"):
    # Step 1: Load the JSON file
    with open(json_file_path, "r", encoding="utf-8") as f:
        data = json.load(f)

    # Step 2: Extract relevant fields for each question
    rows = []
    for idx, (qid, content) in enumerate(data.items(), start=1):
        r = content["response_dict"]
        row = {
            "Question no": idx,
            "Answerability": r["answerablitily"],
            "Primary KC Alignment": r["primary_kc_alignment"],
            "Secondary KC Alignment": r["secondary_kc_alignment"],
            "Synergy": r["synergy"],
            "Topic Alignment": r["topic_alignment"],
            "Grade Alignment": r["grade_alignment"],
            "Real-World Feasibility": r["real_world_feasibility"],
            "Clarity": r["clarity"],
            "Conciseness and Relevance": r["conciseness_and_relevance"],
            "Language Quality": r["language_quality"],
            "Content Appropriateness": r["content_appropriateness"],
            "Solution Correctness": r["solution_correctness"]
        }
        rows.append(row)

    # Step 3: Convert to DataFrame
    df = pd.DataFrame(rows)

    # Step 4: Save as CSV or Excel
    if output_format == "csv":
        df.to_csv(output_file_path, index=False)
    elif output_format == "excel":
        df.to_excel(output_file_path, index=False)
    else:
        raise ValueError("Invalid output format. Choose 'csv' or 'excel'.")

    print(f"File saved to {output_file_path}")


In [9]:
convert_json_to_table(r"Data\Evaluated_questions\GPT4.1\O1\O1_GPT4.1_questions_20_sample_updated_v5_v1.json", r"O1_GPT4.1_questions_20_sample_updated_v5_v1_GPT4.1_v2.csv", "csv")

File saved to O1_GPT4.1_questions_20_sample_updated_v5_v1_GPT4.1_v2.csv
