### Import important libraries

In [1]:
#Import important libraries
import os
import json
import ast
import re

import torch
import pandas as pd
import numpy as np
import sklearn as sk

from dotenv import load_dotenv
from together import Together
from openai import OpenAI

### Import original PSLE dataset

In [2]:
with open("./dataset/generated_questions.json", "r") as file:
    dataset = json.load(file)

In [12]:
def flatten_dict_to_depth(d, max_depth, separator='_'):
    def _flatten(current, parent_key='', depth=0):  # Start from depth = 0
        items = {}
        for k, v in current.items():
            new_key = f"{parent_key}{separator}{k}" if parent_key else k
            if isinstance(v, dict) and depth < max_depth:
                items.update(_flatten(v, new_key, depth + 1))
            else:
                items[new_key] = v
        return items

    return _flatten(d)

def get_max_depth(d):
    if not isinstance(d, dict) or not d:
        return 0
    depth_list = [get_max_depth(v) for v in d.values() if isinstance(v, dict)]
    if not depth_list:
        return 1
    if depth_list is None:
        return 0
    return 1 + max(depth_list)

In [13]:
dataset_flat = flatten_dict_to_depth(dataset, get_max_depth(dataset) - 1, separator='_')
dataset_gpt41 = dataset['gpt41']
result = kc_df.groupby(kc_df.columns[4])[kc_df.columns[5]].apply(list).to_dict()
print(result.keys())

# Initialize keys and token

## OpenRouter with OpenAI wrapper

In [5]:
load_dotenv()
TOKEN = os.getenv("AStarPrivate")
if TOKEN is None:
    raise RuntimeError("Token parsing failed")

client = OpenAI(
    base_url="https://openrouter.ai/api/v1",
    api_key=TOKEN,
)

models = ["openai/gpt-4.1", "anthropic/claude-3.7-sonnet", "deepseek/deepseek-r1", 
          "microsoft/phi-4", "qwen/qwen3-14b", 
          "meta-llama/llama-4-scout"]
models_free = ["microsoft/phi-4-reasoning-plus:free", "deepseek/deepseek-r1-distill-qwen-32b:free", "meta-llama/llama-3.3-8b-instruct:free",
              "qwen/qwen3-32b:free"]
model_default = "deepseek/deepseek-r1-distill-qwen-32b:free"
model = None

# Main LLM solving PSLE questions loop (correct solutions)

This code let each of the LLM solve PSLE questions and store those answers in the ```solutions``` folder

## OpenRouter Loop

In [54]:
for model in models[0:1]:
    kc_index = 0
    dataset_length = len(dataset.keys())

    print("Currently working with model: ", model)

    all_outputs_dct = {}
    all_outputs_text = {}
    errors = {}
    
    # Assuming primary_df is defined elsewhere
    for kcID in dataset.keys():
        kc_index += 1
        info = {
            "question" : dataset[kcID]["Question"],
            "answer" : None,
            "kc": None
        }

        try:
            with open(f"./prompt/correct solution/{model_path}_system_prompt.txt","r") as file:
                system_prompt = file.read()
        except Exception as e:
            system_prompt = ""
        try:
            with open(f"./prompt/correct solution/{model_path}_user_prompt.txt","r") as file:
                user_prompt = file.read()
        except Exception as e:
            user_prompt = ""
    
        prompt = [
            {
                "role": "system",
                "content": system_prompt
            },
            {
                "role" : "user",
                "content" : user_prompt.format(question = info["question"])
            }
        ]
    
        try:
            if model is None:
                response = client.chat.completions.create(
                        model=model_default,
                        messages=prompt,
                        temperature=0.5
                    )
            elif model == "deepseek/deepseek-r1-distill-qwen-14b":
                response = client.chat.completions.create(
                        model=model,
                        messages=prompt,
                        temperature=0.5,
                        max_tokens=30000 
                    )
            else:
                response = client.chat.completions.create(
                        model=model,
                        messages=prompt,
                        temperature=0.5
                    )
    
            message = response.choices[0].message
            if message is None or message.content is None:
                print(f"No valid message returned for question: {info["question"]}")
                continue
    
            response_text = message.content.strip()

            response_text_extracted = None
            match = re.findall(r'\{.*\}', response_text, re.DOTALL)
            if match:
                response_text_extracted = match[0]
            else:
                print("NO JSON FORMAT OUTPUT FOUND! ERROR!")
                print(response_text)
        
            try:
                response_dct = ast.literal_eval(response_text_extracted)
                all_outputs_dct[kcID] = response_dct
            except Exception as e:
                errors[kcID] = response_text
                print("Found error at", kcID, "error: ", e)
                print(response_text)
    
            all_outputs_text[kcID] = response_text

    
            print(f"Knowledge component {kcID} ({kc_index}/{dataset_length}) loaded successfully")
    
        except Exception as e:
            print(f"Error during API call for KC: {kcID} - {e}")

    model_path = model.replace("/","_")
    
    if model is None:
        with open(f'./solution/correct solution/td_questions/{model_default}_solution.json', 'w') as file:
            json.dump(all_outputs_dct, file, indent=4)
    else:
        with open(f'./solution/correct solution/td_questions/{model_path}_solution.json', 'w') as file:
            json.dump(all_outputs_dct, file, indent=4)

Currently working with model:  openai/gpt-4.1
Knowledge component O1NR (1/23) loaded successfully
Knowledge component O2NA (2/23) loaded successfully
Knowledge component O2NE (3/23) loaded successfully
Knowledge component O2NR (4/23) loaded successfully
Knowledge component O2SP (5/23) loaded successfully
Knowledge component P2FF (6/23) loaded successfully
Knowledge component P3FE (7/23) loaded successfully
Knowledge component P3MM (8/23) loaded successfully
Knowledge component P3WA (9/23) loaded successfully
Knowledge component P3WM (10/23) loaded successfully
Knowledge component P4DA (11/23) loaded successfully
Knowledge component P4DD (12/23) loaded successfully
Knowledge component P4DM (13/23) loaded successfully
Knowledge component P4FA (14/23) loaded successfully
Knowledge component P4FF (15/23) loaded successfully
Knowledge component P4FM (16/23) loaded successfully
Knowledge component P4WF (17/23) loaded successfully
Knowledge component P4WN (18/23) loaded successfully
Knowledge

In [56]:
with open(f'./solution/correct solution/td_questions/{model_path}_solution.json', 'w') as file:
    json.dump(all_outputs_dct, file, indent=4)

In [42]:
with open("./testing.txt","w") as file:
    file.write(a)

In [53]:
def process_latex_string(s):
    s = s.replace('\\\\', '\\')   # Normalize double backslashes
    s = s.replace('\\n', r'\\')   # Replace \n with LaTeX newline command
    return s

    
    return s

a = process_latex_string(a)

with open("./testing.txt","w") as file:
    file.write(a)

## Amazon Bedrock loop

# Main LLM evaluating given solutions loop

This code let ```deepseek/deepseek-r1``` model evaluate the performance of other model.

In the future, this code will expand to let each model evaluate one another

## Helper function to get evaluation summary

The summary score will be calculated as follow:
```given_info_score, kc_score, final_ans_score, grammar_score``` is calculated by the ```total number of YES``` divided by ```total number of questions```

```understand_score, fluency_score``` is calculated by taking the average score given to all the questions

All of the variables are then scaled to have the max score of 100

In [8]:
# Get evaluation score for models over multiple question
def get_evaluation_score(evaluation):
    final_ans_score, kc_score, given_info_score, understand_score, fluency_score, grammar_score = 0,0,0,0,0,0
    evaluated_evaluations = 0
    error_flag = False
    for item in evaluation.keys():
        try:
            given_info_correct, kc_correct, final_ans_correct, understand, fluency, grammar_correct = (evaluation[item]['given_info_correct'], evaluation[item]['kc_correct'],
                                                    evaluation[item]['final_ans_correct'], evaluation[item]['understand'], evaluation[item]['fluency'],
                                                    evaluation[item]['grammar_correct'])
            evaluated_evaluations += 1
            if evaluation[item]['given_info_correct'] == 'y':
                given_info_score += 1
            if evaluation[item]['kc_correct'] == 'y':
                kc_score += 1
            if evaluation[item]['final_ans_correct'] == 'y':
                final_ans_score += 1
            understand_score += int(evaluation[item]['understand'])
            fluency_score += int(evaluation[item]['fluency'])
            if evaluation[item]['grammar_correct'] == 'y':
                grammar_score += 1
        except Exception as e:
            print(f"Failed to get evaluation score for item {item}. Error message: {e}")
            print(f"Item content: {evaluation[item]}")
            error_flag = True
    if evaluated_evaluations == 0:
        return None
    final_ans_score, kc_score, given_info_score, understand_score, fluency_score, grammar_score = [x*100/evaluated_evaluations for x in
            (final_ans_score, kc_score, given_info_score, understand_score, fluency_score, grammar_score)]
    understand_score /= 5
    fluency_score /= 5
    return given_info_score, kc_score, final_ans_score, understand_score, fluency_score, grammar_score, error_flag

## Main loop

In [38]:
evaluator = OpenAI(
    base_url="https://openrouter.ai/api/v1",
    api_key=TOKEN,
)

model_evaluator = "deepseek/deepseek-r1"

In [40]:
all_evaluations_summary = {}

for model in models[:1]:
    print("Currently evaluating model: ", model)

    model_path = model.replace("/","_")

    with open(f'./solution/{model_path}_solution.json', 'r') as file:
        all_outputs = json.load(file)

    evaluation = {}
    errors = {}
    
    # Assuming primary_df is defined elsewhere
    for i in range(1, len(dataset.keys())+1):
        info = {
            "question" : dataset[str(i)],
            "answer" : all_outputs.get(str(i), None),
            "kc": None
        }

        if info["answer"] is None:
            continue
    
        prompt = {
            "role": "user",
            "content": f""" You are an expert educator and your role is to grade the student's answer. 
            You are given the question, the student's answer, and you need to grade them based on these rubrics:
    
            -Task-oriented dimensions:
                •Given information is correct: y/n
                •Knowledge component (KC) applied is correct: y/n
                •Final answer is correct: y/n
                
    
            -Linguistic dimension:
                •Understandable (1 to 5)
                •Grammatically correct (y/n)
                •Fluency, clarity, conciseness (1 to 5)
    
            The question is: {info["question"]}
    
            The answer is: {info["answer"]}
    
            Your final response should be in the json format as below, without any other comments, use strings 'y' or 'n' for y/n:
            {{'given_info_correct' : ,
                'kc_correct' : ,
                'final_ans_correct' : ,
                'understand' : ,
                'grammar_correct' : ,
                'fluency' :
            }}
    
    """
        }
        
        response = evaluator.chat.completions.create(
                model=model_evaluator,
                messages=[prompt],
                temperature=0.5
            )
    
        message = response.choices[0].message
        if message is None or message.content is None:
            print(f"No valid message returned for question: {info["question"]}")
            continue
    
        response_text = message.content.strip()

        response_text_extracted = None
        match = re.findall(r'\{[^}]*\}', response_text)
        if match:
            response_text_extracted = match[0]
        else:
            print("NO JSON FORMAT OUTPUT FOUND! ERROR!")
            print(response_text)
    
        try:
            response_dct = ast.literal_eval(response_text_extracted)
            evaluation[i] = response_dct
        except Exception as e:
            errors[i] = response_text
            print("Found error at", i, "error: ", e)
            
        print("Question ", i, " loaded successfully")
    
        if i == 1:
            print(evaluation[i], type(evaluation[i]))

    model_evaluator_path = model_evaluator.replace("/","_")

    with open(f'./evaluation/correct solution/{model_evaluator_path}_evaluating_{model_path}.json', 'w') as file:
        json.dump(evaluation, file, indent=4)

    if errors:
        with open(f'./error/{model_evaluator_path}_evaluating_{model_path}.json', 'w') as file:
            json.dump(errors, file, indent=4)

  '''


Currently evaluating model:  openai/gpt-4.1
Question  1  loaded successfully
{'given_info_correct': 'y', 'kc_correct': 'y', 'final_ans_correct': 'y', 'understand': '5', 'grammar_correct': 'y', 'fluency': '5'} <class 'dict'>
Question  2  loaded successfully
Question  3  loaded successfully


  '''


KeyboardInterrupt: 

## Model evaluation summary

In [18]:
from pathlib import Path
import json

all_evaluations_summary = {}

folder_path = Path("./evaluation/correct solution")

for json_file in folder_path.glob("*.json"):
    with open(json_file, 'r') as f:
        try:
            evaluation = json.load(f)
        except json.JSONDecodeError as e:
            print(f"Failed to parse {json_file.name}: {e}")
        
    evaluation_name = json_file.name

    evaluation_summary = get_evaluation_score(evaluation)


    given_info_score, kc_score, final_ans_score, understand_score, fluency_score, grammar_score, error_flag = evaluation_summary

    if error_flag:
        print(f"WARNING: {evaluation_name} contains erronous evaluation")
    else:
        print(f"{evaluation_name} analyzed successfully")
    
    all_evaluations_summary[evaluation_name] = {"given_info_score" : given_info_score, "kc_score":kc_score, "final_ans_score":final_ans_score,
                                                "understand_score" : understand_score, "fluency_score":fluency_score, "grammar_score":grammar_score}

deepseek_deepseek-r1_evaluating_anthropic_claude-3.7-sonnet.json analyzed successfully
deepseek_deepseek-r1_evaluating_deepseek_deepseek-r1-distill-qwen-14b.json analyzed successfully
deepseek_deepseek-r1_evaluating_deepseek_deepseek-r1.json analyzed successfully
deepseek_deepseek-r1_evaluating_meta-llama_llama-3.3-70b-instruct.json analyzed successfully
deepseek_deepseek-r1_evaluating_meta-llama_llama-4-scout.json analyzed successfully
deepseek_deepseek-r1_evaluating_microsoft_phi-3.5-mini-128k-instruct.json analyzed successfully
deepseek_deepseek-r1_evaluating_microsoft_phi-4-reasoning-plus.json analyzed successfully
deepseek_deepseek-r1_evaluating_openai_chatgpt-4o-latest.json analyzed successfully
deepseek_deepseek-r1_evaluating_openai_gpt-4.1.json analyzed successfully
deepseek_deepseek-r1_evaluating_qwen_qwen3-14b.json analyzed successfully
Failed to get evaluation score for item 8. Error message: 'kc_correct'
Item content: {'given_info_correct': 'y', 'kc_c': 'y', 'final_ans_corr

In [19]:
with open("./final_summary.json", "w") as file:
    json.dump(all_evaluations_summary, file, indent=4)

# Main LLM solving PSLE questions loop (wrong solutions) (in progress)

In [23]:
type_of_mistakes = ["Arithmetic Operation Errors", "Decimal Place Value Errors", "Missing Key Information", "Overlooking Units and Conversion",
                    "Common Formula Mix-ups", "Incorrect Formula Application", "Geometry and Measurement Mistakes", "Pattern Recognition Failures",
                    "Algebraic Thinking Gaps", "Difficulty Translating Words to Mathematical Operations", "Multiple-Step Problem Confusion"]

In [5]:
for model in models[6:]:

    print("Currently working with model: ", model)

    all_outputs = {}
    
    # Assuming primary_df is defined elsewhere
    for i in range(1, len(dataset.keys())+1):
        info = {
            "question" : dataset[str(i)],
            "answer" : None,
            "kc": None
        }
    
        prompt = {
            "role": "user",
            "content": f""" You are a Singaporean primary school student taking the Primary School Leaving Examination (PSLE).
            I will give you a question which you should try to mimic a student's common mistake. The criteria for a good "wrong" question are as follows:

            - Task-oriented dimensions (wrong solution):
                •Is wrong solution
                •Type of error is correct
                
            - Linguistic dimension:
                •Varied understandability for different solutions
                •Grammar not always correct
                •Fluency, clarity, conciseness not always met

            Your response should follow this structure:

            Solution:

            (Your solution)

            Error explanation:

            (Your explanation of why the above solution is wrong)
    
            The question is: {info["question"]}
    
    """
        }
    
        try:
            if model is None:
                response = client.chat.completions.create(
                        model=model_default,
                        messages=[prompt],
                        temperature=0.5
                    )
            elif model == 'deepseek/deepseek-r1-distill-qwen-14b':
                response = client.chat.completions.create(
                        model=model,
                        messages=[prompt],
                        temperature=0.5,
                        max_tokens = 30000
                    )
            else:
                response = client.chat.completions.create(
                        model=model,
                        messages=[prompt],
                        temperature=0.5
                    )
    
            message = response.choices[0].message
            if message is None or message.content is None:
                print(f"No valid message returned for question: {info["question"]}")
                continue
    
            response_text = message.content.strip()
    
            all_outputs[i] = response_text
    
            print("Question ", i, " loaded successfully")
    
        except Exception as e:
            print(f"Error during API call for KC: {info["question"]} - {e}")

    model_path = model.replace("/","_")
    
    if model is None:
        with open(f'./solution/{model_default}_wrong_solution.json', 'w') as file:
            json.dump(all_outputs, file, indent=4)
    else:
        with open(f'./solution/{model_path}_wrong_solution.json', 'w') as file:
            json.dump(all_outputs, file, indent=4)

Currently working with model:  qwen/qwen3-235b-a22b
Question  1  loaded successfully
Question  2  loaded successfully
Question  3  loaded successfully
Question  4  loaded successfully
Question  5  loaded successfully
Question  6  loaded successfully
Question  7  loaded successfully
Question  8  loaded successfully
Question  9  loaded successfully
Question  10  loaded successfully
Question  11  loaded successfully
Question  12  loaded successfully
Question  13  loaded successfully
Question  14  loaded successfully
Question  15  loaded successfully
Question  16  loaded successfully
Question  17  loaded successfully
Question  18  loaded successfully
Question  19  loaded successfully
Question  20  loaded successfully
Question  21  loaded successfully
Question  22  loaded successfully
Question  23  loaded successfully
Question  24  loaded successfully
Currently working with model:  qwen/qwen3-14b
Question  1  loaded successfully
Question  2  loaded successfully
Question  3  loaded successfu

# Model evaluation for wrong questions

In [30]:
all_evaluations_summary = {}

for model in models:
    print("Currently evaluating model: ", model)

    model_path = model.replace("/","_")

    with open(f'./solution/{model_path}_wrong_solution.json', 'r') as file:
        all_outputs = json.load(file)

    evaluation = {}
    errors = {}
    
    # Assuming primary_df is defined elsewhere
    for i in range(1, len(dataset.keys())+1):
        info = {
            "question" : dataset[str(i)],
            "answer" : all_outputs.get(str(i), None),
            "kc": None
        }

        if info["answer"] is None:
            continue
    
        prompt = {
            "role": "user",
            "content": f""" You are an expert educator and your role is to grade the student's answer. 
            You are given the question, the working example which consists of 2 parts: Solution, and error explanations (if it exists)
            You need to grade them based on these rubrics:
    
            -Solution rubrics:
                •Given information is correct: y/n
                •Knowledge component (KC) applied is correct: y/n
                •Final answer is correct: y/n

            -Error explanation rubrics:
                •Correct type of error: y/n
    
            -Linguistic dimension:
                •Understandable: 1 to 5
                •Grammatically correct: y/n
                •Fluency, clarity, conciseness: 1 to 5
    
            The question is: {info["question"]}
    
            The working example is: {info["answer"]}
    
            Your response should be in the json format as below, without any other comments, use strings 'y' or 'n' for yes/no:
            {{'given_info_correct' : ,
                'kc_correct' : ,
                'final_ans_correct' : ,
                'error_type_correct' : ,
                'understand' : ,
                'grammar_correct' : ,
                'fluency' :
            }}
                
    
    """
        }
        
        response = evaluator.chat.completions.create(
                model=model_evaluator,
                messages=[prompt],
                temperature=0.5
            )
    
        message = response.choices[0].message
        if message is None or message.content is None:
            print(f"No valid message returned for question: {info["question"]}")
            continue
    
        response_text = message.content.strip()
    
        try:
            response_dct = ast.literal_eval(response_text)
        except Exception as e:
            errors[i] = response_text
            print("Found error at", i, "error: ", e)
            print(response_text)
    
        evaluation[i] = response_dct
    
        print("Question ", i, " loaded successfully")
    
        if i == 1:
            print(evaluation[i], type(evaluation[i]))

    model_evaluator_path = model_evaluator.replace("/","_")

    with open(f'./evaluation/{model_evaluator_path}_evaluating_wrong_{model_path}.json', 'w') as file:
        json.dump(evaluation, file, indent=4)

    if errors:
        with open(f'./error/{model_evaluator_path}_evaluating_wrong_{model_path}.json', 'w') as file:
            json.dump(errors, file, indent=4)

Currently evaluating model:  openai/chatgpt-4o-latest
Question  1  loaded successfully
{'given_info_correct': 'y', 'kc_correct': 'n', 'final_ans_correct': 'n', 'error_type_correct': 'y', 'understand': 5, 'grammar_correct': 'y', 'fluency': 5} <class 'dict'>
Question  2  loaded successfully
Question  3  loaded successfully
Question  4  loaded successfully
Question  5  loaded successfully
Question  6  loaded successfully
Question  7  loaded successfully
Question  8  loaded successfully
Question  9  loaded successfully
Question  10  loaded successfully
Question  11  loaded successfully
Question  12  loaded successfully
Question  13  loaded successfully
Question  14  loaded successfully
Question  15  loaded successfully
Question  16  loaded successfully
Question  17  loaded successfully
Question  18  loaded successfully
Question  19  loaded successfully
Question  20  loaded successfully
Question  21  loaded successfully
Question  22  loaded successfully
Question  23  loaded successfully
Que

In [9]:
from pathlib import Path
import json

all_evaluations_summary = {}

folder_path = Path("./evaluation/wrong solution")

for json_file in folder_path.glob("*.json"):
    with open(json_file, 'r') as f:
        try:
            evaluation = json.load(f)
        except json.JSONDecodeError as e:
            print(f"Failed to parse {json_file.name}: {e}")
        
    evaluation_name = json_file.name

    evaluation_summary = get_evaluation_score(evaluation)


    given_info_score, kc_score, final_ans_score, understand_score, fluency_score, grammar_score, error_flag = evaluation_summary

    if error_flag:
        print(f"WARNING: {evaluation_name} contains erronous evaluation")
    else:
        print(f"{evaluation_name} analyzed successfully")
    
    all_evaluations_summary[evaluation_name] = {"given_info_score" : given_info_score, "kc_score":kc_score, "final_ans_score":final_ans_score,
                                                "understand_score" : understand_score, "fluency_score":fluency_score, "grammar_score":grammar_score}

openai_chatgpt-4o-latest_evaluating_wrong_anthropic_claude-3.7-sonnet.json analyzed successfully
openai_chatgpt-4o-latest_evaluating_wrong_deepseek_deepseek-r1-distill-qwen-14b.json analyzed successfully
openai_chatgpt-4o-latest_evaluating_wrong_deepseek_deepseek-r1.json analyzed successfully
openai_chatgpt-4o-latest_evaluating_wrong_meta-llama_llama-3.3-70b-instruct.json analyzed successfully
openai_chatgpt-4o-latest_evaluating_wrong_meta-llama_llama-4-scout.json analyzed successfully
openai_chatgpt-4o-latest_evaluating_wrong_microsoft_phi-3.5-mini-128k-instruct.json analyzed successfully
openai_chatgpt-4o-latest_evaluating_wrong_microsoft_phi-4-reasoning-plus.json analyzed successfully
openai_chatgpt-4o-latest_evaluating_wrong_openai_chatgpt-4o-latest.json analyzed successfully
openai_chatgpt-4o-latest_evaluating_wrong_qwen_qwen3-14b.json analyzed successfully
openai_chatgpt-4o-latest_evaluating_wrong_qwen_qwen3-235b-a22b.json analyzed successfully


In [10]:
with open("./final_wrong_summary.json", "w") as file:
    json.dump(all_evaluations_summary, file, indent=4)