### Import important libraries

In [11]:
#Import important libraries
import os
import json
import ast
import re

import torch
import pandas as pd
import numpy as np
import sklearn as sk

from dotenv import load_dotenv
from together import Together
from openai import OpenAI

from site_wrapper.openrouter import OpenRouter
from site_wrapper.platform_migration.api_utils import Azure
from site_wrapper.platform_migration.api_utils import Aws

from pydantic import BaseModel, Field
from typing import Dict

file_id = "2025-06-09_14_16_30"

### Import original PSLE dataset

In [2]:
with open(f"./dataset/{file_id}_generated_questions.json", "r") as file:
    dataset = json.load(file)

# Initialize keys and token

In [3]:
models = ["openai/gpt-4.1", "meta-llama/llama-4-scout", "deepseek/deepseek-r1", 
          "microsoft/phi-4", "qwen/qwen3-14b", "anthropic/claude-3.7-sonnet"]
models_azure = ["openai/gpt-4.1", "meta-llama/llama-4-scout", "deepseek/deepseek-r1", "microsoft/phi-4"]
models_aws = ["anthropic/claude-3.7-sonnet"]
models_platform_translation = {"openai/gpt-4.1" : "gpt-4.1", "deepseek/deepseek-r1" : "DeepSeek-R1-0528", "microsoft/phi-4" : "Phi-4",
                               "meta-llama/llama-4-scout" : "Llama-4-Maverick-17B-128E-Instruct-FP8", "qwen/qwen3-14b" : "qwen/qwen3-14b",
                               "anthropic/claude-3.7-sonnet" : "apac.anthropic.claude-sonnet-4-20250514-v1:0"}
models_open_router = ["qwen/qwen3-14b"]
models_free = ["microsoft/phi-4-reasoning-plus:free", "deepseek/deepseek-r1-distill-qwen-32b:free", "meta-llama/llama-3.3-8b-instruct:free",
              "qwen/qwen3-32b:free"]
model_default = "deepseek/deepseek-r1-distill-qwen-32b:free"
model = None

# Main LLM generating correct solution loop

This code let each of the LLM solve PSLE questions and store those answers in the ```solutions``` folder

In [18]:
with open('./dataset/taxonomy definitions/BLOOM.txt', 'r', encoding='utf-8', errors='replace') as f:
    taxonomy_definition = f.read()
for model in models[:1]:
    kc_index = 0
    dataset_length = len(dataset.keys())

    print("Currently working with model: ", model)
    model_path = model.replace("/","_")
    system_prompt_fpath = f"./prompt/correct solution/{model_path}_system_prompt.txt"
    user_prompt_fpath = f"./prompt/correct solution/{model_path}_user_prompt.txt"
    generated_solution_fpath = f"./solution/correct solution/dataset_{file_id}/{model_path}_solution.json"

    all_outputs_dct = {}
    all_outputs_text = {}
    errors = {}
    
    # Assuming primary_df is defined elsewhere
    for kcID in list(dataset.keys())[:5]:
        kc_index += 1
        info = {
            "id" : dataset[kcID]["ID"],
            "question" : dataset[kcID]["Question"],
            "taxonomy definition" : taxonomy_definition,
            "answer" : None,
            "kc": None
        }

        try:
            with open(system_prompt_fpath,"r") as file:
                system_prompt = file.read()
        except Exception as e:
            system_prompt = ""
        try:
            with open(user_prompt_fpath,"r") as file:
                user_prompt = file.read()
        except Exception as e:
            user_prompt = ""

        this_system_prompt = system_prompt.format()
        this_user_prompt = user_prompt.format(question = info["question"], ID = info["id"], taxonomy_definition = info["taxonomy definition"])

        if model in models_azure:
            wrapper = Azure()
        elif model in models_aws:
            wrapper = Aws()
        else:
            wrapper = OpenRouter()

        if model == "openai/gpt-4.1":
            message = wrapper.get_GPT_txt_response(this_system_prompt, this_user_prompt)
        else:
            message = wrapper.get_LLM_response(models_platform_translation[model], this_system_prompt, this_user_prompt)
    
        response_text = message.strip()
        if model == "deepseek/deepseek-r1":
            _,__,response_text = response_text.partition("</think>")
            response_text = response_text.strip()

        response_text_extracted = None
        match = re.findall(r'\{.*\}', response_text, re.DOTALL)
        if match:
            response_text_extracted = match[0]
        else:
            print("NO JSON FORMAT OUTPUT FOUND! ERROR!")
            print(response_text)
    
        try:
            response_dct = ast.literal_eval(response_text_extracted)
            all_outputs_dct[kcID] = response_dct
        except Exception as e:
            errors[kcID] = response_text
            print("Found error at", kcID, "error: ", e)
            print(response_text)

        all_outputs_text[kcID] = response_text


        print(f"Knowledge component {kcID} ({kc_index}/{dataset_length}) loaded successfully")
        print(response_dct)

Currently working with model:  openai/gpt-4.1
Knowledge component P1N100 (1/32) loaded successfully
{'question': 'Sarah has 37 stickers and Ben has 48 stickers. How many stickers do they have altogether?', 'solution': {'Step 1': "We need to find the total number of stickers by adding Sarah's and Ben's stickers.", 'Step 2': 'Sarah has 37 stickers. Ben has 48 stickers.', 'Step 3': 'Add the two numbers together: \n\n\\[ 37 + 48 = 85 \\]', 'Step 4': 'So, Sarah and Ben have 85 stickers altogether.'}}
Knowledge component P1AddSub (2/32) loaded successfully
{'question': 'Amir has 8 toy cars. His friend gives him 6 more toy cars. How many toy cars does Amir have now?', 'solution': {'Step 1': 'First, we find out how many toy cars Amir starts with. Amir has 8 toy cars.', 'Step 2': 'Next, we see how many toy cars his friend gives to him. His friend gives him 6 more toy cars.', 'Step 3': 'To find the total number of toy cars Amir has now, we need to add the toy cars he started with and the toy car



In [6]:
message

{'response': '', 'status': -1, 'response_time': 2.7644898891448975}

In [19]:
all_outputs_dct

{'P1N100': {'question': 'Sarah has 37 stickers and Ben has 48 stickers. How many stickers do they have altogether?',
  'solution': {'Step 1': "We need to find the total number of stickers by adding Sarah's and Ben's stickers.",
   'Step 2': 'Sarah has 37 stickers. Ben has 48 stickers.',
   'Step 3': 'Add the two numbers together: \n\n\\[ 37 + 48 = 85 \\]',
   'Step 4': 'So, Sarah and Ben have 85 stickers altogether.'}},
 'P1AddSub': {'question': 'Amir has 8 toy cars. His friend gives him 6 more toy cars. How many toy cars does Amir have now?',
  'solution': {'Step 1': 'First, we find out how many toy cars Amir starts with. Amir has 8 toy cars.',
   'Step 2': 'Next, we see how many toy cars his friend gives to him. His friend gives him 6 more toy cars.',
   'Step 3': 'To find the total number of toy cars Amir has now, we need to add the toy cars he started with and the toy cars his friend gave him.',
   'Step 4': 'We use the addition method: 8 + 6.',
   'Step 5': '8 + 6 = 14.',
   'Step

In [56]:
with open(f'./solution/correct solution/td_questions/{model_path}_solution.json', 'w') as file:
    json.dump(all_outputs_dct, file, indent=4)

In [118]:
def process_latex_string(s):
    s = s.replace('\n', ' \n')   # Replace \n with LaTeX newline command
    s = s.replace('$', '\\$')
    s = s.replace('%', '\\%')
    s = s.replace('\\\\', '\\')   # Normalize double backslashes
    return s

In [123]:
print(process_latex_string(all_outputs_dct['P3WA']['solution']))

Let us solve the problem step by step. 
 
First, Aisha has 256 stickers. 
 
She buys another 137 stickers. To find the total number of stickers she has after buying more, we add: 
 
\[ 
256 + 137 = 393 
\] 
 
Now, Aisha gives 89 stickers to her friend. To find out how many stickers she has left, we subtract: 
 
\[ 
393 - 89 = 304 
\] 
 
So, Aisha has 304 stickers now.


In [7]:
with open(f'./solution/correct solution/td_questions/{model_path}_{file_id}_solution.json', 'w') as file:
    json.dump(all_outputs_dct, file, indent=4)

## Amazon Bedrock loop

# Main LLM evaluating given solutions loop

This code let ```deepseek/deepseek-r1``` model evaluate the performance of other model.

In the future, this code will expand to let each model evaluate one another

## Helper function to get evaluation summary

The summary score will be calculated as follow:
```given_info_score, kc_score, final_ans_score, grammar_score``` is calculated by the ```total number of YES``` divided by ```total number of questions```

```understand_score, fluency_score``` is calculated by taking the average score given to all the questions

All of the variables are then scaled to have the max score of 100

In [8]:
# Get evaluation score for models over multiple question
def get_evaluation_score(evaluation):
    final_ans_score, kc_score, given_info_score, understand_score, fluency_score, grammar_score = 0,0,0,0,0,0
    evaluated_evaluations = 0
    error_flag = False
    for item in evaluation.keys():
        try:
            given_info_correct, kc_correct, final_ans_correct, understand, fluency, grammar_correct = (evaluation[item]['given_info_correct'], evaluation[item]['kc_correct'],
                                                    evaluation[item]['final_ans_correct'], evaluation[item]['understand'], evaluation[item]['fluency'],
                                                    evaluation[item]['grammar_correct'])
            evaluated_evaluations += 1
            if evaluation[item]['given_info_correct'] == 'y':
                given_info_score += 1
            if evaluation[item]['kc_correct'] == 'y':
                kc_score += 1
            if evaluation[item]['final_ans_correct'] == 'y':
                final_ans_score += 1
            understand_score += int(evaluation[item]['understand'])
            fluency_score += int(evaluation[item]['fluency'])
            if evaluation[item]['grammar_correct'] == 'y':
                grammar_score += 1
        except Exception as e:
            print(f"Failed to get evaluation score for item {item}. Error message: {e}")
            print(f"Item content: {evaluation[item]}")
            error_flag = True
    if evaluated_evaluations == 0:
        return None
    final_ans_score, kc_score, given_info_score, understand_score, fluency_score, grammar_score = [x*100/evaluated_evaluations for x in
            (final_ans_score, kc_score, given_info_score, understand_score, fluency_score, grammar_score)]
    understand_score /= 5
    fluency_score /= 5
    return given_info_score, kc_score, final_ans_score, understand_score, fluency_score, grammar_score, error_flag

## Main loop

In [38]:
evaluator = OpenAI(
    base_url="https://openrouter.ai/api/v1",
    api_key=TOKEN,
)

model_evaluator = "deepseek/deepseek-r1"

In [40]:
all_evaluations_summary = {}

for model in models[:1]:
    print("Currently evaluating model: ", model)

    model_path = model.replace("/","_")

    with open(f'./solution/{model_path}_solution.json', 'r') as file:
        all_outputs = json.load(file)

    evaluation = {}
    errors = {}
    
    # Assuming primary_df is defined elsewhere
    for i in range(1, len(dataset.keys())+1):
        info = {
            "question" : dataset[str(i)],
            "answer" : all_outputs.get(str(i), None),
            "kc": None
        }

        if info["answer"] is None:
            continue
    
        prompt = {
            "role": "user",
            "content": f""" You are an expert educator and your role is to grade the student's answer. 
            You are given the question, the student's answer, and you need to grade them based on these rubrics:
    
            -Task-oriented dimensions:
                •Given information is correct: y/n
                •Knowledge component (KC) applied is correct: y/n
                •Final answer is correct: y/n
                
    
            -Linguistic dimension:
                •Understandable (1 to 5)
                •Grammatically correct (y/n)
                •Fluency, clarity, conciseness (1 to 5)
    
            The question is: {info["question"]}
    
            The answer is: {info["answer"]}
    
            Your final response should be in the json format as below, without any other comments, use strings 'y' or 'n' for y/n:
            {{'given_info_correct' : ,
                'kc_correct' : ,
                'final_ans_correct' : ,
                'understand' : ,
                'grammar_correct' : ,
                'fluency' :
            }}
    
    """
        }
        
        response = evaluator.chat.completions.create(
                model=model_evaluator,
                messages=[prompt],
                temperature=0.5
            )
    
        message = response.choices[0].message
        if message is None or message.content is None:
            print(f"No valid message returned for question: {info["question"]}")
            continue
    
        response_text = message.content.strip()

        response_text_extracted = None
        match = re.findall(r'\{[^}]*\}', response_text)
        if match:
            response_text_extracted = match[0]
        else:
            print("NO JSON FORMAT OUTPUT FOUND! ERROR!")
            print(response_text)
    
        try:
            response_dct = ast.literal_eval(response_text_extracted)
            evaluation[i] = response_dct
        except Exception as e:
            errors[i] = response_text
            print("Found error at", i, "error: ", e)
            
        print("Question ", i, " loaded successfully")
    
        if i == 1:
            print(evaluation[i], type(evaluation[i]))

    model_evaluator_path = model_evaluator.replace("/","_")

    with open(f'./evaluation/correct solution/{model_evaluator_path}_evaluating_{model_path}.json', 'w') as file:
        json.dump(evaluation, file, indent=4)

    if errors:
        with open(f'./error/{model_evaluator_path}_evaluating_{model_path}.json', 'w') as file:
            json.dump(errors, file, indent=4)

  '''


Currently evaluating model:  openai/gpt-4.1
Question  1  loaded successfully
{'given_info_correct': 'y', 'kc_correct': 'y', 'final_ans_correct': 'y', 'understand': '5', 'grammar_correct': 'y', 'fluency': '5'} <class 'dict'>
Question  2  loaded successfully
Question  3  loaded successfully


  '''


KeyboardInterrupt: 

## Model evaluation summary

In [18]:
from pathlib import Path
import json

all_evaluations_summary = {}

folder_path = Path("./evaluation/correct solution")

for json_file in folder_path.glob("*.json"):
    with open(json_file, 'r') as f:
        try:
            evaluation = json.load(f)
        except json.JSONDecodeError as e:
            print(f"Failed to parse {json_file.name}: {e}")
        
    evaluation_name = json_file.name

    evaluation_summary = get_evaluation_score(evaluation)


    given_info_score, kc_score, final_ans_score, understand_score, fluency_score, grammar_score, error_flag = evaluation_summary

    if error_flag:
        print(f"WARNING: {evaluation_name} contains erronous evaluation")
    else:
        print(f"{evaluation_name} analyzed successfully")
    
    all_evaluations_summary[evaluation_name] = {"given_info_score" : given_info_score, "kc_score":kc_score, "final_ans_score":final_ans_score,
                                                "understand_score" : understand_score, "fluency_score":fluency_score, "grammar_score":grammar_score}

deepseek_deepseek-r1_evaluating_anthropic_claude-3.7-sonnet.json analyzed successfully
deepseek_deepseek-r1_evaluating_deepseek_deepseek-r1-distill-qwen-14b.json analyzed successfully
deepseek_deepseek-r1_evaluating_deepseek_deepseek-r1.json analyzed successfully
deepseek_deepseek-r1_evaluating_meta-llama_llama-3.3-70b-instruct.json analyzed successfully
deepseek_deepseek-r1_evaluating_meta-llama_llama-4-scout.json analyzed successfully
deepseek_deepseek-r1_evaluating_microsoft_phi-3.5-mini-128k-instruct.json analyzed successfully
deepseek_deepseek-r1_evaluating_microsoft_phi-4-reasoning-plus.json analyzed successfully
deepseek_deepseek-r1_evaluating_openai_chatgpt-4o-latest.json analyzed successfully
deepseek_deepseek-r1_evaluating_openai_gpt-4.1.json analyzed successfully
deepseek_deepseek-r1_evaluating_qwen_qwen3-14b.json analyzed successfully
Failed to get evaluation score for item 8. Error message: 'kc_correct'
Item content: {'given_info_correct': 'y', 'kc_c': 'y', 'final_ans_corr

In [19]:
with open("./final_summary.json", "w") as file:
    json.dump(all_evaluations_summary, file, indent=4)