In [1]:
import os
import base64
from pydantic import BaseModel
from openai import AzureOpenAI
from api_utils import *
import jiter


# Generate MWPs and store them as JSON files


In [9]:
topic_descriptions = {
    "Food": "Food such as cooking, baking, food consumption, food sharing, food allocation, buying food, selling food etc.",
    "Education": "Education such as stationary, library, bookstores, classrooms, events at schools, school clubs, teaching and learning activities etc.",
    "Transportation": "Transportation such as traveling, commuting, shipping, trucking, freight, flights etc.",
    "Household Finance": "Household finance such as income, utility bills, money, interest, savings, instalment, mortgage, financial planning etc.",
    "Recreation": "Recreation such as sports, games, exercises, music, movie, dancing, painting, fishing and other recreation activities",
    "Farming": "Farming such as farming of crops, vegetables, fruits, nuts, livestock, aquaculture etc.",
    "Manufacturing": "Manufacturing such as manufacturing of textile, apparel, shoes, electronics, furniture, cars, toys, chemicals etc.",
    "Services": "Services such as installation, maintenance, repairing, cleaning, laundry, hotel, retail, e-commerce, streaming services, digital services etc."}

batch_path_dict = {"P": r"Data\Final_kc_pairs\primary_kc_pairs_final.json",
                    "P3": r"Data\Final_kc_pairs\P3_kc_pairs_final.json",
                    "P4": r"Data\Final_kc_pairs\P4_kc_pairs_final.json",
                    "P5": r"Data\Final_kc_pairs\P5_kc_pairs_final.json",
                    "P6": r"Data\Final_kc_pairs\P6_kc_pairs_final.json",
                    "O": r"Data\Final_kc_pairs\secondary_kc_pairs_final.json",
                    "O1": r"Data\Final_kc_pairs\O1_kc_pairs_final.json",
                    "O2": r"Data\Final_kc_pairs\O2_kc_pairs_final.json",
                    "O3": r"Data\Final_kc_pairs\O3_kc_pairs_final.json",
                    "All": r"Data\Final_kc_pairs\all_kc_pairs_final.json",
                    "50": r"Data\Final_kc_pairs\50_hardest_kc_pairs_final.json"}
    

In [3]:
sys_prompt_path = r"Prompts\GPT\generation_system_prompt_without_solution.txt"
with open(sys_prompt_path, "r", encoding="utf-8") as file:
    sys_prompt = file.read()
    
user_prompt_path = r"Prompts\GPT\generation_user_prompt.txt"
with open(user_prompt_path, "r", encoding="utf-8") as file:
    user_prompt_template = file.read()


In [4]:
openai_client = create_openai_client()
class MWP(BaseModel):
    word_problem: str


In [5]:
def metadata(response_dict=None, **kwargs):
    kwargs['response_dict'] = response_dict
    return kwargs

def GPT_question_generator(kc1, kc2, topic, grade, qid):
    user_prompt = user_prompt_template.format(kc1=kc1, kc2=kc2, topic=topic, grade=grade)
    response_dict = get_GPT_struct_response(openai_client, sys_prompt, user_prompt, MWP)
    return metadata(response_dict=response_dict, Primary_kc = kc1, Secondary_kc = kc2, Topic=topic, Grade=grade, QID=qid)

In [6]:
# Testing the function
response= GPT_question_generator( "WHOLE NUMBERS | Multiplication | multiplication of numbers with 1-4 digits by number with 1 digit or of number with 1-3 digits by number with 2 digits", "WHOLE NUMBERS | Subtraction | subtraction of numbers with 1-4 digits", "Money", "Primary 6", 123)
response

{'Primary_kc': 'WHOLE NUMBERS | Multiplication | multiplication of numbers with 1-4 digits by number with 1 digit or of number with 1-3 digits by number with 2 digits',
 'Secondary_kc': 'WHOLE NUMBERS | Subtraction | subtraction of numbers with 1-4 digits',
 'Topic': 'Money',
 'Grade': 'Primary 6',
 'QID': 123,
 'response_dict': {'status': 0,
  'word_problem': 'Jasmine is helping her family sell tickets for a school charity event. Each ticket costs \\textdollar12 and she manages to sell 27 tickets. Her school sets a target to collect \\textdollar350 more than what Jasmine collected from ticket sales. \n\n(a) How much money did Jasmine collect from selling the tickets?\n\n(b) How much money must her school collect in total to meet the target?\n\n(c) If the school has already collected \\textdollar500 in total, how much more do they need to reach their target amount?\n\nShow all your working clearly.',
  'response_time': 4.4020836353302}}

In [7]:
def batch_GPT_question_generator(input_path, output_path):
    with open(input_path, "r", encoding="utf-8") as f:
        input_data = json.load(f)
    if os.path.exists(output_path):
        with open(output_path, "r", encoding="utf-8") as f:
            all_responses = json.load(f)
    else:
        all_responses = {}
    for qid, data_dict in input_data.items():
        kc1 = data_dict.get("primary_kc_name", "")
        kc2 = data_dict.get("secondary_kc_name", "")
        grade = data_dict.get("primary_kc_grade", "")
        for topic_key, topic_description in topic_descriptions.items():
            updated_qid = qid + "_GPT4.1_"+ topic_key
            updated_qid = get_next_question_id(updated_qid)
            response = GPT_question_generator(kc1 = kc1, kc2 = kc2, topic = topic_description, grade = grade, qid = updated_qid)
            if response:  
                all_responses[updated_qid] = response
                with open(output_path, "w", encoding="utf-8") as f:
                    json.dump(all_responses, f, indent=2)

In [10]:
def one_topic_GPT_question_generator(input_path, output_path, topic, with_solution=True):
    with open(input_path, "r", encoding="utf-8") as f:
        input_data = json.load(f)
    if os.path.exists(output_path):
        with open(output_path, "r", encoding="utf-8") as f:
            all_responses = json.load(f)
    else:
        all_responses = {}
    for qid, data_dict in input_data.items():
        kc1 = data_dict.get("primary_kc_name", "")
        kc2 = data_dict.get("secondary_kc_name", "")
        grade = data_dict.get("primary_kc_grade", "")
        updated_qid = qid + "_GPT4.1_"+ topic
        if not with_solution:
            updated_qid = updated_qid + "_without_solution"
        updated_qid = get_next_question_id(updated_qid)
        response = GPT_question_generator(kc1 = kc1, kc2 = kc2, topic = topic_descriptions[topic], grade = grade, qid = updated_qid)
        if response:  
            all_responses[updated_qid] = response
            with open(output_path, "w", encoding="utf-8") as f:
                json.dump(all_responses, f, indent=2)

In [11]:
one_topic_GPT_question_generator(batch_path_dict["50"], r"Data\Generated_questions\GPT4.1\50_hardest\50_hardest_services_without_solution_v1.json", "Services", with_solution=False)

# KC pairs slit

In [None]:
import random
import json

In [6]:
def select_random_samples(sample_size, input_path, output_path, exclude_keys_path=None):
    with open(input_path, "r", encoding="utf-8") as f:
        input_data = json.load(f)
    with open(exclude_keys_path, "r", encoding="utf-8") as f:
        exclude_keys = json.load(f) if exclude_keys_path else None
    input_data = {k: v for k, v in input_data.items() if exclude_keys is None or k not in exclude_keys}
    selected_samples = random.sample(list(input_data.items()), sample_size)
    selected_data = {qid: data for qid, data in selected_samples}
    with open(output_path, "w", encoding="utf-8") as f:
        json.dump(selected_data, f, indent=2)   

In [34]:
select_random_samples(80, r"Data\Generated_questions\GPT4.1\All\recreation_v8_v1.json", r"Data\Generated_questions\GPT4.1\All\80_recreation_v8_v1.json", exclude_keys_path=r"Data\Generated_questions\GPT4.1\All\20_recreation_v8_v1.json")

In [10]:
def split_remaining_MWPs_into_three_parts(input_path, exclude_keys_path=None):
    # Load input data
    with open(input_path, "r", encoding="utf-8") as f:
        input_data = json.load(f)

    # Load exclude keys if provided
    exclude_keys = set()
    if exclude_keys_path:
        with open(exclude_keys_path, "r", encoding="utf-8") as f:
            exclude_keys = set(json.load(f))

    # Remove excluded keys
    filtered_data = {k: v for k, v in input_data.items() if k not in exclude_keys}

    # Shuffle keys for randomness
    items = list(filtered_data.items())
    random.shuffle(items)

    # Split into 3 nearly equal parts
    total = len(items)
    split_size = total // 3
    part1 = dict(items[:split_size])
    part2 = dict(items[split_size:2*split_size])
    part3 = dict(items[2*split_size:])

    return part1, part2, part3


In [None]:
part1, part2, part3 = split_remaining_MWPs_into_three_parts(r"Data\Generated_questions\GPT4.1\All\recreation_v8_v1.json", exclude_keys_path=r"Data\Generated_questions\GPT4.1\All\100_recreation_v8_v1.json")
print(len(part1), len(part2), len(part3))
dictionary = {
    "TD": part1,
    "Minh": part2,
    "Sarah": part3
}
with open(r"Data\Generated_questions\GPT4.1\All\recreation_v8_v1_split.json", "w", encoding="utf-8") as f:
    json.dump(dictionary, f, indent=2)
    

25 25 26


In [None]:
# Assign KC pairs to each person based on the corresponding assigned MWPs in recreation_v8_v1_split.json
input_path = r"Data\Generated_questions\GPT4.1\All\recreation\recreation_v8_v1_split.json"
with open(input_path, "r", encoding="utf-8") as f:
    input_data = json.load(f)
assigned_KCs = dict()
for name, data in input_data.items():
    assigned_KCs[name] = list()
    for qid in data.keys():
        qid = "_".join(qid.split("_")[:2])
        assigned_KCs[name].append(qid)
with open(r"Data\Generated_questions\GPT4.1\All\assigned_KCs.json", "w", encoding = "utf-8") as f:
     json.dump(assigned_KCs, f, indent=2, ensure_ascii=False)

In [None]:
# These are 76 KC pairs of 76 MWPs which are left after the common round of 100 MWPs evaluation
alr_assign = ["P3-WNSub4d_P1-WNAdd2nd",
    "P5-DcDiv3dK_P4-DcRnd3d",
    "P5-DcDiv3dK_P4-DcAdd2nd",
    "P4-DcMul2d1d_P4-DcCnv2Fr",
    "P4-DcDiv2d1d_P4-DcCmp3d",
    "P3-WNMul3d1d_P1-WNCmp",
    "P6-FrDivPP_P5-FrCnv2Dc",
    "P4-DcSub2d_P4-DcRnd3d",
    "O3-MXMul_O3-MXAdd",
    "P6-PcFndWN_P1-WNAdd2nd",
    "P5-FrMulImIm_P2-FrSub2nd",
    "P6-AgRepLrEx_P6-AgSmpLrEx",
    "P4-DcDiv2d1d_P4-DcAdd2nd",
    "O1-PcFndRslt_P1-WNSub2nd",
    "O3-SPFndstd_O2-SPFndmean",
    "P5-DcMul3dK_P4-DcSub2nd",
    "P5-FrMulPIm_P3-FrSmp",
    "P5-PcRepWh_P1-WNDiv2nd",
    "P6-FrDivPN_P2-FrAdd2nd",
    "P5-FrMulPIm_P2-FrCmp",
    "O1-PcRepRvs_O1-PcCnv2Dc",
    "P4-DcAdd2d_P4-DcCmp3d",
    "P5-FrMulImIm_P3-FrSmp",
    "O2-RoRepDP_P1-WNMul2nd",
    "P4-WNDiv4d1d_P4-WNRnd5d",
    "P4-DcAdd2d_P4-DcRnd3d",
    "P6-FrDivPN_P3-FrSmp",
    "P5-DcDiv3dK_P4-DcCnv2Fr",
    "O3-MXMulSM_O3-MXSub",
    "P6-PcFndWN_P1-WNSub2nd",
    "P4-FrSubU12_P2-FrCmp",
    "P6-AgSlvLrN_P6-AgRepLrEx",
    "P5-RtFndR_P2-DcCnvD2N",
    "P5-FrMulPIm_P5-FrCnv2Dc",
    "O3-SPFndQtl_O3-SPFndIQR",
    "O3-SPMulProb_O2-SPRepPrSE",
    "O1-RoRepFr_P6-FrDiv2nd",
    "O2-RoRepIvP_P1-WNDiv2nd",
    "P3-WNDiv3d1d_P1-WNAdd2nd",
    "O2-AgSlvLr2v_O1-AgRepEq",
    "P5-FrMulMixN_P5-FrCnv2Dc",
    "P3-WNDiv3d1d_P1-WNMul2nd",
    "P6-PcFndChg_P1-WNSub2nd",
    "P3-FrAddRl12_P2-FrCmp",
    "P4-DcDiv2d1d_P4-DcSub2nd",
    "P5-RtFndR_P2-DcCnvN2D",
    "P6-PcFndChg_P1-WNMul2nd",
    "O3-STOprUn_O3-STOprIns",
    "P3-WNDivRmd3d_P1-WNAdd2nd",
    "O2-SPFndmdn_O2-SPFndmode",
    "P6-PcFndWN_P1-WNMul2nd",
    "O2-AgSlvIneq_O2-AgRepIneq",
    "O3-SPMulProb_O3-SPFndPrCE",
    "P6-FrDivPN_P2-FrSub2nd",
    "O1-PcRepRvs_O1-PcCnv2Fr",
    "P4-DcMul2d1d_P4-DcCmp3d",
    "O1-RoRepDc_P4-DcAdd2nd",
    "P5-FrMulImN_P2-FrAdd2nd",
    "O1-PcRep2q_O1-PcCnv2Fr",
    "P5-FrMulImIm_P5-FrCnv2Dc",
    "P5-FrMulImN_P5-FrCnv2Dc",
    "O1-AgRepExSq_O1-AgEvlEx",
    "P5-PcRepWh_P1-WNSub2nd",
    "P6-PcFndChg_P1-WNAdd2nd",
    "O1-RoRepFr_P5-FrMul2nd",
    "P5-FrSubMix_P5-FrCnv2Dc",
    "P5-DcDiv3dK_P4-DcSub2nd",
    "P6-PcFndWN_P1-WNDiv2nd",
    "P4-DcMul2d1d_P4-DcSub2nd",
    "O3-SPAddProb_O3-SPFndPrCE",
    "O1-PcFndRslt_P1-WNAdd2nd",
    "P3-WNMul3d1d_P1-WNSub2nd",
    "O2-SPFndmdn_O3-SPFndrng",
    "P5-DcMul3dK_P4-DcAdd2nd",
    "P6-FrDivPN_P5-FrMul2nd",
    "P5-DcDiv3dK_P4-DcCmp3d"]


In [None]:
# Split the remaining 100 KC pairs for each person
target_sizes = [24, 38, 38]
groups = [[] for _ in target_sizes]
counts = [0] * 3

# Index to cycle through 0 → 1 → 2 → 0 ...
i = 0
with open(r"Data\Final_kc_pairs\all_kc_pairs_final.json", "r", encoding= "utf-8") as f:
    kc_pairs = json.load(f)
remaining_kc_pairs = [k for k in kc_pairs if k not in alr_assign]
for kc in remaining_kc_pairs:
    while counts[i] >= target_sizes[i]:
        i = (i + 1) % 3  # skip full group
    groups[i].append(kc)
    counts[i] += 1
    i = (i + 1) % 3  # move to next group
with open(r"Data\Generated_questions\GPT4.1\All\assigned_KCs.json", 'r', encoding = "utf_8") as f:
    assigned_KCs = json.load(f)
assigned_KCs["TD"].extend(groups[0])
assigned_KCs["Minh"].extend(groups[1])
assigned_KCs["Sarah"].extend(groups[2])
with open(r"Data\Generated_questions\GPT4.1\All\assigned_KCs.json", 'w', encoding = "utf_8") as f:
    json.dump(assigned_KCs, f, indent=2, ensure_ascii=False)



In [None]:
# Split 176 MWPs on another topics according to the assigned KC pairs.
with open(r"Data\Generated_questions\GPT4.1\All\services\services_v8_v1.json", "r", encoding="utf-8") as f:
    MWPs = json.load(f)
# Initialize split dictionary
services_v8_v1_split = {"TD": dict(), "Minh": dict(), "Sarah": dict()}

# Assign based on safe_qid matching assigned_KCs
for qid, data in MWPs.items():
    safe_qid = "_".join(qid.split("_")[:2])
    if safe_qid in assigned_KCs["TD"]:
        services_v8_v1_split["TD"][qid] = data
    elif safe_qid in assigned_KCs["Minh"]:
        services_v8_v1_split["Minh"][qid] = data
    elif safe_qid in assigned_KCs["Sarah"]:
        services_v8_v1_split["Sarah"][qid] = data


with open(r"Data\Generated_questions\GPT4.1\All\services\services_v8_v1_split.json", "w", encoding="utf-8") as f:
    json.dump(services_v8_v1_split, f, indent=2, ensure_ascii=False)

# Convert JSON files into TEX files

In [12]:
import json
from pathlib import Path
def convert_json_into_tex(input_path, output_file):
    with open(input_path, "r", encoding="utf-8") as f:
        input_data = json.load(f)
    
    with open(output_file, "w", encoding="utf-8") as f:
        f.write("\\documentclass{article}\n")
        f.write("\\usepackage[utf8]{inputenc}\n")
        f.write("\\usepackage{amsmath}\n")
        f.write("\\usepackage{amsfonts}\n") 
        f.write("\\usepackage{amssymb}\n")
        f.write("\\usepackage{graphicx}\n")
        f.write("\\usepackage{hyperref}\n")
        f.write("\\title{recreation_v8_v1}\n")
        f.write("\\author{Tien Dung Doan}\n")
        f.write("\\begin{document}\n")
        f.write("\\maketitle\n")
        i = 1
        for qid, data in input_data.items():
            word_problem = data.get("response_dict", {}).get("word_problem", "")
            solution = data.get("response_dict", {}).get("solution", "")
            kc1 = data.get("Primary_kc", "")
            kc2 = data.get("Secondary_kc", "")
            topic = data.get("Topic", "")
            grade = data.get("Grade", "")
            
            safe_qid = qid.replace("_", "\\_")
            f.write(f"\\section*{{Question {i}}}\n")
            i += 1

            # Metadata
            f.write("\\textbf{Metadata}\n\n")
            f.write(f"\\begin{{itemize}}\n")
            f.write(f"  \\item Question ID: {safe_qid}\n")
            f.write(f"  \\item Primary KC: {kc1}\n")
            f.write(f"  \\item Secondary KC: {kc2}\n")
            f.write(f"  \\item Topic: {topic}\n")
            f.write(f"  \\item Grade: {grade}\n")
            f.write(f"\\end{{itemize}}\n\n")

            # Question
            f.write("\\textbf{Question}\n\n")
            f.write(f"{word_problem}\n\n")

            # Solution
            f.write("\\textbf{Solution}\n\n")
            f.write(f"{solution}\n\n")

        f.write("\\end{document}\n")

def convert_json_into_tex_question(input_path, output_path, title):
    if isinstance(input_path, (str, Path)) and Path(input_path).exists():
        with open(input_path, "r", encoding="utf-8") as f:
            input_data = json.load(f)
    else:
        input_data = input_path  # assume it's already a dictionary
    
    with open(output_path, "w", encoding="utf-8") as f:
        f.write("\\documentclass{article}\n")
        f.write("\\usepackage[utf8]{inputenc}\n")
        f.write("\\usepackage{amsmath}\n")
        f.write("\\usepackage{amsfonts}\n")
        f.write("\\usepackage{amssymb}\n")
        f.write("\\usepackage{graphicx}\n")
        f.write("\\usepackage{hyperref}\n")
        f.write(f"\\title{ {title} }\n")
        f.write("\\author{Tien Dung Doan}\n")
        f.write("\\begin{document}\n")
        f.write("\\maketitle\n")
        i = 1
        for qid, data in input_data.items():
            word_problem = data.get("response_dict", {}).get("word_problem", "")
            kc1 = data.get("Primary_kc", "")
            kc2 = data.get("Secondary_kc", "")
            topic = data.get("Topic", "")
            grade = data.get("Grade", "")
            
            safe_qid = qid.replace("_", "\\_")
            f.write(f"\\section*{{Question {i}}}\n")
            i += 1

            # Metadata
            f.write("\\textbf{Metadata}\n\n")
            f.write(f"\\begin{{itemize}}\n")
            f.write(f"  \\item Question ID: {safe_qid}\n")
            f.write(f"  \\item Primary KC: {kc1}\n")
            f.write(f"  \\item Secondary KC: {kc2}\n")
            f.write(f"  \\item Topic: {topic}\n")
            f.write(f"  \\item Grade: {grade}\n")
            f.write(f"\\end{{itemize}}\n\n")

            # Question
            f.write("\\textbf{Question}\n\n")
            f.write(f"{word_problem}\n\n")

        f.write("\\end{document}\n")

def convert_json_into_tex_solution(input_path, output_path, title):
    if isinstance(input_path, (str, Path)) and Path(input_path).exists():
        with open(input_path, "r", encoding="utf-8") as f:
            input_data = json.load(f)
    else:
        input_data = input_path  # assume it's already a dictionary
    
    with open(output_path, "w", encoding="utf-8") as f:
        f.write("\\documentclass{article}\n")
        f.write("\\usepackage[utf8]{inputenc}\n")
        f.write("\\usepackage{amsmath}\n")
        f.write("\\usepackage{amsfonts}\n")
        f.write("\\usepackage{amssymb}\n")
        f.write("\\usepackage{graphicx}\n")
        f.write("\\usepackage{hyperref}\n")
        f.write(f"\\title{ {title} }\n")
        f.write("\\author{Tien Dung Doan}\n")
        f.write("\\begin{document}\n")
        f.write("\\maketitle\n")
        i = 1
        for qid, data in input_data.items():
            word_problem = data.get("response_dict", {}).get("word_problem", "")
            solution = data.get("response_dict", {}).get("solution", "")
            kc1 = data.get("Primary_kc", "")
            kc2 = data.get("Secondary_kc", "")
            topic = data.get("Topic", "")
            grade = data.get("Grade", "")
            
            safe_qid = qid.replace("_", "\\_")
            f.write(f"\\section*{{Question {i}}}\n")
            i += 1

            # Metadata
            f.write("\\textbf{Metadata}\n\n")
            f.write(f"\\begin{{itemize}}\n")
            f.write(f"  \\item Question ID: {safe_qid}\n")
            f.write(f"  \\item Primary KC: {kc1}\n")
            f.write(f"  \\item Secondary KC: {kc2}\n")
            f.write(f"  \\item Topic: {topic}\n")
            f.write(f"  \\item Grade: {grade}\n")
            f.write(f"\\end{{itemize}}\n\n")

            # Solution
            f.write("\\textbf{Solution}\n\n")
            f.write(f"{solution}\n\n")

        f.write("\\end{document}\n")

        
           

In [None]:
convert_json_into_tex(r"Data\Generated_questions\GPT4.1\All\recreation_v8_v1.json", r"Data\Generated_questions\GPT4.1\All\recreation_v8_v1.tex")

In [None]:
convert_json_into_tex_question(r"Data\Generated_questions\GPT4.1\50_hardest\50_hardest_services_without_solution_v1.json", r"Data\Generated_questions\GPT4.1\50_hardest\50_hardest_services_without_solution_v1.tex", "50 hardest services without solution v1")

In [None]:
input_path = r"Data\Generated_questions\GPT4.1\All\recreation_v8_v1_split.json"
with open(input_path, "r", encoding="utf-8") as f:
    input_data = json.load(f)
for name, data in input_data.items():
    output_question_path = fr"Data/Generated_questions/GPT4.1/All/{name}_question_recreation_v8_v1.tex"
    output_solution_path = fr"Data/Generated_questions/GPT4.1/All/{name}_solution_recreation_v8_v1.tex"

    title_question = f"{name} Questions recreation v8 v1"
    title_solution = f"{name} Solutions recreation v8 v1"

    convert_json_into_tex_question(data, output_question_path, title_question)
    convert_json_into_tex_solution(data, output_solution_path, title_solution)

In [None]:
input_path = r"Data\Generated_questions\GPT4.1\All\services\services_v8_v1_split.json"
with open(input_path, "r", encoding="utf-8") as f:
    input_data = json.load(f)
for name, data in input_data.items():
    output_question_path = fr"Data\Generated_questions\GPT4.1\All\services\{name}_question_services_v8_v1.tex"
    output_solution_path = fr"Data\Generated_questions\GPT4.1\All\services\{name}_solution_services_v8_v1.tex"

    title_question = f"{name} Questions services v8 v1"
    title_solution = f"{name} Solutions services v8 v1"

    convert_json_into_tex_question(data, output_question_path, title_question)
    convert_json_into_tex_solution(data, output_solution_path, title_solution)