# 4 Generating the Bank for the MicroTasks

To generate the bank for the microtasks I will use an API for an LLM.
The output will be two questions for each core course of each programme.

Basically I will create the perfect prompt that will use the columns of the df_courses to generate the microstasks. 

We use two prompts: broad + disambiguaition (Kenneth Style)

In [None]:
from pathlib import Path
import pandas as pd
#!pip install --upgrade openai
import os, re
from openai import OpenAI
import json
import numpy as np
from tqdm import tqdm  # optional progress bar, pip install tqdm


## 1 Load the data and filter for max 2 courses

In [None]:
# load the csv file about the courses forwhih we have to gen the tasks
silver = Path("../data_programmes_courses/silver")

df_courses_tasks = pd.read_csv(silver / "df_courses_tasks_silver.csv", encoding="utf-8-sig")
print("The shape of the courses tasks dataframe is:", df_courses_tasks.shape)

# keep only first two courses from each programme
df_courses_tasks = df_courses_tasks.groupby("programme_title").head(2).reset_index(drop=True)
print("After keeping only first two courses from each programme the shape is:", df_courses_tasks.shape)

## 2. Set up OpenAI client 

In [None]:

key_path = Path("../data_bank_microtasks") / "api_key.txt"

# Read the key and strip spaces and newlines
api_key = key_path.read_text(encoding="utf8").strip()

# Create the client using this key
client = OpenAI(api_key=api_key)

models = client.models.list()
#for m in models.data:
#    print(m.id)

model_gpt = "gpt-4.1-mini"  


In [None]:
# here we list all programme names
programmes = sorted(df_courses_tasks["programme_title"].unique())
print("Number of programmes:", len(programmes))
print("First few programmes:", programmes[:5])

def build_programme_context(df_prog: pd.DataFrame) -> str:
    """
    With this function we build a short text snippet that describes one programme.
    We use the two core courses that we kept for that programme.
    """
    lines = []

    # we take the programme name from the first row
    prog_name = df_prog["programme_title"].iloc[0]
    lines.append(f"Programme: {prog_name}")

    # we loop over the two core courses
    for idx, row in df_prog.iterrows():
        course_title = row.get("course_name", "")
        course_obj = row.get("course_objective", "")
        course_cont = row.get("course_content", "")

        if isinstance(course_title, str) and course_title.strip():
            lines.append(f"Course: {course_title.strip()}")

        if isinstance(course_obj, str) and course_obj.strip():
            # we keep the course objectives
            lines.append(f"Objectives: {course_obj.strip()}")

        if isinstance(course_cont, str) and course_cont.strip():
            # we keep only a short part of the content to control prompt length
            snippet = course_cont.strip()[:400]
            lines.append(f"Content snippet: {snippet}")

    # we join all lines in a single string
    context = "\n".join(lines)
    return context

# here we test the context builder for one programme
test_prog = programmes[0]
df_test = df_courses_tasks[df_courses_tasks["programme_title"] == test_prog]
print("Context for test programme:")
print(build_programme_context(df_test))


In [None]:
def build_aptitude_prompt(programme_name: str,
                          programme_context: str,
                          task_type: str) -> str:
    """
    With this function we build the text that we send to the model
    to create one aptitude microchallenge.
    task_type can be "classify", "fillblank", or "puzzle".
    """
    base = f"""
You are creating an aptitude micro challenge for a high school student
who is curious about the bachelor programme {programme_name}.

You receive a short context that summarises real courses from this programme.
You must anchor the challenge in that context.
Do not invent random domains.
Stay close to the topics and methods in the context.

The task must test ability or reasoning, not personal preference.
Use instructions such as "Sort these", "Choose the correct", "Complete the text".

General requirements:
- tiny_learn must be a list of exactly three short bullet points.
  Each bullet explains one useful idea in simple language.
- hint must be one short sentence that nudges the student without giving the answer away.
- signalType must always be "aptitude".
"""

    if task_type == "classify":
        specific = """
Task type: classify.

You must return a JSON object with these fields:
question_code: string, for example "ancient-classify-001"
type: "classify"
signalType: "aptitude"
question: short instruction, for example "Sort these into Greek or Roman origin"
tiny_learn: list of exactly three strings
categories: list of category labels, for example ["Greek", "Roman"]
items: list of objects with fields:
    id: short id such as "a" or "b"
    text: short description of the item
    correctCategory: one of the category labels
hint: short sentence

The categories and items must make sense for this programme.
"""
    elif task_type == "fillblank":
        specific = """
Task type: fillblank.

You must return a JSON object with these fields:
question_code: string, for example "ancient-fillblank-001"
type: "fillblank"
signalType: "aptitude"
question: short instruction, for example "Complete the text"
tiny_learn: list of exactly three strings
textWithBlanks: short text that contains markers {{0}}, {{1}}, etc
blanks: list of objects with fields:
    id: integer index such as 0 or 1
    correctWordId: id of the correct word from the words list
words: list of objects with fields:
    id: string, for example "sumerian"
    text: the word as it should appear in the text
hint: short sentence

The text must describe something that fits the programme context.
"""
    elif task_type == "puzzle":
        specific = """
Task type: puzzle.

You must return a JSON object with these fields:
question_code: string, for example "ancient-puzzle-001"
type: "puzzle"
signalType: "aptitude"
question: short instruction, for example "Which explanation fits best"
tiny_learn: list of exactly three strings
puzzle: object that can include a stem or short description, for example:
    { "variant": "logic", "stem": "..." }
options: list of objects with fields:
    id: short letter id, "A", "B", "C", "D"
    value: the text of the option
correctAnswer: id of the correct option
hint: short sentence

The puzzle must be solvable using the context and standard school knowledge.
"""
    else:
        raise ValueError(f"Unsupported task_type: {task_type}")

    context_block = f"""
Programme context:
{programme_context}

Output format:
Return a single valid JSON object and nothing else.
"""

    return base + specific + context_block




In [None]:
import json
import re

def generate_aptitude_task_for_programme(programme_name: str,
                                         programme_context: str,
                                         task_type: str) -> dict:
    """
    With this function we call the model one time and return one aptitude task
    parsed as a Python dict.
    """
    # here we build the full prompt, including programme context and task type
    prompt = build_aptitude_prompt(programme_name, programme_context, task_type)

    # here we call the model
    response = client.responses.create(
        model=model_gpt,
        input=prompt,
        temperature=0.7,
    )

    # here we take the text part of the first output and strip spaces
    raw = response.output[0].content[0].text.strip()

    # small debug print in case something goes wrong
    # we can comment this out later
    print("RAW MODEL OUTPUT START")
    print(raw[:500])
    print("RAW MODEL OUTPUT END")

    # here we try to extract the JSON object from the raw text
    # we look for the first curly brace and the last curly brace
    start = raw.find("{")
    end = raw.rfind("}") + 1

    if start == -1 or end == 0:
        raise ValueError("We did not find any JSON object in the model output")

    json_str = raw[start:end]

    # here we parse the JSON substring into a Python dict
    task = json.loads(json_str)

    # here we enforce signalType and type fields from our side
    task["signalType"] = "aptitude"
    task["type"] = task_type

    # here we normalise tiny_learn to three bullets
    tiny = task.get("tiny_learn", [])
    if not isinstance(tiny, list):
        tiny = [str(tiny)]
    if len(tiny) > 3:
        tiny = tiny[:3]
    while len(tiny) < 3:
        tiny.append("Extra note about the concept.")
    task["tiny_learn"] = tiny

    return task




In [None]:
test_prog = programmes[0]
df_test = df_courses_tasks[df_courses_tasks["programme_title"] == test_prog]
ctx = build_programme_context(df_test)

test_task = generate_aptitude_task_for_programme(
    programme_name=test_prog,
    programme_context=ctx,
    task_type="classify",
)

print(json.dumps(test_task, indent=2, ensure_ascii=False))
print("tiny_learn:", test_task.get("tiny_learn"))
print("Number of bullets:", len(test_task.get("tiny_learn", [])))



In [None]:
from pathlib import Path
import json
from tqdm import tqdm
import random

# here we decide which aptitude task types we want for each programme
# we can change this list later if we add more types
task_types = ["classify", "fillblank", "puzzle", ]

aptitude_bank = {}

# here we loop over all programmes and create aptitude tasks
for prog in tqdm(programmes, desc="Generating aptitude microchallenges"):
    # we filter the two core courses for this programme
    df_prog = df_courses_tasks[df_courses_tasks["programme_title"] == prog]

    # we build the short context snippet for the programme
    ctx = build_programme_context(df_prog)

    tasks_for_prog = []

    # here we generate one task for each type in task_types
    for task_type in task_types:
        try:
            task = generate_aptitude_task_for_programme(
                programme_name=prog,
                programme_context=ctx,
                task_type=task_type,
            )
            tasks_for_prog.append(task)
        except Exception as e:
            # we print the problem and keep going with the next type or programme
            print(f"Problem for programme {prog} task type {task_type}: {e}")

    # we store the aptitude tasks for this programme
    aptitude_bank[prog] = {
        "aptitude": tasks_for_prog
    }

print("Example programme:", programmes[0])
print(json.dumps(aptitude_bank[programmes[0]]["aptitude"], indent=2, ensure_ascii=False))

# ================================================================
# here we merge aptitude with the existing personality bank
# ================================================================

data_dir = Path("../data_bank_microtasks")
data_dir.mkdir(parents=True, exist_ok=True)

# here we load the existing microtasks bank if it exists
base_bank_path = data_dir / "microtasks_bank.json"

if base_bank_path.exists():
    with open(base_bank_path, "r", encoding="utf-8") as f:
        microtasks_bank = json.load(f)
    print("Loaded existing microtasks_bank.json")
else:
    # if we do not have a previous bank we start from an empty dict
    microtasks_bank = {}
    print("No existing microtasks_bank.json found, we start from an empty bank")

# here we merge aptitude tasks into the main bank
for prog, block in aptitude_bank.items():
    # we make sure the programme entry exists in the main bank
    if prog not in microtasks_bank:
        microtasks_bank[prog] = {}
    # we replace or create the aptitude list for this programme
    microtasks_bank[prog]["aptitude"] = block["aptitude"]

# ================================================================
# here we save the full bank
# ================================================================

full_bank_path = data_dir / "microtasks_bank_full.json"

with open(full_bank_path, "w", encoding="utf-8") as f:
    json.dump(microtasks_bank, f, ensure_ascii=False, indent=2)

print("Saved full microtasks bank to:", full_bank_path)
print("Number of programmes in the full bank:", len(microtasks_bank))

# here we quickly check that one programme has both personality and aptitude if personality existed
sample_prog = programmes[0]
print("Sample programme:", sample_prog)
print("Keys for this programme:", microtasks_bank.get(sample_prog, {}).keys())


In [None]:
from pathlib import Path
import json

# here we set the directory where we store all banks
data_dir = Path("../data_bank_microtasks")
data_dir.mkdir(parents=True, exist_ok=True)

# here we define the paths of the two source jsons
personality_path = data_dir / "microtasks.json"
aptitude_path = data_dir / "microchallenges_bank_aptitude.json"

# here we load the personality bank
with open(personality_path, "r", encoding="utf-8") as f:
    personality_bank = json.load(f)

print("We loaded the personality bank with programmes:",
      len(personality_bank))

# here we load the aptitude bank
with open(aptitude_path, "r", encoding="utf-8") as f:
    aptitude_bank = json.load(f)

print("We loaded the aptitude bank with programmes:",
      len(aptitude_bank))

# here we start from the personality bank as base
full_bank = personality_bank.copy()

# here we merge aptitude blocks into the full bank
for prog, block in aptitude_bank.items():
    # we make sure there is a dict for this programme
    if prog not in full_bank:
        full_bank[prog] = {}
    # we take the aptitude list from the aptitude bank
    full_bank[prog]["aptitude"] = block.get("aptitude", [])

# here we save the full merged bank
full_bank_path = data_dir / "microtasks_bank_full.json"

with open(full_bank_path, "w", encoding="utf-8") as f:
    json.dump(full_bank, f, ensure_ascii=False, indent=2)

print("We saved the full bank to:", full_bank_path)

# here we quickly inspect one programme to see the keys
sample_prog = "Ancient Studies"
if sample_prog in full_bank:
    print("Sample programme:", sample_prog)
    print("Blocks for this programme:", list(full_bank[sample_prog].keys()))
else:
    print("Warning, Ancient Studies is not in the merged bank")
