# Validating all aptitude tasks

In [1]:
# We import the standard libraries that we need
from pathlib import Path
import json
import re
from collections import defaultdict

import pandas as pd

## 1. Load the microtasks bank from disk

In [None]:


data_dir = Path("../data_bank_microtasks")
bank_path = data_dir / "microtasks_bank_full.json"  # adjust if your file name is different

with open(bank_path, "r", encoding="utf-8") as f:
    bank = json.load(f)

print("Programmes in bank:", len(bank))

Programmes in bank: 14


## 2. Helper functions that validate each task type 

In [None]:

# Each function receives one task dict and returns a list of error strings.

def validate_common_fields(task: dict) -> list[str]:
    """
    Here we check fields that must be present for every aptitude task,
    no matter the specific type.
    """
    errors = []

    qcode = task.get("question_code")
    if not isinstance(qcode, str) or not qcode.strip():
        errors.append("question_code must be a non empty string")

    if task.get("signalType") != "aptitude":
        errors.append("signalType must be 'aptitude'")

    qtext = task.get("question")
    if not isinstance(qtext, str) or not qtext.strip():
        errors.append("question must be a non empty string")

    tiny = task.get("tiny_learn")
    if not isinstance(tiny, list):
        errors.append("tiny_learn must be a list of three strings")
    else:
        if len(tiny) != 3:
            errors.append(f"tiny_learn must have length three, found {len(tiny)}")
        for i, t in enumerate(tiny):
            if not isinstance(t, str) or not t.strip():
                errors.append(f"tiny_learn[{i}] must be a non empty string")

    return errors


def validate_classify(task: dict) -> list[str]:
    """
    Here we check the structure for classify tasks.
    """
    errors = []

    cats = task.get("categories")
    if not isinstance(cats, list) or len(cats) < 2:
        errors.append("classify: categories must be a list with at least two labels")
        return errors

    if any(not isinstance(c, str) or not c.strip() for c in cats):
        errors.append("classify: all category labels must be non empty strings")

    items = task.get("items")
    if not isinstance(items, list) or len(items) == 0:
        errors.append("classify: items must be a non empty list")
        return errors

    ids_seen = set()
    for it in items:
        if not isinstance(it, dict):
            errors.append("classify: each item must be an object")
            continue

        iid = it.get("id")
        text = it.get("text")
        cat = it.get("correctCategory")

        if not isinstance(iid, str) or not iid.strip():
            errors.append("classify: each item id must be a non empty string")
        else:
            if iid in ids_seen:
                errors.append(f"classify: duplicated item id {iid}")
            ids_seen.add(iid)

        if not isinstance(text, str) or not text.strip():
            errors.append(f"classify: item {iid} has empty text")

        if cat not in cats:
            errors.append(
                f"classify: item {iid} has correctCategory '{cat}' not in categories"
            )

    return errors


def validate_fillblank(task: dict) -> list[str]:
    """
    Here we check the structure for fillblank tasks.
    """
    errors = []

    text = task.get("textWithBlanks")
    if not isinstance(text, str) or not text.strip():
        errors.append("fillblank: textWithBlanks must be a non empty string")
        return errors

    # We find all markers like {{0}}, {{1}}, ...
    marker_ids = re.findall(r"\{\{(\d+)\}\}", text)
    marker_ids_int = sorted({int(m) for m in marker_ids}) if marker_ids else []

    blanks = task.get("blanks")
    if not isinstance(blanks, list) or len(blanks) == 0:
        errors.append("fillblank: blanks must be a non empty list")
        return errors

    words = task.get("words")
    if not isinstance(words, list) or len(words) == 0:
        errors.append("fillblank: words must be a non empty list")
        return errors

    word_ids = {w.get("id") for w in words if isinstance(w, dict)}

    blank_ids = []
    for b in blanks:
        if not isinstance(b, dict):
            errors.append("fillblank: each blank must be an object")
            continue

        bid = b.get("id")
        wid = b.get("correctWordId")

        if not isinstance(bid, int):
            errors.append("fillblank: blank id must be an integer")
        else:
            blank_ids.append(bid)

        if wid not in word_ids:
            errors.append(
                f"fillblank: blank {bid} has correctWordId '{wid}' not in words list"
            )

    # We check that blank ids match the markers that appear in the text
    if marker_ids_int and sorted(blank_ids) != marker_ids_int:
        errors.append(
            f"fillblank: blank ids {sorted(blank_ids)} do not match markers {marker_ids_int}"
        )

    return errors


def validate_puzzle(task: dict) -> list[str]:
    """
    Here we check the structure for puzzle tasks.
    """
    errors = []

    puzzle_obj = task.get("puzzle")
    if not isinstance(puzzle_obj, dict):
        errors.append("puzzle: puzzle field must be an object")
    else:
        variant = puzzle_obj.get("variant")
        if not isinstance(variant, str) or not variant.strip():
            errors.append("puzzle: variant must be a non empty string")

        # We do not force a specific variant here, we only check that at least
        # one of the known shapes is present in the object
        if variant == "pattern":
            seq = puzzle_obj.get("sequence")
            if not isinstance(seq, list) or len(seq) == 0:
                errors.append("puzzle: pattern variant must have non empty sequence list")
        if variant == "balance":
            left = puzzle_obj.get("left")
            right = puzzle_obj.get("right")
            if not isinstance(left, list) or not isinstance(right, list):
                errors.append("puzzle: balance variant must have left and right lists")

    options = task.get("options")
    if not isinstance(options, list) or len(options) < 2:
        errors.append("puzzle: options must be a list with at least two entries")
        return errors

    ids = []
    for opt in options:
        if not isinstance(opt, dict):
            errors.append("puzzle: each option must be an object")
            continue

        oid = opt.get("id")
        if not isinstance(oid, str) or not oid.strip():
            errors.append("puzzle: each option id must be a non empty string")
        else:
            ids.append(oid)

        if "value" not in opt:
            errors.append(f"puzzle: option {oid} must have a value field")

    correct = task.get("correctAnswer")
    if correct not in ids:
        errors.append(
            f"puzzle: correctAnswer '{correct}' is not one of the option ids {ids}"
        )

    return errors


def validate_graph(task: dict) -> list[str]:
    """
    Here we check the structure for graph tasks.
    """
    errors = []

    graph_data = task.get("graphData")
    if not isinstance(graph_data, dict):
        errors.append("graph: graphData must be an object")
        return errors

    gtype = graph_data.get("type")
    if gtype not in ("bar", "line"):
        errors.append("graph: graphData.type must be 'bar' or 'line'")

    labels = graph_data.get("labels")
    values = graph_data.get("values")
    if not isinstance(labels, list) or not isinstance(values, list):
        errors.append("graph: labels and values must be lists")
    else:
        if len(labels) != len(values):
            errors.append(
                f"graph: labels and values must have same length, got {len(labels)} and {len(values)}"
            )

    # yAxisLabel is required for line graphs in your example
    if gtype == "line":
        if not isinstance(graph_data.get("yAxisLabel"), str):
            errors.append("graph: line graph must have yAxisLabel as string")

    regions = task.get("clickableRegions")
    if not isinstance(regions, list) or len(regions) == 0:
        errors.append("graph: clickableRegions must be a non empty list")
        return errors

    region_ids = set()
    for r in regions:
        if not isinstance(r, dict):
            errors.append("graph: each clickable region must be an object")
            continue

        rid = r.get("id")
        if not isinstance(rid, str) or not rid.strip():
            errors.append("graph: region id must be a non empty string")
        else:
            region_ids.add(rid)

        # For line graphs we expect dataIndex to be present
        if gtype == "line":
            if "dataIndex" not in r:
                errors.append("graph: line graph region must have dataIndex field")

    correct_region = task.get("correctRegion")
    if correct_region not in region_ids:
        errors.append(
            f"graph: correctRegion '{correct_region}' is not one of {sorted(region_ids)}"
        )

    return errors


def validate_codeorder(task: dict) -> list[str]:
    """
    Here we check the structure for codeorder tasks.
    """
    errors = []

    lang = task.get("language")
    if lang != "python":
        errors.append(f"codeorder: language must be 'python', found {lang}")

    lines = task.get("lines")
    if not isinstance(lines, list) or len(lines) < 3:
        errors.append("codeorder: lines must be a list with at least three entries")
        return errors

    positions = []
    for line in lines:
        if not isinstance(line, dict):
            errors.append("codeorder: each line must be an object")
            continue

        lid = line.get("id")
        code = line.get("code")
        pos = line.get("correctPosition")

        if not isinstance(lid, str) or not lid.strip():
            errors.append("codeorder: each line id must be a non empty string")

        if not isinstance(code, str) or not code.strip():
            errors.append(f"codeorder: line {lid} must have non empty code")

        if not isinstance(pos, int):
            errors.append(f"codeorder: line {lid} must have integer correctPosition")
        else:
            positions.append(pos)

    # We check that positions are a contiguous sequence
    if positions:
        sorted_pos = sorted(positions)
        expected = list(range(1, len(sorted_pos) + 1))
        if sorted_pos != expected:
            errors.append(
                f"codeorder: correctPosition values must be {expected}, found {sorted_pos}"
            )

    expected_out = task.get("expectedOutput")
    if not isinstance(expected_out, str) or not expected_out.strip():
        errors.append("codeorder: expectedOutput must be a non empty string")

    return errors


## 3. Run the validators on every aptitude task and collect all errors.

In [6]:

all_errors = []

for programme_title, block in bank.items():
    aptitude_tasks = block.get("aptitude", [])
    for task in aptitude_tasks:
        t_type = task.get("type")
        course_name = task.get("course_name", "")
        qcode = task.get("question_code", "")

        # We always check common fields
        errs = []
        errs.extend(validate_common_fields(task))

        # Then we check type specific parts
        if t_type == "classify":
            errs.extend(validate_classify(task))
        elif t_type == "fillblank":
            errs.extend(validate_fillblank(task))
        elif t_type == "puzzle":
            errs.extend(validate_puzzle(task))
        elif t_type == "graph":
            errs.extend(validate_graph(task))
        elif t_type == "codeorder":
            errs.extend(validate_codeorder(task))
        else:
            errs.append(f"Unknown task type '{t_type}'")

        for msg in errs:
            all_errors.append(
                {
                    "programme_title": programme_title,
                    "course_name": course_name,
                    "question_code": qcode,
                    "type": t_type,
                    "error": msg,
                }
            )

In [7]:
# STEP 4
# Here we summarise the results and show a sample of the problems if there are any.

total_aptitude = sum(len(block.get("aptitude", [])) for block in bank.values())
print("Total aptitude tasks in bank:", total_aptitude)
print("Total validation errors:", len(all_errors))

if all_errors:
    df_err = pd.DataFrame(all_errors)
    print("\nFirst twenty errors:")
    display(df_err.head(20))
else:
    print("All aptitude tasks passed validation.")

Total aptitude tasks in bank: 90
Total validation errors: 0
All aptitude tasks passed validation.
