In [1]:
import json
from math import exp
from collections import defaultdict, Counter


In [2]:
with open("mixed_questions.json", "r", encoding="utf-8") as f:
    data = json.load(f)

questions = data["questions"]

In [3]:
target_year = 2027     # year to predict for
lambda_decay = 3       # recency decay factor

In [4]:
#TO compute last_seen_year per concept

# concept_last_seen_year[cid] = latest year an exam question appeared in this concept
concept_last_seen_year = {}

for q in questions:
    if q.get("source", "").lower() == "exam":
        cid = q["concept_id"]
        # Use existing last_seen_year if present, else fallback to 'year'
        year = q.get("derived_features", {}).get("last_seen_year", q.get("year"))
        if year is not None:
            concept_last_seen_year[cid] = max(concept_last_seen_year.get(cid, year), year)

In [5]:
for q in questions:
    cid = q["concept_id"]

    # Ensure derived_features dictionary exists
    if "derived_features" not in q:
        q["derived_features"] = {}

    # Concept-level last_seen_year
    concept_year = concept_last_seen_year.get(cid, None)

    if concept_year is not None:
        gap = target_year - concept_year
        q["derived_features"]["gap_since_last_seen"] = gap
        q["derived_features"]["recency_decay"] = exp(-gap / lambda_decay)
        q["derived_features"]["concept_last_seen_year"] = concept_year

        # Exam questions keep their original last_seen_year
        if q.get("source", "").lower() == "exam":
            q["derived_features"]["last_seen_year"] = q["derived_features"].get("last_seen_year", q.get("year"))

    else:
        # Concept never appeared in exams
        q["derived_features"]["gap_since_last_seen"] = None
        q["derived_features"]["recency_decay"] = 0.0
        q["derived_features"]["concept_last_seen_year"] = None


In [6]:
#section frequency table (from exam questions only)

cluster_section_counts = defaultdict(lambda: {"A": 0, "B": 0, "C": 0})

for q in questions:
    if q["source"] == "exam":
        cid = q["concept_id"]
        sec = q["exam_meta"]["section"]
        cluster_section_counts[cid][sec] += 1

In [7]:
#Convert counts â†’ probabilities (with smoothing to avoid 0s)

cluster_section_probs = {}

alpha = 1.0  # smoothing strength

for cid, counts in cluster_section_counts.items():
    total = sum(counts.values()) + 3 * alpha
    cluster_section_probs[cid] = {
        "A": (counts["A"] + alpha) / total,
        "B": (counts["B"] + alpha) / total,
        "C": (counts["C"] + alpha) / total,
    }


In [8]:
#Global section distribution

global_counts = {"A": 0, "B": 0, "C": 0}

for q in questions:
    if q["source"] == "exam":
        sec = q["exam_meta"]["section"]
        global_counts[sec] += 1

total = sum(global_counts.values())

if total > 0:
    global_probs = {
        "A": global_counts["A"] / total,
        "B": global_counts["B"] / total,
        "C": global_counts["C"] / total,
    }
else:
    global_probs = {"A": 1/3, "B": 1/3, "C": 1/3}

In [9]:
#Assign section_prob to all questions

for q in questions:
    cid = q["concept_id"]

    # Fallback for noise or unseen clusters
    if cid == -1 or cid not in cluster_section_probs:
        probs = global_probs
    else:
        probs = cluster_section_probs[cid]

    # Ensure derived_features exists
    if "derived_features" not in q:
        q["derived_features"] = {}

    # Store section probabilities
    q["derived_features"]["section_prob"] = {
        "A": probs["A"],
        "B": probs["B"],
        "C": probs["C"],
    }

    # Add confidence feature (very useful for ML)
    q["derived_features"]["section_confidence"] = max(
        probs["A"], probs["B"], probs["C"]
    )
    
    # assign most likely section
    predicted_section = max(probs, key=probs.get)
    # if it's a textbook question, also mirror into exam_meta
    if q.get("source", "").lower() == "textbook":
        if q.get("exam_meta") is None:
            q["exam_meta"] = {}
        q["exam_meta"]["section"] = predicted_section


In [10]:
with open("checking.json", "w", encoding="utf-8") as f:
    json.dump(data, f, indent=2)