In [129]:
import json
import math
import numpy as np
from sklearn.cluster import KMeans
from sklearn.cluster import DBSCAN
from sklearn.preprocessing import normalize
from collections import defaultdict

In [130]:
with open("mixed_questions.json", "r", encoding="utf-8") as f:
    data = json.load(f)

questions = data["questions"]

embeddings = np.array([q["embedding"] for q in questions])

In [131]:
embeddings = normalize(embeddings)  # After normalization, each embedding vector has length 1, so distances between vectors reflect semantic similarity only


In [132]:
K = 15  # say 15 clusters for now

In [133]:
#running KMeans

kmeans = KMeans(n_clusters=K, random_state=42,n_init=10)
topic_labels = kmeans.fit_predict(embeddings)
centroids = kmeans.cluster_centers_


In [134]:
#Assigning topic_id to every question

for i, q in enumerate(data["questions"]):
    q["topic_id"] = int(topic_labels[i])


In [135]:
#Computing topic importance for questions that appeared in exam only

from collections import defaultdict

topic_counts = defaultdict(int)

for q in questions:
    if q["source"] == "exam":
        topic_counts[q["topic_id"]] += 1

In [136]:
#Normalizing topic importance(log scaled because it's tree friendly)

topic_importance = {}

for t in topic_counts:
    importance = math.log1p(topic_counts[t])   # log(1 + number of exam questions in topic t)
    topic_importance[t] = importance

In [137]:
#Assigning topic importance to all questions

for q in questions:
    q["derived_features"]["topic_importance"] = topic_importance.get(
        q["topic_id"], 0.0
    )


In [138]:
#running DBSCAN

dbscan = DBSCAN(
    eps=0.25,         # eps is the maximum distance between two points for them to be considered “neighbors” in a cluster
    min_samples=2,   # min 2 questions to form a concept
    metric="cosine"  # using cosine similarity for text embeddings
)
concept_labels = dbscan.fit_predict(embeddings)   #concept_labels is an array/list of integers giving cluster ID assigned by DBSCAN to each question. It tells us which questions belong to the same concept cluster, and -1 indicates an outlier

In [139]:
#Assign concept IDs to questions


for q, concept_id in zip(questions, concept_labels):
    q["concept_id"] = int(concept_id)


In [140]:
#Computing concept importance


concept_counts = defaultdict(int)

for q in questions:
    if q["source"] == "exam" and q["concept_id"] != -1:
        concept_counts[q["concept_id"]] += 1

#log-scaled
concept_importance = {c: math.log1p(count) for c, count in concept_counts.items()}
# Base importance for a single exam question
base_importance = math.log(1 + 1)   #0.693 auxa

# Assign to all questions
for q in questions:
    if q["concept_id"] in concept_importance:
        q["derived_features"]["concept_importance"] = concept_importance[q["concept_id"]]
    else:
        #assign 0.693 to exam questions labeled as noise and 0.0 to textbook question
        if q["source"] == "exam" and q["concept_id"] == -1:
            q["derived_features"]["concept_importance"] = base_importance
        else:
            q["derived_features"]["concept_importance"] = 0.0

In [141]:
#since

In [142]:
#Saving the updated  mixed_questions.json

with open("mixed_questions.json", "w", encoding="utf-8") as f:
    json.dump(
        {"subject": data["subject"], "questions": questions},
        f,
        indent=2
    )


In [143]:
#seeing each topic importance extracted using KMeans

# Counting exam questions per topic
topic_counts = defaultdict(int)

for q in questions:
    if q["source"] == "exam":
        topic_counts[q["topic_id"]] += 1


topic_importance_log = {}

for t, count in topic_counts.items():    
    # Log-scaled method
    topic_importance_log[t] = math.log1p(count)  # log(1 + count)

print("\nTopic Importance (Log-scaled):")
for t in sorted(topic_importance_log):
    print(f"Topic {t}: {topic_importance_log[t]:.3f}")


Topic Importance (Log-scaled):
Topic 0: 3.466
Topic 1: 2.485
Topic 2: 2.079
Topic 3: 1.792
Topic 5: 3.296
Topic 6: 2.303
Topic 7: 2.996
Topic 8: 2.708
Topic 9: 2.639
Topic 10: 3.714
Topic 11: 2.708
Topic 12: 2.833
Topic 13: 2.197
Topic 14: 1.609


In [146]:
#checking if the KMeans clusters are semantically coherent

topic_groups = defaultdict(list)

for q in questions:
    topic_groups[q["topic_id"]].append((q["source"], q["raw_text"]))

for topic_id, q_list in topic_groups.items():
    print(f"\n=== Topic {topic_id} ===")
    for i, (source, text) in enumerate(q_list, 1):
        print(f"{i}. [{source}] {text}")



=== Topic 10 ===
1. [exam] Describe different steps for the development of a C program.
2. [exam] Write a C program to swap (interchange) values of two variables with the help of function and pointer.
3. [exam] Write a C program that illustrates how an array of structures is passed to a function, and how a pointer to a particular structure is returned.
4. [exam] Differentiate pass by value with pass by reference. Write a program to illustrate three library functions related to dynamic memory allocation in C programming.
5. [exam] Briefly explain different types of storage class specifiers that are used in C programming language.  
    Write a C program to copy and compare structure variables.
6. [exam] Summarize the rules for naming identifiers. Are uppercase letters equivalent to lowercase letters? Name the different classes of statements in C. Describe the composition of each.
7. [exam] Describe the output that will be generated by each of the following C programs:  

   a.  #includ

In [147]:
#checking if the all questions are per concept (with concept importance) after DBSCAN clustering


concept_groups = defaultdict(list)

for q in questions:
    concept_groups[q["concept_id"]].append((q["source"], q["raw_text"]))

for cid, q_list in concept_groups.items():
    # For normal clusters, use calculated importance
    if cid != -1:
        importance = concept_importance.get(cid, 0.0)
        print(f"\n=== Concept {cid} (Importance: {importance:.3f}) ===")
        for i, (source, text) in enumerate(q_list, 1):
            print(f"{i}. [{source}] {text}")
    else:
        # Noise section: print each question with its individual importance
        print(f"\n=== Concept {cid} (Noise Section) ===")
        for i, (source, text) in enumerate(q_list, 1):
            # Only assign base importance to exam questions
            q_importance = base_importance if source == "exam" else 0.0
            print(f"{i}. [{source}] (Importance: {q_importance:.3f}) {text}")


=== Concept 0 (Importance: 1.792) ===
1. [exam] Describe different steps for the development of a C program.
2. [exam] Define structured programming language. Describe the general structure of a C program.
3. [exam] Explain the four steps of development of a C program.
4. [exam] Discuss the characteristics of C programming language. Describe general programming rules.
5. [exam] Explain basic structure of a C program with examples. Write syntax and examples of `puts()`, `scanf()`, `printf()`, and `putchar()`.
6. [textbook] What are the major components of a C program? What significance is attached to the function name main?
7. [textbook] Categorise programming languages on the basic of their uses and applications. Among them which programming language is C programming?
8. [textbook] Explain structure of C program with an appropriate example.

=== Concept 1 (Importance: 1.099) ===
1. [exam] With example, describe any four types of operators used in C programming.
2. [exam] What do you m