In [1]:
import json
import numpy as np
from collections import Counter
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import DBSCAN

In [2]:
with open("Data/e_book.json", "r") as f:
    data = json.load(f)

questions = data["questions"]

In [3]:
X = np.array([q["embedding"] for q in questions])

In [4]:
X = StandardScaler().fit_transform(X)

In [5]:
dbscan = DBSCAN(eps=0.55, min_samples=2)
labels = dbscan.fit_predict(X)


In [6]:
for q, label in zip(questions, labels):
    q["cluster"] = int(label)

In [7]:
cluster_counts = Counter(labels)
cluster_counts.pop(-1, None)  

1078

In [8]:
print(" CLUSTER SUMMARY")
print("-" * 40)
for cid, count in cluster_counts.items():
    print(f"Cluster {cid}: {count} questions")

print(" CLUSTERED QUESTIONS")
print("-" * 40)
for cid in cluster_counts:
    print(f"\n Cluster {cid}")
    for q in questions:
        if q["cluster"] == cid:
            print("  -", q["raw_text"])

print(" NOISE (RARE) QUESTIONS")
print("-" * 40)
for q in questions:
    if q["cluster"] == -1:
        print("  -", q["raw_text"])

 CLUSTER SUMMARY
----------------------------------------
Cluster 0: 2 questions
Cluster 1: 2 questions
Cluster 2: 2 questions
Cluster 3: 2 questions
Cluster 4: 2 questions
Cluster 5: 2 questions
Cluster 6: 2 questions
Cluster 7: 2 questions
Cluster 8: 2 questions
Cluster 9: 2 questions
Cluster 10: 2 questions
Cluster 11: 2 questions
Cluster 12: 2 questions
Cluster 13: 2 questions
 CLUSTERED QUESTIONS
----------------------------------------

 Cluster 0
  - What are the advantages of using functions?
  - What are the advantages of using functions?

 Cluster 1
  - What is null pointer?
  - What is null pointer?

 Cluster 2
  - Discuss in brief about different generation of programming languages.
  - Discuss in brief about different generation of programming languages.

 Cluster 3
  - What is an expression?
  - What is an expression?

 Cluster 4
  - Explain about input and output function available in C with syntax and example of each part.
  - Explain about input and output function ava