In [1]:
import json
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA

In [2]:
with open("Data/e_book.json","r",encoding="utf-8") as f:
    data = json.load(f)
    

In [3]:
questions = data["questions"]

In [4]:
embeddings= np.array([q["embedding"] for q in questions])

In [5]:
norms = np.linalg.norm(embeddings, axis=1, keepdims= True)

In [6]:
embeddings_normalized = embeddings/ norms

In [7]:
def random_centroids(embeddings, k):
    indices = np.random.choice(len(embeddings), size=k, replace=False)
    return embeddings[indices]

In [8]:
centroids = random_centroids(embeddings, 20)

In [9]:
centroids

array([[-0.03235121,  0.07171778, -0.0416294 , ...,  0.0682628 ,
         0.063637  ,  0.01755362],
       [ 0.05193331,  0.09175792, -0.08754157, ...,  0.04039731,
        -0.02830567,  0.01961418],
       [-0.0421259 , -0.05476788, -0.00357007, ...,  0.04630165,
         0.06651292, -0.05963337],
       ...,
       [-0.00939784,  0.07033402,  0.01088581, ..., -0.02206365,
         0.05388596, -0.11250428],
       [ 0.05984091,  0.02535134, -0.06599137, ...,  0.04029744,
         0.00369559,  0.00288598],
       [-0.01066494,  0.01049393, -0.02429506, ...,  0.15088186,
         0.11224185, -0.02774519]], shape=(20, 384))

In [10]:
def get_labels(embeddings, centroids):
    distances = np.linalg.norm(embeddings[:, np.newaxis, :] - centroids[np.newaxis, :, :],axis=2) 
    labels = np.argmin(distances, axis=1)
    return labels

In [11]:
labels = get_labels(embeddings, centroids)


In [12]:
def update_centroids(embeddings, labels, k):
    d = embeddings.shape[1]
    new_centroids = np.zeros((k, d))

    for i in range(k):
        cluster_points = embeddings[labels == i]

        if len(cluster_points) == 0:
            # reinitialize empty cluster
            new_centroids[i] = embeddings[np.random.randint(len(embeddings))]
        else:
            new_centroids[i] = cluster_points.mean(axis=0)

    return new_centroids


In [13]:
max_iterations = 100
k = 50
tolerance = 1e-6

centroids = random_centroids(embeddings_normalized, k)
iteration = 0

while iteration < max_iterations:
    old_centroids = centroids.copy()

    labels = get_labels(embeddings_normalized, centroids)
    centroids = update_centroids(embeddings_normalized, labels, k)

    # Re-normalize centroids
    centroids /= np.linalg.norm(centroids, axis=1, keepdims=True)

    # Convergence check
    if np.allclose(centroids, old_centroids, atol=tolerance):
        print(f"Converged at iteration {iteration}")
        break

    iteration += 1


Converged at iteration 15


In [14]:
for q, cluster_id in zip(questions, labels):
    q["cluster_id"] = int(cluster_id)


In [15]:
cluster_id_tocheck = 0

cluster_0_questions = [
    q for q in questions if q["cluster_id"] == cluster_id_tocheck
]


In [16]:
for i,q in enumerate(cluster_0_questions[:125]):
    print(f"{i+1}. ({q['year']}) {q['cleaned_text']}")

1. (None) what is a function why are functions needed in c programming
2. (None) what are the advantages of using functions
3. (None) differentiate between a function declaration, function definition, and function call.
4. (None) what is a function prototype why is it required
5. (None) what is a nested function call explain with an example.
6. (None) what is a library function give examples.
7. (None) what is a userdefined function give examples.
8. (None) what is function overloading is it supported in c explain.
9. (None) what are stringhandling functions name some commonly used string functions.
10. (None) pointer to function as argument declaration and usage in host and guest functions.
11. (None) applications where passing one function to another is useful.
12. (None) why are addresses of functions stored on the stack
13. (None) if a function is to be called, is it necessary to mention its prototype if yes, why
14. (None) what is the difference between function declaration and fu