In [41]:
import json
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA

In [42]:
with open("D:\\React\\neplearn\\Data\\e_book.json","r",encoding="utf-8") as f:
    data = json.load(f)
    

In [43]:
questions = data["questions"]

In [44]:
embeddings= np.array([q["embedding"] for q in questions])

In [45]:
norms = np.linalg.norm(embeddings, axis=1, keepdims= True)

In [46]:
embeddings_normalized = embeddings/ norms

In [47]:
def random_centroids(embeddings, k):
    indices = np.random.choice(len(embeddings), size=k, replace=False)
    return embeddings[indices]

In [48]:
centroids = random_centroids(embeddings, 20)

In [49]:
centroids

array([[-0.02283542,  0.05938179, -0.05373811, ...,  0.04383835,
         0.06179833, -0.03994607],
       [ 0.04218721,  0.07180932, -0.10987258, ...,  0.0062459 ,
         0.01043634, -0.04397617],
       [ 0.01686457,  0.07122346,  0.00729055, ..., -0.02234727,
         0.1047065 ,  0.00612486],
       ...,
       [-0.03592495, -0.00092659, -0.05379568, ...,  0.03292235,
         0.07190356, -0.031355  ],
       [-0.00684903,  0.00739852, -0.0032756 , ..., -0.02183557,
        -0.02191116, -0.01095674],
       [ 0.06999835,  0.03872629, -0.07964125, ...,  0.01045959,
        -0.04391257, -0.06256887]], shape=(20, 384))

In [50]:
def get_labels(embeddings, centroids):
    distances = np.linalg.norm(embeddings[:, np.newaxis, :] - centroids[np.newaxis, :, :],axis=2) 
    labels = np.argmin(distances, axis=1)
    return labels

In [51]:
labels = get_labels(embeddings, centroids)


In [52]:
def update_centroids(embeddings, labels, k):
    d = embeddings.shape[1]
    new_centroids = np.zeros((k, d))

    for i in range(k):
        cluster_points = embeddings[labels == i]

        if len(cluster_points) == 0:
            # reinitialize empty cluster
            new_centroids[i] = embeddings[np.random.randint(len(embeddings))]
        else:
            new_centroids[i] = cluster_points.mean(axis=0)

    return new_centroids


In [63]:
max_iterations = 100
k = 50
tolerance = 1e-6

centroids = random_centroids(embeddings_normalized, k)
iteration = 0

while iteration < max_iterations:
    old_centroids = centroids.copy()

    labels = get_labels(embeddings_normalized, centroids)
    centroids = update_centroids(embeddings_normalized, labels, k)

    # Re-normalize centroids
    centroids /= np.linalg.norm(centroids, axis=1, keepdims=True)

    # Convergence check
    if np.allclose(centroids, old_centroids, atol=tolerance):
        print(f"Converged at iteration {iteration}")
        break

    iteration += 1


Converged at iteration 14


In [64]:
for q, cluster_id in zip(questions, labels):
    q["cluster_id"] = int(cluster_id)


In [65]:
cluster_id_tocheck = 0

cluster_0_questions = [
    q for q in questions if q["cluster_id"] == cluster_id_tocheck
]


In [66]:
for i,q in enumerate(cluster_0_questions[:125]):
    print(f"{i+1}. ({q['year']}) {q['cleaned_text']}")

1. (None) what is an expression what kind of information is represented by an expression
2. (None) can statements be embedded within other statements explain.
3. (None) what is the conditional operator how does it work
4. (None) what is an expression how does an expression differ from a statement
5. (None) what is a conditional expression how does it differ from an if statement
6. (None) how is the address of a variable obtained which operator is used
7. (None) what is the indirection operator () how does it work
8. (None) what is the purpose of the indirection () operator
9. (None) what is the purpose of the indirection operator () to what type of operand must it be applied
10. (None) can the address operator () act upon an arithmetic expression like 2  (u  v) explain.
11. (None) can an expression with the indirection operator appear on the lefthand side of an assignment
12. (None) what is an expression
13. (None) what are the types of expression
14. (None) define tokens expression an