In [1]:
import json
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA

In [2]:
with open("Data/KUexam_questions.json","r",encoding="utf-8") as f:
    data = json.load(f)
    

In [3]:
questions = data["questions"]

In [4]:
embeddings= np.array([q["embedding"] for q in questions])

In [5]:
norms = np.linalg.norm(embeddings, axis=1, keepdims= True)

In [6]:
embeddings_normalized = embeddings/ norms

In [7]:
def random_centroids(embeddings, k):
    indices = np.random.choice(len(embeddings), size=k, replace=False)
    return embeddings[indices]

In [8]:
centroids = random_centroids(embeddings, 20)

In [9]:
centroids

array([[ 0.02638713,  0.07180556, -0.04829141, ...,  0.02310626,
         0.01141715,  0.0026505 ],
       [-0.01272829,  0.06331618, -0.17631204, ...,  0.05157461,
         0.00928811,  0.00795231],
       [ 0.0207739 ,  0.03801021, -0.07432288, ..., -0.00027406,
        -0.01151048, -0.0360148 ],
       ...,
       [ 0.06485282,  0.02092258, -0.11152431, ...,  0.0847462 ,
         0.01573033, -0.00885365],
       [-0.03333984, -0.01569868, -0.02977037, ...,  0.05985083,
         0.08096741, -0.01027963],
       [-0.04550707, -0.01095176, -0.08004905, ...,  0.02629869,
         0.06886209, -0.07461203]], shape=(20, 384))

In [10]:
def get_labels(embeddings, centroids):
    distances = np.linalg.norm(embeddings[:, np.newaxis, :] - centroids[np.newaxis, :, :],axis=2) 
    labels = np.argmin(distances, axis=1)
    return labels

In [11]:
labels = get_labels(embeddings, centroids)


In [12]:
def update_centroids(embeddings, labels, k):
    d = embeddings.shape[1]
    new_centroids = np.zeros((k, d))

    for i in range(k):
        cluster_points = embeddings[labels == i]

        if len(cluster_points) == 0:
            # reinitialize empty cluster
            new_centroids[i] = embeddings[np.random.randint(len(embeddings))]
        else:
            new_centroids[i] = cluster_points.mean(axis=0)

    return new_centroids


In [13]:
max_iterations = 100
k = 15
tolerance = 1e-6

centroids = random_centroids(embeddings_normalized, k)
iteration = 0

while iteration < max_iterations:
    old_centroids = centroids.copy()

    labels = get_labels(embeddings_normalized, centroids)
    centroids = update_centroids(embeddings_normalized, labels, k)

    # Re-normalize centroids
    centroids /= np.linalg.norm(centroids, axis=1, keepdims=True)

    # Convergence check
    if np.allclose(centroids, old_centroids, atol=tolerance):
        print(f"Converged at iteration {iteration}")
        break

    iteration += 1


Converged at iteration 4


In [14]:
for q, cluster_id in zip(questions, labels):
    q["cluster_id"] = int(cluster_id)


In [15]:
cluster_id_tocheck = 0

cluster_0_questions = [
    q for q in questions if q["cluster_id"] == cluster_id_tocheck
]


In [16]:
for i,q in enumerate(cluster_0_questions[:125]):
    print(f"{i+1}. ({q['year']}) {q['cleaned_text']}")

1. (2013) write a program to check whether a given number is armstrong or not hint
2. (2014) write a loop that will generate every fourth integer beginning with variable i 3 and containing all integers less than 150 calculate the sum of those integers that are divisible by 7
3. (2014) write a program to read integers n1 and n2 and display all odd numbers between those two numbers
4. (2015) write a c program to check whether a number is a strong number or not
5. (2015) write a program that reads twelve numbers entered by the user and prints if any of them match
6. (2016) given a square matrix a of size 10 x 10 write a program that identifies whether the matrix is diagonal a diagonal matrix has all nondiagonal elements equal to zero
7. (2017) write a program to print all even numbers between 101 and 201 except those divisible by 14
8. (2017) write a function called `isdivisible` that takes two positive integers as arguments assume the first argument is less than the second the function r