In [11]:
import matplotlib.pyplot as plt
import numpy as np
import torch
from sklearn.cluster import KMeans

feature_space = torch.load("/workspaces/dbm25/data/extracted_features.pt")

feature = "layer3"

a = []
for element in feature_space:
   a.append(element[feature])

a_array = np.array(a)


In [12]:
k = 3
km = KMeans(n_clusters=k, random_state=0)
km.fit(a_array)

# km.cluster_centers_  → shape (k, 1000)
# km.labels_           → array of length 3006
centroids = km.cluster_centers_
labels = km.labels_

print(centroids.shape)
print(labels.shape)

(3, 1024)
(3006,)


In [13]:
point_similarities = []
for idx, point in enumerate(a_array):
    # compute the distance of the feature from each centroid with cosine similarity
    similarities = []
    for idy, centroid in enumerate(centroids):
        sim = np.dot(point, centroid) / (np.linalg.norm(point) * np.linalg.norm(centroid))  # cosine similarity
        similarities.append(np.array([sim, idy]))
    similarities = np.array(similarities)
    sorted_similarities = similarities[similarities[:, 0].argsort()[::-1]]
    point_similarities.append(sorted_similarities)
    
point_similarities = np.array(point_similarities)
# Has shape (n_samples, similarity, centroid_id)
print(point_similarities.shape)
print(point_similarities[0])

(3006, 3, 2)
[[0.9760344  1.        ]
 [0.96796572 2.        ]
 [0.93644494 0.        ]]


In [14]:
# Select m<k most important similarities and keep only that
# So for m = 5 and k = 10 we would go from (n_samples, 10, 2) to (n_samples, 5, 2)
m = 1
test_subject = 0
if m <= k:
    reduced_point_similarities = point_similarities[:, :m, :]
    print(reduced_point_similarities.shape)
    print(reduced_point_similarities[test_subject])
else:
    print("m must be smaller or equal to k")

(3006, 1, 2)
[[0.9760344 1.       ]]


In [15]:
# Predict point test_subject with kmeans and get it's cluster
# This should return the same cluster as the first element of point_distances
predicted_label = km.predict(a_array[test_subject].reshape(1, -1))
print(predicted_label)

[1]


In [16]:
# TODO reconstruct the dict with image_id, class, reduced_similarities

In [17]:
print(feature_space[0].keys())

dict_keys(['file_path', 'class', 'cm', 'hog', 'avgpool', 'layer3', 'fc'])


In [18]:
# For each element in feature_space drop everything but the image_id, class and add the reduced similarities
for idx, element in enumerate(feature_space):
    # Get the image_id and class
    file_path = element["file_path"]
    class_id = element["class"]
    
    # Get the reduced similarities
    reduced_similarities = reduced_point_similarities[idx]
    
    # Create a new dict with the image_id, class and reduced similarities
    new_element = {
        "file_path": file_path,
        "class": class_id,
        "similarities": reduced_similarities
    }
    
    # Replace the element in the feature_space with the new element
    feature_space[idx] = new_element

# Save it to a pt file
torch.save(feature_space, f"/workspaces/dbm25/data/extracted_features_reduced_{feature}.pt")

In [22]:
# load the file and check if it worked
loaded_feature_space = torch.load(f"/workspaces/dbm25/data/extracted_features_reduced_{feature}.pt", weights_only=False)
print(loaded_feature_space[0].keys())
print(loaded_feature_space[0]["similarities"])

dict_keys(['file_path', 'class', 'similarities'])
[[0.9760344 1.       ]]
