In [25]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import torch
import matplotlib

In [26]:

def svd_decomposition(feature_space, feature):
   a = []
   for element in feature_space:
      a.append(element[feature])

   a_array = np.array(a)

   U, S, Vh = np.linalg.svd(a_array, full_matrices=False, compute_uv=True, hermitian=False)

   return U, S, Vh


def get_top_k_latent_features(U, k):
   return U[:, :k]


In [27]:
def plot_latent_features(S):
   S_list = S.tolist()

   top_k_list = [elem for elem in S_list if elem > 100000]

   plt.plot(S_list, marker='o')  # 'marker' aggiunge i punti ai valori
   plt.title('Andamento continuo dei dati')
   plt.xlabel('Indice')
   plt.ylabel('Valore')
   plt.grid(True)
   plt.show()


   print(top_k_list)
   print(len(top_k_list))

def find_elbow_point(y):

   x = np.arange(len(y))
   # Rette di riferimento: primo e ultimo punto
   line_vec = np.array([x[-1] - x[0], y[-1] - y[0]])
   line_vec = line_vec / np.linalg.norm(line_vec)

   # Distanza di ogni punto dalla retta
   vec_from_first = np.stack((x - x[0], y - y[0]), axis=1)
   scalar_proj = np.dot(vec_from_first, line_vec)
   proj = np.outer(scalar_proj, line_vec)
   vec_to_line = vec_from_first - proj
   dist_to_line = np.linalg.norm(vec_to_line, axis=1)

   # Trova l'indice con distanza massima
   elbow_idx = np.argmax(dist_to_line)

   return elbow_idx

In [28]:
def serialize_latent_space(feature_space, U, path="/workspaces/dbm25/data/svd_latent_features.pt"):
   latent_feature_list = U.tolist()

   old_feature_space = pd.DataFrame(feature_space)
   old_feature_space = old_feature_space[["file_path", "class"]]
   old_feature_space["svd_latent"] = latent_feature_list

   new_feature_space = []

   for index, elem in old_feature_space.iterrows():
      new_feature_space.append(elem.to_dict())

   torch.save(new_feature_space, f=path)

**Find top k latent features:**

In [29]:
feature_space = torch.load("/workspaces/dbm25/data/extracted_features.pt")
feature = "hog"

U, S, Vh = svd_decomposition(feature_space, feature)

# find the elbow point of the S values
elbow_idx = find_elbow_point(S)
print(f"Elbow point index: {elbow_idx}")
# extract the top k latent features based on the elbow point
k = elbow_idx
top_k_latent_features = get_top_k_latent_features(U, k)
serialize_latent_space(feature_space, top_k_latent_features, path=f"/workspaces/dbm25/data/top_{k}_svd_latent_features.pt")



Elbow point index: 42


**Find latent features weights**

In [30]:
def extract_latent_features_weight(latent_feature_space):
   weight_list = []
   for element in latent_feature_space:
      index_weight = [(f"Latent Feature {i}: ", weight) for i, weight in enumerate(element["svd_latent"])]
      #sorted_weights = np.sort(element["svd_latent"])[::-1]
      index_weight = sorted(index_weight, key=lambda x: x[1], reverse=True)
      
      image_id = element["file_path"].split("/").pop()
      new_pair = (image_id, index_weight)
      weight_list.append(new_pair)
   
   return weight_list

In [31]:
latent_feature_space = torch.load("/workspaces/dbm25/data/top_42_svd_latent_features.pt")

features_weight = extract_latent_features_weight(latent_feature_space)

print(features_weight[0])




('brain_glioma_0051.jpg', [('Latent Feature 10: ', 0.022267003272862993), ('Latent Feature 31: ', 0.02106213429556215), ('Latent Feature 15: ', 0.018750588877928066), ('Latent Feature 8: ', 0.012947843189133564), ('Latent Feature 38: ', 0.010463016688095673), ('Latent Feature 16: ', 0.010195627293968907), ('Latent Feature 22: ', 0.00938306858419813), ('Latent Feature 7: ', 0.007813018349134778), ('Latent Feature 20: ', 0.006428312048048199), ('Latent Feature 39: ', 0.005242971217400467), ('Latent Feature 28: ', 0.005222683414431162), ('Latent Feature 27: ', 0.005143947051331034), ('Latent Feature 11: ', 0.0050845687437505145), ('Latent Feature 3: ', 0.0045840243409848895), ('Latent Feature 36: ', 0.00223472937582133), ('Latent Feature 13: ', 0.0017734909009584432), ('Latent Feature 32: ', 0.0009282861818511777), ('Latent Feature 26: ', 0.0004552643854857938), ('Latent Feature 24: ', -0.00012317918662769047), ('Latent Feature 23: ', -0.0008848165295434401), ('Latent Feature 12: ', -0.00

In [32]:
U, S, Vh = svd_decomposition(feature_space, feature)
# print(f"U.shape: {U.shape}, S.shape: {S.shape}, Vh.shape: {Vh.shape}")

matrix_rank = np.linalg.matrix_rank(U)
print(f"Matrix rank: {matrix_rank}") # rank is always equal to K




Matrix rank: 900
