# Analyze the visual embedding space and the semantic embedding space using given categories

## Define useful tools and metrics

In [3]:
import numpy as np
from sklearn.cluster import KMeans
from sklearn.metrics import adjusted_rand_score, normalized_mutual_info_score, confusion_matrix

In [1]:
def cluster_embeddings(embeddings_array, k):
    kmeans = KMeans(n_clusters=k, random_state=42).fit(embeddings_array)

    # Get the cluster labels and centroids
    cluster_labels = kmeans.labels_
    cluster_centroids = kmeans.cluster_centers_

    return cluster_labels, cluster_centroids

In [13]:
def analyze_label_relationship(labels1, labels2):
    # Calculate the Adjusted Rand Index
    ari = adjusted_rand_score(labels1, labels2)

    # Calculate the Normalized Mutual Information
    nmi = normalized_mutual_info_score(labels1, labels2)

    # Calculate the Contingency Matrix (confusion matrix)
    cm = confusion_matrix(labels1, labels2)
    
    # Sum the maximum values of each row
    max_values_sum = np.sum(np.amax(cm, axis=1))
    
    # Calculate purity
    purity = max_values_sum / np.sum(cm)

    return ari, nmi, purity, cm


In [None]:
def analyze(labels, embeddings):
    k = len(labels)
    cluster_labels, cluster_centroids = cluster_embeddings(embeddings, k)
    ari, nmi, purity, cm = analyze_label_relationship(cluster_labels, labels1)
    
    print('The cluster centroids are:\n', cluster_centroids)
    
    print("Adjusted Rand Index:", ari)
    print("Normalized Mutual Information:", nmi)
    print('Purity:', purity)
    print("Contingency Matrix (confusion matrix):\n", cm)
    return ari, nmi, purity, cm, cluster_centroids
    

In [None]:
def align_labels(labels_df, embeddings):
    result_embeddings = np.array([embeddings[hanzi] for hanzi in labels_df['hanzi'].values])
    return labels_df['label'].values.to_array(), result_embeddings