In [None]:
from collections import defaultdict
from typing import Dict, List

import numpy as np
from scipy import stats
from sklearn.decomposition import PCA
from sklearn.metrics.pairwise import cosine_similarity


In [None]:
def analyze_embeddings(embeddings: np.ndarray, syndrome_ids: List) -> Dict:
    """
    Analyze the embeddings data with advanced statistics
    Args:
        embeddings (np.ndarray): Matrix of embeddings
        syndrome_ids (List): List of syndrome IDs

    Returns:
        Dict: Dictionary containing embedding statistics
    """
    # Basic statistics
    embedding_stats = {
        "embedding_dim": embeddings.shape[1],
        "mean": np.mean(embeddings, axis=0),
        "std": np.std(embeddings, axis=0),
        "min": np.min(embeddings, axis=0),
        "max": np.max(embeddings, axis=0),
        "skewness": stats.skew(embeddings, axis=0),
        "kurtosis": stats.kurtosis(embeddings, axis=0),
    }
    pca = PCA()
    pca_result = pca.fit_transform(embeddings)
    embedding_stats["explained_variance_ratio"] = pca.explained_variance_ratio_
    embedding_stats["cumulative_variance_ratio"] = np.cumsum(
        pca.explained_variance_ratio_
    )

    # Calculate within-class and between-class statistics
    unique_syndromes = list(set(syndrome_ids))
    within_class_distances = []
    between_class_distances = []

    for syndrome in unique_syndromes:
        # Get embeddings for current syndrome
        syndrome_mask = np.array(syndrome_ids) == syndrome
        syndrome_embeddings = embeddings[syndrome_mask]

        # Calculate within-class distances
        if len(syndrome_embeddings) > 1:
            distances = cosine_similarity(syndrome_embeddings)
            # Get upper triangle of distance matrix (excluding diagonal)
            distances = distances[np.triu_indices(len(distances), k=1)]
            within_class_distances.extend(distances)

        # Calculate between-class distances
        other_embeddings = embeddings[~syndrome_mask]
        if len(syndrome_embeddings) > 0 and len(other_embeddings) > 0:
            distances = cosine_similarity(syndrome_embeddings, other_embeddings)
            between_class_distances.extend(distances.flatten())

    embedding_stats["within_class_similarity"] = {
        "mean": np.mean(within_class_distances),
        "std": np.std(within_class_distances),
        "min": np.min(within_class_distances),
        "max": np.max(within_class_distances),
    }
    embedding_stats["between_class_similarity"] = {
        "mean": np.mean(between_class_distances),
        "std": np.std(between_class_distances),
        "min": np.min(between_class_distances),
        "max": np.max(between_class_distances),
    }
    return embedding_stats

In [None]:
def calculate_separation_metrics(embeddings: np.ndarray, syndrome_ids: List) -> Dict:
    """
    Calculate metrics that indicate how well the embeddings separate different syndromes
    Args:
    embeddings (np.ndarray): Matrix of embeddings
    syndrome_ids (List): List of syndrome IDs

    Returns:
        Dict: Dictionary containing separation metrics
    """
    unique_syndromes = list(set(syndrome_ids))
    separation_metrics = {}

    # Calculate syndrome centroids
    centroids = {}
    for syndrome in unique_syndromes:
        syndrome_mask = np.array(syndrome_ids) == syndrome
        syndrome_embeddings = embeddings[syndrome_mask]
        centroids[syndrome] = np.mean(syndrome_embeddings, axis=0)

    # Calculate inter-centroid distances
    centroid_distances = {}
    for i, syndrome1 in enumerate(unique_syndromes):
        for syndrome2 in unique_syndromes[i + 1 :]:
            distance = np.linalg.norm(centroids[syndrome1] - centroids[syndrome2])
            centroid_distances[f"{syndrome1}-{syndrome2}"] = distance

    separation_metrics["centroid_distances"] = centroid_distances
    separation_metrics["mean_centroid_distance"] = np.mean(
        list(centroid_distances.values())
    )
    separation_metrics["min_centroid_distance"] = np.min(
        list(centroid_distances.values())
    )
    separation_metrics["max_centroid_distance"] = np.max(
        list(centroid_distances.values())
    )

    return separation_metrics