In [1]:
import os
import torch
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
import dac

def load_dac_model(model_type="44khz", device="cpu"):
    """Load a pretrained DAC model."""
    model_path = dac.utils.download(model_type=model_type)
    model = dac.DAC.load(model_path)
    model = model.to(device)
    return model

def get_codebook_weights(model):
    """Returns a list of codebook weights: [codebook_size, codebook_dim] per codebook"""
    return [quant.codebook.weight.detach().cpu() for quant in model.quantizer.quantizers]

def tsne_project(vectors, seed=42):
    """Project codebook vectors into 2D using t-SNE"""
    tsne = TSNE(n_components=2, perplexity=30, init='pca', learning_rate='auto', n_iter=1000, random_state=seed)
    return tsne.fit_transform(vectors)

def plot_codebook_2d(vectors_2d, codebook_id, save_dir="plots"):
    os.makedirs(save_dir, exist_ok=True)
    plt.figure(figsize=(6, 6))
    plt.scatter(vectors_2d[:, 0], vectors_2d[:, 1], c='blue', s=20, alpha=0.7)
    plt.title(f"Codebook {codebook_id} — t-SNE Projection")
    plt.xlabel("Dim 1")
    plt.ylabel("Dim 2")
    plt.grid(True)
    plt.tight_layout()
    plt.savefig(os.path.join(save_dir, f"codebook_{codebook_id}_tsne.png"))
    plt.close()

def main():
    model = load_dac_model(device="cpu")
    codebook_weights = get_codebook_weights(model)

    for i, vectors in enumerate(codebook_weights):
        vectors_2d = tsne_project(vectors)
        plot_codebook_2d(vectors_2d, codebook_id=i)

    print("Saved all codebook visualizations to 'plots/' folder.")

if __name__ == "__main__":
    main()

  model_dict = torch.load(location, "cpu")
  WeightNorm.apply(module, name, dim)


Saved all codebook visualizations to 'plots/' folder.


In [2]:
import os
import torch
import matplotlib.pyplot as plt
from sklearn.cluster import AgglomerativeClustering, DBSCAN, SpectralClustering
from sklearn.preprocessing import normalize, StandardScaler
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.neighbors import NearestNeighbors
import numpy as np
import dac
import warnings

# Suppress warnings that might interfere
warnings.filterwarnings('ignore', category=FutureWarning)

def load_dac_model(model_type="44khz", device="cpu"):
    model_path = dac.utils.download(model_type=model_type)
    model = dac.DAC.load(model_path)
    model = model.to(device)
    return model


def get_codebook_weights(model):
    """Returns a list of codebook weights: [codebook_size, codebook_dim] per codebook"""
    return [quant.codebook.weight.detach().cpu().numpy() for quant in model.quantizer.quantizers]


def simple_kmeans_numpy(X, n_clusters, max_iters=100, seed=42):
    """Simple K-means implementation using only numpy to avoid threading issues"""
    np.random.seed(seed)
    n_samples, n_features = X.shape
    
    # Initialize centroids randomly
    centroids = X[np.random.choice(n_samples, n_clusters, replace=False)]
    
    for _ in range(max_iters):
        # Assign points to closest centroid
        distances = np.sqrt(((X - centroids[:, np.newaxis])**2).sum(axis=2))
        labels = np.argmin(distances, axis=0)
        
        # Update centroids
        new_centroids = np.array([X[labels == i].mean(axis=0) for i in range(n_clusters)])
        
        # Check for convergence
        if np.allclose(centroids, new_centroids):
            break
        centroids = new_centroids
    
    return labels


def cluster_and_project(vectors, n_clusters=5, seed=42, method='hierarchical'):
    """Normalize, reduce, cluster, and project codebook vectors"""
    print(f"Processing {vectors.shape[0]} vectors of dimension {vectors.shape[1]}")
    
    # Check if we have enough samples for clustering
    if vectors.shape[0] < n_clusters:
        print(f"Warning: Only {vectors.shape[0]} vectors available, reducing clusters to {vectors.shape[0]}")
        n_clusters = vectors.shape[0]
    
    X = normalize(vectors, norm='l2')

    # PCA reduction
    n_components = min(50, X.shape[1], X.shape[0] - 1)
    pca = PCA(n_components=n_components)
    X_reduced = pca.fit_transform(X)
    print(f"PCA reduced to {X_reduced.shape[1]} dimensions")

    # Try different clustering methods
    print(f"Attempting clustering with method: {method}")
    
    try:
        if method == 'hierarchical':
            # Agglomerative clustering - no threading issues
            clustering = AgglomerativeClustering(n_clusters=n_clusters, linkage='ward')
            labels = clustering.fit_predict(X_reduced)
            print(f"Hierarchical clustering completed with {n_clusters} clusters")
            
        elif method == 'spectral':
            # Spectral clustering
            clustering = SpectralClustering(n_clusters=n_clusters, random_state=seed, 
                                          affinity='nearest_neighbors', n_neighbors=min(10, X_reduced.shape[0]-1))
            labels = clustering.fit_predict(X_reduced)
            print(f"Spectral clustering completed with {n_clusters} clusters")
            
        elif method == 'dbscan':
            # DBSCAN - automatically determines number of clusters
            # Estimate eps using k-distance
            k = min(5, X_reduced.shape[0] - 1)
            neighbors = NearestNeighbors(n_neighbors=k)
            neighbors_fit = neighbors.fit(X_reduced)
            distances, indices = neighbors_fit.kneighbors(X_reduced)
            distances = np.sort(distances[:, k-1], axis=0)
            eps = np.percentile(distances, 90)  # Use 90th percentile as eps
            
            clustering = DBSCAN(eps=eps, min_samples=max(2, X_reduced.shape[0] // 20))
            labels = clustering.fit_predict(X_reduced)
            n_clusters_found = len(set(labels)) - (1 if -1 in labels else 0)
            print(f"DBSCAN clustering completed with {n_clusters_found} clusters (eps={eps:.3f})")
            
        elif method == 'numpy_kmeans':
            # Simple numpy K-means implementation
            labels = simple_kmeans_numpy(X_reduced, n_clusters, seed=seed)
            print(f"Numpy K-means clustering completed with {n_clusters} clusters")
            
        else:
            raise ValueError(f"Unknown clustering method: {method}")
            
    except Exception as e:
        print(f"Clustering method '{method}' failed with error: {e}")
        print("Falling back to simple distance-based clustering...")
        # Ultimate fallback: simple distance-based clustering
        np.random.seed(seed)
        centers = X_reduced[np.random.choice(X_reduced.shape[0], n_clusters, replace=False)]
        distances = np.sqrt(((X_reduced - centers[:, np.newaxis])**2).sum(axis=2))
        labels = np.argmin(distances, axis=0)

    # t-SNE projection with adjusted perplexity
    perplexity = min(30, max(5, X_reduced.shape[0] // 4))
    print(f"Using perplexity: {perplexity}")
    
    try:
        tsne = TSNE(
            n_components=2, 
            perplexity=perplexity, 
            init='pca', 
            learning_rate='auto',
            n_iter=1000, 
            random_state=seed,
            n_jobs=1  # Force single-threaded to avoid issues
        )
        X_2d = tsne.fit_transform(X_reduced)
        print("t-SNE projection completed")
    except Exception as e:
        print(f"t-SNE failed with error: {e}")
        print("Falling back to PCA for 2D projection...")
        # Fallback: use PCA for 2D projection
        pca_2d = PCA(n_components=2)
        X_2d = pca_2d.fit_transform(X_reduced)

    return X_2d, labels


def plot_codebook_2d(vectors_2d, labels, codebook_id, save_dir="plots_heirarchical"):
    os.makedirs(save_dir, exist_ok=True)
    
    plt.figure(figsize=(8, 6))
    scatter = plt.scatter(vectors_2d[:, 0], vectors_2d[:, 1], c=labels, cmap='tab10', s=30, alpha=0.7)
    plt.colorbar(scatter, label='Cluster')
    plt.title(f"Codebook {codebook_id} — 2D Projection with Clusters")
    plt.xlabel("Dimension 1")
    plt.ylabel("Dimension 2")
    plt.grid(True, alpha=0.3)
    
    # Add some statistics to the plot
    n_clusters = len(set(labels))
    plt.text(0.02, 0.98, f'Vectors: {len(vectors_2d)}\nClusters: {n_clusters}', 
             transform=plt.gca().transAxes, verticalalignment='top',
             bbox=dict(boxstyle='round', facecolor='white', alpha=0.8))
    
    plt.tight_layout()
    plt.savefig(os.path.join(save_dir, f"codebook_{codebook_id}_clusters.png"), dpi=150, bbox_inches='tight')
    plt.close()
    print(f"Saved plot for codebook {codebook_id}")


def main():
    print("Loading DAC model...")
    model = load_dac_model(device="cpu")
    print("Extracting codebook weights...")
    codebook_weights = get_codebook_weights(model)
    
    # Try different clustering methods in order of preference
    clustering_methods = ['hierarchical', 'numpy_kmeans', 'spectral', 'dbscan']
    
    print(f"Found {len(codebook_weights)} codebooks")
    for i, vectors in enumerate(codebook_weights):
        print(f"\n=== Processing Codebook {i} ===")
        print(f"Shape: {vectors.shape}")
        
        success = False
        for method in clustering_methods:
            try:
                print(f"Trying clustering method: {method}")
                X_2d, labels = cluster_and_project(vectors, n_clusters=5, method=method)
                plot_codebook_2d(X_2d, labels, codebook_id=i)
                success = True
                break
            except Exception as e:
                print(f"Method '{method}' failed: {e}")
                continue
        
        if not success:
            print(f"All clustering methods failed for codebook {i}")

    print("\nCompleted! Check the 'plots/' folder for visualizations.")


if __name__ == "__main__":
    # Alternative approach: Set environment variable to avoid threading issues
    os.environ["OMP_NUM_THREADS"] = "1"
    os.environ["MKL_NUM_THREADS"] = "1"
    os.environ["OPENBLAS_NUM_THREADS"] = "1"
    
    main()

Loading DAC model...
Extracting codebook weights...
Found 9 codebooks

=== Processing Codebook 0 ===
Shape: (1024, 8)
Trying clustering method: hierarchical
Processing 1024 vectors of dimension 8
PCA reduced to 8 dimensions
Attempting clustering with method: hierarchical
Hierarchical clustering completed with 5 clusters
Using perplexity: 30
t-SNE projection completed
Saved plot for codebook 0

=== Processing Codebook 1 ===
Shape: (1024, 8)
Trying clustering method: hierarchical
Processing 1024 vectors of dimension 8
PCA reduced to 8 dimensions
Attempting clustering with method: hierarchical
Hierarchical clustering completed with 5 clusters
Using perplexity: 30
t-SNE projection completed
Saved plot for codebook 1

=== Processing Codebook 2 ===
Shape: (1024, 8)
Trying clustering method: hierarchical
Processing 1024 vectors of dimension 8
PCA reduced to 8 dimensions
Attempting clustering with method: hierarchical
Hierarchical clustering completed with 5 clusters
Using perplexity: 30
t-SNE

In [3]:
import os
import torch
import matplotlib.pyplot as plt
from sklearn.cluster import AgglomerativeClustering, DBSCAN, SpectralClustering
from sklearn.preprocessing import normalize, StandardScaler
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.neighbors import NearestNeighbors
import numpy as np
import dac
import warnings

# Suppress warnings that might interfere
warnings.filterwarnings('ignore', category=FutureWarning)

def load_dac_model(model_type="44khz", device="cpu"):
    model_path = dac.utils.download(model_type=model_type)
    model = dac.DAC.load(model_path)
    model = model.to(device)
    return model


def get_codebook_weights(model):
    """Returns a list of codebook weights: [codebook_size, codebook_dim] per codebook"""
    return [quant.codebook.weight.detach().cpu().numpy() for quant in model.quantizer.quantizers]


def simple_kmeans_numpy(X, n_clusters, max_iters=100, seed=42):
    """Simple K-means implementation using only numpy to avoid threading issues"""
    np.random.seed(seed)
    n_samples, n_features = X.shape
    
    # Initialize centroids randomly
    centroids = X[np.random.choice(n_samples, n_clusters, replace=False)]
    
    for _ in range(max_iters):
        # Assign points to closest centroid
        distances = np.sqrt(((X - centroids[:, np.newaxis])**2).sum(axis=2))
        labels = np.argmin(distances, axis=0)
        
        # Update centroids
        new_centroids = np.array([X[labels == i].mean(axis=0) for i in range(n_clusters)])
        
        # Check for convergence
        if np.allclose(centroids, new_centroids):
            break
        centroids = new_centroids
    
    return labels


def cluster_and_project(vectors, n_clusters=5, seed=42, method='hierarchical'):
    """Normalize, reduce, cluster, and project codebook vectors"""
    print(f"Processing {vectors.shape[0]} vectors of dimension {vectors.shape[1]}")
    
    # Check if we have enough samples for clustering
    if vectors.shape[0] < n_clusters:
        print(f"Warning: Only {vectors.shape[0]} vectors available, reducing clusters to {vectors.shape[0]}")
        n_clusters = vectors.shape[0]
    
    X = normalize(vectors, norm='l2')

    # PCA reduction
    n_components = min(50, X.shape[1], X.shape[0] - 1)
    pca = PCA(n_components=n_components)
    X_reduced = pca.fit_transform(X)
    print(f"PCA reduced to {X_reduced.shape[1]} dimensions")

    # Try different clustering methods
    print(f"Attempting clustering with method: {method}")
    
    try:
        if method == 'hierarchical':
            # Agglomerative clustering - no threading issues
            clustering = AgglomerativeClustering(n_clusters=n_clusters, linkage='ward')
            labels = clustering.fit_predict(X_reduced)
            print(f"Hierarchical clustering completed with {n_clusters} clusters")
            
        elif method == 'spectral':
            # Spectral clustering
            clustering = SpectralClustering(n_clusters=n_clusters, random_state=seed, 
                                          affinity='nearest_neighbors', n_neighbors=min(10, X_reduced.shape[0]-1))
            labels = clustering.fit_predict(X_reduced)
            print(f"Spectral clustering completed with {n_clusters} clusters")
            
        elif method == 'dbscan':
            # DBSCAN - automatically determines number of clusters
            # Estimate eps using k-distance
            k = min(5, X_reduced.shape[0] - 1)
            neighbors = NearestNeighbors(n_neighbors=k)
            neighbors_fit = neighbors.fit(X_reduced)
            distances, indices = neighbors_fit.kneighbors(X_reduced)
            distances = np.sort(distances[:, k-1], axis=0)
            eps = np.percentile(distances, 90)  # Use 90th percentile as eps
            
            clustering = DBSCAN(eps=eps, min_samples=max(2, X_reduced.shape[0] // 20))
            labels = clustering.fit_predict(X_reduced)
            n_clusters_found = len(set(labels)) - (1 if -1 in labels else 0)
            print(f"DBSCAN clustering completed with {n_clusters_found} clusters (eps={eps:.3f})")
            
        elif method == 'numpy_kmeans':
            # Simple numpy K-means implementation
            labels = simple_kmeans_numpy(X_reduced, n_clusters, seed=seed)
            print(f"Numpy K-means clustering completed with {n_clusters} clusters")
            
        else:
            raise ValueError(f"Unknown clustering method: {method}")
            
    except Exception as e:
        print(f"Clustering method '{method}' failed with error: {e}")
        print("Falling back to simple distance-based clustering...")
        # Ultimate fallback: simple distance-based clustering
        np.random.seed(seed)
        centers = X_reduced[np.random.choice(X_reduced.shape[0], n_clusters, replace=False)]
        distances = np.sqrt(((X_reduced - centers[:, np.newaxis])**2).sum(axis=2))
        labels = np.argmin(distances, axis=0)

    # t-SNE projection with adjusted perplexity
    perplexity = min(30, max(5, X_reduced.shape[0] // 4))
    print(f"Using perplexity: {perplexity}")
    
    pca_2d = PCA(n_components=2)
    X_2d = pca_2d.fit_transform(X_reduced)

    return X_2d, labels


def plot_codebook_2d(vectors_2d, labels, codebook_id, save_dir="plots_pca"):
    os.makedirs(save_dir, exist_ok=True)
    
    plt.figure(figsize=(8, 6))
    scatter = plt.scatter(vectors_2d[:, 0], vectors_2d[:, 1], c=labels, cmap='tab10', s=30, alpha=0.7)
    plt.colorbar(scatter, label='Cluster')
    plt.title(f"Codebook {codebook_id} — 2D Projection with Clusters")
    plt.xlabel("Dimension 1")
    plt.ylabel("Dimension 2")
    plt.grid(True, alpha=0.3)
    
    # Add some statistics to the plot
    n_clusters = len(set(labels))
    plt.text(0.02, 0.98, f'Vectors: {len(vectors_2d)}\nClusters: {n_clusters}', 
             transform=plt.gca().transAxes, verticalalignment='top',
             bbox=dict(boxstyle='round', facecolor='white', alpha=0.8))
    
    plt.tight_layout()
    plt.savefig(os.path.join(save_dir, f"codebook_{codebook_id}_clusters.png"), dpi=150, bbox_inches='tight')
    plt.close()
    print(f"Saved plot for codebook {codebook_id}")


def main():
    print("Loading DAC model...")
    model = load_dac_model(device="cpu")
    print("Extracting codebook weights...")
    codebook_weights = get_codebook_weights(model)
    
    # Try different clustering methods in order of preference
    clustering_methods = ['hierarchical', 'numpy_kmeans', 'spectral', 'dbscan']
    
    print(f"Found {len(codebook_weights)} codebooks")
    for i, vectors in enumerate(codebook_weights):
        print(f"\n=== Processing Codebook {i} ===")
        print(f"Shape: {vectors.shape}")
        
        success = False
        for method in clustering_methods:
            try:
                print(f"Trying clustering method: {method}")
                X_2d, labels = cluster_and_project(vectors, n_clusters=5, method=method)
                plot_codebook_2d(X_2d, labels, codebook_id=i)
                success = True
                break
            except Exception as e:
                print(f"Method '{method}' failed: {e}")
                continue
        
        if not success:
            print(f"All clustering methods failed for codebook {i}")

    print("\nCompleted! Check the 'plots/' folder for visualizations.")


if __name__ == "__main__":
    # Alternative approach: Set environment variable to avoid threading issues
    os.environ["OMP_NUM_THREADS"] = "1"
    os.environ["MKL_NUM_THREADS"] = "1"
    os.environ["OPENBLAS_NUM_THREADS"] = "1"
    
    main()

Loading DAC model...
Extracting codebook weights...
Found 9 codebooks

=== Processing Codebook 0 ===
Shape: (1024, 8)
Trying clustering method: hierarchical
Processing 1024 vectors of dimension 8
PCA reduced to 8 dimensions
Attempting clustering with method: hierarchical
Hierarchical clustering completed with 5 clusters
Using perplexity: 30
Saved plot for codebook 0

=== Processing Codebook 1 ===
Shape: (1024, 8)
Trying clustering method: hierarchical
Processing 1024 vectors of dimension 8
PCA reduced to 8 dimensions
Attempting clustering with method: hierarchical
Hierarchical clustering completed with 5 clusters
Using perplexity: 30
Saved plot for codebook 1

=== Processing Codebook 2 ===
Shape: (1024, 8)
Trying clustering method: hierarchical
Processing 1024 vectors of dimension 8
PCA reduced to 8 dimensions
Attempting clustering with method: hierarchical
Hierarchical clustering completed with 5 clusters
Using perplexity: 30
Saved plot for codebook 2

=== Processing Codebook 3 ===
Sh

In [5]:
import os
import torch
import matplotlib.pyplot as plt
from sklearn.cluster import AgglomerativeClustering, DBSCAN, SpectralClustering
from sklearn.preprocessing import normalize, StandardScaler
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.neighbors import NearestNeighbors
import numpy as np
import dac
import warnings

# Suppress warnings that might interfere
warnings.filterwarnings('ignore', category=FutureWarning)

def load_dac_model(model_type="44khz", device="cpu"):
    model_path = dac.utils.download(model_type=model_type)
    model = dac.DAC.load(model_path)
    model = model.to(device)
    return model


def get_codebook_weights(model):
    """Returns a list of codebook weights: [codebook_size, codebook_dim] per codebook"""
    return [quant.codebook.weight.detach().cpu().numpy() for quant in model.quantizer.quantizers]


def simple_kmeans_numpy(X, n_clusters, max_iters=100, seed=42):
    """Simple K-means implementation using only numpy to avoid threading issues"""
    np.random.seed(seed)
    n_samples, n_features = X.shape
    
    # Initialize centroids randomly
    centroids = X[np.random.choice(n_samples, n_clusters, replace=False)]
    
    for _ in range(max_iters):
        # Assign points to closest centroid
        distances = np.sqrt(((X - centroids[:, np.newaxis])**2).sum(axis=2))
        labels = np.argmin(distances, axis=0)
        
        # Update centroids
        new_centroids = np.array([X[labels == i].mean(axis=0) for i in range(n_clusters)])
        
        # Check for convergence
        if np.allclose(centroids, new_centroids):
            break
        centroids = new_centroids
    
    return labels


def cluster_and_project(vectors, n_clusters=5, seed=42, method='hierarchical'):
    """Normalize, reduce, cluster, and project codebook vectors"""
    print(f"Processing {vectors.shape[0]} vectors of dimension {vectors.shape[1]}")
    
    # Check if we have enough samples for clustering
    if vectors.shape[0] < n_clusters:
        print(f"Warning: Only {vectors.shape[0]} vectors available, reducing clusters to {vectors.shape[0]}")
        n_clusters = vectors.shape[0]
    
    X = normalize(vectors, norm='l2')

    # PCA reduction
    n_components = min(50, X.shape[1], X.shape[0] - 1)
    pca = PCA(n_components=n_components)
    X_reduced = pca.fit_transform(X)
    print(f"PCA reduced to {X_reduced.shape[1]} dimensions")

    # Try different clustering methods
    print(f"Attempting clustering with method: {method}")
    
    try:
        if method == 'hierarchical':
            # Agglomerative clustering - no threading issues
            clustering = AgglomerativeClustering(n_clusters=n_clusters, linkage='ward')
            labels = clustering.fit_predict(X_reduced)
            print(f"Hierarchical clustering completed with {n_clusters} clusters")
        
            from sklearn.metrics import silhouette_score
            try:
                sil_score = silhouette_score(X_reduced, labels)
                print(f"Silhouette Score: {sil_score:.3f}")
            except Exception as e:
                print(f"Silhouette score calculation failed: {e}")
            
        elif method == 'spectral':
            # Spectral clustering
            clustering = SpectralClustering(n_clusters=n_clusters, random_state=seed, 
                                          affinity='nearest_neighbors', n_neighbors=min(10, X_reduced.shape[0]-1))
            labels = clustering.fit_predict(X_reduced)
            print(f"Spectral clustering completed with {n_clusters} clusters")
            
        elif method == 'dbscan':
            # DBSCAN - automatically determines number of clusters
            # Estimate eps using k-distance
            k = min(5, X_reduced.shape[0] - 1)
            neighbors = NearestNeighbors(n_neighbors=k)
            neighbors_fit = neighbors.fit(X_reduced)
            distances, indices = neighbors_fit.kneighbors(X_reduced)
            distances = np.sort(distances[:, k-1], axis=0)
            eps = np.percentile(distances, 90)  # Use 90th percentile as eps
            
            clustering = DBSCAN(eps=eps, min_samples=max(2, X_reduced.shape[0] // 20))
            labels = clustering.fit_predict(X_reduced)
            n_clusters_found = len(set(labels)) - (1 if -1 in labels else 0)
            print(f"DBSCAN clustering completed with {n_clusters_found} clusters (eps={eps:.3f})")
            
        elif method == 'numpy_kmeans':
            # Simple numpy K-means implementation
            labels = simple_kmeans_numpy(X_reduced, n_clusters, seed=seed)
            print(f"Numpy K-means clustering completed with {n_clusters} clusters")
            
        else:
            raise ValueError(f"Unknown clustering method: {method}")
            
    except Exception as e:
        print(f"Clustering method '{method}' failed with error: {e}")
        print("Falling back to simple distance-based clustering...")
        # Ultimate fallback: simple distance-based clustering
        np.random.seed(seed)
        centers = X_reduced[np.random.choice(X_reduced.shape[0], n_clusters, replace=False)]
        distances = np.sqrt(((X_reduced - centers[:, np.newaxis])**2).sum(axis=2))
        labels = np.argmin(distances, axis=0)

    # t-SNE projection with adjusted perplexity
    perplexity = min(30, max(5, X_reduced.shape[0] // 4))
    print(f"Using perplexity: {perplexity}")
    
    try:
        tsne = TSNE(
            n_components=3, 
            perplexity=perplexity, 
            init='pca', 
            learning_rate='auto',
            n_iter=1000, 
            random_state=seed,
            n_jobs=1  # Force single-threaded to avoid issues
        )
        X_2d = tsne.fit_transform(X_reduced)
        print("t-SNE projection completed")
    except Exception as e:
        print(f"t-SNE failed with error: {e}")
        print("Falling back to PCA for 2D projection...")
        # Fallback: use PCA for 2D projection
        pca_2d = PCA(n_components=2)
        X_2d = pca_2d.fit_transform(X_reduced)

    return X_2d, labels


def plot_codebook_2d(vectors_2d, labels, codebook_id, save_dir="plots_silhouette"):
    os.makedirs(save_dir, exist_ok=True)
    
    plt.figure(figsize=(8, 6))
    scatter = plt.scatter(vectors_2d[:, 0], vectors_2d[:, 1], c=labels, cmap='tab10', s=30, alpha=0.7)
    plt.colorbar(scatter, label='Cluster')
    plt.title(f"Codebook {codebook_id} — 2D Projection with Clusters")
    plt.xlabel("Dimension 1")
    plt.ylabel("Dimension 2")
    plt.grid(True, alpha=0.3)
    
    # Add some statistics to the plot
    n_clusters = len(set(labels))
    plt.text(0.02, 0.98, f'Vectors: {len(vectors_2d)}\nClusters: {n_clusters}', 
             transform=plt.gca().transAxes, verticalalignment='top',
             bbox=dict(boxstyle='round', facecolor='white', alpha=0.8))
    
    plt.tight_layout()
    plt.savefig(os.path.join(save_dir, f"codebook_{codebook_id}_clusters.png"), dpi=150, bbox_inches='tight')
    plt.close()
    print(f"Saved plot for codebook {codebook_id}")


def main():
    print("Loading DAC model...")
    model = load_dac_model(device="cpu")
    print("Extracting codebook weights...")
    codebook_weights = get_codebook_weights(model)
    
    # Try different clustering methods in order of preference
    clustering_methods = ['hierarchical', 'numpy_kmeans', 'spectral', 'dbscan']
    
    print(f"Found {len(codebook_weights)} codebooks")
    for i, vectors in enumerate(codebook_weights):
        print(f"\n=== Processing Codebook {i} ===")
        print(f"Shape: {vectors.shape}")
        
        success = False
        for method in clustering_methods:
            try:
                print(f"Trying clustering method: {method}")
                X_2d, labels = cluster_and_project(vectors, n_clusters=5, method=method)
                plot_codebook_2d(X_2d, labels, codebook_id=i)
                success = True
                break
            except Exception as e:
                print(f"Method '{method}' failed: {e}")
                continue
        
        if not success:
            print(f"All clustering methods failed for codebook {i}")

    print("\nCompleted! Check the 'plots/' folder for visualizations.")


if __name__ == "__main__":
    # Alternative approach: Set environment variable to avoid threading issues
    os.environ["OMP_NUM_THREADS"] = "1"
    os.environ["MKL_NUM_THREADS"] = "1"
    os.environ["OPENBLAS_NUM_THREADS"] = "1"
    
    main()

Loading DAC model...
Extracting codebook weights...
Found 9 codebooks

=== Processing Codebook 0 ===
Shape: (1024, 8)
Trying clustering method: hierarchical
Processing 1024 vectors of dimension 8
PCA reduced to 8 dimensions
Attempting clustering with method: hierarchical
Hierarchical clustering completed with 5 clusters
Silhouette Score: 0.037
Using perplexity: 30
t-SNE projection completed
Saved plot for codebook 0

=== Processing Codebook 1 ===
Shape: (1024, 8)
Trying clustering method: hierarchical
Processing 1024 vectors of dimension 8
PCA reduced to 8 dimensions
Attempting clustering with method: hierarchical
Hierarchical clustering completed with 5 clusters
Silhouette Score: 0.027
Using perplexity: 30
t-SNE projection completed
Saved plot for codebook 1

=== Processing Codebook 2 ===
Shape: (1024, 8)
Trying clustering method: hierarchical
Processing 1024 vectors of dimension 8
PCA reduced to 8 dimensions
Attempting clustering with method: hierarchical
Hierarchical clustering comp

In [1]:
import os
import torch
import matplotlib.pyplot as plt
from sklearn.cluster import AgglomerativeClustering
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.preprocessing import normalize
from sklearn.metrics import silhouette_score
from sklearn.neighbors import NearestNeighbors
import numpy as np
import dac
import umap
import hdbscan
import warnings

warnings.filterwarnings("ignore", category=FutureWarning)

def load_dac_model(model_type="44khz", device="cpu"):
    model_path = dac.utils.download(model_type=model_type)
    model = dac.DAC.load(model_path)
    return model.to(device)

def get_codebook_weights(model):
    return [quant.codebook.weight.detach().cpu().numpy() for quant in model.quantizer.quantizers]

def cluster_and_project(vectors, n_clusters=5, seed=42, method='agglomerative_cosine'):
    print(f"Processing {vectors.shape[0]} vectors of dimension {vectors.shape[1]}")
    X = normalize(vectors, norm='l2')

    # Reduce to up to 50 dims with PCA (for clustering stability)
    n_components = min(50, X.shape[1], X.shape[0] - 1)
    X_pca = PCA(n_components=n_components).fit_transform(X)
    print(f"PCA reduced to {X_pca.shape[1]} dimensions")

    # Clustering
    try:
        if method == "agglomerative_cosine":
            clustering = AgglomerativeClustering(n_clusters=n_clusters, affinity='cosine', linkage='average')
            labels = clustering.fit_predict(X_pca)
            print(f"Agglomerative (cosine) clustering completed with {n_clusters} clusters")

        elif method == "hdbscan":
            clusterer = hdbscan.HDBSCAN(
                metric='euclidean',  # or 'cosine' if you skip PCA
                min_cluster_size=10,
                min_samples=5,
                prediction_data=True
            )
            labels = clusterer.fit_predict(X_pca)
            n_clusters_found = len(set(labels)) - (1 if -1 in labels else 0)
            print(f"HDBSCAN found {n_clusters_found} clusters")

        else:
            raise ValueError(f"Unknown clustering method: {method}")

        # Silhouette Score
        try:
            valid = labels != -1  # skip noise points in HDBSCAN
            if valid.sum() >= 2:
                score = silhouette_score(X_pca[valid], labels[valid])
                print(f"Silhouette Score: {score:.3f}")
        except Exception as e:
            print(f"Silhouette score failed: {e}")

    except Exception as e:
        print(f"Clustering failed with error: {e}")
        labels = np.zeros(X.shape[0])

    # UMAP (non-linear 2D or 3D projection)
    try:
        reducer = umap.UMAP(n_components=2, random_state=seed, metric='cosine')
        X_proj = reducer.fit_transform(X)
        print("UMAP projection completed")
    except Exception as e:
        print(f"UMAP failed: {e}, falling back to t-SNE")
        X_proj = TSNE(n_components=2, perplexity=30, random_state=seed, n_iter=1000).fit_transform(X_pca)

    return X_proj, labels

def plot_codebook_2d(vectors_2d, labels, codebook_id, save_dir="plots_hdbscan_umap"):
    os.makedirs(save_dir, exist_ok=True)
    plt.figure(figsize=(8, 6))
    scatter = plt.scatter(vectors_2d[:, 0], vectors_2d[:, 1], c=labels, cmap='tab10', s=30, alpha=0.8)
    plt.colorbar(scatter, label='Cluster')
    plt.title(f"Codebook {codebook_id} — UMAP + Clustering")
    plt.xlabel("Dim 1")
    plt.ylabel("Dim 2")
    plt.grid(True, alpha=0.3)
    n_clusters = len(set(labels)) - (1 if -1 in labels else 0)
    plt.text(0.02, 0.98, f'Vectors: {len(vectors_2d)}\nClusters: {n_clusters}', 
             transform=plt.gca().transAxes, verticalalignment='top',
             bbox=dict(boxstyle='round', facecolor='white', alpha=0.8))
    plt.tight_layout()
    plt.savefig(os.path.join(save_dir, f"codebook_{codebook_id}_clusters.png"), dpi=150, bbox_inches='tight')
    plt.close()
    print(f"Saved plot for codebook {codebook_id}")

def main():
    print("Loading DAC model...")
    model = load_dac_model(device="cpu")
    print("Extracting codebook weights...")
    codebook_weights = get_codebook_weights(model)
    print(f"Found {len(codebook_weights)} codebooks")

    for i, vectors in enumerate(codebook_weights):
        print(f"\n=== Processing Codebook {i} ===")
        print(f"Shape: {vectors.shape}")

        # Try HDBSCAN first, fall back to agglomerative
        for method in ['hdbscan', 'agglomerative_cosine']:
            try:
                X_2d, labels = cluster_and_project(vectors, n_clusters=5, method=method)
                plot_codebook_2d(X_2d, labels, codebook_id=i)
                break
            except Exception as e:
                print(f"Method '{method}' failed: {e}")
                continue

    print("\nCompleted! Check the 'plots_hdbscan_umap/' folder for visualizations.")

if __name__ == "__main__":
    os.environ["OMP_NUM_THREADS"] = "1"
    os.environ["MKL_NUM_THREADS"] = "1"
    os.environ["OPENBLAS_NUM_THREADS"] = "1"
    main()


Loading DAC model...
Extracting codebook weights...
Found 9 codebooks

=== Processing Codebook 0 ===
Shape: (1024, 8)
Processing 1024 vectors of dimension 8
PCA reduced to 8 dimensions
HDBSCAN found 2 clusters
Silhouette Score: 0.468


  warn(


UMAP projection completed
Saved plot for codebook 0

=== Processing Codebook 1 ===
Shape: (1024, 8)
Processing 1024 vectors of dimension 8
PCA reduced to 8 dimensions
HDBSCAN found 2 clusters
Silhouette Score: 0.239


  warn(


UMAP projection completed
Saved plot for codebook 1

=== Processing Codebook 2 ===
Shape: (1024, 8)
Processing 1024 vectors of dimension 8
PCA reduced to 8 dimensions
HDBSCAN found 0 clusters


  warn(


UMAP projection completed
Saved plot for codebook 2

=== Processing Codebook 3 ===
Shape: (1024, 8)
Processing 1024 vectors of dimension 8
PCA reduced to 8 dimensions
HDBSCAN found 2 clusters
Silhouette Score: 0.071


  warn(


UMAP projection completed
Saved plot for codebook 3

=== Processing Codebook 4 ===
Shape: (1024, 8)
Processing 1024 vectors of dimension 8
PCA reduced to 8 dimensions
HDBSCAN found 4 clusters
Silhouette Score: 0.127


  warn(


UMAP projection completed
Saved plot for codebook 4

=== Processing Codebook 5 ===
Shape: (1024, 8)
Processing 1024 vectors of dimension 8
PCA reduced to 8 dimensions
HDBSCAN found 3 clusters
Silhouette Score: 0.079


  warn(


UMAP projection completed
Saved plot for codebook 5

=== Processing Codebook 6 ===
Shape: (1024, 8)
Processing 1024 vectors of dimension 8
PCA reduced to 8 dimensions
HDBSCAN found 3 clusters
Silhouette Score: 0.128


  warn(


UMAP projection completed
Saved plot for codebook 6

=== Processing Codebook 7 ===
Shape: (1024, 8)
Processing 1024 vectors of dimension 8
PCA reduced to 8 dimensions
HDBSCAN found 0 clusters


  warn(


UMAP projection completed
Saved plot for codebook 7

=== Processing Codebook 8 ===
Shape: (1024, 8)
Processing 1024 vectors of dimension 8
PCA reduced to 8 dimensions
HDBSCAN found 2 clusters
Silhouette Score: 0.048


  warn(


UMAP projection completed
Saved plot for codebook 8

Completed! Check the 'plots_hdbscan_umap/' folder for visualizations.
