# Basic EVoC Clustering Example

This notebook demonstrates basic usage of EVoC for clustering high-dimensional embedding vectors.

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from evoc import EVoC
from sklearn.datasets import make_blobs

## Generate Sample Data

Let's create some sample high-dimensional data that mimics embedding vectors.

In [None]:
# Generate sample embedding-like data
n_samples = 1000
n_features = 512  # Common embedding dimension
n_centers = 5

X, true_labels = make_blobs(
    n_samples=n_samples, 
    centers=n_centers,
    n_features=n_features,
    random_state=42,
    cluster_std=2.0
)

# Normalize to unit vectors (common for embeddings)
X = X / np.linalg.norm(X, axis=1, keepdims=True)

print(f"Created {n_samples} samples with {n_features} dimensions")
print(f"True number of clusters: {len(np.unique(true_labels))}")
print(f"Data shape: {X.shape}")

## Apply EVoC Clustering

Now let's cluster the data using EVoC with default parameters.

In [None]:
# Initialize and fit EVoC
clusterer = EVoC(
    n_neighbors=15,
    noise_level=0.5,
    base_min_cluster_size=5,
    random_state=42
)

# Fit and predict
labels = clusterer.fit_predict(X)

# Analyze results
n_clusters = len(np.unique(labels[labels >= 0]))
n_noise = np.sum(labels == -1)

print(f"EVoC found {n_clusters} clusters")
print(f"Number of noise points: {n_noise}")
print(f"Number of clustering layers: {len(clusterer.cluster_layers_)}")

# Show cluster sizes
unique_labels, counts = np.unique(labels[labels >= 0], return_counts=True)
for label, count in zip(unique_labels, counts):
    print(f"Cluster {label}: {count} points")