FINAL CODIGO:

In [None]:
import numpy as np

class OnlineKMeansCapacitado:
    def __init__(self, k, capacities):
        self.k = k
        self.capacities = capacities[:]
        self.remaining = capacities[:]
        self.centers = []
        self.counts = []

    def partial_fit(self, x):
        x = np.asarray(x).astype(float)

        if len(self.centers) < self.k:
            self.centers.append(x.copy())
            self.counts.append(1)
            self.remaining[len(self.centers) - 1] -= 1
            return len(self.centers) - 1

        distances = []
        for i, c in enumerate(self.centers):
            if self.remaining[i] > 0:
                distances.append(np.linalg.norm(x - c))
            else:
                distances.append(np.inf)

        idx = int(np.argmin(distances))

        self.counts[idx] += 1
        self.remaining[idx] -= 1

        eta = 1.0 / self.counts[idx]
        self.centers[idx] = (1.0 - eta) * self.centers[idx] + eta * x

        return idx

In [None]:
def dunn_index(X, labels):
    clusters = np.unique(labels)

    if len(clusters) < 2:
        return 0.0

    # distancia mínima entre clusters
    inter_cluster_dist = np.inf
    for i in clusters:
        for j in clusters:
            if i >= j:
                continue
            dist = np.min([
                np.linalg.norm(x - y)
                for x in X[labels == i]
                for y in X[labels == j]
            ])
            inter_cluster_dist = min(inter_cluster_dist, dist)

    # diámetro máximo intra-cluster
    intra_cluster_dist = 0
    for i in clusters:
        points = X[labels == i]
        if len(points) > 1:
            dist = np.max([
                np.linalg.norm(x - y)
                for x in points
                for y in points
            ])
            intra_cluster_dist = max(intra_cluster_dist, dist)

    if intra_cluster_dist == 0:
        return 0.0

    return inter_cluster_dist / intra_cluster_dist

In [None]:
import numpy as np
from sklearn.datasets import load_iris
from sklearn.utils import shuffle
from sklearn.metrics import (
    silhouette_score,
    adjusted_rand_score,
    normalized_mutual_info_score,
    adjusted_mutual_info_score
)

def dunn_index(X, labels):
    clusters = np.unique(labels)

    if len(clusters) < 2:
        return 0.0

    # distancia mínima entre clusters
    inter_cluster_dist = np.inf
    for i in clusters:
        for j in clusters:
            if i >= j:
                continue
            dist = np.min([
                np.linalg.norm(x - y)
                for x in X[labels == i]
                for y in X[labels == j]
            ])
            inter_cluster_dist = min(inter_cluster_dist, dist)

    # diámetro máximo intra-cluster
    intra_cluster_dist = 0
    for i in clusters:
        points = X[labels == i]
        if len(points) > 1:
            dist = np.max([
                np.linalg.norm(x - y)
                for x in points
                for y in points
            ])
            intra_cluster_dist = max(intra_cluster_dist, dist)

    if intra_cluster_dist == 0:
        return 0.0

    return inter_cluster_dist / intra_cluster_dist

# Cargar Iris
X, y_true = load_iris(return_X_y=True)

# ===== ALEATORIZACIÓN DE DATOS =====
X, y_true = shuffle(X, y_true)

# Parámetros dados
k = 3
capacities = [50, 50, 50]

# ===== NUEVA MÉTRICA THRESHOLD =====
threshold = 0.85

# ===== VALIDACIÓN DE ENTRADA =====
if sum(capacities) != X.shape[0]:
    raise ValueError(
        "La suma de los tamaños de clusters no coincide con el total "
        "de instancias del dataset."
    )

# ===== MODELO =====
model = OnlineKMeansCapacitado(k, capacities)

# Streaming ONLINE + Normalización L2 por punto
labels = []
X_norm = []
distancias = []

for x in X:
    norm = np.linalg.norm(x)
    x_norm = x / norm if norm > 0 else x
    X_norm.append(x_norm)

    label = model.partial_fit(x_norm)
    labels.append(label)

    # Distancia y threshold antes de guardar
    centroide = model.centers[label]
    dist = np.linalg.norm(x_norm - centroide)
    distancias.append(dist)

X_norm = np.array(X_norm)
labels = np.array(labels)
distancias = np.array(distancias)

# ===== Métricas =====
silhouette = silhouette_score(X_norm, labels)
dunn = dunn_index(X_norm, labels)

ari = adjusted_rand_score(y_true, labels)
nmi = normalized_mutual_info_score(y_true, labels)
ami = adjusted_mutual_info_score(y_true, labels)

# ===== Métrica Threshold =====
threshold_ratio = float(np.mean(distancias <= threshold))

# Resultados
print("Tamaños de clusters:", np.bincount(labels))
print("\nValidación interna:")
print("  Silhouette:", round(silhouette, 4))
print("  Dunn index:", round(dunn, 4))

print("\nValidación externa:")
print("  ARI:", round(ari, 4))
print("  NMI:", round(nmi, 4))
print("  AMI:", round(ami, 4))

print("\nNueva métrica threshold:")
print("  Threshold (<=):", threshold)
print("  Proporción dentro del threshold:", round(threshold_ratio, 4))

Tamaños de clusters: [50 50 50]

Validación interna:
  Silhouette: 0.0932
  Dunn index: 0.0

Validación externa:
  ARI: 0.2422
  NMI: 0.268
  AMI: 0.2588

Nueva métrica threshold:
  Threshold (<=): 0.85
  Proporción dentro del threshold: 1.0
