In [2]:
import numpy as np
from sklearn.datasets import make_blobs
from sklearn.metrics import silhouette_score


class StreamKMeansPP:
    def __init__(self, n_clusters=8, max_iter=100, tol=1e-4, random_state=None):
        self.n_clusters = n_clusters
        self.max_iter = max_iter
        self.tol = tol
        self.random_state = random_state
        self.cluster_centers_ = None
        self.labels_ = None
        self.inertia_ = None

    def fit(self, X):
        rng = np.random.default_rng(self.random_state)

        # Choose the first centroid uniformly at random from the dataset
        centroid_indices = [rng.integers(low=0, high=X.shape[0])]
        centroids = X[centroid_indices]

        # Choose the remaining centroids
        for _ in range(1, self.n_clusters):
            distances = np.array([min(np.linalg.norm(x - c)**2 for c in centroids) for x in X])
            probs = distances / np.sum(distances)
            centroid_indices.append(rng.choice(len(X), p=probs))
            centroids = X[centroid_indices]

        self.cluster_centers_ = centroids

        # Assign points to clusters
        self.labels_ = np.array([np.argmin([np.linalg.norm(x - c)**2 for c in centroids]) for x in X])

        # Update centroids iteratively
        for _ in range(self.max_iter):
            new_centroids = np.array([np.mean(X[self.labels_ == i], axis=0) for i in range(self.n_clusters)])

            # Check convergence
            if np.sum(np.linalg.norm(new_centroids - centroids, axis=1)) < self.tol:
                break

            centroids = new_centroids
            self.cluster_centers_ = centroids
            self.labels_ = np.array([np.argmin([np.linalg.norm(x - c)**2 for c in centroids]) for x in X])

        # Calculate inertia
        self.inertia_ = sum(min(np.linalg.norm(x - c)**2 for c in centroids) for x in X)

    def fit_predict(self, X):
        self.fit(X)
        return self.labels_

class SimpleCluStream:
    def __init__(self, k, beta, decay_factor):
        self.k = k
        self.beta = beta
        self.decay_factor = decay_factor
        self.centroids = np.zeros((k, num_attributes))
        self.counts = np.zeros(k)
        self.weights = np.zeros(k)

    def partial_fit(self, X):
        for x in X:
            min_dist = np.inf
            min_idx = -1
            for i, centroid in enumerate(self.centroids):
                dist = np.linalg.norm(x - centroid)
                if dist < min_dist:
                    min_dist = dist
                    min_idx = i
            self.centroids[min_idx] += self.decay_factor * (x - self.centroids[min_idx])
            self.counts[min_idx] += 1
            self.weights[min_idx] += self.decay_factor

            if np.sum(self.weights) > self.beta:
                nonzero_weights = self.weights[self.weights != 0]
                if len(nonzero_weights) > 0:
                    self.centroids[self.weights != 0] /= nonzero_weights[:, None]
                    self.weights = np.zeros(self.k)

    def predict(self, X):
        labels = []
        for x in X:
            min_dist = np.inf
            min_idx = -1
            for i, centroid in enumerate(self.centroids):
                dist = np.linalg.norm(x - centroid)
                if dist < min_dist:
                    min_dist = dist
                    min_idx = i
            labels.append(min_idx)
        return np.array(labels)


# Generating synthetic data stream
def generate_synthetic_data(num_attributes, num_data):
    X, _ = make_blobs(n_samples=num_data, n_features=num_attributes, centers=5, random_state=42)
    return X


# Purity calculation
def purity_score(y_true, y_pred):
    contingency_matrix = np.zeros((len(np.unique(y_true)), len(np.unique(y_pred))))
    for i in range(len(y_true)):
        contingency_matrix[y_true[i], y_pred[i]] += 1
    return np.sum(np.amax(contingency_matrix, axis=0)) / np.sum(contingency_matrix)


# Sum of Squared Error (SSE) calculation
def calculate_sse(X, centroids, labels):
    sse = 0
    for i in range(len(X)):
        centroid = centroids[labels[i]]
        sse += np.sum((X[i] - centroid) ** 2)
    return sse


# Applying StreamKM++ and simple CluStream and evaluating
def evaluate_clustering(X, k):
    # Applying StreamKM++
    streamkm_pp = StreamKMeansPP(n_clusters=k, random_state=42)
    streamkm_pp.fit(X)
    streamkm_pp_labels = streamkm_pp.labels_

    # Simple CluStream
    simple_clustream = SimpleCluStream(k=k, beta=100, decay_factor=0.01)
    for i in range(len(X)):
        simple_clustream.partial_fit([X[i]])
    clustream_labels = simple_clustream.predict(X)

    # Calculate evaluation metrics
    purity_streamkm_pp = purity_score(labels_true, streamkm_pp_labels)
    sse_streamkm_pp = calculate_sse(X, streamkm_pp.cluster_centers_, streamkm_pp_labels)
    silhouette_streamkm_pp = silhouette_score(X, streamkm_pp_labels)

    purity_clustream = purity_score(labels_true, clustream_labels)
    sse_clustream = calculate_sse(X, simple_clustream.centroids, clustream_labels)
    silhouette_clustream = silhouette_score(X, clustream_labels)

    # Print results
    print("StreamKM++:")
    print("Purity:", purity_streamkm_pp)
    print("SSE:", sse_streamkm_pp)
    print("Silhouette Score:", silhouette_streamkm_pp)

    print("\nCluStream:")
    print("Purity:", purity_clustream)
    print("SSE:", sse_clustream)
    print("Silhouette Score:", silhouette_clustream)


# Generating synthetic data stream
num_attributes = 10
num_data = 20000
X = generate_synthetic_data(num_attributes, num_data)

# Generate some random labels for evaluation
labels_true = np.random.randint(0, 5, size=num_data)

# Applying StreamKM++ and simple CluStream and evaluating
k = 5  # Number of clusters
evaluate_clustering(X, k)

StreamKM++:
Purity: 0.20875
SSE: 199905.34241787193
Silhouette Score: 0.7590772654456193

CluStream:
Purity: 0.2049
SSE: 2312435.5539868665
Silhouette Score: 0.5541789858672842
