In [None]:
import numpy as np

def convert(centroids, labels, mu):
    """
    Convert the label predicted according the order of distributions used to generated the data
    
    example:
    the data from 1 to 100 is generated from distribution 1, so the data are labelled by 0
    but in the algorithm, the centroid or the mean may not be the order as same as the order of distributions used
    so we corresponds the predicted label by algorithm to the real label
    
    we compare the clustering centers returned by the method (centroids for K-means, means for GMMs) 
    and map them to the distribution with the closest mean.

    
    Parameters:
    centroids (ndarray): The current centroids of shape (n_clusters, n_features).
    labels (ndarray): The current cluster assignments for each data point of shape (n_samples,).
    mu (ndarray): The mean of distributions that used to generate data (n_distributions, n_features)
    
    Returns:
    prediction (ndarray): the corresponded label.
    """

    table = []

    for i in range(centroids.shape[0]):
        dist = [np.linalg.norm(centroids[i] - mean) for mean in mu]
        index = dist.index(min(dist))
        table.append([i,index])

    prediction = []
    for i in range(len(labels)):
        for label in table:
            if labels[i] == label[0]:
                prediction.append(label[1])
    
    return np.array(prediction)


def objective(X, centroids, labels):
    """
    Calculates the objective function given the data X, cluster centroids and the corresponding labels.
    
    Parameters:
    X (ndarray): The input data of shape (n_samples, n_features).
    centroids (ndarray): The current centroids of shape (n_clusters, n_features).
    labels (ndarray): The current cluster assignments for each data point of shape (n_samples,).
    
    Returns:
    float: The value of the K-means objective function.
    """
    n_samples = X.shape[0]
    n_clusters = centroids.shape[0]
    objective = 0
    
    for k in range(n_clusters):
        idx = np.where(labels == k)[0]
        if idx.size > 0:
            # Calculate the distance between data points and centroids
            distances = np.linalg.norm(X[idx] - centroids[k], axis=1)
            # Calculate the objective
            objective += np.sum(distances ** 2)
    
    return objective/n_samples

In [None]:
# Initialize lists to store the results
kmeans_objective = []
kmeans_accuracy = []
gmm_objective = []
gmm_accuracy = []

# Loop over the sigma values and generate datasets
for sigma in sigma_range:
    # Generate the dataset
    a = np.random.multivariate_normal(mu_a, sigma_a*sigma, 100)
    b = np.random.multivariate_normal(mu_b, sigma_b*sigma, 100)
    c = np.random.multivariate_normal(mu_c, sigma_c*sigma, 100)
    data = np.concatenate([a, b, c], axis=0)

    # K-means clustering
    #kmeans = KMeans(n_clusters=n_clusters, init='random', random_state=0).fit(data)
    kmeans = KMeans(n_clusters=n_clusters, init='k-means++', random_state=0).fit(data)
    
    kmeans_objective.append(objective(data, kmeans.cluster_centers_, kmeans.labels_))
    prediction = convert(kmeans.cluster_centers_, kmeans.labels_, mu)
    kmeans_accuracy.append(np.sum(prediction == ground_truth)/len(prediction))

    # GMM with EM algorithm
    #gmm = GaussianMixture(n_components=n_clusters, covariance_type='full', init_params = 'random', random_state=0).fit(data)
    gmm = GaussianMixture(n_components=n_clusters, covariance_type='full', random_state=0).fit(data)

    
    labels = gmm.predict(data)
    gmm_objective.append(-gmm.score(data))   
    prediction = convert(gmm.means_, labels, mu)
    gmm_accuracy.append(np.sum(prediction == ground_truth)/len(prediction))

# Plot the results
fig, axs = plt.subplots(2, sharex=True, figsize=(8,8))

axs[0].set_title('Clustering Objective vs Sigma')
axs[0].plot(sigma_range, kmeans_objective, label='K-means')
axs[0].plot(sigma_range, gmm_objective, label='GMM')
axs[0].set_ylabel('Clustering Objective')
axs[0].legend()

axs[1].set_title('Clustering Accuracy vs Sigma')
axs[1].plot(sigma_range, kmeans_accuracy, label='K-means')
axs[1].plot(sigma_range, gmm_accuracy, label='GMM')
axs[1].set_ylabel('Clustering Accuracy')
axs[1].set_xlabel('Sigma')
axs[1].legend()

fig.suptitle('K-means with K-means ++, GMM with random init')
# fig.suptitle('K-means with random init, GMM with random init')

# Adjust spacing between subplots
fig.subplots_adjust(hspace=0.3)

plt.show()