## Simulate Data 

To generate the dataset GAUSSMIXTURE, we sampled k=5 centers from a 15-dimensional spherical Gaussian distribution with mean at the origin and variance $R\in \{1,10,100 \}$ and then added points from Gaussian distributions with unit variance around each center.

In [3]:
## Simulate data
k = 20
n = 10000
d = 15

## simulate k centers from 15-dimensional spherical Gaussian distribution 
mean = np.hstack(np.zeros((d,1)))
cov = np.diag(np.array([1,10,100]*5))
centers = np.random.multivariate_normal(mean, cov, k)

## Simulate n data
for i in range(k):
    mean = centers[i]
    if i == 0:
        data = np.random.multivariate_normal(mean, np.diag(np.ones(d)) , int(n/k+n%k))
    else:
        data = np.append(data, np.random.multivariate_normal(mean, np.diag(np.ones(d)) , int(n/k)), axis = 0) 

In [4]:
data.shape

(10000, 15)

## KMeans




In [36]:
def KMeans(data, k, centroids, max_iter = 10000): 
    
    """ Apply the KMeans clustering algorithm
    
    Parameters:
      data                        ndarrays data 
      k                           number of cluster
      centroids                   initial centroids
    
    Returns:
      "Iteration before Coverge"  time used to converge
      "Centroids"                 the final centroids finded by KMeans    
      "Labels"                    the cluster of each data   
    """
    
    n = data.shape[0] 
    iterations = 0
    
    while iterations < max_iter:        
        ## calculate distence between each point to the three centroids
        dist = np.sum((data[:, np.newaxis, :] - centroids)**2, axis=2)
        
        ## give cluster label to each point 
        cluster_label = np.argmin(dist, axis=1)
        
        ## calculate new centroids
        newCentroids = np.zeros(centroids.shape)
        for j in range(0, k):
            if sum(cluster_label == j) == 0:
                newCentroids[j] = centroids[j]
            else:
                newCentroids[j] = np.mean(data[cluster_label == j, :], axis=0)
        
        ## Check if it is converged
        if np.array_equal(centroids, newCentroids):
            print("Converge")
            break 
            
        centroids = newCentroids
        iterations += 1
        
    return({"Iteration before Coverge": iterations, 
            "Centroids": centroids, 
            "Labels": cluster_label})

In [37]:
centroids_initial = data[np.random.choice(range(data.shape[0]), k, replace=False),:]
KMeans(data, k, centroids_initial)

Converge


{'Centroids': array([[  0.26822612,   0.88701274,  -5.32107689,   0.09211889,
          -0.71405037,  -2.84441004,   0.03114272,   2.22844915,
          -1.98513624,  -0.31061524,   0.47862393,   9.55952361,
           0.33278855,  -0.50233611,  -0.38189866],
        [  0.2475699 ,   0.74202565,   9.16801481,   0.56615326,
          -3.21085381,  -7.56552279,  -0.0637888 ,   2.32174818,
          13.61411942,   0.63486105,  -0.77761489,  -2.83308631,
           0.2002892 ,   1.24559861,  -3.78487688],
        [  0.03561076,   3.61815552,  -0.48844399,   0.027959  ,
           2.19524088,  12.91910162,  -0.78744788,  -0.25747557,
           8.86137572,   0.08075333,   0.46134753,  -4.47722422,
          -0.28661607,  -2.73701299,  -8.06625162]]),
 'Iteration before Coverge': 8,
 'Labels': array([2, 2, 2, ..., 0, 0, 0])}