In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.datasets import make_blobs
import random
from random import randint
from PIL import Image

### The K-means algorithm is defined as follows:

In [2]:
# class K_means():
def initialize(X_data, centroids):
    X_data = np.transpose(np.array(X_data))         # X_data argument is expected to have dimension in rows and number of observation in coloumns
    centroids = np.transpose(np.array(centroids))   # centroids argument is expected to have dimension in rows
    N = X_data.shape[0]    
    X_data = np.append(X_data, np.transpose(np.zeros((1,N))), axis=1)
    return X_data, centroids

def assign_cluster(X_data, centroids):
    X_data = X_data.copy() # to avoid mutability 
    N, M = X_data.shape    # N = num. of observations, M = dimension of data
    Nc, Mc = centroids.shape # Nc = num. of centroids, Mc = dimension of data
    dum_var = np.zeros(Nc)
    
    for i in range(N):
        dum_var = np.zeros(Nc)
#         print(X_data[0, 0:M-1])
        for j in range(Nc):
            dum_var[j] = (np.linalg.norm(X_data[i, 0:M-1] - centroids[j,:]))
        X_data[i,M-1] = np.argmin(dum_var)
                      
    return X_data

def update_centroids(X_data, centroids):
    centroids = centroids.copy() # to avoid mutability 
    X_data = X_data.copy()       # to avoid mutability 
    
    X_data = pd.DataFrame(X_data)
    centroids = pd.DataFrame(centroids) 
    N, M = X_data.shape    # N = num. of observations, M = dimension of data
    Nc, Mc = centroids.shape # Nc = num. of centroids, Mc = dimension of data
    X_data = X_data.rename(columns = {M-1:'cluster'})
#     print(X_data)
     
    
    for i in range(Nc):
        dum = X_data[X_data['cluster'] == i]
        dum = dum[dum.columns[0:X_data.shape[1]-1]]    
#         print(dum)
        if dum.shape[0] != 0:
            centroids.loc[i, :] = dum.sum(axis=0)/dum.shape[0]
        else:
            centroids.loc[i, :] = 0
            
#     print(centroids)
    return np.array(centroids)

### Generate Data:

In [3]:
def generate_data():
    im = image_compress('nature2.jpg', 400)
    im.show()
    pix = np.array(im)

    N, M, K = pix.shape
    X_data = np.zeros([K, N*M])
    for i in range(K):
        X_data[i,:] = (pix[:,:,i].reshape(( pix[:,:,i].shape[0]*pix[:,:,i].shape[1]) ))

    X_data = X_data.astype(float)
    return X_data, [N, M, K]

def image_compress(img_name, compression):
    basewidth = compression
    img = Image.open(img_name)
    wpercent = (basewidth/float(img.size[0]))
    hsize = int((float(img.size[1])*float(wpercent)))
    img = img.resize((basewidth,hsize), Image.ANTIALIAS)
    img.save(img_name+'new.jpg')
    
    return img
    

### Implement the K-means algorithm:

In [4]:
X_data, img_shape = generate_data()
N, M = X_data.shape    # N = num. of observations, M = dimension of data
K = 8
epsilon = 0.00001
centroids = X_data[:, np.random.randint(M, size=K)]
error = 10
iter = 0
X_data, centroids = initialize(X_data, centroids)
N, M = X_data.shape    # N = num. of observations, M = dimension of data
while error > epsilon:
    
    X_data1 = assign_cluster(X_data, centroids)
    
    centroids_new = update_centroids(X_data1, centroids)

    error = np.sqrt(np.square(centroids-centroids_new).sum(axis=1)).sum()
    
    centroids = centroids_new
    
    print('iteration',iter, ', error =', error)
    iter += 1   
print('Centroids = ')
print(centroids)

iteration 0 , error = 136.2171759372303
iteration 1 , error = 54.44915634291858
iteration 2 , error = 41.433910117132754
iteration 3 , error = 32.38846799064559
iteration 4 , error = 26.101272965843094
iteration 5 , error = 21.821285598918045
iteration 6 , error = 18.972408601606986
iteration 7 , error = 16.117756167632628
iteration 8 , error = 14.2924811811499
iteration 9 , error = 12.014684853210197
iteration 10 , error = 10.299587967599583
iteration 11 , error = 8.644562373438484
iteration 12 , error = 7.242742423676734
iteration 13 , error = 6.038227034363046
iteration 14 , error = 4.675871020945304
iteration 15 , error = 3.985865801098882
iteration 16 , error = 3.473343912952699
iteration 17 , error = 3.3002508961976096
iteration 18 , error = 2.830821629327173
iteration 19 , error = 2.400466390689677
iteration 20 , error = 1.800317472065063
iteration 21 , error = 1.6262250480148086
iteration 22 , error = 1.3645304719872833
iteration 23 , error = 1.2357406095197658
iteration 24 , e

In [5]:
color = ["#"+''.join([random.choice('0123456789ABCDEF') for j in range(6)])
             for i in range(K)]
def hex_to_rgb(hex):
     hex = hex.lstrip('#')
     hlen = len(hex)
     return tuple(int(hex[i:i+hlen//3], 16) for i in range(0, hlen, hlen//3))
    
    
RGB_color = hex_to_rgb(color[1])
X_data1 = pd.DataFrame(X_data1)
X_data1 = X_data1.rename(columns = {M-1:'cluster'})
X_mod_data = pd.DataFrame(index=[0], columns= [0,1,2])

for i in range(K):
    RGB_color = hex_to_rgb(color[i])
    dum = X_data1[X_data1['cluster'] == i].loc[:,[0,1,2]]
#     dum.loc[:, :] = [RGB_color[0], RGB_color[1], RGB_color[2]]
    dum.loc[:, :] = centroids[i, :]
    X_mod_data = pd.concat([X_mod_data, dum], ignore_index=False)

X_mod_data = X_mod_data.dropna()
X_mod_data = X_mod_data.sort_index(ascending=True)
# print(X_mod_data)
# SC = X_mod_data.sort_index(ascending=True)
# print(SC)
X_mod_data = np.array(X_mod_data)
X_mod_data = X_mod_data.reshape((img_shape[0], img_shape[1], img_shape[2] ) )
print(X_mod_data.shape)

(255, 400, 3)


### View results:

#### The clusters are not explicitly displsyed here to maintain neatness. However, the clusters with their corresponding centroids are plotted for visualizations as follows:

In [6]:
image = Image.fromarray(X_mod_data.astype('uint8'), 'RGB')
image.show()
image.save('nature2_K_means.jpg')