In [85]:
from skimage.io import imread
from skimage import img_as_float
import numpy as np
import pandas as pd
from sklearn.cluster import KMeans
import operator

In [78]:
image = imread('parrots.jpg')

In [79]:
image.shape

(474, 713, 3)

In [80]:
data = img_as_float(image)

In [81]:
data = data.reshape(474*713, -1)

In [82]:
data.shape

(337962, 3)

In [83]:
df = pd.DataFrame()
df['R'] = data[:, 0]
df['G'] = data[:, 1]
df['B'] = data[:, 2]

In [84]:
df

Unnamed: 0,R,G,B
0,0.011765,0.490196,0.015686
1,0.011765,0.490196,0.015686
2,0.007843,0.494118,0.007843
3,0.007843,0.494118,0.007843
4,0.007843,0.501961,0.011765
5,0.011765,0.505882,0.011765
6,0.007843,0.513725,0.015686
7,0.003922,0.517647,0.007843
8,0.007843,0.521569,0.011765
9,0.000000,0.525490,0.011765


In [65]:
clustersNumber = 16
kmn = KMeans(n_clusters=clustersNumber, init='k-means++', random_state=241)

In [66]:
kmn.fit(df)

KMeans(copy_x=True, init='k-means++', max_iter=300, n_clusters=16, n_init=10,
    n_jobs=1, precompute_distances='auto', random_state=241, tol=0.0001,
    verbose=0)

In [67]:
df['cluster'] = kmn.labels_

In [68]:
df

Unnamed: 0,R,G,B,cluster
0,0.011765,0.490196,0.015686,7
1,0.011765,0.490196,0.015686,7
2,0.007843,0.494118,0.007843,7
3,0.007843,0.494118,0.007843,7
4,0.007843,0.501961,0.011765,7
5,0.011765,0.505882,0.011765,7
6,0.007843,0.513725,0.015686,7
7,0.003922,0.517647,0.007843,7
8,0.007843,0.521569,0.011765,7
9,0.000000,0.525490,0.011765,7


In [69]:
clusterMean = []
for cluster in range(0, clustersNumber):
    mean = np.mean(df[df.cluster == cluster])
    clusterMean.append([mean.R, mean.G, mean.B])

In [70]:
clusterMean

[[0.019205003565873562, 0.032140331402680702, 0.017161960406337225],
 [0.23472548113654587, 0.46043416288429079, 0.41329667243453849],
 [0.053536595726068961, 0.68365017819685581, 0.029152366789475175],
 [0.83133449246056557, 0.068416977873455256, 0.030609093774556544],
 [0.70428891884681288, 0.53296634845483981, 0.092362366970263815],
 [0.97994085340078418, 0.91280846939524196, 0.045614453639500029],
 [0.69892022909604212, 0.76237871810461011, 0.72096855359570899],
 [0.044835657120936012, 0.38893654850544934, 0.079484474309761702],
 [0.77563967368976394, 0.091351006146150771, 0.6892240389233194],
 [0.90125042228348629, 0.90161578438300594, 0.89143585685382831],
 [0.45683385303811264, 0.39687895283905728, 0.2987208216619735],
 [0.2028485518654313, 0.75533979657821626, 0.13787893940300594],
 [0.049917965023844692, 0.64774421480315392, 0.71336965200502078],
 [0.43619569303477379, 0.80824559242100258, 0.56908013406755475],
 [0.41382127247308115, 0.20421955793359484, 0.051376732613315039],

In [71]:
df_mean = df.copy()

In [72]:
for cluster in range(0, clustersNumber):
    df_mean.ix[df_mean.cluster==cluster, 'R'] = clusterMean[cluster][0]
    df_mean.ix[df_mean.cluster==cluster, 'G'] = clusterMean[cluster][1]
    df_mean.ix[df_mean.cluster==cluster, 'B'] = clusterMean[cluster][2]

In [73]:
df_mean

Unnamed: 0,R,G,B,cluster
0,0.044836,0.388937,0.079484,7
1,0.044836,0.388937,0.079484,7
2,0.044836,0.388937,0.079484,7
3,0.044836,0.388937,0.079484,7
4,0.044836,0.388937,0.079484,7
5,0.044836,0.388937,0.079484,7
6,0.044836,0.388937,0.079484,7
7,0.044836,0.388937,0.079484,7
8,0.044836,0.388937,0.079484,7
9,0.044836,0.388937,0.079484,7


In [74]:
mse = np.mean((df.values - df_mean.values) ** 2)
psnr = 10 * np.log10(float(1) / mse)

In [75]:
mse

0.008596485560229853

In [76]:
psnr

20.65679061983197

In [92]:
res = {}
for clustersNumber in range(2, 21):
    kmn = KMeans(n_clusters=clustersNumber, init='k-means++', random_state=241)
    df_work = df.copy()
    kmn.fit(df_work)
    df_work['cluster'] = kmn.labels_
    
    clusterMean = []
    for cluster in range(0, clustersNumber):
        mean = np.mean(df_work[df_work.cluster == cluster])
        clusterMean.append([mean.R, mean.G, mean.B])
        
    df_mean = df_work.copy()
    for cluster in range(0, clustersNumber):
        df_mean.ix[df_mean.cluster==cluster, 'R'] = clusterMean[cluster][0]
        df_mean.ix[df_mean.cluster==cluster, 'G'] = clusterMean[cluster][1]
        df_mean.ix[df_mean.cluster==cluster, 'B'] = clusterMean[cluster][2]
        
    mse = np.mean((df_work.values - df_mean.values) ** 2)
    psnr = -10 * np.log10(mse)
    res[clustersNumber] = psnr
    
res_sorted = sorted(res.items(), key=operator.itemgetter(1), reverse=True)
for x in res_sorted:
    print x

(20, 24.021213285200794)
(19, 23.825962948000303)
(18, 23.638773120852061)
(17, 23.426929792161459)
(16, 23.167874655394041)
(15, 22.910125927820204)
(14, 22.63340172831964)
(13, 22.326820391385791)
(12, 21.901256346430262)
(11, 21.432313525555191)
(10, 20.946082444318197)
(9, 20.416201023075395)
(8, 19.733989396954588)
(7, 18.948364156360526)
(6, 17.856055527260544)
(5, 16.840196150841553)
(4, 15.665705323836649)
(3, 14.457526695099766)
(2, 13.394436494793444)


In [93]:
res = {}
for clustersNumber in range(2, 21):
    kmn = KMeans(n_clusters=clustersNumber, init='k-means++', random_state=241)
    df_work = df.copy()
    kmn.fit(df_work)
    df_work['cluster'] = kmn.labels_
    
    clusterMean = []
    for cluster in range(0, clustersNumber):
        mean.R = np.median(df_work[df_work.cluster == cluster].R)
        mean.G = np.median(df_work[df_work.cluster == cluster].G)
        mean.B = np.median(df_work[df_work.cluster == cluster].B)
        clusterMean.append([mean.R, mean.G, mean.B])
        
    df_mean = df_work.copy()
    for cluster in range(0, clustersNumber):
        df_mean.ix[df_mean.cluster==cluster, 'R'] = clusterMean[cluster][0]
        df_mean.ix[df_mean.cluster==cluster, 'G'] = clusterMean[cluster][1]
        df_mean.ix[df_mean.cluster==cluster, 'B'] = clusterMean[cluster][2]
        
    mse = np.mean((df_work.values - df_mean.values) ** 2)
    psnr = -10 * np.log10(mse)
    res[clustersNumber] = psnr
    
res_sorted = sorted(res.items(), key=operator.itemgetter(1), reverse=True)
for x in res_sorted:
    print x

(20, 23.868258018109341)
(19, 23.669714259187657)
(18, 23.443969478940595)
(17, 23.234280002249143)
(16, 22.983202317363837)
(15, 22.715424478595065)
(14, 22.430192558241426)
(13, 22.105672465192043)
(12, 21.613491265973472)
(11, 21.1563229072556)
(10, 20.721279993104048)
(9, 20.118300305333015)
(8, 19.42783098807762)
(7, 18.650336518524362)
(6, 17.365000042162443)
(5, 16.504731922130034)
(4, 15.304385942699209)
(3, 13.93782117748893)
(2, 12.973115643954383)
