In [1]:
from scipy.spatial.distance import pdist, squareform
from sklearn.metrics import pairwise_distances

import numpy as np
import pandas as pd

In [2]:
d = {
    'x' : [1,2,2,3,4,4,5,6,6,3,4,4,5,5,6],
    'y' : [4,4,5,3,4,5,5,4,6,4,1,3,1,2,1],
    'cluster' : ['c1','c1','c1','c3','c3','c3','c3','c3','c3','c2','c2','c2','c2','c2','c2']
}

df = pd.DataFrame(d)
df.head()

Unnamed: 0,x,y,cluster
0,1,4,c1
1,2,4,c1
2,2,5,c1
3,3,3,c3
4,4,4,c3


## Calculate the a-value for each and every data-points

In [3]:
dist = pdist(df[['x','y']], metric='euclidean')

## all the diagonal distance are 0, since a distance to the point it-self is 0.
dist_matrix = pd.DataFrame(squareform(dist))

dist_matrix.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
0,0.0,1.0,1.414214,2.236068,3.0,3.162278,4.123106,5.0,5.385165,2.0,4.242641,3.162278,5.0,4.472136,5.830952
1,1.0,0.0,1.0,1.414214,2.0,2.236068,3.162278,4.0,4.472136,1.0,3.605551,2.236068,4.242641,3.605551,5.0
2,1.414214,1.0,0.0,2.236068,2.236068,2.0,3.0,4.123106,4.123106,1.414214,4.472136,2.828427,5.0,4.242641,5.656854
3,2.236068,1.414214,2.236068,0.0,1.414214,2.236068,2.828427,3.162278,4.242641,1.0,2.236068,1.0,2.828427,2.236068,3.605551
4,3.0,2.0,2.236068,1.414214,0.0,1.0,1.414214,2.0,2.828427,1.0,3.0,1.0,3.162278,2.236068,3.605551


In [4]:
## lets calculate a-value for 1st point clusters
df.loc[df['cluster'] == 'c1']

Unnamed: 0,x,y,cluster
0,1,4,c1
1,2,4,c1
2,2,5,c1


<img src = 'images\cl-1 distance.jpg'>

In [5]:
## Similarly calculating the distances for the points in the cluster-2
df.loc[df['cluster'] == 'c2']

Unnamed: 0,x,y,cluster
9,3,4,c2
10,4,1,c2
11,4,3,c2
12,5,1,c2
13,5,2,c2
14,6,1,c2


In [6]:
## Similarly calculating the distances for the points in the cluster-3
df.loc[df['cluster'] == 'c3']

Unnamed: 0,x,y,cluster
3,3,3,c3
4,4,4,c3
5,4,5,c3
6,5,5,c3
7,6,4,c3
8,6,6,c3


In [7]:
sil_dict = {
    'a':[1.207, 1, 1.207, 2.777, 1.731, 1.742, 1.614, 2.163,
         2.544, 3.051, 1.915, 1.979, 1.768, 1.614, 2.297 ]
}

sil = pd.DataFrame(sil_dict)
sil = pd.concat([df,sil], axis=1)

sil.head()

Unnamed: 0,x,y,cluster,a
0,1,4,c1,1.207
1,2,4,c1,1.0
2,2,5,c1,1.207
3,3,3,c3,2.777
4,4,4,c3,1.731


## Calculate b-value, average distance to points in the nearest cluster

In [8]:
## Lets calculate b-value for the point(1,4) in cluster-1

<img src = 'images\1_4_b_1.jpg'>
<img src = 'images\1_4_b_2.jpg'>

In [9]:
## Similarly the b-value for all the points are calculted

sil['dist1'] = pd.Series([3.818, 2.881, 2.953, 1.962, 2.412, 2.466, 3.428, 4.374, 4.66, 1.47, 
                        4.11, 2.74, 4.74, 4.107, 5.496])

sil['dist2'] = pd.Series([4.118, 3.281, 3.936, 2.15, 2.33, 3.195, 3.286, 2.873, 4.469, 2.043,
                        3.725, 2.013, 3.729, 2.832, 3.968])

sil['b'] = sil.apply(lambda x: np.min([x.dist1, x.dist2]), axis=1)
sil.head()

Unnamed: 0,x,y,cluster,a,dist1,dist2,b
0,1,4,c1,1.207,3.818,4.118,3.818
1,2,4,c1,1.0,2.881,3.281,2.881
2,2,5,c1,1.207,2.953,3.936,2.953
3,3,3,c3,2.777,1.962,2.15,1.962
4,4,4,c3,1.731,2.412,2.33,2.33


## Silhouette coefficient for each point

In [10]:
sil['sil'] = sil.apply(lambda x : (x.b - x.a) / np.max([x.a, x.b]), axis=1)
sil.head()

Unnamed: 0,x,y,cluster,a,dist1,dist2,b,sil
0,1,4,c1,1.207,3.818,4.118,3.818,0.683866
1,2,4,c1,1.0,2.881,3.281,2.881,0.652898
2,2,5,c1,1.207,2.953,3.936,2.953,0.591263
3,3,3,c3,2.777,1.962,2.15,1.962,-0.293482
4,4,4,c3,1.731,2.412,2.33,2.33,0.257082


## Silhoutte Coefficient of a cluster = Average Silhouette coefficient of the points in their respective cluster

In [11]:
cluster_sils = pd.DataFrame(sil.groupby('cluster').sil.agg('mean'))
cluster_sils

Unnamed: 0_level_0,sil
cluster,Unnamed: 1_level_1
c1,0.642676
c2,0.226948
c3,0.240649


## Silhouette Coefficient of the whole clustering = Average Silhouette coefficient of the points in the data

In [12]:
print('Slihouette Coefficient of clustering : ',np.round(sil['sil'].mean(),3))

Slihouette Coefficient of clustering :  0.316
