# Objective : Learning the usage of user-defined initial centroids in k-means clustering

# Question : 
# 1. Take a sample of 15 instances randomly from the iris data set 
# 2. Perform hierarchical clustering (with 3 clusters) using the sample data and compute the initial centroids
# 3. Perform k-means clustering using the centroids obtained in (2) for the full iris data set
# 4. Compare the above strategy with the default call using silhouette coefficient

# Step 1: Taking a sample of size 15

In [1]:
import random
sa=random.sample(range(150),15)
print("The sample rows are",sa)

The sample rows are [32, 6, 1, 64, 47, 14, 98, 81, 143, 28, 89, 70, 99, 3, 95]


# Step 2 : Slicing the 15 sampled rows of the iris data set

In [2]:
from sklearn.datasets import load_iris
da=load_iris() # loaded the data
X=da.data[sa,] # sliced the sampled rows
X

array([[5.2, 4.1, 1.5, 0.1],
       [4.6, 3.4, 1.4, 0.3],
       [4.9, 3. , 1.4, 0.2],
       [5.6, 2.9, 3.6, 1.3],
       [4.6, 3.2, 1.4, 0.2],
       [5.8, 4. , 1.2, 0.2],
       [5.1, 2.5, 3. , 1.1],
       [5.5, 2.4, 3.7, 1. ],
       [6.8, 3.2, 5.9, 2.3],
       [5.2, 3.4, 1.4, 0.2],
       [5.5, 2.5, 4. , 1.3],
       [5.9, 3.2, 4.8, 1.8],
       [5.7, 2.8, 4.1, 1.3],
       [4.6, 3.1, 1.5, 0.2],
       [5.7, 3. , 4.2, 1.2]])

# Step 3: Getting cluster members from the sample data

In [4]:
from sklearn.cluster import AgglomerativeClustering
clusters=AgglomerativeClustering(n_clusters=3,linkage='ward',affinity='euclidean').fit(X)
labels=clusters.labels_
#print(labels)



In [5]:
import numpy as np
cl0=np.where(labels==0)
cl1=np.where(labels==1)
cl2=np.where(labels==2)

In [6]:
# Slicing the rows corresponding to the three clusters
X1=X[list(cl0[0]),]
X2=X[list(cl1[0]),]
X3=X[list(cl2[0]),]

In [7]:
# Computing the centroids
m1=np.mean(X1,axis=0)
m2=np.mean(X2,axis=0)
m3=np.mean(X3,axis=0)


In [8]:
# Displaying the centroids
print("The first centroid is ",m1)
print("The second centroid is ",m2)
print("The third centroids ",m3)

The first centroid is  [4.98571429 3.45714286 1.4        0.2       ]
The second centroid is  [6.35 3.2  5.35 2.05]
The third centroids  [5.51666667 2.68333333 3.76666667 1.2       ]


# Calling k-means using the above choice of centroids

In [9]:
# Construction of matrix of initial centroids
my_centroids=np.array([m1,m2,m3])
my_centroids

array([[4.98571429, 3.45714286, 1.4       , 0.2       ],
       [6.35      , 3.2       , 5.35      , 2.05      ],
       [5.51666667, 2.68333333, 3.76666667, 1.2       ]])

In [10]:
# applying k-means clustering with the above choice
from sklearn.cluster import KMeans
clustering=KMeans(n_clusters=3,init=my_centroids) # we are passing our centroids as initial centroids

In [11]:
# display the cluster labels
cluster_labels=clustering.fit_predict(da.data)
print(cluster_labels)

  super()._check_params_vs_input(X, default_n_init=10)


[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 1 2 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 2 1 1 1 1 2 1 1 1 1
 1 1 2 2 1 1 1 1 2 1 2 1 2 1 1 2 2 1 1 1 1 1 2 1 1 1 1 2 1 1 1 2 1 1 1 2 1
 1 2]


In [12]:
# display the cluster centers
centers=clustering.cluster_centers_
centers

array([[5.006     , 3.428     , 1.462     , 0.246     ],
       [6.85384615, 3.07692308, 5.71538462, 2.05384615],
       [5.88360656, 2.74098361, 4.38852459, 1.43442623]])

# Conclusion: The first cluster contains flowers with larger magnitude,
# The third cluster contains flowers with smaller magnitude and the
# the second one contains intermediate level dimensions

In [13]:
clustering1=KMeans(n_clusters=3) # default
# display the cluster labels
cluster_labels1=clustering1.fit_predict(da.data)
print(cluster_labels1)



[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 2 2 2 2 1 2 2 2 2
 2 2 1 1 2 2 2 2 1 2 1 2 1 2 2 1 1 2 2 2 2 2 1 2 2 2 2 1 2 2 2 1 2 2 2 1 2
 2 1]


In [14]:
from sklearn.metrics import silhouette_score

In [15]:
s1=silhouette_score(da.data,cluster_labels)
s2=silhouette_score(da.data,cluster_labels1)
print(s1,s2)

0.5511916046195919 0.5528190123564095


# Default call gives higher silhouette coefficient. Hence, we recommend default call.