In [57]:
import pandas as pd
import numpy as np
from sklearn import datasets
from numpy.random import choice
from numpy.random import seed
from sklearn.preprocessing import MinMaxScaler


iris = datasets.load_iris()
data = pd.DataFrame(iris.data,columns = iris.feature_names)

target = iris.target_names
labels = iris.target

scaler = MinMaxScaler()
data = pd.DataFrame(scaler.fit_transform(data), columns=data.columns)


datapoints = data.to_numpy()
m, f = datapoints.shape
k = 3


def init_medoids(X, k):
    seed(1)
    samples = choice(len(X), size=k, replace=False)
    return X[samples, :]

def compute_d_p(X, medoids, p):
    m = len(X)
    medoids_shape = medoids.shape

    if len(medoids_shape) == 1: 
        medoids = medoids.reshape((1,len(medoids)))

    k = len(medoids)
    
    S = np.empty((m, k))
    
    for i in range(m):
        d_i = np.linalg.norm(X[i, :] - medoids, ord=p, axis=1)
        S[i, :] = d_i**p

    return S

def assign_labels(S):
    return np.argmin(S, axis=1)


def update_medoids(X, medoids, p):
    S = compute_d_p(datapoints, medoids, p)
    labels = assign_labels(S)
        
    out_medoids = medoids
                
    for i in set(labels):
        
        avg_dissimilarity = np.sum(compute_d_p(datapoints, medoids[i], p))

        cluster_points = datapoints[labels == i]
        
        for datap in cluster_points:
            new_medoid = datap
            new_dissimilarity= np.sum(compute_d_p(datapoints, datap, p))
            
            if new_dissimilarity < avg_dissimilarity :
                avg_dissimilarity = new_dissimilarity
                
                out_medoids[i] = datap
                
    return out_medoids


def has_converged(old_medoids, medoids):
    return set([tuple(x) for x in old_medoids]) == set([tuple(x) for x in medoids])


def kmedoids(X, k, p, max_steps=np.inf):
    medoids = init_medoids(X, k)
        
    converged = False
    labels = np.zeros(len(X))
    i = 1
    while (not converged) and (i <= max_steps):
        old_medoids = medoids.copy()
        
        S = compute_d_p(X, medoids, p)
        
        labels = assign_labels(S)
        
        medoids = update_medoids(X, medoids, p)
        
        converged = has_converged(old_medoids, medoids)
        i += 1
    return (medoids,labels)


max_epoch = 10000


medoids_initial = init_medoids(datapoints, 3)
S = compute_d_p(datapoints, medoids_initial, 2) 
labels = assign_labels(S)
results = kmedoids(datapoints, 3, 2, max_steps=max_epoch)
final_medoids = results[0]
data['clusters'] = results[1]


In [58]:
sse = 0

for i in range(data.shape[0]):
  point = [data.iloc[i, 0], data.iloc[i, 1], data.iloc[i, 2], data.iloc[i, 3]]
  ind = data.iloc[i,4]
  sse += (np.linalg.norm(point - results[0][ind]))**2


print("The SSE for the k medoid method is : ", sse)
print("The silhouette score for k medoid method is : ", silhouette_score(data, labels))

The SSE for the k medoid method is :  20.65462305735546
The silhouette score for k medoid method is :  0.6482147036394829


In [59]:
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

iris = datasets.load_iris()
data = pd.DataFrame(iris.data,columns = iris.feature_names)

KMean= KMeans(n_clusters=3)
KMean.fit(data)
label=KMean.predict(data)

print("The silhouette score for k mean method is : ", silhouette_score(data, label))

The silhouette score for k mean method is :  0.5528190123564091
