In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
from sklearn import metrics
from sklearn.cluster import (
    KMeans, AgglomerativeClustering, DBSCAN, MeanShift,
    Birch, AffinityPropagation, MiniBatchKMeans, SpectralClustering,
)
import warnings
warnings.filterwarnings("ignore")
from sklearn.datasets import load_iris

In [3]:
iris = load_iris()
df = pd.DataFrame(iris.data, columns=iris.feature_names)
df['target'] = iris.target
df.sample(10)

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
88,5.6,3.0,4.1,1.3,1
35,5.0,3.2,1.2,0.2,0
104,6.5,3.0,5.8,2.2,2
2,4.7,3.2,1.3,0.2,0
46,5.1,3.8,1.6,0.2,0
7,5.0,3.4,1.5,0.2,0
37,4.9,3.6,1.4,0.1,0
97,6.2,2.9,4.3,1.3,1
55,5.7,2.8,4.5,1.3,1
24,4.8,3.4,1.9,0.2,0


In [4]:
df = df.sample(frac=1).reset_index(drop=True)
df.head(10)

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
0,7.7,2.8,6.7,2.0,2
1,4.7,3.2,1.6,0.2,0
2,7.3,2.9,6.3,1.8,2
3,6.0,3.4,4.5,1.6,1
4,7.7,3.0,6.1,2.3,2
5,7.2,3.2,6.0,1.8,2
6,5.6,2.5,3.9,1.1,1
7,4.7,3.2,1.3,0.2,0
8,5.2,3.4,1.4,0.2,0
9,5.6,3.0,4.5,1.5,1


In [6]:
iris_labels = df.target
iris_labels

0      2
1      0
2      2
3      1
4      2
      ..
145    0
146    1
147    0
148    2
149    1
Name: target, Length: 150, dtype: int32

In [10]:
def build_model(clustering_model, data, labels):
    model = clustering_model(data)

    print('homo\tcompl\tv-meas\tARI\tAMI\tsilhouette')
    print(50*'-')
    print(f'{metrics.homogeneity_score(labels, model.labels_):.2f}', end='\t')
    print(f'{metrics.completeness_score(labels, model.labels_):.2f}', end='\t')
    print(f'{metrics.v_measure_score(labels, model.labels_):.2f}', end='\t')
    print(f'{metrics.adjusted_rand_score(labels, model.labels_):.2f}', end='\t')
    print(f'{metrics.adjusted_mutual_info_score(labels, model.labels_):.2f}', end='\t')
    print(f'{metrics.silhouette_score(data, model.labels_):.2f}', end='\t')

In [11]:
def k_means(data, n_clusters=3, max_iter=10000):
    model = KMeans(n_clusters=n_clusters, max_iter=max_iter)
    model.fit(data)
    return model

In [12]:
build_model(k_means, df.drop('target', axis=1), iris_labels)

homo	compl	v-meas	ARI	AMI	silhouette
--------------------------------------------------
0.75	0.76	0.76	0.73	0.76	0.55	

In [13]:
def agglomerative_fn(data, n_clusters=3):
    model = AgglomerativeClustering(n_clusters=n_clusters)
    model.fit(data)
    return model

In [14]:
build_model(agglomerative_fn, df.drop('target', axis=1), iris_labels)

homo	compl	v-meas	ARI	AMI	silhouette
--------------------------------------------------
0.76	0.78	0.77	0.73	0.77	0.55	

In [15]:
def dbscan_fn(data, eps=0.45, min_samples=4):
    model = DBSCAN(eps=eps, min_samples=min_samples)
    model.fit(data)
    return model

build_model(dbscan_fn, df.drop('target', axis=1), iris_labels)

homo	compl	v-meas	ARI	AMI	silhouette
--------------------------------------------------
0.58	0.61	0.59	0.51	0.58	0.37	

In [16]:
def mean_shift_fn(data, bandwidth=0.85):
    model = MeanShift(bandwidth=bandwidth)
    model.fit(data)
    return model

build_model(mean_shift_fn, df.drop('target', axis=1), iris_labels)

homo	compl	v-meas	ARI	AMI	silhouette
--------------------------------------------------
0.76	0.77	0.77	0.74	0.76	0.55	

In [17]:
def birch_fn(data, n_clusters=3):
    model = Birch(n_clusters=n_clusters)
    model.fit(data)
    return model

build_model(birch_fn, df.drop('target', axis=1), iris_labels)

homo	compl	v-meas	ARI	AMI	silhouette
--------------------------------------------------
0.73	0.78	0.75	0.66	0.75	0.54	

In [18]:
def affinity_propagation_fn(data, max_iter=1000, damping=0.6):
    model = AffinityPropagation(damping=damping, max_iter=max_iter)
    model.fit(data)
    return model

build_model(affinity_propagation_fn, df.drop('target', axis=1), iris_labels)

homo	compl	v-meas	ARI	AMI	silhouette
--------------------------------------------------
0.85	0.49	0.62	0.44	0.61	0.35	

In [20]:
def mini_batch_k_means_fn(data, n_clusters=3, max_iter=1000):
    model = MiniBatchKMeans(n_clusters=n_clusters, max_iter=max_iter, batch_size=20)
    model.fit(data)
    return model

build_model(mini_batch_k_means_fn, df.drop('target', axis=1), iris_labels)

homo	compl	v-meas	ARI	AMI	silhouette
--------------------------------------------------
0.74	0.75	0.74	0.72	0.74	0.55	

In [21]:
SS = 1000 # Self-similarity
IS = 10 # Intra-cluster similarity
LS = 0.01 # Low Similarity

similarity_mat = [
    [SS, IS, IS, LS, LS, LS, LS, LS, LS],
    [IS, SS, IS, LS, LS, LS, LS, LS, LS],
    [IS, IS, SS, LS, LS, LS, LS, LS, LS],
    [LS, LS, LS, SS, IS, IS, LS, LS, LS],
    [LS, LS, LS, IS, SS, IS, LS, LS, LS],
    [LS, LS, LS, IS, IS, SS, LS, LS, LS],
    [LS, LS, LS, LS, LS, LS, SS, IS, IS],
    [LS, LS, LS, LS, LS, LS, IS, SS, IS],
    [LS, LS, LS, LS, LS, LS, IS, IS, SS],
]

In [22]:
spectral_model = SpectralClustering(n_clusters=3, affinity='precomputed').fit(similarity_mat)

In [23]:
spectral_model.labels_

array([1, 1, 1, 2, 2, 2, 0, 0, 0])