In [None]:
# Clustering.
# Applications: 
# preprocessor for data analysis,
# segmentation of images or customers,
# anomaly and novelty detection,
# semi-supervised (starting with few labels),
# search-for-similar image,
# nonlinear dimensionality reduction.

In [4]:
# K-Means Clustering.
# Use supplies the number of clusters, K.
# Invented separately by Lloyd 1957 and Forgy 1965.

# Algorithm:
# centroid selection (random or inspired initially),
# label instances by closest centroid,
# select a new centroid using mean per label,
# repeat.

# Limitations:
# num clusters is predetermined,
# assumes clusters are equal size,
# converges on local optima.

# Expected linear time but worst case O(n^m)  
# (book only said "exponential in #instances")

from sklearn.datasets import load_iris
iris = load_iris()
X = iris.data
y = iris.target

from sklearn.cluster import KMeans
K = 3
km = KMeans(n_clusters=K)
y_pred = km.fit_predict(X)
y_pred

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 2, 2, 2, 2, 1, 2, 2, 2,
       2, 2, 2, 1, 1, 2, 2, 2, 2, 1, 2, 1, 2, 1, 2, 2, 1, 1, 2, 2, 2, 2,
       2, 1, 2, 2, 2, 2, 1, 2, 2, 2, 1, 2, 2, 2, 1, 2, 2, 1], dtype=int32)

In [5]:
km.cluster_centers_

array([[5.006     , 3.428     , 1.462     , 0.246     ],
       [5.9016129 , 2.7483871 , 4.39354839, 1.43387097],
       [6.85      , 3.07368421, 5.74210526, 2.07105263]])

In [20]:
# Mean squared distance to centroid.
km.inertia_

78.851441426146

In [21]:
# Predict the cluster for a point near a centroid.
import numpy as np
hypothetical = np.array([[5, 3, 1, 0]])
km.predict(hypothetical)

array([0], dtype=int32)

In [22]:
# Compute one point's distance to centroid.
# This is a form of dimensionality reduction.
km.transform(hypothetical)

array([[0.67615087, 3.80110135, 5.49588489]])

In [19]:
# One point's score.
# Score = negative inertia.
# Useful for situations where better score means tighter fit.
km.score(hypothetical)

-0.4571800000000004

In [24]:
# Redo with more parameter settings but get same results.
init_val = 'random' # default = K-means++ 
n_init_val = 10 # maximun number of times to start again with random init
km = KMeans(n_clusters=K,init=init_val,n_init=n_init_val,random_state=42)
y_rand = km.fit_predict(X)
y_rand == y_pred

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,

In [25]:
# Redo with more clusters.
km = KMeans(n_clusters=10)
y_lots = km.fit_predict(X)
y_lots

array([5, 0, 0, 0, 5, 7, 0, 5, 0, 0, 5, 5, 0, 0, 7, 7, 7, 5, 7, 5, 5, 5,
       0, 5, 5, 0, 5, 5, 5, 0, 0, 5, 7, 7, 0, 0, 5, 5, 0, 5, 5, 0, 0, 5,
       5, 0, 5, 0, 5, 5, 3, 3, 3, 6, 3, 6, 3, 2, 3, 6, 2, 6, 6, 3, 6, 3,
       6, 6, 9, 6, 9, 6, 9, 3, 3, 3, 3, 1, 3, 6, 6, 6, 6, 9, 6, 3, 3, 3,
       6, 6, 6, 3, 6, 2, 6, 6, 6, 3, 2, 6, 8, 9, 8, 1, 8, 4, 6, 4, 1, 8,
       1, 1, 1, 9, 9, 1, 1, 4, 4, 9, 8, 9, 4, 9, 8, 4, 9, 9, 1, 8, 4, 4,
       1, 9, 9, 4, 8, 1, 9, 1, 8, 1, 9, 8, 8, 1, 9, 1, 1, 9], dtype=int32)

In [38]:
# Silhouette is a metric for estimating optimal K.
from sklearn.metrics import silhouette_score
# Crashes if K less than 2
for K in range(2,10):
    km = KMeans(n_clusters=K)
    km.fit_predict(X)
    s=silhouette_score(X,km.labels_)
    print("Clusters=%d Score=%f"%(K,s))
# We know there are 3 clusters but 2 of them are similar.
# This analysis says 2 clusters is optimal.

Clusters=2 Score=0.681046
Clusters=3 Score=0.552819
Clusters=4 Score=0.497455
Clusters=5 Score=0.488749
Clusters=6 Score=0.364834
Clusters=7 Score=0.353871
Clusters=8 Score=0.361790
Clusters=9 Score=0.321463


In [39]:
# Score ranges -1 (edge of cluster) to +1 (near its centroid).
# Can take score per point or score overall.
# With "knife diagram" plot of score per point for each cluster,
# try to choose a K with the most uniform knife shapes.