# K-Means Clustering

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

In [None]:
from sklearn.datasets import make_blobs

# create dataset
X, y = make_blobs(
    n_samples=150, n_features=2,
    centers=3, cluster_std=0.5,
    shuffle=True, random_state=0
)

# plot
plt.figure(figsize=(10, 10))
plt.scatter(
    X[:, 0], X[:, 1],
    c='white', marker='o',
    edgecolor='black', s=50
)
plt.grid()
plt.show()

In [None]:
from sklearn.cluster import KMeans

km = KMeans(
    n_clusters=3, init='random',
    n_init=10, max_iter=300, 
    tol=1e-04, random_state=0
)
y_km = km.fit_predict(X)

In [None]:
# plot
plt.figure(figsize=(10, 10))
plt.scatter(
    X[y_km == 0, 0], X[y_km == 0, 1],
    s=50, c='lightgreen',
    marker='s', edgecolor='black',
    label='cluster 1'
)
plt.scatter(
    X[y_km == 1, 0], X[y_km == 1, 1],
    s=50, c='orange',
    marker='o', edgecolor='black',
    label='cluster 2'
)
plt.scatter(
    X[y_km == 2, 0], X[y_km == 2, 1],
    s=50, c='lightblue',
    marker='v', edgecolor='black',
    label='cluster 3'
)
plt.scatter(
    km.cluster_centers_[:, 0], km.cluster_centers_[:, 1],
    s=250, marker='*',
    c='red', edgecolor='black',
    label='centroids'
)
plt.legend(scatterpoints=1)
plt.grid()
plt.show()

# Elbow Method and Silhouette Plots

In [None]:
print(f'Distortion: {km.inertia_:.2f}')

In [None]:
# calculate distortion for a range of number of cluster
distortions = []
for i in range(1, 11):
    km = KMeans(
        n_clusters=i, init='k-means++',
        n_init=10, max_iter=300,
        random_state=0
    )
    km.fit(X)
    distortions.append(km.inertia_)

# plot
plt.figure(figsize=(10, 5))
plt.plot(range(1, 11), distortions, marker='o')
plt.xlabel('Number of clusters')
plt.ylabel('Distortion')
plt.show()

In [None]:
from matplotlib import cm
from sklearn.metrics import silhouette_samples

# initialize and fit k-means
km = KMeans(
    n_clusters=3, init='k-means++',
    n_init=10, max_iter=300,
    tol=1e-04, random_state=0
)
y_km = km.fit_predict(X)

# find cluster labels and number of clusters
cluster_labels = np.unique(y_km)
n_clusters = cluster_labels.shape[0]

# calculate silhouettes
silhouette_vals = silhouette_samples(X, y_km, metric='euclidean')

In [None]:
# plot silhouette
plt.figure(figsize=(10, 10))
y_ax_lower, y_ax_upper = 0, 0
yticks = []
for i, c in enumerate(cluster_labels):
    c_silhouette_vals = silhouette_vals[y_km == c]
    c_silhouette_vals.sort()
    y_ax_upper += len(c_silhouette_vals)
    color = cm.jet(float(i) / n_clusters)
    plt.barh(
        range(y_ax_lower, y_ax_upper),
        c_silhouette_vals, 
        height=1.0, edgecolor='none',
        color=color
     )
    yticks.append((y_ax_lower + y_ax_upper) / 2.)
    y_ax_lower += len(c_silhouette_vals)

# calculate silhouette score    
silhouette_avg = np.mean(silhouette_vals)

plt.axvline(silhouette_avg, color="red", linestyle="--") 
plt.yticks(yticks, cluster_labels + 1)
plt.ylabel('Cluster')
plt.xlabel('Silhouette coefficient')
plt.show()

In [None]:
# initialize and fit k-means
km = KMeans(
    n_clusters=2, init='k-means++',
    n_init=10, max_iter=300,
    tol=1e-04, random_state=0
)
y_km = km.fit_predict(X)

# plot
plt.figure(figsize=(10, 10))
plt.scatter(
    X[y_km == 0, 0], X[y_km == 0, 1],
    s=50, c='lightgreen',
    edgecolor='black',
    marker='s', label='cluster 1'
)
plt.scatter(
    X[y_km == 1, 0], X[y_km == 1, 1],
    s=50, c='orange',
    edgecolor='black',
    marker='o', label='cluster 2'
)
plt.scatter(
    km.cluster_centers_[:, 0], km.cluster_centers_[:, 1],
    s=250, marker='*',
    c='red', label='centroids'
)
plt.legend()
plt.grid()
plt.show()

In [None]:
# find cluster labels and number of clusters
cluster_labels = np.unique(y_km)
n_clusters = cluster_labels.shape[0]

# calculate silhouettes
silhouette_vals = silhouette_samples(X, y_km, metric='euclidean')

# plot silhouette
plt.figure(figsize=(10, 10))
y_ax_lower, y_ax_upper = 0, 0
yticks = []
for i, c in enumerate(cluster_labels):
    c_silhouette_vals = silhouette_vals[y_km == c]
    c_silhouette_vals.sort()
    y_ax_upper += len(c_silhouette_vals)
    color = cm.jet(i / n_clusters)
    plt.barh(
        range(y_ax_lower, y_ax_upper), 
        c_silhouette_vals, 
        height=1.0, edgecolor='none', 
        color=color
    )
    yticks.append((y_ax_lower + y_ax_upper) / 2)
    y_ax_lower += len(c_silhouette_vals)

# calculate silhouette score    
silhouette_avg = np.mean(silhouette_vals)

plt.axvline(silhouette_avg, color="red", linestyle="--")
plt.yticks(yticks, cluster_labels + 1)
plt.ylabel('Cluster')
plt.xlabel('Silhouette coefficient')
plt.show()

# Hierarchical Clustering

In [None]:
np.random.seed(123)

# create dummy data
variables = ['X', 'Y', 'Z']
labels = ['ID_0', 'ID_1', 'ID_2', 'ID_3', 'ID_4']
X = np.random.random_sample([5, 3]) * 10
# store as dataframe
df = pd.DataFrame(X, columns=variables, index=labels)
df

In [None]:
from scipy.spatial.distance import pdist, squareform

row_dist = pd.DataFrame(
    squareform(pdist(df, metric='euclidean')),
    columns=labels, index=labels
)
row_dist

In [None]:
from scipy.cluster.hierarchy import linkage

help(linkage)

In [None]:
row_clusters = linkage(pdist(df, metric='euclidean'), method='complete')

# or
# row_clusters = linkage(df.values, method='complete', metric='euclidean')

In [None]:
# show clustering results as dataframe
pd.DataFrame(
    row_clusters,
    columns=['row label 1', 'row label 2', 'distance', 'no. of items in clust.'],
    index=[f'cluster {(i+1)}' for i in range(row_clusters.shape[0])]
)

In [None]:
from scipy.cluster.hierarchy import dendrogram

# plot dendrogram
plt.figure(figsize=(10, 5))
row_dendr = dendrogram(row_clusters, labels=labels)

plt.tight_layout()
plt.ylabel('Euclidean distance')
plt.show()

In [None]:
fig = plt.figure(figsize=(10, 10), facecolor='white')
axd = fig.add_axes([0.09, 0.1, 0.2, 0.6])

# plot dendrogram and turn 90 degrees
row_dendr = dendrogram(row_clusters, orientation='left')

# reorder data by label
df_rowclust = df.iloc[row_dendr['leaves'][::-1]]

# construct heatmap
axm = fig.add_axes([0.23, 0.1, 0.6, 0.6])
cax = axm.matshow(df_rowclust, interpolation='nearest', cmap='hot_r')

# remove ticks and spines
axd.set_xticks([])
axd.set_yticks([])
for i in axd.spines.values():
    i.set_visible(False)
    
# add colorbar
fig.colorbar(cax)

# add feature and sample names to axis tick labels
axm.set_xticklabels([''] + list(df_rowclust.columns))
axm.set_yticklabels([''] + list(df_rowclust.index))
plt.show()

In [None]:
from sklearn.cluster import AgglomerativeClustering

# cluster with k=3
ac = AgglomerativeClustering(n_clusters=3, affinity='euclidean', linkage='complete')
labels = ac.fit_predict(X)
print(f'Cluster labels: {labels}')

In [None]:
# cluster with k=2
ac = AgglomerativeClustering(n_clusters=2, affinity='euclidean', linkage='complete')
labels = ac.fit_predict(X)
print(f'Cluster labels: {labels}')

# DBSCAN

In [None]:
from sklearn.datasets import make_moons

# create make_moons dataset
X, y = make_moons(n_samples=200, noise=0.05, random_state=0)
plt.scatter(X[:, 0], X[:, 1])
plt.show()

In [None]:
f, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 5))

# initialze and fit with kmeans
km = KMeans(n_clusters=2, random_state=0)
y_km = km.fit_predict(X)

# plot
ax1.scatter(
    X[y_km == 0, 0], X[y_km == 0, 1],
    c='lightblue', edgecolor='black',
    marker='o', s=40,
    label='cluster 1'
)
ax1.scatter(
    X[y_km == 1, 0], X[y_km == 1, 1],
    c='red', edgecolor='black',
    marker='s', s=40,
    label='cluster 2'
)
ax1.set_title('K-means clustering')
ac = AgglomerativeClustering(
    n_clusters=2,
    affinity='euclidean',
    linkage='complete'
)

# fit with AgglomerativeClustering
y_ac = ac.fit_predict(X)

# plot
ax2.scatter(
    X[y_ac == 0, 0], X[y_ac == 0, 1],
    c='lightblue', edgecolor='black',
    marker='o', s=40,
    label='cluster 1'
)
ax2.scatter(
    X[y_ac == 1, 0], X[y_ac == 1, 1],
    c='red', edgecolor='black',
    marker='s', s=40,
    label='cluster 2'
)
ax2.set_title('Agglomerative clustering')

plt.legend()
plt.show()

In [None]:
from sklearn.cluster import DBSCAN

# initialize and fit DBSCAN
db = DBSCAN(eps=0.2, min_samples=5, metric='euclidean')
y_db = db.fit_predict(X)

# plot
plt.scatter(
    X[y_db == 0, 0], X[y_db == 0, 1],
    c='lightblue', edgecolor='black',
    marker='o', s=40,
    label='cluster 1'
)
plt.scatter(
    X[y_db == 1, 0], X[y_db == 1, 1],
    c='red', edgecolor='black',
    marker='s', s=40,
    label='cluster 2'
)
plt.legend()
plt.show()