In [97]:
import numpy as np
import ipympl
import matplotlib.pyplot as plt
from matplotlib.widgets import Slider, RadioButtons
from sklearn.cluster import AgglomerativeClustering
from sklearn import datasets
%matplotlib notebook



## Датасеты

In [98]:
# datasets
n_samples = 1500
noisy_circles = datasets.make_circles(n_samples=n_samples, factor=.5, noise=.05)
noisy_moons = datasets.make_moons(n_samples=n_samples, noise=.05)
blobs = datasets.make_blobs(n_samples=n_samples, random_state=8)
no_structure = np.random.rand(n_samples, 2), None

datasets = [noisy_circles, noisy_moons, blobs, no_structure]

In [99]:
# datasets visualization
fig, axs = plt.subplots(2, 2)

for i, dataset in enumerate(datasets):
    X, y = dataset
    axs[i//2, i%2].scatter(X[:,0], X[:,1])    
plt.show()

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

## Применение иерархической кластеризации

In [100]:
from functools import partial
params = [2, 'ward']

def sliders_on_changed(val, params):
    params[0] = val
    for i, dataset in enumerate(datasets):
        axs[i//2, i%2].clear()
        X, y = dataset
        clustering = AgglomerativeClustering(n_clusters=params[0], linkage=params[1]).fit(X)
        axs[i//2, i%2].scatter(X[:,0], X[:,1], c=clustering.labels_)
    fig.canvas.draw_idle()

def linkage_on_clicked(val, params):
    params[1] = val
    for i, dataset in enumerate(datasets):
        axs[i//2, i%2].clear()
        X, y = dataset
        clustering = AgglomerativeClustering(n_clusters=params[0], linkage=params[1]).fit(X)
        axs[i//2, i%2].scatter(X[:,0], X[:,1], c=clustering.labels_)
    fig.canvas.draw_idle()

fig, axs = plt.subplots(2, 2)

for i, dataset in enumerate(datasets):
    X, y = dataset
    clustering = AgglomerativeClustering().fit(X)
    
    axs[i//2, i%2].scatter(X[:,0], X[:,1], c=clustering.labels_)
fig.subplots_adjust(left=0.25, bottom=0.35)
ax = fig.add_axes([0.25, 0.1, 0.6, 0.03])
freq_slider = Slider(ax, 'n_clusters',1 , 10, valinit=2, valstep=1)
freq_slider.on_changed(partial(sliders_on_changed, params=params))

ax = fig.add_axes([0.025, 0.5, 0.15, 0.15])
linkage_type = RadioButtons(ax, ('ward', 'complete', 'average', 'single'), active=0)
linkage_type.on_clicked(partial(linkage_on_clicked, params=params))

plt.show()

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

## Пример анализа оптимального числа кластеров

In [101]:
from sklearn.datasets import make_blobs
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_samples, silhouette_score

import matplotlib.pyplot as plt
import matplotlib.cm as cm
import numpy as np

X, y = datasets[2]
range_n_clusters = [2, 3, 4, 5]

for n_clusters in range_n_clusters:
    fig, (ax1, ax2) = plt.subplots(1, 2)
    fig.set_size_inches(9, 4)

    ax1.set_xlim([-0.1, 1])
    ax1.set_ylim([0, len(X) + (n_clusters + 1) * 10])

    clusterer = AgglomerativeClustering(n_clusters=n_clusters)
    cluster_labels = clusterer.fit_predict(X)

    silhouette_avg = silhouette_score(X, cluster_labels)
    print("For n_clusters =", n_clusters,
          "The average silhouette_score is :", silhouette_avg)

    sample_silhouette_values = silhouette_samples(X, cluster_labels)

    y_lower = 10
    for i in range(n_clusters):
        ith_cluster_silhouette_values = \
            sample_silhouette_values[cluster_labels == i]

        ith_cluster_silhouette_values.sort()

        size_cluster_i = ith_cluster_silhouette_values.shape[0]
        y_upper = y_lower + size_cluster_i

        color = cm.nipy_spectral(float(i) / n_clusters)
        ax1.fill_betweenx(np.arange(y_lower, y_upper),
                          0, ith_cluster_silhouette_values,
                          facecolor=color, edgecolor=color, alpha=0.7)

        ax1.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i))
        y_lower = y_upper + 10

    ax1.set_title("The silhouette plot for the various clusters.")
    ax1.set_xlabel("The silhouette coefficient values")
    ax1.set_ylabel("Cluster label")

    ax1.axvline(x=silhouette_avg, color="red", linestyle="--")

    ax1.set_yticks([])
    ax1.set_xticks([-0.1, 0, 0.2, 0.4, 0.6, 0.8, 1])

    colors = cm.nipy_spectral(cluster_labels.astype(float) / n_clusters)
    ax2.scatter(X[:, 0], X[:, 1], marker='.', s=30, lw=0, alpha=0.7,
                c=colors, edgecolor='k')

    ax2.set_title("The visualization of the clustered data.")
    ax2.set_xlabel("Feature space for the 1st feature")
    ax2.set_ylabel("Feature space for the 2nd feature")

    plt.suptitle(("Silhouette analysis for Hierarchical clustering on sample data "
                  "with n_clusters = %d" % n_clusters),
                 fontsize=14, fontweight='bold')

plt.show()

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

For n_clusters = 2 The average silhouette_score is : 0.786262052469124


Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

For n_clusters = 3 The average silhouette_score is : 0.8290743874701529


Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

For n_clusters = 4 The average silhouette_score is : 0.6569556280956855


Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

For n_clusters = 5 The average silhouette_score is : 0.4909776685284084
