In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from time import time

In [None]:
from sklearn import metrics
from sklearn.cluster import KMeans
from sklearn.datasets import load_digits
from sklearn.decomposition import PCA
from sklearn.preprocessing import scale

In [None]:
dfClara = pd.read_csv('in.data/xclara.csv')
print(dfClara.shape)
print(dfClara.columns)
dfClara.head()

In [None]:
fig,ax = plt.subplots()
plt.scatter(x=dfClara.V1, y=dfClara.V2, marker='.', s=10, alpha=0.3)
ax.set_xlabel('V1')
ax.set_ylabel('V2')
ax.set_title('xclara dataset')
plt.show()

In [None]:
np.random.seed(1001001)

def eval_k_means(estimator, name, data, k):
    t0 = time()
    estimator.fit(data)
    print('%-10s, k=%d: sil=%0.4f' % (name, k, metrics.silhouette_score(data, estimator.labels_,
                                      metric='euclidean',
                                      sample_size=sample_size)))

In [None]:
# standardize data
data = scale(dfClara)

#
sample_size=150

# 
best_sil = -999

for init in ['k-means++', 'random']:
    for k in range(3,8):
        estimator = KMeans(init=init, n_clusters=k, n_init=10, max_iter=100)
        estimator.fit(data)
        sil = metrics.silhouette_score(data, estimator.labels_,
                                          metric='euclidean',
                                          sample_size=sample_size)
        print('%-10s, k=%d: sil=%0.4f' % (init, k, sil))
        if sil > best_sil:
            best_estimator = estimator
            best_sil = sil

print('better estimator silhouette: %0.4f (%d clusters, %s)' % (best_sil, best_estimator.n_clusters, best_estimator.init))
dfClara['label'] = best_estimator.labels_

In [None]:
fig,ax = plt.subplots()
plt.scatter(x=dfClara.V1, y=dfClara.V2, marker='.', c=dfClara.label, cmap='tab20b', s=10, alpha=0.3)
ax.set_xlabel('V1')
ax.set_ylabel('V2')
ax.set_title('xclara dataset')
plt.show()

#  Drinks dataset

In [None]:
dfDrinks = pd.read_csv('in.data/drinks.csv')
print(dfDrinks.shape)
print(dfDrinks.columns)
dfDrinks.head()

In [None]:
dfDrinks.set_index(['country'], inplace=True)
dfDrinks.head()

In [None]:
dfDrinks.describe()

In [None]:
sns.pairplot(dfDrinks)
plt.show()

In [None]:
np.random.seed(1001001)

def eval_k_means(estimator, name, data, k):
    t0 = time()
    estimator.fit(data)
    print('%-10s, k=%d: sil=%0.4f' % (name, k, metrics.silhouette_score(data, estimator.labels_,
                                      metric='euclidean',
                                      sample_size=sample_size)))

# standardize data
data = scale(dfDrinks)

#
sample_size=150

# 
best_sil = -999

for init in ['k-means++', 'random']:
    for k in range(3,8):
        estimator = KMeans(init=init, n_clusters=k, n_init=10, max_iter=100)
        estimator.fit(data)
        sil = metrics.silhouette_score(data, estimator.labels_,
                                          metric='euclidean',
                                          sample_size=sample_size)
        print('%-10s, k=%d: sil=%0.4f' % (init, k, sil))
        if sil > best_sil:
            best_estimator = estimator
            best_sil = sil

print('better estimator silhouette: %0.4f (%d clusters, %s)' % (best_sil, best_estimator.n_clusters, best_estimator.init))
dfDrinks['label'] = best_estimator.labels_

In [None]:
sns.pairplot(data=dfDrinks, vars=['beer_servings', 'spirit_servings', 'wine_servings', 'total_litres_of_pure_alcohol'], hue='label')
plt.show()

# K-means vs. Spectral clustering

In [None]:
from sklearn import datasets
from sklearn.cluster import SpectralClustering, AgglomerativeClustering
from sklearn.neighbors import kneighbors_graph

def fit_kmeans(df, k):
    X,y = df
    model = KMeans(init='random', n_clusters=k, n_init=10, max_iter=100)
    labels = model.fit_predict(X,y)
    df1 = pd.DataFrame(X)
    df1['y'] = y
    df1['label'] = labels
    return df1

def fit_agglomerative(df, k):
    X,y = df
    connectivity = kneighbors_graph(X, n_neighbors=10, include_self=False)
    # make connectivity symmetric
    connectivity = 0.5 * (connectivity + connectivity.T)    
    model = AgglomerativeClustering(n_clusters=k, linkage="average", affinity="cityblock", connectivity=connectivity)
    labels = model.fit_predict(X,y)
    df1 = pd.DataFrame(X)
    df1['y'] = y
    df1['label'] = labels
    return df1

def fit_spectral(df, k):
    X,y = df
    model = SpectralClustering(n_clusters=k, affinity='nearest_neighbors', assign_labels='kmeans')
    labels = model.fit_predict(X,y)
    df1 = pd.DataFrame(X)
    df1['y'] = y
    df1['label'] = labels
    return df1

    
n_samples = 1500

noisy_circles = datasets.make_circles(n_samples=n_samples, factor=.5, noise=.05)
noisy_moons = datasets.make_moons(n_samples=n_samples, noise=.05)
blobs = datasets.make_blobs(n_samples=n_samples, random_state=4)

dfS1 = fit_spectral(noisy_circles, k=2)
dfS2 = fit_spectral(noisy_moons, k=2)
dfS3 = fit_spectral(blobs, k=2)

dfK1 = fit_kmeans(noisy_circles, k=2)
dfK2 = fit_kmeans(noisy_moons, k=2)
dfK3 = fit_kmeans(blobs, k=2)

dfA1 = fit_agglomerative(noisy_circles, k=2)
dfA2 = fit_agglomerative(noisy_moons, k=2)
dfA3 = fit_agglomerative(blobs, k=2)



In [None]:
fig, (ax1, ax2, ax3) = plt.subplots(nrows=1, ncols=3, figsize=(20,5))
ax1.scatter(dfS1[0], dfS1[1], c=dfS1.label)
ax1.set_title('circles')
ax2.scatter(dfS2[0], dfS2[1], c=dfS2.label)
ax2.set_title('moons')
ax3.scatter(dfS3[0], dfS3[1], c=dfS3.label)
ax3.set_title('blobs')
plt.suptitle('Spectral', y=0.98)

fig, (ax4, ax5, ax6) = plt.subplots(nrows=1, ncols=3, figsize=(20,5))
ax4.scatter(dfA1[0], dfA1[1], c=dfA1.label)
ax4.set_title('circles')
ax5.scatter(dfA2[0], dfA2[1], c=dfA2.label)
ax5.set_title('moons')
ax6.scatter(dfA3[0], dfA3[1], c=dfA3.label)
ax6.set_title('blobs')
plt.suptitle('Agglomerative', y=0.98)

fig, (ax1, ax2, ax3) = plt.subplots(nrows=1, ncols=3, figsize=(20,5))
ax1.scatter(dfK1[0], dfK1[1], c=dfK1.label)
ax1.set_title('circles')
ax2.scatter(dfK2[0], dfK2[1], c=dfK2.label)
ax2.set_title('moons')
ax3.scatter(dfK3[0], dfK3[1], c=dfK3.label)
ax4.set_title('blobs')
plt.suptitle('K-means', y=0.98)
plt.show()