<font color="#CA3532"><h1 align="left">Clustering</h1></font>
<h2 align="left">Clustering jerárquico en dataset de vinos</h2>

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import make_blobs
from sklearn.cluster import KMeans
from sklearn import metrics

Now we will use clustering in a wine database. The goal is to check if the clustering discovers the different real wine types.

The database describes the parameters of different wine instances. There are 3 types of wine and 13 wine features with the levels of the most important indicators:
- Alcohol
- Malic acid
- Ash
- Ash alcalinity
- Magnesium
- Total phenols
- Flavanoids
- Nonflavanoid phenols
- Proanthocyanins
- Color intensity
- Hue
- OD280_OD315
- Proline

Now we load this database:


In [None]:
data = pd.read_csv('./wine_dataset.csv', delimiter=';', header=0)

print('Wine Database\n')
print('Number of real classes (wine types):', np.unique(data['Type']).shape[0])
print('Unique class labels:', np.unique(data['Type']))
print('\nFirst instances:')
data.head()

In [None]:
data.describe().T[["count", "min", "max", "mean", "std"]]

After loading the database we need to do some basic preprocessing: standarization and PCA:

In [None]:
X = data.drop("Type", axis=1)
y = data["Type"]

classes_names = data['Type'].unique()
print(classes_names)
attribute_names = X.columns
print(attribute_names)

Now we will perform clustering using a hierarchical strategy:


In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

# Now we will analyze the wine dataset with hierarchical clustering

X_std = StandardScaler().fit_transform(X)
pca = PCA(n_components=5)
pca.fit(X_std)
X_pca = pca.transform(X_std)
#X_clust = X_std
X_clust = X_pca

#from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics.pairwise import pairwise_distances

#dists = 1 - cosine_similarity(infoMat_analysis)
from scipy.spatial.distance import pdist
dists = pairwise_distances(X_clust, metric='euclidean')
dists = pdist(X_clust, metric='euclidean')

from scipy.cluster.hierarchy import ward, dendrogram

linkage_matrix = ward(dists)

In [None]:
fig, ax = plt.subplots(figsize=(15,20))
#dendr = dendrogram(linkage_matrix, orientation="left", labels=names)
dendr = dendrogram(linkage_matrix, orientation="left")

inds_leaves = dendr['leaves']

plt.tick_params(\
               axis = 'x',
                which = 'both',
                bottom = 'off',
                top = 'off',
                labelbottom = 'off')
plt.tight_layout()
plt.title('Wine database, hierarchical clustering', fontsize = 20)
plt.show()

from scipy.cluster.hierarchy import fcluster

nselected_clusters = 3
ids_clusters = fcluster(linkage_matrix, nselected_clusters, criterion='maxclust')-1

unique_y = np.unique(y)
for c in np.unique(ids_clusters):
    inds = (np.where(np.array(ids_clusters) == c))[0]
    print('- Cluster %d' % c)
    for real_class in unique_y:
        print('  Number of patterns with real class %c: %d' % (real_class, (list(y[inds])).count(real_class)))
    print()

In [None]:
from scipy.cluster.hierarchy import dendrogram, linkage, fcluster

# ward, complete
Z = linkage(X_clust, 'ward', optimal_ordering=True) # single, complete, average

# Plot the dendrogram:
plt.figure(figsize=(15, 20))
dendrogram(Z, leaf_rotation=0, orientation='left')
plt.grid(True)

In [None]:
from scipy.cluster.hierarchy import dendrogram, linkage, fcluster

Z = linkage(X_clust, 'ward') # ward, single, complete, average

# Plot the dendrogram:
plt.figure(figsize=(15, 5))
dendrogram(Z, leaf_rotation=0)
plt.grid(True)

In [None]:
nselected_clusters = 3
ids_clusters = fcluster(Z, nselected_clusters, criterion='maxclust')-1

unique_y = np.unique(y)
for c in np.unique(ids_clusters):
    inds = (np.where(np.array(ids_clusters) == c))[0]
    print('- Cluster %d' % c)
    for real_class in unique_y:
        print('  Number of patterns with real class %c: %d' % (real_class, (list(y[inds])).count(real_class)))
    print()

In [None]:
pcx = 0
pcy = 1

fig = plt.figure(figsize=(6,6))
colors = ['lightgreen', 'lightblue', 'yellow', 'orange', 'magenta']
markers = ['s', 'v', 'o', 'd', 's']

for c in range(nselected_clusters):
    inds = np.where(ids_clusters == c)[0]
    plt.scatter(X_pca[inds, 0],
                X_pca[inds, 1],
                s = 60,
                c = colors[c], marker = markers[c],
                label = 'cluster %d' % (c))

plt.legend()
plt.grid()
plt.tight_layout()
plt.title('Wine database, optimal clustering')
plt.xlabel('Principal component '+str(pcx+1))
plt.ylabel('Principal component '+str(pcy+1))
plt.show()

### Finding automatically the number of clusters

In [None]:
#from sklearn.metrics import silhouette_score as qmetric
from sklearn.metrics import calinski_harabasz_score as qmetric

Nclusters_max = 15

qualities = []
labels_sets = []
for k in range(1,Nclusters_max+1):
    labels = fcluster(Z, k, criterion='maxclust')
    labels_sets.append(labels)
    if k >1:
        qualities.append(qmetric(X_clust, labels))
    else:
        qualities.append(0)

In [None]:
fig = plt.figure(figsize=(14,3))

plt.plot(range(1,Nclusters_max+1), qualities, marker='o')
plt.xlabel('number of clusters')
plt.title('clustering quality')
plt.show()

best = pd.Series(qualities).idxmax() # get index for the best model
labels = labels_sets[best]
n_clusters = len(np.unique(labels))