# Obtención y preparación de los datos.

In [None]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

In [None]:
from sklearn.datasets import load_digits
digits = load_digits()
digits.data.shape

In [None]:
dataset_description = digits.DESCR
print(dataset_description)

In [None]:
x = digits.data
print(x)

In [None]:
print(digits.target_names)

In [None]:
y = digits.target
print(y)

unique, counts = np.unique(y, return_counts=True)
dict(zip(unique, counts))

In [None]:
digits.images

In [None]:
digits.images[0]

In [None]:
plt.figure(1, figsize=(3, 3))
plt.imshow(digits.images[3], cmap=plt.cm.gray_r, interpolation='nearest')
plt.show()

In [None]:
df = pd.DataFrame(digits.data)
df.head()

# Opción 1) Scikit - Learn: Algoritmo K-Means

<b>Nota:</b> Previamente ya se utilizó este algoritmo, solamente se incluye con la finalidad de compararlo.

In [None]:
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters = 10, random_state = 0)
kmeans.fit(df)
clusters = kmeans.fit_predict(df)
print(clusters)
kmeans.cluster_centers_.shape

In [None]:
fig, ax = plt.subplots(2, 5, figsize=(8,3))
centers = kmeans.cluster_centers_.reshape(10,8,8)
for axi, center in zip(ax.flat, centers):
    axi.set(xticks=[], yticks=[])
    axi.imshow(center, interpolation='nearest', cmap=plt.cm.binary)

In [None]:
from scipy.stats import mode

predicted_labels = np.zeros_like(clusters)
for i in range(10):
    mask = (clusters == i)
    predicted_labels[mask] = mode(y[mask])[0]
    
print(predicted_labels)

In [None]:
from sklearn.metrics import accuracy_score
accuracy_score(y, predicted_labels)

In [None]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(digits.target, predicted_labels)
sns.heatmap(cm.T, linewidth = 0.5, annot = True, cmap = 'Reds', fmt = 'g',
            xticklabels=digits.target_names,
            yticklabels=digits.target_names)
plt.xlabel('Valores reales')
plt.ylabel('Valores predecidos')
plt.title('Comparación contra los valores reales')
plt.show()

# Opción 2) Scikit - Learn: Mini Batch K-Means

https://scikit-learn.org/0.15/modules/generated/sklearn.cluster.MiniBatchKMeans.html#sklearn.cluster.MiniBatchKMeans

In [None]:
from sklearn.cluster import MiniBatchKMeans
model = MiniBatchKMeans(n_clusters = 10)
model.fit(df)
clusters = model.fit_predict(df)
print(clusters)

In [None]:
from scipy.stats import mode

predicted_labels = np.zeros_like(clusters)
for i in range(10):
    mask = (clusters == i)
    predicted_labels[mask] = mode(digits.target[mask])[0]
    
print(predicted_labels)

In [None]:
from sklearn.metrics import accuracy_score
accuracy_score(digits.target, predicted_labels)

In [None]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(digits.target, predicted_labels)
sns.heatmap(cm.T, linewidth = 0.5, annot = True, cmap = 'Reds', fmt = 'g',
            xticklabels=digits.target_names,
            yticklabels=digits.target_names)
plt.xlabel('Valores reales')
plt.ylabel('Valores predecidos')
plt.title('Comparación contra los valores reales')
plt.show()

# Opción 3) Scikit - Learn: Spectral Clustering
Affinity = Nearest Neighbors

https://scikit-learn.org/stable/modules/generated/sklearn.cluster.SpectralClustering.html

In [None]:
from sklearn.cluster import SpectralClustering
model = SpectralClustering(n_clusters = 10, 
                            affinity='nearest_neighbors',
                            assign_labels='kmeans')
model.fit(df)
clusters = model.fit_predict(df)
print(clusters)

In [None]:
from scipy.stats import mode

predicted_labels = np.zeros_like(clusters)
for i in range(10):
    mask = (clusters == i)
    predicted_labels[mask] = mode(digits.target[mask])[0]
    
print(predicted_labels)

In [None]:
from sklearn.metrics import accuracy_score
accuracy_score(digits.target, predicted_labels)

In [None]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(digits.target, predicted_labels)
sns.heatmap(cm.T, linewidth = 0.5, annot = True, cmap = 'Reds', fmt = 'g',
            xticklabels=digits.target_names,
            yticklabels=digits.target_names)
plt.xlabel('Valores reales')
plt.ylabel('Valores predecidos')
plt.title('Comparación contra los valores reales')
plt.show()

# Opción 4) Scikit - Learn: Agglomerative Clustering

https://scikit-learn.org/stable/modules/generated/sklearn.cluster.AgglomerativeClustering.html

In [None]:
from sklearn.cluster import AgglomerativeClustering
model = AgglomerativeClustering(n_clusters = 10)
model.fit(df)
clusters = model.fit_predict(df)
print(clusters)

In [None]:
from scipy.stats import mode

predicted_labels = np.zeros_like(clusters)
for i in range(10):
    mask = (clusters == i)
    predicted_labels[mask] = mode(digits.target[mask])[0]
    
print(predicted_labels)

In [None]:
from sklearn.metrics import accuracy_score
accuracy_score(digits.target, predicted_labels)

In [None]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(digits.target, predicted_labels)
sns.heatmap(cm.T, linewidth = 0.5, annot = True, cmap = 'Reds', fmt = 'g',
            xticklabels=digits.target_names,
            yticklabels=digits.target_names)
plt.xlabel('Valores reales')
plt.ylabel('Valores predecidos')
plt.title('Comparación contra los valores reales')
plt.show()

# Opción 5) Scikit - Learn: Gaussian Mixture Models

https://scikit-learn.org/stable/modules/generated/sklearn.mixture.GaussianMixture.html#sklearn.mixture.GaussianMixture

<b>Actualización 2022:</b>
<br>
<br>La versión más reciente de scikit-learn ha modificado el nombre del módulo GMM, el nombre actual es: <b>GaussianMixture</b>.
<br>
<br>Para evitar el error <b>cannot import name 'GMM'</b> se actualizo la celda de código con el nombre actual.

In [None]:
from sklearn.mixture import GaussianMixture
model = GaussianMixture(n_components = 10, random_state = 150)
model.fit(df)
clusters = model.fit_predict(df)
print(clusters)

In [None]:
from scipy.stats import mode

predicted_labels = np.zeros_like(clusters)
for i in range(10):
    mask = (clusters == i)
    predicted_labels[mask] = mode(digits.target[mask])[0]
    
print(predicted_labels)

In [None]:
from sklearn.metrics import accuracy_score
accuracy_score(digits.target, predicted_labels)

In [None]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(digits.target, predicted_labels)
sns.heatmap(cm.T, linewidth = 0.5, annot = True, cmap = 'Reds', fmt = 'g',
            xticklabels=digits.target_names,
            yticklabels=digits.target_names)
plt.xlabel('Valores reales')
plt.ylabel('Valores predecidos')
plt.title('Comparación contra los valores reales')
plt.show()

# Otros algoritmos de Clustering

<center><img src="img/sphx_glr_plot_cluster_comparison_001.png" width = "100%"></center>
https://scikit-learn.org/stable/auto_examples/cluster/plot_cluster_comparison.html#sphx-glr-auto-examples-cluster-plot-cluster-comparison-py