# Problema de Clustering

<p style='text-align: justify;'>Es el problema encargado de agrupar un conjunto de datos, objetos, clientes, productos u objetos de tal forma que los elementos del mismo conjunto (Clúster) compartan características similares.</p>

<center><img src="img/Clustering1.png" width = "70%"></center>

<p style='text-align: justify;'>En este primer y sencillo ejemplo visual separaremos al conjunto de datos en cuatro clústeres diferentes, tal como se muestra en la siguiente figura.</p>

<center><img src="img/Clustering2.png" width = "70%"></center>

## Ejemplo 1) Empecemos a trabajar con un conjunto de datos sintéticos.

In [None]:
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set()

#### Función "make_blobs"

<b>Actualización 2022:</b>
<br>
<br>La versión más reciente de scikit-learn ha simplificado el módulo sklearn.datasets.samples_generator, por lo que la siguiente celda contiene la nueva forma de acceder a la función make_blobs.
<br>
<br>Para evitar el error <b>No module named 'sklearn.datasets.samples_generator'</b>.

In [None]:
from sklearn.datasets import make_blobs
x, y_true = make_blobs(n_samples = 300, centers = 4, random_state = 70)
plt.scatter(x[:, 0], x[:, 1], s=50)
plt.show()

In [None]:
plt.scatter(x[:, 0], x[:, 1], c=y_true, s=50, cmap='autumn')
plt.show()

### Algoritmo K-Means.

In [None]:
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters=4)
kmeans.fit(x)

In [None]:
kmeans.labels_

In [None]:
y_kmeans = kmeans.predict(x)
y_kmeans

In [None]:
plt.scatter(x[:, 0], x[:, 1], c=y_kmeans, s=50, cmap='autumn', alpha=0.6)
centers = kmeans.cluster_centers_
plt.scatter(centers[:, 0], centers[:, 1], c='black', s=200, alpha=0.7)
plt.show()

In [None]:
centers

## Ejemplo 2) Variando el número de Clusters.

In [None]:
x, y_true = make_blobs(random_state=1)

In [None]:
kmeans = KMeans(n_clusters=2)
kmeans.fit(x)

In [None]:
y_kmeans = kmeans.predict(x)
y_kmeans

In [None]:
plt.scatter(x[:, 0], x[:, 1], c=y_kmeans, s=50, cmap='autumn', alpha=0.6)
centers = kmeans.cluster_centers_
plt.scatter(centers[:, 0], centers[:, 1], c='black', s=200, alpha=0.7)
plt.show()

### Compactemos lo anterior escribiendo una función.

In [None]:
def BasicKMeans(clusters = 2):
    x, y_true = make_blobs(random_state=1)
    
    kmeans = KMeans(n_clusters = clusters)
    kmeans.fit(x)
    
    y_kmeans = kmeans.predict(x)
    y_kmeans
    
    plt.scatter(x[:, 0], x[:, 1], c=y_kmeans, s=50, cmap='autumn', alpha=0.6)
    centers = kmeans.cluster_centers_
    plt.scatter(centers[:, 0], centers[:, 1], c='black', s=200, alpha=0.7)
    plt.show()

In [None]:
BasicKMeans(3)

In [None]:
BasicKMeans(4)

### Retomemos la función interactive.

In [None]:
from ipywidgets import interact, interactive, fixed, interact_manual
import ipywidgets as widgets
    
interact(BasicKMeans, clusters = (2, 8));

## Ejemplo 3) Casos donde el algoritmo de K-Means puede fallar.

### Casos donde no existe una consistente distribución de los datos.

In [None]:
x, y_true = make_blobs(n_samples = 200,
                      cluster_std = [1.0, 2.5, 0.5],
                      random_state = 180)

kmeans = KMeans(n_clusters = 3)
kmeans.fit(x)

y_kmeans = kmeans.predict(x)
y_kmeans

plt.scatter(x[:, 0], x[:, 1], c=y_kmeans, s=50, cmap='autumn', alpha=0.6)
centers = kmeans.cluster_centers_
plt.scatter(centers[:, 0], centers[:, 1], c='black', s=200, alpha=0.7)
plt.show()

In [None]:
def DiferenteDistribucion(clusters = 2):
    x, y_true = make_blobs(n_samples = 200,
                      cluster_std = [1.0, 2.5, 0.5],
                      random_state = 180)

    kmeans = KMeans(n_clusters = clusters)
    kmeans.fit(x)

    y_kmeans = kmeans.predict(x)
    y_kmeans

    plt.scatter(x[:, 0], x[:, 1], c=y_kmeans, s=50, cmap='autumn', alpha=0.6)
    centers = kmeans.cluster_centers_
    plt.scatter(centers[:, 0], centers[:, 1], c='black', s=200, alpha=0.7)
    plt.show()
    
interact(DiferenteDistribucion, clusters = (2, 8));

### Casos donde la distribución de los datos no se puede representar por un espacio circular.

In [None]:
x, y_true = make_blobs(n_samples = 600,
                       random_state = 170)
plt.scatter(x[:, 0], x[:, 1])

In [None]:
rng = np.random.RandomState(74)
transformation = rng.normal(size=(2,2))
transformation

In [None]:
x = np.dot(x, transformation)
plt.scatter(x[:, 0], x[:, 1])

In [None]:
kmeans = KMeans(n_clusters = 3)
kmeans.fit(x)

y_kmeans = kmeans.predict(x)
y_kmeans

plt.scatter(x[:, 0], x[:, 1], c=y_kmeans, s=50, cmap='autumn', alpha=0.6)
centers = kmeans.cluster_centers_
plt.scatter(centers[:, 0], centers[:, 1], c='black', s=200, alpha=0.7)
plt.show()

In [None]:
def DiferenteForma(clusters = 2):
    x, y_true = make_blobs(n_samples = 600,
                       random_state = 170)
    
    rng = np.random.RandomState(74)
    transformation = rng.normal(size=(2,2))
    
    x = np.dot(x, transformation)
    
    kmeans = KMeans(n_clusters = clusters)
    kmeans.fit(x)

    y_kmeans = kmeans.predict(x)
    y_kmeans

    plt.scatter(x[:, 0], x[:, 1], c=y_kmeans, s=50, cmap='autumn', alpha=0.6)
    centers = kmeans.cluster_centers_
    plt.scatter(centers[:, 0], centers[:, 1], c='black', s=200, alpha=0.7)
    plt.show()
    
interact(DiferenteForma, clusters = (2, 8));

### Casos donde los datos generar formas complejas.

#### Función "make_moons"

In [None]:
from sklearn.datasets import make_moons
x, y_true = make_moons(n_samples = 100,
                       random_state = 100)
plt.scatter(x[:, 0], x[:, 1])

In [None]:
x, y_true = make_moons(n_samples = 100, noise = 0.1,
                       random_state = 100)
plt.scatter(x[:, 0], x[:, 1])

In [None]:
kmeans = KMeans(n_clusters = 2)
kmeans.fit(x)

y_kmeans = kmeans.predict(x)
y_kmeans

plt.scatter(x[:, 0], x[:, 1], c=y_kmeans, s=50, cmap='autumn', alpha=0.6)
centers = kmeans.cluster_centers_
plt.scatter(centers[:, 0], centers[:, 1], c='black', s=200, alpha=0.7)
plt.show()

In [None]:
def FormasComplejas(clusters = 2):
    x, y_true = make_moons(n_samples = 100, noise = 0.1,
                       random_state = 100)
    
    kmeans = KMeans(n_clusters = clusters)
    kmeans.fit(x)

    y_kmeans = kmeans.predict(x)
    y_kmeans

    plt.scatter(x[:, 0], x[:, 1], c=y_kmeans, s=50, cmap='autumn', alpha=0.6)
    centers = kmeans.cluster_centers_
    plt.scatter(centers[:, 0], centers[:, 1], c='black', s=200, alpha=0.7)
    plt.show()
    
interact(FormasComplejas, clusters = (2, 8));