In [None]:
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

# Obtención de los datos.

https://scikit-learn.org/stable/modules/generated/sklearn.datasets.fetch_lfw_people.html

https://scikit-learn.org/stable/datasets/index.html#labeled-faces-in-the-wild-dataset

In [None]:
from sklearn.datasets import fetch_lfw_people

In [None]:
people = fetch_lfw_people(min_faces_per_person = 30, resize = 0.7)

In [None]:
people

In [None]:
people.images[0]

In [None]:
print(len(people.images))
print(len(people.target))
people.target_names

## Mostrar una imagen con la instrucción imshow()

In [None]:
image_shape = people.images[0].shape
print(image_shape)

In [None]:
plt.imshow(people.images[0])
plt.show()

In [None]:
plt.imshow(people.images[0], cmap=plt.cm.gray)
plt.show()

In [None]:
person = 5
plt.imshow(people.images[person], cmap=plt.cm.gray)

target = people.target[person]
plt.title(people.target_names[target])

plt.show()

In [None]:
fig, axes = plt.subplots(4, 10, figsize=(20, 8),
                         subplot_kw={'xticks':[], 'yticks':[]},
                         gridspec_kw=dict(hspace=0.2, wspace=0.2))
for i, ax in enumerate(axes.flat):
    ax.imshow(people.images[i], cmap=plt.cm.gray)
plt.show()

# Preparación de los datos.

## Paso 1) Filtrar máximo cincuenta imagénes de la misma persona.

In [None]:
mask = np.zeros(people.target.shape, dtype=np.bool)
print(mask)

In [None]:
np.unique(people.target)

In [None]:
max_rep = 50
for target in np.unique(people.target):
    mask[np.where(people.target == target)[0][:max_rep]] = 1
print(mask)

In [None]:
X_people = people.data[mask]
y_people = people.target[mask]

In [None]:
print(X_people)

In [None]:
print(y_people)

unique, counts = np.unique(y_people, return_counts=True)
dict(zip(unique, counts))

In [None]:
print(len(X_people))
print(len(y_people))

In [None]:
print(len(X_people[0]))

## Paso 2) Escalar los datos.

In [None]:
X_people = X_people / 255
print(X_people)

In [None]:
fig, axes = plt.subplots(4, 10, figsize=(20, 8),
                         subplot_kw={'xticks':[], 'yticks':[]},
                         gridspec_kw=dict(hspace=0.2, wspace=0.2))
for i, ax in enumerate(axes.flat):
    ax.imshow(X_people[i].reshape(87, 65), cmap=plt.cm.gray)
plt.show()

## Paso 3) Filtro de los componentes principales.

<b>Lectura Adicional:</b>
<br>https://jakevdp.github.io/PythonDataScienceHandbook/05.09-principal-component-analysis.html

In [None]:
from sklearn.decomposition import PCA

In [None]:
pca = PCA(n_components=100)
pca.fit_transform(X_people)
X_pca = pca.transform(X_people)

In [None]:
print(len(X_pca))

In [None]:
print(len(X_pca[0]))

# Clustering usando el algoritmo DBSCAN.

http://scikit-learn.org/stable/modules/generated/sklearn.cluster.DBSCAN.html

In [None]:
from sklearn.cluster import DBSCAN

<b>Nota:</b> Los datos que sean etiquetados con el valor de "-1" se considerarán valores que causan ruido al modelo.

### Probemos con el parametro eps = 10

In [None]:
dbscan=DBSCAN(min_samples=3, eps=10)
labels=dbscan.fit_predict(X_pca)
print(labels)

In [None]:
unique, counts = np.unique(labels, return_counts=True)
dict(zip(unique, counts))

In [None]:
print(len(unique))

In [None]:
noise = X_people[labels==-1]
print(noise)

In [None]:
print(len(noise))

In [None]:
fig,axes=plt.subplots(1,2, subplot_kw={'xticks': (), 'yticks': ()})
for image, ax in zip(noise, axes.ravel()):
    ax.imshow(image.reshape(image_shape), cmap=plt.cm.gray)
plt.show()

### Automaticemos lo anterior, para buscar un factor de eps que nos devuelva la mayor cantidad de clusters

In [None]:
def AnalisisDBSCAN(eps = 10):
    dbscan=DBSCAN(min_samples=3, eps=eps)
    labels=dbscan.fit_predict(X_pca)
    #print(labels)
    
    unique, counts = np.unique(labels, return_counts=True)
    print(dict(zip(unique, counts)))
    print(len(unique))
    
    noise = X_people[labels==-1]
    #print(noise)
    print(len(noise))

In [None]:
AnalisisDBSCAN(5)

In [None]:
for i in range(1,16):
    print("eps = " + str(i))
    AnalisisDBSCAN(i)
    print()

In [None]:
dbscan = DBSCAN(min_samples=3, eps=8)
labels = dbscan.fit_predict(X_pca)
print(labels)

In [None]:
for cluster in range(max(labels)+1):
    index = labels == cluster
    print("Cluster: " + str(cluster))
    count =  np.sum(index)
    print("Total de elementos: " + str(count))

    fig, axes = plt.subplots(1, 3, subplot_kw={'xticks': (), 'yticks': ()})
    for image, target, ax in zip(X_people[index], y_people[index], axes):
        ax.imshow(image.reshape(image_shape), cmap=plt.cm.gray)
        ax.set_title(people.target_names[target].split()[-1])
    plt.show()

In [None]:
dbscan = DBSCAN(min_samples=3, eps=9)
labels = dbscan.fit_predict(X_pca)
print(labels)

In [None]:
for cluster in range(max(labels)+1):
    index = labels == cluster
    print("Cluster: " + str(cluster))
    count =  np.sum(index)
    print("Total de elementos: " + str(count))

    fig, axes = plt.subplots(1, 3, subplot_kw={'xticks': (), 'yticks': ()})
    for image, target, ax in zip(X_people[index], y_people[index], axes):
        ax.imshow(image.reshape(image_shape), cmap=plt.cm.gray)
        ax.set_title(people.target_names[target].split()[-1])
    plt.show()

# Clustering usando el algoritmo K-Means.

In [None]:
from sklearn.cluster import KMeans

In [None]:
n_clusters = 10
kmeans = KMeans(n_clusters = n_clusters, random_state = 100)
labels_km = kmeans.fit_predict(X_pca)
print(labels_km)

In [None]:
unique, counts = np.unique(labels_km, return_counts=True)
dict(zip(unique, counts))

In [None]:
print(len(unique))

In [None]:
fig, axes = plt.subplots(2, 5, subplot_kw={'xticks': (), 'yticks': ()}, figsize=(12, 4))
centers = kmeans.cluster_centers_
for center, ax in zip(centers, axes.ravel()):
    image = pca.inverse_transform(center).reshape(image_shape)
    ax.imshow(image, cmap=plt.cm.gray)
plt.show()

In [None]:
for cluster in range(max(labels_km)+1):
    index = labels_km == cluster
    print("Cluster: " + str(cluster))
    count =  np.sum(index)
    print("Total de elementos: " + str(count))
    
    center = kmeans.cluster_centers_[cluster]
    
    fig, axes = plt.subplots(1, 5, subplot_kw={'xticks': (), 'yticks': ()}, figsize=(10, 8))
    axes[0].imshow(pca.inverse_transform(center).reshape(image_shape), cmap=plt.cm.gray)
    
    for image, target, ax in zip(X_people[index], y_people[index], axes[1:]):
        ax.imshow(image.reshape(image_shape), cmap=plt.cm.gray)
        ax.set_title(people.target_names[target].split()[-1])
    
    plt.show()

# Clustering usando el algoritmo Agglomerative Clustering.

In [None]:
from sklearn.cluster import AgglomerativeClustering

In [None]:
n_clusters = 10
agglomerative  = AgglomerativeClustering(n_clusters = n_clusters)
labels_agg = agglomerative.fit_predict(X_pca)
print(labels_agg)

In [None]:
unique, counts = np.unique(labels_agg, return_counts=True)
dict(zip(unique, counts))

In [None]:
print(len(unique))

In [None]:
from scipy.cluster.hierarchy import dendrogram, ward

In [None]:
linkage_array = ward(X_pca)
print(linkage_array)

In [None]:
plt.figure(figsize=(40, 10))
dendrogram(linkage_array, p=10, truncate_mode='level', no_labels=True)
plt.show()

In [None]:
for cluster in range(max(labels_agg)+1):
    index = labels_agg == cluster
    print("Cluster: " + str(cluster))
    count =  np.sum(index)
    print("Total de elementos: " + str(count))
    
    fig, axes = plt.subplots(1, 5, subplot_kw={'xticks': (), 'yticks': ()}, figsize=(10, 8))
    
    for image, target, ax in zip(X_people[index], y_people[index], axes):
        ax.imshow(image.reshape(image_shape), cmap=plt.cm.gray)
        ax.set_title(people.target_names[target].split()[-1])
    
    plt.show()