# Obtención de los datos.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

In [None]:
from sklearn.datasets import load_breast_cancer
data = load_breast_cancer()

In [None]:
print(data.DESCR)

In [None]:
df = pd.DataFrame(data.data, columns = data.feature_names)
df.head()

In [None]:
df.boxplot(figsize=(40,10))
plt.xticks(rotation = 90)
plt.show()

In [None]:
y = pd.Series(data.target)
y.value_counts()

# Preparación de los datos.

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import normalize

### Escalamiento de los datos.

In [None]:
scaler = StandardScaler()
scaled_df = scaler.fit_transform(df)
scaled_df = pd.DataFrame(scaled_df, columns = df.columns) 

In [None]:
scaled_df.head()

In [None]:
scaled_df.boxplot(figsize=(40,10))
plt.xticks(rotation = 90)
plt.show()

### Normalización de los datos.

In [None]:
normalized_df = normalize(scaled_df)
normalized_df = pd.DataFrame(normalized_df, columns = df.columns) 

In [None]:
normalized_df.head()

In [None]:
normalized_df.boxplot(figsize=(40,10))
plt.xticks(rotation = 90)
plt.show()

# Análisis de Componentes Principales (PCA)
https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.PCA.html

In [None]:
from sklearn.decomposition import PCA

In [None]:
pca = PCA(n_components = 2) 
x_principal = pca.fit_transform(normalized_df) 
x_principal = pd.DataFrame(x_principal)
x_principal.columns = ['P1', 'P2'] 
x_principal.head()

In [None]:
fig = plt.figure(figsize = (10,10))
plt.scatter(x_principal['P1'], x_principal['P2'], s = 80)
plt.show()

# Algoritmo K-Means.
https://scikit-learn.org/stable/modules/generated/sklearn.cluster.KMeans.html

In [None]:
from sklearn.cluster import KMeans

In [None]:
n_clusters = 2
km = KMeans(n_clusters = n_clusters)
km.fit(normalized_df);

In [None]:
y_predict = km.predict(normalized_df)

In [None]:
y_predict = pd.Series(y_predict)
y_predict.value_counts()

In [None]:
fig = plt.figure(figsize = (10,10))
plt.scatter(x_principal['P1'], x_principal['P2'], c = y_predict, s = 80)
plt.show()

# Matriz de confusión para verificar el desempeño.

In [None]:
from sklearn.metrics import confusion_matrix

In [None]:
cm = confusion_matrix(y, y_predict)
sns.heatmap(cm, linewidth = 0.5, annot = True, cmap = 'Reds', fmt = 'g')
plt.ylabel('Valores predecidos')
plt.xlabel('Valores reales')
plt.title('Comparación con valores de entrenamiento')
plt.show()

# Reporte de clasificación.

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y, y_predict))

# Las etiquetas encontradas por el algoritmo no coinciden con nuestras etiquetas originales.

In [None]:
print(y)

In [None]:
print(y_predict)

In [None]:
predicted_labels = np.zeros_like(y_predict)
print(predicted_labels)
print(predicted_labels.size)

In [None]:
from scipy.stats import mode

for i in range(2):
    mask = (y_predict == i)
    predicted_labels[mask] = mode(y[mask])[0]
    
print(predicted_labels)

# Matriz de confusión para verificar el desempeño.

In [None]:
cm = confusion_matrix(y, predicted_labels)
sns.heatmap(cm, linewidth = 0.5, annot = True, cmap = 'Reds', fmt = 'g')
plt.ylabel('Valores predecidos')
plt.xlabel('Valores reales')
plt.title('Comparación con valores de entrenamiento')
plt.show()

# Reporte de clasificación.

In [None]:
print(classification_report(y, predicted_labels))

# Agreguemos nuestra clasificación al df original.

In [None]:
df['cluster'] = predicted_labels
df.head()

In [None]:
for column in df:
    grid= sns.FacetGrid(df, hue="cluster")
    grid.map(plt.hist, column, alpha = 0.7).add_legend()
    
plt.show();