# Obtención y preparación de los datos.

In [None]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

In [None]:
from sklearn.datasets import load_wine
wine = load_wine()
print(wine.data.shape)

In [None]:
dataset_description = wine.DESCR
print(dataset_description)

In [None]:
print(wine.feature_names)

In [None]:
x = wine.data
print(x)

In [None]:
print(wine.target_names)

In [None]:
y = wine.target
print(y)

unique, counts = np.unique(y, return_counts=True)
dict(zip(unique, counts))

In [None]:
df = pd.DataFrame(wine.data, columns = wine.feature_names)
df.head()

In [None]:
df.boxplot(figsize=(28,10))
plt.xticks(rotation = 90)
plt.show()

In [None]:
np.log(1+df).boxplot(figsize=(28,10))
plt.xticks(rotation = 90);

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import normalize

In [None]:
scaler = StandardScaler()
scaled_df = scaler.fit_transform(df)

In [None]:
normalized_df = normalize(scaled_df)

In [None]:
normalized_df = pd.DataFrame(normalized_df,columns=df.columns) 

In [None]:
normalized_df.head()

In [None]:
normalized_df.boxplot(figsize=(28,10))
plt.xticks(rotation = 90)
plt.show()

# Algoritmo K-Means.

https://scikit-learn.org/stable/modules/generated/sklearn.cluster.KMeans.html

In [None]:
from sklearn.cluster import KMeans

## Probemos con dos clústeres.

In [None]:
n_clusters = 2
km = KMeans(n_clusters=n_clusters)
km.fit(normalized_df);

### Análisis de componentes principales (Principal Component Analysis).
https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.PCA.html

In [None]:
from sklearn.decomposition import PCA

In [None]:
pca = PCA(n_components = 2) 
x_principal = pca.fit_transform(normalized_df) 
x_principal = pd.DataFrame(x_principal)
x_principal.columns = ['P1', 'P2'] 
x_principal.head()

In [None]:
plt.scatter(x_principal['P1'], x_principal['P2'], c = km.predict(normalized_df))
plt.show()

## Método del Codo para determinar el número de Clústeres.

In [None]:
sum_of_squared_distances = []
for k in range(1,15):
    km = KMeans(n_clusters = k)
    km = km.fit(normalized_df)
    sum_of_squared_distances.append(km.inertia_)

In [None]:
plt.plot(sum_of_squared_distances)
plt.show()

## Ahora probemos con tres clústeres.

In [None]:
n_clusters = 3
km = KMeans(n_clusters = n_clusters)
km.fit(normalized_df)
clusters = km.fit_predict(normalized_df)
print(clusters)
km.cluster_centers_.shape

In [None]:
pca = PCA(n_components = 2) 
x_principal = pca.fit_transform(normalized_df) 
x_principal = pd.DataFrame(x_principal)
x_principal.columns = ['P1', 'P2'] 
x_principal.head()

In [None]:
plt.scatter(x_principal['P1'], x_principal['P2'], 
            c = km.predict(normalized_df))
plt.show()

In [None]:
df["cluster"] = km.predict(normalized_df)
df.head()

In [None]:
sns.set()

In [None]:
for columna in df:
    grid= sns.FacetGrid(df, hue="cluster")
    grid.map(plt.hist, columna, alpha = 0.7)

In [None]:
for columna in df:
    d = {'color': ['blue', 'green', 'red']}
    grid= sns.FacetGrid(df, col="cluster", hue_kws=d, hue="cluster")
    grid.map(plt.hist, columna, alpha = 0.7)

# Comprobemos nuestra agrupación contra la realidad

In [None]:
df.cluster

In [None]:
wine.target

In [None]:
predicted_labels = np.zeros_like(clusters)
print(predicted_labels)
print(predicted_labels.size)

In [None]:
from scipy.stats import mode

for i in range(3):
    mask = (clusters == i)
    predicted_labels[mask] = mode(wine.target[mask])[0]
    
print(predicted_labels)

In [None]:
from sklearn.metrics import accuracy_score
accuracy_score(wine.target, predicted_labels)

In [None]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(wine.target, predicted_labels)
sns.heatmap(cm.T, linewidth = 0.5, annot = True, cmap = 'Reds', fmt = 'g')
plt.ylabel('Valores predecidos')
plt.xlabel('Valores reales')
plt.title('Comparación contra los valores reales')
plt.show()

In [None]:
print(wine.target)

unique, counts = np.unique(wine.target, return_counts=True)
dict(zip(unique, counts))

In [None]:
print(predicted_labels)

unique, counts = np.unique(predicted_labels, return_counts=True)
dict(zip(unique, counts))