In [None]:
"""
Autores:
        - Raúl Jiménez Juárez
        - Beatriz Magán Pinto
"""

In [None]:
!pip install scikit-learn-extra
!pip install sklearn_som
!pip install kneed
!pip install seaborn
!pip install dataframe_image

In [None]:
%reset
from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score
from sklearn.cluster import Birch, AgglomerativeClustering, KMeans, DBSCAN, BisectingKMeans
from sklearn_extra.cluster import KMedoids
from sklearn_som.som import SOM
from sklearn.datasets import make_blobs
from sklearn.metrics import silhouette_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from kneed import KneeLocator
from matplotlib import rcParams
import scipy.cluster.hierarchy as sch
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import dataframe_image as dfi
import numpy as np

# 2. Carga y análisis descriptivo de los datos

In [None]:
X_numerico = pd.read_csv("dataset_diabetes/X_numerico.csv")
X_t_numerico = pd.read_csv("dataset_diabetes/X_t_numerico.csv")
Y_numerico = pd.read_csv("dataset_diabetes/Y_numerico.csv")
Y_t_numerico = pd.read_csv("dataset_diabetes/Y_t_numerico.csv")
X_numerico_bal = pd.read_csv("dataset_diabetes/X_numerico_bal.csv")
X_t_numerico_bal = pd.read_csv("dataset_diabetes/X_t_bal_numerico.csv")
Y_numerico_bal = pd.read_csv("dataset_diabetes/Y_numerico_bal.csv")
Y_t_numerico_bal = pd.read_csv("dataset_diabetes/Y_t_numerico_bal.csv")
X_onehot = pd.read_csv("dataset_diabetes/X_onehot.csv")
X_t_onehot = pd.read_csv("dataset_diabetes/X_t_onehot.csv")
X_onehot_bal = pd.read_csv("dataset_diabetes/X_onehot_bal.csv")
X_t_onehot_bal = pd.read_csv("dataset_diabetes/X_t_onehot_bal.csv")

In [None]:
#Hacemos una normalización de los datos
def normalizar(dataset):
    return ((dataset-dataset.min())/(dataset.max()-dataset.min())).fillna(0)

# 3. Desarrollo de algoritmos

In [None]:
df_predicted_train = pd.DataFrame()
df_predicted_test = pd.DataFrame()

In [None]:
def plot_metrica(metrica, nombre_metrica, nombre_caso):
    rcParams['figure.figsize'] = 10, 10
    plt.rcParams.update({'font.size': 15})
    fig, ax = plt.subplots()
    ax.scatter(metrica[:,0], metrica[:,1])
    ax.plot(metrica[:,0], metrica[:,1])
    ax.set(xlabel="n_clusters", ylabel=nombre_metrica, title="Variación " + nombre_metrica + " " + nombre_caso)
    ax.grid()
    plt.savefig("plots/metricas/"+nombre_metrica+"_"+nombre_caso)
    plt.tight_layout()

In [None]:
def plot_metricas(metricas, nombres_metricas, nombre_caso):
    
    cols = 3
    rows = 1
    rcParams['figure.figsize'] = 65, 18
    plt.rcParams.update({'font.size': 48})
    if(len(metricas )== 4):
        cols = 2
        rows = 2
        rcParams['figure.figsize'] = 20, 20
        plt.rcParams.update({'font.size': 15})
    i = 0

    fig, ax = plt.subplots(rows, cols)
    for r in range(rows):
        for c in range(cols):
            if len(metricas) == 4:
                ax[r, c].scatter(metricas[i][:,0], metricas[i][:,1])
                ax[r, c].plot(metricas[i][:,0], metricas[i][:,1])
                ax[r, c].set(xlabel="n_clusters", ylabel=nombres_metricas[i], title= nombres_metricas[i] + " "  + nombre_caso)
                ax[r, c].grid()
            else:
                ax[c].scatter(metricas[i][:,0], metricas[i][:,1])
                ax[c].plot(metricas[i][:,0], metricas[i][:,1])
                ax[c].set(xlabel="n_clusters", ylabel=nombres_metricas[i], title= nombres_metricas[i] + " "  + nombre_caso)
                ax[c].grid()
            i+=1
    plt.savefig("plots/metricas/metricas_"+nombre_caso)
    plt.tight_layout()

## 3.1 K-means

In [None]:
#Calcular la ineercia de 1 hasta 7  para el número óptimo de clusters
#https://jarroba.com/seleccion-del-numero-optimo-clusters/
def comparativa_clusters_kmeans(dataset, n_clusters, nombre_caso):
    inertias_arr = list()
    silhouette_arr = list()
    calinski_harabasz_arr = list()
    davies_bouldin_arr =list()
    for i in range(2, n_clusters):
        print(i)
        modelo = KMeans(init="random",n_clusters=i,n_init=10,max_iter=300,random_state=0)
        modelo.fit(normalizar(dataset).values)
        inertias_arr.append([i, modelo.inertia_])
        predicted = modelo.fit_predict(normalizar(dataset).values)
        silhouette_arr.append([i, silhouette_score(dataset, predicted)])
        calinski_harabasz_arr.append([i, calinski_harabasz_score(dataset, predicted)])
        davies_bouldin_arr.append([i, davies_bouldin_score(dataset, predicted)])
              
    inertias_arr = np.array(inertias_arr)
    silhouete_arr = np.array(silhouette_arr)
    calinski_harabasz_arr = np.array(calinski_harabasz_arr)
    davies_bouldin_arr = np.array(davies_bouldin_arr)

    plot_metrica(inertias_arr, "inercia", nombre_caso)
    plot_metrica(silhouete_arr, "silhouette", nombre_caso)
    plot_metrica(calinski_harabasz_arr, "calinski_harabasz", nombre_caso)
    plot_metrica(davies_bouldin_arr, "davies_bouldin_score", nombre_caso)

    plot_metricas(np.array([inertias_arr,silhouette_arr, calinski_harabasz_arr, davies_bouldin_arr]), 
                  ["inercia", "silhouette", "calinski_harabasz", "davies_bouldin_score"], nombre_caso)
    

In [None]:
max_clusters = 30

### 3.1.1 Primer caso (Dataset inicial numérico con categóticas One-hot) 

In [None]:
comparativa_clusters_kmeans(X_t_onehot, max_clusters, "- caso 1 - K-means")

In [None]:
n_clusters = 4
cluster_kmeans = KMeans(init="random",n_clusters=n_clusters,n_init=10,max_iter=300,random_state=0)
cluster_kmeans.fit(normalizar(X_onehot).values)
df_predicted_train["kmeans_df_onehot"] = cluster_kmeans.predict(normalizar(X_onehot).values)
df_predicted_test["kmeans_df_onehot"] = cluster_kmeans.predict(normalizar(X_t_onehot).values)

### 3.1.2 Segundo caso  (Dataset balanceado numérico con categóticas One-hot) 

In [None]:
comparativa_clusters_kmeans(X_t_onehot_bal, max_clusters, "- caso 2 - K-means")

In [None]:
cluster_kmeans = KMeans(init="random",n_clusters=n_clusters,n_init=10,max_iter=300,random_state=0)
cluster_kmeans.fit(normalizar(X_onehot_bal).values)
df_predicted_train["kmeans_df_onehot_balanceado"] = cluster_kmeans.predict(normalizar(X_onehot_bal).values)
df_predicted_test["kmeans_df_onehot_balanceado"] = cluster_kmeans.predict(normalizar(X_t_onehot_bal).values)

### 3.1.3 Tercer caso  (Dataset inicial numérico) 

In [None]:
comparativa_clusters_kmeans(X_t_numerico, max_clusters, "- caso 3 - K-means")

In [None]:
cluster_kmeans = KMeans(init="random",n_clusters=n_clusters,n_init=10,max_iter=300,random_state=0)
cluster_kmeans.fit(normalizar(X_numerico).values)
df_predicted_train["kmeans_df"] = cluster_kmeans.predict(normalizar(X_numerico).values)
df_predicted_test["kmeans_df_numerico"] = cluster_kmeans.predict(normalizar(X_t_numerico).values)

### 3.1.4 Cuarto caso  (Dataset balanceado numérico) 

In [None]:
comparativa_clusters_kmeans(X_t_numerico_bal, max_clusters, "- caso 4 - K-means")

In [None]:
cluster_kmeans = KMeans(init="random",n_clusters=n_clusters,n_init=10,max_iter=300,random_state=0)
cluster_kmeans.fit(normalizar(X_numerico_bal).values)
df_predicted_train["kmeans_df_balanceado"] = cluster_kmeans.predict(normalizar(X_numerico_bal).values)
df_predicted_test["kmeans_df_numerico_balanceado"] = cluster_kmeans.predict(normalizar(X_t_numerico_bal).values)

## 3.2 Mapas Autoorganizativos

In [None]:
def comparativa_clusters_som(dataset, lado_maximo, nombre_caso):
    inertias_arr = list()
    silhouette_arr = list()
    calinski_harabasz_arr = list()
    davies_bouldin_arr =list()
    
    for i in range(2, lado_maximo):
        tamaño_lado = i * i
        print(tamaño_lado)
        cluster_som = SOM(m=i, n=i, dim = len(dataset.columns))
        cluster_som.fit(normalizar(dataset).values)
        predicted =  cluster_som.predict(normalizar(dataset).values)
        silhouette_arr.append([tamaño_lado, silhouette_score(dataset,predicted)])
        calinski_harabasz_arr.append([tamaño_lado, calinski_harabasz_score(dataset, predicted)])
        davies_bouldin_arr.append([tamaño_lado, davies_bouldin_score(dataset, predicted)])
              
    silhouete_arr = np.array(silhouette_arr)
    calinski_harabasz_arr = np.array(calinski_harabasz_arr)
    davies_bouldin_arr = np.array(davies_bouldin_arr)

    plot_metrica(silhouete_arr, "silhouette", nombre_caso)
    plot_metrica(calinski_harabasz_arr, "calinski_harabasz", nombre_caso)
    plot_metrica(davies_bouldin_arr, "davies_bouldin_score", nombre_caso)

    plot_metricas(np.array([silhouette_arr, calinski_harabasz_arr, davies_bouldin_arr]), 
                  ["silhouette", "calinski_harabasz", "davies_bouldin_score"], nombre_caso)
    

In [None]:
lado_maximo = 9

### 3.2.1 Primer caso (Dataset inicial numérico con categóticas One-hot)

In [None]:
comparativa_clusters_som(X_t_onehot, lado_maximo, "- caso 1 - SOM")

In [None]:
n_clusters = 2
cluster_som = SOM(m=n_clusters, n=n_clusters, dim = len(X_t_onehot.columns))
cluster_som.fit(normalizar(X_onehot).values)
df_predicted_train["som_df_onehot"] = cluster_som.predict(normalizar(X_onehot).values)
df_predicted_test["som_df_onehot"] = cluster_som.predict(normalizar(X_t_onehot).values)

### 3.2.2 Segundo caso  (Dataset balanceado numérico con categóticas One-hot) 

In [None]:
comparativa_clusters_som(X_t_onehot_bal, lado_maximo, "- caso 2 - SOM")

In [None]:
n_clusters = 2
cluster_som = SOM(m=n_clusters, n=n_clusters, dim = len(X_onehot_bal.columns))
cluster_som.fit(normalizar(X_onehot_bal).values)
df_predicted_train["som_df_onehot_balanceado"] = cluster_som.predict(normalizar(X_onehot_bal).values)
df_predicted_test["som_df_onehot_balanceado"] = cluster_som.predict(normalizar(X_t_onehot_bal).values)

### 3.2.3 Tercer caso  (Dataset inicial numérico) 

In [None]:
comparativa_clusters_som(X_t_numerico, lado_maximo, "- caso 3 - SOM")

In [None]:
n_clusters = 2
cluster_som = SOM(m=n_clusters, n=n_clusters, dim = len(X_t_numerico.columns))
cluster_som.fit(normalizar(X_numerico).values)
df_predicted_train["som_df_numerico"] = cluster_som.predict(normalizar(X_numerico).values)
df_predicted_test["som_df_numerico"] = cluster_som.predict(normalizar(X_t_numerico).values)

### 3.2.4 Cuarto caso  (Dataset balanceado numérico) 

In [None]:
comparativa_clusters_som(X_numerico_bal, lado_maximo, "- caso 4 - SOM")

In [None]:
n_clusters = 2
cluster_som = SOM(m=n_clusters, n=n_clusters, dim = len(X_t_numerico_bal.columns))
cluster_som.fit(normalizar(X_numerico_bal).values)
df_predicted_train["som_df_numerico_balanceado"] = cluster_som.predict(normalizar(X_numerico_bal).values)
df_predicted_test["som_df_numerico_balanceado"] = cluster_som.predict(normalizar(X_t_numerico_bal).values)

## 3.4 K-medoids


In [None]:
def comparativa_clusters_kmedoids(dataset, max_clusters, nombre_caso):
    inertias_arr = list()
    silhouette_arr = list()
    calinski_harabasz_arr = list()
    davies_bouldin_arr =list()
    for i in range(2, max_clusters):
        print(i)
        modelo = KMedoids(n_clusters=i, random_state=0)
        modelo.fit(dataset.values)
        inertias_arr.append([i, modelo.inertia_])
        predicted = modelo.fit_predict(normalizar(dataset).values)
        silhouette_arr.append([i, silhouette_score(dataset, predicted)])
        calinski_harabasz_arr.append([i, calinski_harabasz_score(dataset, predicted)])
        davies_bouldin_arr.append([i, davies_bouldin_score(dataset, predicted)])
              
    inertias_arr = np.array(inertias_arr)
    silhouete_arr = np.array(silhouette_arr)
    calinski_harabasz_arr = np.array(calinski_harabasz_arr)
    davies_bouldin_arr = np.array(davies_bouldin_arr)

    plot_metrica(inertias_arr, "inercia", nombre_caso)
    plot_metrica(silhouete_arr, "silhouette", nombre_caso)
    plot_metrica(calinski_harabasz_arr, "calinski_harabasz", nombre_caso)
    plot_metrica(davies_bouldin_arr, "davies_bouldin_score", nombre_caso)

    plot_metricas(np.array([inertias_arr,silhouette_arr, calinski_harabasz_arr, davies_bouldin_arr]), 
                  ["inercia", "silhouette", "calinski_harabasz", "davies_bouldin_score"], nombre_caso)

### 3.4.1 Primer caso (Dataset inicial numérico con categóticas One-hot) 

In [None]:
comparativa_clusters_kmedoids(X_t_onehot, max_clusters, "- caso 1 - K-medoids")

In [None]:
n_clusters = 4
cluster_kmedoids = KMedoids(n_clusters=n_clusters, random_state=0)
cluster_kmedoids.fit(normalizar(X_t_onehot).values)
df_predicted_train["kmedoids_df_onehot"] = cluster_kmedoids.predict(np.ascontiguousarray(normalizar(X_onehot).values))
df_predicted_test["kmedoids_df_onehot"] = cluster_kmedoids.predict(np.ascontiguousarray(normalizar(X_t_onehot).values))

### 3.4.2 Segundo caso  (Dataset balanceado numérico con categóticas One-hot) 

In [None]:
comparativa_clusters_kmedoids(X_t_onehot_bal, max_clusters, "- caso 2 - K-medoids")

In [None]:
cluster_kmedoids = KMedoids(n_clusters=n_clusters, random_state=0)
cluster_kmedoids.fit(normalizar(X_t_onehot_bal).values)
df_predicted_train["kmedoids_df_onehot_balanceado"] = cluster_kmedoids.predict(np.ascontiguousarray(normalizar(X_onehot_bal).values))
df_predicted_test["kmedoids_df_onehot_balanceado"] = cluster_kmedoids.predict(np.ascontiguousarray(normalizar(X_t_onehot_bal).values))

### 3.4.3 Tercer caso  (Dataset inicial numérico) 

In [None]:
comparativa_clusters_kmedoids(X_t_numerico, max_clusters, "- caso 3 - K-medoids")

In [None]:
cluster_kmedoids = KMedoids(n_clusters=n_clusters, random_state=0)
cluster_kmedoids.fit(normalizar(X_t_numerico).values)
df_predicted_train["kmedoids_df_numerico"] = cluster_kmedoids.predict(np.ascontiguousarray(normalizar(X_numerico).values))
df_predicted_test["kmedoids_df_numerico"] = cluster_kmedoids.predict(np.ascontiguousarray(normalizar(X_t_numerico).values))

### 3.4.4 Cuarto caso  (Dataset balanceado numérico) 

In [None]:
comparativa_clusters_kmedoids(X_t_numerico_bal, max_clusters, "Caso 4 - K-medoids")

In [None]:
cluster_kmedoids = KMedoids(n_clusters=n_clusters, random_state=0)
cluster_kmedoids.fit(normalizar(X_t_numerico_bal).values)
df_predicted_train["kmedoids_df_numerico_balanceado"] = cluster_kmedoids.predict(np.ascontiguousarray(normalizar(X_numerico_bal).values))
df_predicted_test["kmedoids_df_numerico_balanceado"] = cluster_kmedoids.predict(np.ascontiguousarray(normalizar(X_t_numerico_bal).values))

## 3.5 Agglomerative

In [None]:
def comparativa_clusters_agglomerative(dataset, max_clusters, nombre_caso):
    silhouette_arr = list()
    calinski_harabasz_arr = list()
    davies_bouldin_arr =list()
    for i in range(2, max_clusters):
        print(i)
        modelo = AgglomerativeClustering(n_clusters=i, affinity='euclidean')
        predicted = modelo.fit_predict(normalizar(dataset).values)
        silhouette_arr.append([i, silhouette_score(dataset, predicted)])
        calinski_harabasz_arr.append([i, calinski_harabasz_score(dataset, predicted)])
        davies_bouldin_arr.append([i, davies_bouldin_score(dataset, predicted)])
              

    silhouete_arr = np.array(silhouette_arr)
    calinski_harabasz_arr = np.array(calinski_harabasz_arr)
    davies_bouldin_arr = np.array(davies_bouldin_arr)

    plot_metrica(silhouete_arr, "silhouette", nombre_caso)
    plot_metrica(calinski_harabasz_arr, "calinski_harabasz", nombre_caso)
    plot_metrica(davies_bouldin_arr, "davies_bouldin_score", nombre_caso)

    plot_metricas(np.array([silhouette_arr, calinski_harabasz_arr, davies_bouldin_arr]), 
                  ["silhouette", "calinski_harabasz", "davies_bouldin_score"], nombre_caso)

### 3.5.1 Primer caso (Dataset inicial numérico con categóticas One-hot)  

In [None]:
comparativa_clusters_agglomerative(X_t_onehot, max_clusters, "- caso 1 - Agglomerative")

In [None]:
n_clusters = 4
cluster_agglomerative = AgglomerativeClustering(n_clusters=n_clusters, affinity='euclidean')
#cluster_agglomerative.fit(normalizar(X_onehot).values)
df_predicted_train["agglomerative_df_onehot"] = cluster_agglomerative.fit_predict(np.ascontiguousarray(normalizar(X_onehot).values))
df_predicted_test["agglomerative_df_onehot"] = cluster_agglomerative.fit_predict(np.ascontiguousarray(normalizar(X_t_onehot).values))

### 3.5.2 Segundo caso  (Dataset balanceado numérico con categóticas One-hot) 

In [None]:
comparativa_clusters_agglomerative(X_t_onehot_bal, max_clusters, "- caso 2 - Agglomerative")

In [None]:

cluster_agglomerative = AgglomerativeClustering(n_clusters=n_clusters, affinity='euclidean')
#cluster_agglomerative.fit(X_onehot_bal.values)
df_predicted_train["agglomerative_df_onehot_balanceado"] = cluster_agglomerative.fit_predict(np.ascontiguousarray(normalizar(X_onehot_bal).values))
df_predicted_test["agglomerative_df_onehot_balanceado"] = cluster_agglomerative.fit_predict(np.ascontiguousarray(normalizar(X_t_onehot_bal).values))

### 3.5.3 Tercer caso  (Dataset inicial numérico)  

In [None]:
comparativa_clusters_agglomerative(X_t_numerico, max_clusters, "- caso 3 - Agglomerative")

In [None]:
cluster_agglomerative = AgglomerativeClustering(n_clusters=n_clusters, affinity='euclidean')
#cluster_agglomerative.fit(X_numerico.values)
df_predicted_train["agglomerative_df_numerico"] = cluster_agglomerative.fit_predict(np.ascontiguousarray(normalizar(X_numerico).values))
df_predicted_test["agglomerative_df_numerico"] = cluster_agglomerative.fit_predict(np.ascontiguousarray(normalizar(X_t_numerico).values))

### 3.5.4 Cuarto caso  (Dataset balanceado numérico) 

In [None]:
comparativa_clusters_agglomerative(X_t_numerico_bal, max_clusters, "- caso 4 - Agglomerative")

In [None]:
cluster_agglomerative = AgglomerativeClustering(n_clusters=n_clusters, affinity='euclidean')
#cluster_agglomerative.fit(X_numerico_bal.values)
df_predicted_train["agglomerative_df_numerico_balanceado"] = cluster_agglomerative.fit_predict(np.ascontiguousarray(normalizar(X_numerico_bal).values))
df_predicted_test["agglomerative_df_numerico_balanceado"] = cluster_agglomerative.fit_predict(np.ascontiguousarray(normalizar(X_t_numerico_bal).values))

In [None]:
centroides_agglomerative = cluster_agglomerative.cluster_centers

##  3.7 Birch

In [None]:
def comparativa_clusters_birch(dataset, max_clusters, nombre_caso):
    silhouette_arr = list()
    calinski_harabasz_arr = list()
    davies_bouldin_arr =list()
    for i in range(2, max_clusters):
        print(i)
        modelo = Birch(n_clusters=i)
        predicted = modelo.fit_predict(np.ascontiguousarray(normalizar(dataset).values))
        silhouette_arr.append([i, silhouette_score(dataset, predicted)])
        calinski_harabasz_arr.append([i, calinski_harabasz_score(dataset, predicted)])
        davies_bouldin_arr.append([i, davies_bouldin_score(dataset, predicted)])
              
    silhouete_arr = np.array(silhouette_arr)
    calinski_harabasz_arr = np.array(calinski_harabasz_arr)
    davies_bouldin_arr = np.array(davies_bouldin_arr)
    plot_metrica(silhouete_arr, "silhouette", nombre_caso)
    plot_metrica(calinski_harabasz_arr, "calinski_harabasz", nombre_caso)
    plot_metrica(davies_bouldin_arr, "davies_bouldin_score", nombre_caso)

    plot_metricas(np.array([silhouette_arr, calinski_harabasz_arr, davies_bouldin_arr]), 
                  ["silhouette", "calinski_harabasz", "davies_bouldin_score"], nombre_caso)

### 3.7.1 Primer caso (Dataset inicial numérico con categóticas One-hot) 

In [None]:
comparativa_clusters_birch(X_t_onehot, max_clusters, "- caso 1 - Birch")

In [None]:
n_clusters = 4
cluster_birch = Birch(n_clusters=n_clusters)
cluster_birch.fit(np.ascontiguousarray(normalizar(X_onehot).values))
df_predicted_train["birch_df_onehot"] = cluster_birch.predict(np.ascontiguousarray(normalizar(X_onehot).values))
df_predicted_test["birch_df_onehot"] = cluster_birch.predict(np.ascontiguousarray(normalizar(X_t_onehot).values))

### 3.7.2 Segundo caso  (Dataset balanceado numérico con categóticas One-hot) 

In [None]:
comparativa_clusters_birch(X_t_onehot_bal, max_clusters, "- caso 2 - Birch")

In [None]:
cluster_birch = Birch(n_clusters=n_clusters)
cluster_birch.fit(np.ascontiguousarray(normalizar(X_onehot_bal).values))
df_predicted_train["birch_df_onehot_balanceado"] = cluster_birch.predict(np.ascontiguousarray(normalizar(X_onehot_bal).values))
df_predicted_test["birch_df_onehot_balanceado"] = cluster_birch.predict(np.ascontiguousarray(normalizar(X_t_onehot_bal).values))

### 3.7.3 Tercer caso  (Dataset inicial numérico) 

In [None]:
comparativa_clusters_birch(X_t_numerico, max_clusters, "- caso 3 - Birch")

In [None]:
cluster_birch = Birch(n_clusters=n_clusters)
cluster_birch.fit(np.ascontiguousarray(normalizar(X_numerico).values))
df_predicted_train["birch_df_numerico"] = cluster_birch.predict(np.ascontiguousarray(normalizar(X_numerico).values))
df_predicted_test["birch_df_numerico"] = cluster_birch.predict(np.ascontiguousarray(normalizar(X_t_numerico).values))

### 3.7.4 Cuarto caso  (Dataset balanceado numérico) 

In [None]:
comparativa_clusters_birch(X_t_numerico_bal, max_clusters, "- caso 4 - Birch")

In [None]:
cluster_birch = Birch(n_clusters=n_clusters)
cluster_birch.fit(np.ascontiguousarray(normalizar(X_numerico_bal).values))
df_predicted_train["birch_df_numerico_balanceado"] = cluster_birch.predict(np.ascontiguousarray(normalizar(X_numerico_bal).values))
df_predicted_test["birch_df_numerico_balanceado"] = cluster_birch.predict(np.ascontiguousarray(normalizar(X_t_numerico_bal).values))

## 3.8  BisectingKMeans

In [None]:
def comparativa_clusters_bisectingkmeans(dataset, max_clusters, nombre_caso):
    inertias_arr = list()
    silhouette_arr = list()
    calinski_harabasz_arr = list()
    davies_bouldin_arr =list()
    for i in range(2, max_clusters):
        print(i)
        modelo = BisectingKMeans(n_clusters=i, random_state=0)
        modelo.fit(dataset.values)
        inertias_arr.append([i, modelo.inertia_])
        predicted = modelo.fit_predict(normalizar(dataset).values)
        silhouette_arr.append([i, silhouette_score(dataset, predicted)])
        calinski_harabasz_arr.append([i, calinski_harabasz_score(dataset, predicted)])
        davies_bouldin_arr.append([i, davies_bouldin_score(dataset, predicted)])
              
    inertias_arr = np.array(inertias_arr)
    silhouete_arr = np.array(silhouette_arr)
    calinski_harabasz_arr = np.array(calinski_harabasz_arr)
    davies_bouldin_arr = np.array(davies_bouldin_arr)

    plot_metrica(inertias_arr, "inercia", nombre_caso)
    plot_metrica(silhouete_arr, "silhouette", nombre_caso)
    plot_metrica(calinski_harabasz_arr, "calinski_harabasz", nombre_caso)
    plot_metrica(davies_bouldin_arr, "davies_bouldin_score", nombre_caso)

    plot_metricas(np.array([inertias_arr,silhouette_arr, calinski_harabasz_arr, davies_bouldin_arr]), 
                  ["inercia", "silhouette", "calinski_harabasz", "davies_bouldin_score"], nombre_caso)

### 3.8.1 Primer caso (Dataset inicial numérico con categóticas One-hot) 

In [None]:
comparativa_clusters_bisectingkmeans(X_onehot, max_clusters, "- caso 1 - Bisecting K-means")

In [None]:
n_clusters = 4

In [None]:
cluster_bisectic = BisectingKMeans(n_clusters=n_clusters, random_state=0)
cluster_bisectic.fit(np.ascontiguousarray(normalizar(X_onehot).values))
df_predicted_train["bisecting_df_onehot"] = cluster_bisectic.predict(np.ascontiguousarray(normalizar(X_onehot).values))
df_predicted_test["bisecting_df_onehot"] = cluster_bisectic.predict(np.ascontiguousarray(normalizar(X_t_onehot).values))

### 3.8.2 Segundo caso  (Dataset balanceado numérico con categóticas One-hot) 

In [None]:
comparativa_clusters_bisectingkmeans(X_onehot_bal, max_clusters, "- caso 2 - Bisecting K-means")

In [None]:
cluster_bisectic = BisectingKMeans(n_clusters=n_clusters, random_state=0)
cluster_bisectic.fit(np.ascontiguousarray(normalizar(X_onehot_bal).values))
df_predicted_train["bisecting_df_onehot_balanceado"] = cluster_bisectic.predict(np.ascontiguousarray(normalizar(X_onehot_bal).values))
df_predicted_test["bisecting_df_onehot_balanceado"] = cluster_bisectic.predict(np.ascontiguousarray(normalizar(X_t_onehot_bal).values))

### 3.8.3 Tercer caso  (Dataset inicial numérico) 

In [None]:
comparativa_clusters_bisectingkmeans(X_numerico, max_clusters, "- caso 3 - Bisecting K-means")

In [None]:
cluster_bisectic = BisectingKMeans(n_clusters=n_clusters, random_state=0)
cluster_bisectic.fit(np.ascontiguousarray(normalizar(X_numerico).values))
df_predicted_train["bisecting_df_numerico"] = cluster_bisectic.predict(np.ascontiguousarray(normalizar(X_numerico).values))
df_predicted_test["bisecting_df_numerico"] = cluster_bisectic.predict(np.ascontiguousarray(normalizar(X_t_numerico).values))

### 3.8.4 Cuarto caso  (Dataset balanceado numérico) 

In [None]:
comparativa_clusters_bisectingkmeans(X_numerico_bal, max_clusters, "- caso 4 - Bisecting K-means")

In [None]:
cluster_bisectic = BisectingKMeans(n_clusters=n_clusters, random_state=0)
cluster_bisectic.fit(np.ascontiguousarray(normalizar(X_numerico_bal).values))
df_predicted_train["bisecting_df_numerico_balanceado"] = cluster_bisectic.predict(np.ascontiguousarray(normalizar(X_numerico_bal).values))
df_predicted_test["bisecting_df_numerico_balanceado"] = cluster_bisectic.predict(np.ascontiguousarray(normalizar(X_t_numerico_bal).values))

In [None]:
centroides_bisecting = cluster_bisectic.cluster_centers_

### Guardamos los resultados obtenidos 

In [None]:
df_predicted_train.to_csv("dataset_diabetes/df_predicted_train.csv", index = False)
df_predicted_test.to_csv("dataset_diabetes/df_predicted_test.csv", index = False)

In [None]:
df_predicted_train = pd.read_csv("dataset_diabetes/df_predicted_train.csv")
df_predicted_test = pd.read_csv("dataset_diabetes/df_predicted_test.csv")

# 4. Análisis de resultados

In [None]:
def mostrar_plot(columna_1, columna_2, data_label, X_data, name):

    rcParams['figure.figsize'] = 10, 10
    plt.rcParams.update({'font.size': 14})
        
    plt.figure()
    plt.title(name)
    plt.scatter(X_data[columna_1], X_data[columna_2], c=data_label)
    plt.xlabel(columna_1)
    plt.ylabel(columna_2)
    plt.grid(visible=True)
    #plt.savefig("plots/"+columna_1+"_"+columna_2+"_"+name)

In [None]:
columnas_df_onehot = []
columnas_df_onehot_bal = []
columnas_df_numerico = []
columnas_df_numerico_bal = []

for col in df_predicted_test.columns:
    if "df_onehot_bal" in col:
        columnas_df_onehot_bal.append(col)
    elif "df_onehot" in col:
        columnas_df_onehot.append(col)
    elif "df_numerico_bal" in col:
        columnas_df_numerico_bal.append(col)
    else:
        columnas_df_numerico.append(col)

## 4.3 Análisis de métricas 

In [None]:
def print_metricas(nombre_caso, metricas):
    print("\n===============================================================")
    print(nombre_caso + " kmeans: " + str(round(metricas[0], 2)))
    print(nombre_caso + " SOM: " + str(round(metricas[1], 2)))
    print(nombre_caso + " kmedoids: " + str(round(metricas[2], 2)))
    print(nombre_caso + " agglomerative: " + str(round(metricas[3], 2)))
    print(nombre_caso + " birch: " + str(round(metricas[4], 2)))
    print(nombre_caso + " bisecting: " + str(round(metricas[5], 2)))
    print("===============================================================")

In [None]:
def obtener_metricas(dataset, predicted, nombre_caso):

    metricas = {}
    metricas_silhouette = []
    metricas_calinski_harabasz = []
    metricas_davies_bouldin = []

    for col in predicted.columns:
        metricas_silhouette.append(silhouette_score(dataset, predicted[col]))
        metricas_calinski_harabasz.append(calinski_harabasz_score(dataset, predicted[col]))
        metricas_davies_bouldin.append(davies_bouldin_score(dataset, predicted[col]))
    
    metricas["silhouette"] = metricas_silhouette
    metricas["calinski_harabasz"] = metricas_calinski_harabasz
    metricas["davies_bouldin"] = metricas_davies_bouldin
    
    print_metricas(nombre_caso + " silhouette", metricas_silhouette)
    print_metricas(nombre_caso + " calinski_harabasz", metricas_calinski_harabasz)
    print_metricas(nombre_caso + " davies_bouldin", metricas_davies_bouldin)

In [None]:
obtener_metricas(X_t_onehot, df_predicted_test[columnas_df_onehot], "caso 1")

In [None]:
obtener_metricas(X_t_onehot_bal, df_predicted_test[columnas_df_onehot_bal], "caso 2")

In [None]:
obtener_metricas(X_t_numerico, df_predicted_test[columnas_df_numerico], "caso 3")

In [None]:
obtener_metricas(X_t_numerico_bal, df_predicted_test[columnas_df_numerico_bal], "caso 4")

In [None]:
obtener_metricas(X_onehot, df_predicted_train[columnas_df_onehot], "caso 1")

In [None]:
obtener_metricas(X_onehot_bal, df_predicted_train[columnas_df_onehot_bal], "caso 2")

In [None]:
obtener_metricas(X_numerico, df_predicted_train[columnas_df_numerico], "caso 3")

In [None]:
obtener_metricas(X_numerico_bal, df_predicted_train[columnas_df_numerico_bal], "caso 4")

## 4.1 Dendrogramas 

In [None]:
def mostrar_dendrograma(dataset, nombre):
    dendrogram = sch.dendrogram(sch.linkage(dataset, method = 'ward'), p=3, truncate_mode = 'level', no_labels= True)
    rcParams['figure.figsize'] = 40, 15
    plt.rcParams.update({'font.size': 30})
    plt.title('Dendrograma ' + nombre)
    plt.xlabel('Registros')
    plt.ylabel('Distancias Euclidianas')
    plt.savefig("plots/metricas/dendrograma_" +nombre)
    plt.show()

In [None]:
mostrar_dendrograma(normalizar(X_t_onehot), "df_onehot")

In [None]:
mostrar_dendrograma(normalizar(X_t_onehot_bal), "df_onehot_balanceado")

In [None]:
mostrar_dendrograma(normalizar(X_t_numerico), "df_numerico")

In [None]:
mostrar_dendrograma(normalizar(X_t_numerico_bal), "df_numerico_balanceado")

## 4.2 Gráficas clústeres algoritmos 

In [None]:
def mostrar_plot_centroides(columna_1, columna_2, i1, i2,data_label, X_data, name, centroides):

    colors = ['orange', 'lime', 'red', 'black']
    rcParams['figure.figsize'] = 10, 10
    plt.rcParams.update({'font.size': 14})
        
    plt.figure()
    plt.title(name)
    plt.scatter(X_data[columna_1], X_data[columna_2], c=data_label)
    plt.scatter(centroides[:,i1], centroides[:,i2], marker='*', s=150, edgecolor='black', c = colors)
    plt.xlabel(columna_1)
    plt.ylabel(columna_2)
    plt.grid(visible=True)
    plt.savefig("plots/medoides/"+columna_1+"_"+columna_2+"_"+name)
    plt.show()
   

In [None]:
columnas = X_t_numerico_bal.columns
cols = []
for i in range(len(columnas)):
    for j in range(len(columnas)):
        if(i != j and columnas[j] not in cols):
            mostrar_plot_centroides(columnas[i], columnas[j], i, j, 
                                    df_predicted_test["bisecting_df_numerico_balanceado"], normalizar(X_t_numerico_bal), "bisecting - caso 4", centroides_bisecting)
            cols.append(columnas[i])

In [None]:
def mostrar_plots_cluster(df_predicted, X_data, nombre_caso):
    columns = X_data.columns
    cols = 2
    rows = round(len(df_predicted.columns)/cols)
    columnas_mostradas = []
    for column in range(len(columns)):
        for column2 in range(len(columns)):
            if column != column2 and column2 not in columnas_mostradas:
                fig, axs = plt.subplots(rows,cols,figsize = (45,20*rows))
                plt.rcParams.update({'font.size': 30})
                i = 0
                j = 0
                for algoritmo in df_predicted.columns:    
                    axs[i,j].set(title= algoritmo)
                    axs[i,j].scatter(X_data[columns[column]], X_data[columns[column2]], s=400, c=df_predicted[algoritmo], alpha=0.2)
                    axs[i,j].set(xlabel= columns[column])
                    axs[i,j].set(ylabel = columns[column2])
                    
                    axs[i, j].grid()
                    j += 1
                    if(j == cols):
                        j = 0
                        i+=1

                plt.savefig("plots/clusteres/"+columns[column]+"_"+columns[column2]+"_"+nombre_caso)
                plt.show()
        columnas_mostradas.append(column)

                #plt.close()

In [None]:
def grafico_barras_unico_clusters(columna, df, etiquetas):
    clusters = etiquetas.unique()
    print(clusters)
    fig, ax = plt.subplots(len(clusters), figsize = (9,20))
    plt.rcParams.update({'font.size': 15})
    for cluster in clusters:
        ax[cluster].hist(df[df["label"] == cluster][columna],bins=10,edgecolor='black')
        ax[cluster].set(title = columna+" cluster " + str(cluster))
        ax[cluster].grid()
        ax[cluster].plot()
    fig.tight_layout()
    #plt.savefig("plots/"+"hist_"+columna+"_clusters")

In [None]:
mostrar_plots_cluster(df_predicted_test[columnas_df_onehot], X_t_onehot, "caso_1")

In [None]:
mostrar_plots_cluster(df_predicted_test[columnas_df_onehot_bal], X_t_onehot_bal, "caso_2")

In [None]:
mostrar_plots_cluster(df_predicted_test[columnas_df_numerico], X_t_numerico, "caso_3")

In [None]:
mostrar_plots_cluster(df_predicted_test[columnas_df_numerico_bal], Y_t_numerico_bal, "caso_4")

In [None]:
mostrar_plots_cluster(df_predicted_test[["agglomerative_df_numerico_balanceado", "bisecting_df_numerico_balanceado", "kmedoids_df_numerico_balanceado"]], Y_t_numerico_bal, "caso_4")

In [None]:
mostrar_plot("num_lab_procedures", "num_medications", df_predicted_train["kmeans"], X_train, "kmeans")

In [None]:
mostrar_plot("num_lab_procedures", "num_medications", df_predicted_train["som"], X_train, "som")

In [None]:
mostrar_plot("num_lab_procedures", "num_medications", Y_train, X_train, "kmeans")

### 4.3.2 Grafico de barras clústeres 

In [None]:
def grafico_barras_conjunto(df, nombre):
    columnas = df.columns
    cols=3
    rows=round(len(columnas)/cols)
    plt.rcParams.update({'font.size': 15})
    fig, axs = plt.subplots(rows,cols,figsize = (cols*8,rows*8))
    for i in range(rows):
        for j in range(cols):
            n = cols*i+j
            if n<len(df.columns):
                axs[i,j].hist(df[columnas[n]],bins=10,edgecolor='black')
                axs[i,j].set(title = columnas[n])
                axs[i,j].tick_params(labelrotation=45)  
                plt.plot()
            else:
                axs[i,j].plot()
            
    fig.tight_layout()
    plt.savefig("plots/clusteres/histograma_clusteres"+nombre)
    plt.show()

In [None]:
def mostrar_grafico_barras_clusteres(df, predicted, name):
    df["predicted"] = predicted   
    clusteres = np.sort(df["predicted"].unique())
    for cluster in clusteres:
        print("Histograma para el cluster "+str(cluster))
        
        df_buff = df[df["predicted"] == cluster]
        grafico_barras_conjunto(df_buff.drop(columns=['predicted']), name+"_cluster_"+str(cluster))

In [None]:
mostrar_grafico_barras_clusteres( Y_numerico_bal, df_predicted_train["bisecting_df_numerico_balanceado"], 'bisecting_df_numerico_balanceado')

In [None]:
mostrar_grafico_barras_clusteres( Y_numerico_bal, df_predicted_train["agglomerative_df_numerico_balanceado"], "agglomerative_df_numerico_balanceado")

## 4.4 Descripción breve de cada caso 

In [None]:
def mostrar_descripcion_clusteres(dataframe, predicted, nombre_caso):
    
    df_conjunto = dataframe.copy(deep=True)
    
    for col in predicted.columns:
        df_conjunto[col] = predicted[col]
        
        for cluster in predicted[col].unique():
            print(str(cluster)+"_"+col+"_"+nombre_caso)
            df_descripcion = df_conjunto[df_conjunto[col] == cluster]
            print(df_descripcion.describe())
            print("\n=====================================================================\n")
            ##dfi.export(df_descripcion.drop(col, axis=1).describe(), "plots/descipcion_clusteres/"+str(cluster)+"_"+col+"_"+nombre_caso+".png")
            
        df_conjunto.drop(col, axis=1, inplace=True)

In [None]:
def obtener_descripcion_comparativas(dataframe, predicted, col, nombre_caso):
    df_conjunto = dataframe.copy(deep=True)
    dic_grupos = {}
    
    for c in dataframe.columns:
        dic_grupos[c] = pd.DataFrame()
        
    df_conjunto[col] = predicted[col]
    for cluster in predicted[col].unique():
        df_descripcion = df_conjunto[df_conjunto[col] == cluster]
        for c in df_descripcion.describe().columns[:-1]:
            dic_grupos[c][str(c)+"_"+str(cluster+1)] = df_descripcion.describe()[c] 

    for c in dataframe.columns:
        dic_grupos[c] = dic_grupos[c].reindex(sorted(dic_grupos[c].columns), axis=1)
        print(dic_grupos[c]) 
        #dfi.export(dic_grupos[c], "plots/descipcion_clusteres/"+c+"_"+nombre_caso+".png")


In [None]:
mostrar_descripcion_clusteres(X_t_numerico_bal, df_predicted_test[['agglomerative_df_numerico_balanceado','bisecting_df_numerico_balanceado']],"caso 4")

In [None]:
obtener_descripcion_comparativas(X_t_numerico_bal, df_predicted_test, 'agglomerative_df_numerico_balanceado',"caso 4_agglomerative")

In [None]:
"""
Enlaces de interés:

https://scikit-learn-extra.readthedocs.io/en/stable/auto_examples/cluster/plot_clustering.html#sphx-glr-auto-examples-cluster-plot-clustering-py
https://scikit-learn.org/stable/modules/clustering.html
https://colab.research.google.com/github/jumafernandez/BDM/blob/master/Guias/Guia_Clustering.ipynb#scrollTo=zBwQLD4aVXoJ
https://sci-hub.hkvisa.net/10.1007/s13755-018-0054-0
https://www.sngular.com/es/data-science-crisp-dm-metodologia/
https://scikit-learn.org/stable/modules/generated/sklearn.cluster.KMeans.html
https://som-learn.readthedocs.io/en/latest/somlearn.html
https://scikit-learn-extra.readthedocs.io/en/stable/generated/sklearn_extra.cluster.KMedoids.html
https://scikit-learn.org/stable/modules/generated/sklearn.cluster.AgglomerativeClustering.html
https://jarroba.com/seleccion-del-numero-optimo-clusters/
https://scikit-learn.org/stable/modules/generated/sklearn.cluster.Birch.html
https://scikit-learn.org/stable/auto_examples/cluster/plot_bisect_kmeans.html#sphx-glr-auto-examples-cluster-plot-bisect-kmeans-py
https://scikit-learn.org/stable/modules/model_evaluation.html
https://scikit-learn.org/stable/modules/classes.html#module-sklearn.metrics
https://scikit-learn.org/stable/modules/generated/sklearn.metrics.davies_bouldin_score.html
https://scikit-learn.org/stable/modules/generated/sklearn.metrics.calinski_harabasz_score.html#sklearn.metrics.calinski_harabasz_score
https://stackabuse.com/one-hot-encoding-in-python-with-pandas-and-scikit-learn/
https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.OneHotEncoder.html
"""