In [1]:
# K-Modes
from kmodes.kmodes import KModes

# DBSCAN
from sklearn.cluster import DBSCAN

# ROCK
from pyclustering.cluster.rock import rock


# Para visualização e manipulação 
import matplotlib.pyplot as plt
import pandas as pd 
import numpy as np
import umap
from sklearn.metrics import pairwise_distances
from scipy.spatial.distance import squareform


# Para avaliçãp
from sklearn.metrics import accuracy_score

  from .autonotebook import tqdm as notebook_tqdm


In [15]:
fatores_df = pd.read_csv(r'C:\Users\maype\Desktop\projetos\Trabalho Prático AM2\data\base_fatores.csv')

In [16]:
# K-Modes
k_modes = KModes(n_clusters=3, init='Huang', n_init=10, verbose=1)
clusters_kmodes = k_modes.fit_predict(fatores_df.drop('diagnostico_hipertensao', axis=1))

# Adicionando os resultados ao DataFrame
fatores_df['cluster_kmodes'] = clusters_kmodes


Init: initializing centroids
Init: initializing clusters
Starting iterations...
Run 1, iteration: 1/100, moves: 0, cost: 465439.0
Init: initializing centroids
Init: initializing clusters
Starting iterations...
Run 2, iteration: 1/100, moves: 0, cost: 465439.0
Init: initializing centroids
Init: initializing clusters
Starting iterations...
Run 3, iteration: 1/100, moves: 0, cost: 465439.0
Init: initializing centroids
Init: initializing clusters
Starting iterations...
Run 4, iteration: 1/100, moves: 0, cost: 465439.0
Init: initializing centroids
Init: initializing clusters
Starting iterations...
Run 5, iteration: 1/100, moves: 0, cost: 465439.0
Init: initializing centroids
Init: initializing clusters
Starting iterations...
Run 6, iteration: 1/100, moves: 0, cost: 465439.0
Init: initializing centroids
Init: initializing clusters
Starting iterations...
Run 7, iteration: 1/100, moves: 0, cost: 465439.0
Init: initializing centroids
Init: initializing clusters
Starting iterations...
Run 8, ite

In [17]:
# DBSCAN
dbscan = DBSCAN(eps=0.5, min_samples=5)  # Ajuste eps e min_samples conforme necessário
clusters_dbscan = dbscan.fit_predict(fatores_df.drop('diagnostico_hipertensao', axis=1))

# Adicionando os resultados ao DataFrame
fatores_df['cluster_dbscan'] = clusters_dbscan


In [None]:
# Remover a coluna de diagnóstico para o clustering
fatores_df_sem_diagnostico = fatores_df.drop('diagnostico_hipertensao', axis=1)

# Convertendo para formato numérico adequado ao pyclustering
# Transformamos os dados categóricos em números usando 'astype('category').cat.codes'
fatores_numericos = fatores_df_sem_diagnostico.apply(lambda x: x.astype('category').cat.codes).values.tolist()

# Instanciando o algoritmo ROCK
rock_instance = rock(fatores_numericos, eps=0.5, number_clusters=3, threshold=0.5)
rock_instance.process()

# Obtendo os clusters
clusters_rock = rock_instance.get_clusters()

# Reduzindo a dimensionalidade para visualização com UMAP
umap_2d = umap.UMAP(n_components=2, random_state=42)
fatores_umap = umap_2d.fit_transform(fatores_df_sem_diagnostico)

# Mapeando os clusters no formato esperado
cluster_labels = [-1] * len(fatores_numericos)
for cluster_id, cluster_points in enumerate(clusters_rock):
    for point_index in cluster_points:
        cluster_labels[point_index] = cluster_id

In [None]:
# Aplicando UMAP para reduzir os fatores a 2 dimensões
umap_2d = umap.UMAP(n_components=2, random_state=42)
fatores_umap = umap_2d.fit_transform(fatores_sem_diagnostico)

# Visualizando os clusters gerados pelos diferentes modelos
def plot_clusters(data, labels, title):
    plt.figure(figsize=(10, 8))
    plt.scatter(data[:, 0], data[:, 1], c=labels, cmap='Spectral', s=30, alpha=0.7)
    plt.colorbar(label='Clusters')
    plt.title(title)
    plt.xlabel('UMAP 1')
    plt.ylabel('UMAP 2')
    plt.show()

# Plotando clusters gerados
#plot_clusters(fatores_umap, clusters_rock, 'Clusters ROCK com UMAP')
plot_clusters(fatores_umap, clusters_kmodes, 'Clusters K-Modes com UMAP')
plot_clusters(fatores_umap, clusters_dbscan, 'Clusters DBSCAN com UMAP')

In [None]:
# Médias dos fatores por cluster (K-Modes como exemplo)
cluster_analysis_kmodes = fatores_df.groupby('cluster_kmodes').mean()
print(cluster_analysis_kmodes)

# Para DBSCAN
cluster_analysis_dbscan = fatores_df.groupby('cluster_dbscan').mean()
print(cluster_analysis_dbscan)

# Para Rock
#cluster_analysis_rock = fatores_df.groupby('cluster_rock').mean()
#print(cluster_analysis_rock)

In [None]:
def calcular_pureza(y_true, clusters):
    contingency_matrix = pd.crosstab(y_true, clusters)
    return np.sum(np.amax(contingency_matrix.values, axis=0)) / np.sum(contingency_matrix.values)

# Pureza para K-Modes
pureza_kmodes = calcular_pureza(fatores_df['diagnostico_hipertensao'], clusters_kmodes)
print(f'Pureza K-Modes: {pureza_kmodes}')

# Pureza para DBSCAN
pureza_dbscan = calcular_pureza(fatores_df['diagnostico_hipertensao'], clusters_dbscan)
print(f'Pureza DBSCAN: {pureza_dbscan}')

# Pureza para Rock
#pureza_rock = calcular_pureza(fatores_df['diagnostico_hipertensao'], clusters_rock)
#print(f'Pureza Aglomerativo: {pureza_rock}')