In [None]:
!pip install scikit-learn-extra

In [4]:
import numpy as np
import pandas as pd
import time
from sklearn.metrics import silhouette_score, adjusted_rand_score
from sklearn.cluster import KMeans
from sklearn.metrics import pairwise_distances_argmin_min
from sklearn.metrics import pairwise_distances_argmin
from sklearn_extra.cluster import KMedoids
from numba import jit
from numba import njit
import os

In [5]:
@jit(nopython=True)
def minkowskiDistance(a, b, p):
    return np.sum(np.abs(a - b) ** p) ** (1 / p)

In [6]:
@jit(nopython=True)
def distanceMatrix(points, p):
  # Recebe um array de pontos com os rótulos removidos e retorna a matriz de distâncias entre eles
  size = points.shape[0]
  distance = np.zeros((size, size))
  for i in range(size):
    for j in range(size):
      distance[i, j] = minkowskiDistance(points[i], points[j], p)
  return distance

In [7]:
@jit(nopython=True)
def KMeans1Aux(points, distance_matrix, r):
    num_points = len(points)
    centers = np.empty((num_points, points.shape[1]))
    centersIdx = np.empty(num_points, dtype=np.int64)
    remaining_points = np.ones(num_points, dtype=np.bool_)
    count = 0

    while np.any(remaining_points):
        center_idx = np.random.choice(np.where(remaining_points)[0])
        centers[count] = points[center_idx]
        centersIdx[count] = center_idx
        distances = distance_matrix[center_idx, remaining_points]
        remaining_points[remaining_points] = distances >= r
        count+=1
    return centers[:count] #retorno somentes dos centros que foram instanciados

In [8]:
@jit(nopython=True)
def KMeans1(k, points, distance_matrix):
  # Implementação do KMeans por aproximação baseada na convergência
  rmax = np.max(distance_matrix)
  low = 0
  high = rmax
  centers = np.empty((k, points.shape[1]))

  while high - low > 0.000001:
    mid = (high + low) / 2
    centers = KMeans1Aux(points, distance_matrix, mid)
    if len(centers) <= k:
      high = mid
    else:
      low = mid

  return centers

In [9]:
@jit(nopython=True)
def KMeans2(k, points, distance_matrix):
    # Implementação do KMeans por aproximação gulosa
    num_points = len(points)
    centers_idx = np.array([np.random.choice(num_points)])
    centers = [points[centers_idx[0]]]

    while len(centers) < k:
        min_dists = np.full(num_points, np.inf)
        for i in range(num_points):
            for j in range(len(centers_idx)):
                if distance_matrix[i, centers_idx[j]] < min_dists[i]:
                    min_dists[i] = distance_matrix[i, centers_idx[j]]

        max_idx = np.argmax(min_dists)
        max_point = points[max_idx]

        centers_idx = np.append(centers_idx, max_idx)
        centers.append(max_point)

    return centers

In [10]:
@jit(nopython=True)
def get_k_size(file_name):
  # Lê o número de clusters com base no nome do arquivo
  return file_name.split('_')[1]

In [11]:
def get_points_and_distance_matrix(data, p):
  points = data.iloc[:, :-1].values
  dist = distanceMatrix(points, p)
  return points, dist

In [12]:
def get_data_from_k_mean_1(k_size, points, dist, metric, labels_true):
  centers = KMeans1(k_size, points, dist)
  distances = pairwise_distances_argmin_min(points, centers,metric=metric)[1]
  labels_pred = pairwise_distances_argmin(points, centers,metric=metric)
  silhouette = silhouette_score(points, labels_pred) if len(np.unique(labels_pred)) > 1 else 0
  adjusted_rand = adjusted_rand_score(labels_true, labels_pred)
  return  np.max(distances), silhouette, adjusted_rand

In [13]:
def get_data_from_k_mean_2(k_size, points, dist,metric, labels_true):
  centers = KMeans2(k_size, points, dist)
  distances = pairwise_distances_argmin_min(points, centers,metric=metric)[1]
  labels_pred = pairwise_distances_argmin(points, centers,metric=metric)
  silhouette = silhouette_score(points, labels_pred) if len(np.unique(labels_pred)) > 1 else 0
  adjusted_rand = adjusted_rand_score(labels_true, labels_pred)
  return  np.max(distances), silhouette, adjusted_rand

In [14]:
def get_data_from_kmeans(k_size, points, dist, data,metric):
    # Define o tipo de métrica e o modelo de clustering baseado no tipo de métrica (manhattam ou euclidian)
    if metric == 'manhattan':
        clustering_model = KMedoids(n_clusters=k_size, metric=metric, random_state=0)
    else:
        clustering_model = KMeans(n_clusters=k_size, random_state=0, n_init='auto')

    # Executa o ajuste do modelo aos pontos
    clustering_model.fit(points)
    labels_pred = clustering_model.labels_
    centers = clustering_model.cluster_centers_

    # Calcula a distância máxima dos pontos ao centroide mais próximo
    distances = pairwise_distances_argmin_min(points, centers, metric=metric)[1]
    max_distance = np.max(distances)

    # Calcula o silhouette score e o adjusted rand index
    labels_true = data.iloc[:, -1].values
    # Caso todos os labels sejam do msm centro, silhoutte será 0
    silhouette = silhouette_score(points, labels_pred) if len(np.unique(labels_pred)) > 1 else 0
    adjusted_rand = adjusted_rand_score(labels_true, labels_pred)

    return max_distance, silhouette, adjusted_rand


In [15]:
def max_distance_for_same_label(matrix, matrix_distances, p):
# Calcula a distância máxima entre pontos com o mesmo label para os rótulos originais
    max_distance = 0

    labels = np.unique(matrix[:, -1])

    for label in labels:
        points = matrix[matrix[:, -1] == label][:, :-1].astype(np.float64)
        matrix_distances = distanceMatrix(points, p)

        for i in range(len(points)):
            for j in range(i + 1, len(points)):
                dist = matrix_distances[i, j]
                if dist > max_distance:
                    max_distance = dist

    return  max_distance

In [16]:
def get_information(directory_path, output_csv):
    if not os.path.isdir(directory_path):
        print(f"O caminho {directory_path} não é uma pasta válida.")
        return

    # Lista para armazenar os resultados
    results = []
    metric = ['manhattan', 'euclidean']
    for file_name in os.listdir(directory_path):
        file_path = os.path.join(directory_path, file_name)
        if os.path.isfile(file_path):
            data = pd.read_csv(file_path, header=None)
            print(file_name)
            k_center = int(get_k_size(file_name))
            labels_true = data.iloc[:, -1].values
            for p in [1,2]:
              points, dist = get_points_and_distance_matrix(data, p)
              estimated_distance = max_distance_for_same_label(data.values, dist, p)
              for i in range(30):
                  # Roda a instância para o primeiro método k_mean_1
                  start_time = time.time()
                  max_distance_k_mean_1, silhouette_1,adjusted_rand_1 = get_data_from_k_mean_1(k_center, points, dist, metric[p -1], labels_true)
                  elapsed_time_1 = time.time() - start_time

                  # Roda a instância para o primeiro método k_mean_2
                  start_time = time.time()
                  max_distance_k_mean_2, silhouette_2,adjusted_rand_2 = get_data_from_k_mean_2(k_center, points, dist, metric[p -1], labels_true)
                  elapsed_time_2 = time.time() - start_time

                  # Roda instância para o k-Means implementeado no sklearn
                  start_time = time.time()
                  max_distance_k_means, silhouette_3, adjusted_rand_3 = get_data_from_kmeans(k_center, points, dist, data, metric[p -1])
                  elapsed_time_3 = time.time() - start_time

                  results.append({
                      'instance': file_name,
                      'centers_number': k_center,
                      'size': data.shape[0],
                      'p': p,
                      'iteration': i + 1,
                      'max_distance_k_mean_1': max_distance_k_mean_1,
                      'silhouette_k_mean_1': silhouette_1,
                      'adjusted_rand_k_mean_1': adjusted_rand_1,
                      'execution_time_k_mean_1': elapsed_time_1,
                      'max_distance_k_mean_2': max_distance_k_mean_2,
                      'silhouette_k_mean_2': silhouette_2,
                      'adjusted_rand_k_mean_2': adjusted_rand_2,
                      'execution_time_k_mean_2': elapsed_time_2,
                      'max_distance_k_means_skt': max_distance_k_means,
                      'execution_time_k_means_skt': elapsed_time_3,
                      'silhouette_k_means_skt': silhouette_3,
                      'adjusted_rand_k_means_skt': adjusted_rand_3,
                      'estimated': estimated_distance

                  })

    # Criar um DataFrame a partir dos resultados e salvar em CSV
    results_df = pd.DataFrame(results)
    results_df.to_csv(output_csv, index=False)

In [None]:
get_information('/content/real_data', 'output_results.csv')