In [79]:
from typing import *
import numpy
import pandas

def compute_cohesion(
    cluster_data : Union[numpy.ndarray, pandas.DataFrame, List[float]],
    label:Union[numpy.ndarray, pandas.Series, List[float]]
) -> float:
    import numpy as np
    from scipy.spatial.distance import cdist
    """
        군집의 응집도를 계산합니다.
    """
    total_cohesion = 0.0

    if type(cluster_data) == numpy.ndarray or type(cluster_data) == list:
        clusters = [cluster_data[label == l] for l in np.unique(label)]
    elif type(cluster_data) == pandas.DataFrame:
        clusters = [[cluster_data[index[0]]for index in [label == l]][0] for l in np.unique(label)]
    else:
        raise TypeError
        
    for cluster in clusters:
        centroid = np.mean(cluster, axis=0)
        distance_sum = np.sum(cdist(cluster, [centroid]))
        total_cohesion += distance_sum

    return total_cohesion

def compute_separation(
    cluster_data: Union[numpy.ndarray, pandas.DataFrame, List[float]],
    label:Union[numpy.ndarray, pandas.Series, List[float]]
) -> float:
    import numpy as np
    from scipy.spatial.distance import cdist
    """
        군집의 분리도를 계산합니다.
    """

    total_separation = 0.0
    if type(cluster_data) == numpy.ndarray or type(cluster_data) == list:
        clusters = [cluster_data[label == l] for l in np.unique(label)]
        for i in range(len(clusters)):
            for j in range(i + 1, len(clusters)):
                distance_sum = np.sum(cdist(clusters[i], clusters[j]))
                total_separation += distance_sum
    elif type(cluster_data) == pandas.DataFrame:
        clusters = [[cluster_data[index[0]]for index in [label == l]][0] for l in np.unique(label)]
        for i in range(len(clusters)):
            for j in range(i + 1, len(clusters)):
                distance_sum = np.sum(cdist(clusters[i].values, clusters[j].values))
                total_separation += distance_sum

    return total_separation

def compute_silhouette(
    cluster_data: Union[numpy.ndarray, pandas.DataFrame, List[float]],
    label:Union[numpy.ndarray, pandas.Series, List[float]]
) -> float:
    """
        군집의 실루엣 계수를 계산합니다. 군집 내 응집도를 최대화하고 군집간 분리도를 최대화하여야 합니다.
    """
    cohesion = compute_cohesion(cluster_data, label)
    separation = compute_separation(cluster_data, label)
    n_clusters = len(cluster_data)
    silhouette_coefficient = (separation - cohesion) / max(cohesion, separation) / n_clusters

    return silhouette_coefficient

def calculate_dunn_index(cluster_data, label):
    from sklearn.metrics import pairwise_distances
    import numpy as np
    """
    Dunn Index를 계산합니다. 클러스터간 거리 중 최소값과 클러스터내 거리 중 최대값의 비율을 나타냅니다.

    Parameters:
        cluster_data (array-like): 군집 데이터. 각 군집은 하나의 배열로 표현되며,
                                   배열 내에는 군집에 속하는 데이터 포인트들이 포함됩니다.
        labels (array-like): 각 데이터 포인트의 군집 라벨.

    Returns:
        float: Dunn Index 값.
    """
    # 각 군집의 중심점 계산
    if type(cluster_data) == numpy.ndarray or type(cluster_data) == list:
        clusters = [cluster_data[label == l] for l in np.unique(label)]
    elif type(cluster_data) == pandas.DataFrame:
        clusters = [[cluster_data[index[0]] for index in [label == l]][0] for l in np.unique(label)]
    else:
        raise TypeError
    
    centroids = [np.mean(cluster, axis=0) for cluster in clusters]

    # 최대 군집 간 거리 계산
    max_inter_cluster_distance = pairwise_distances(centroids, metric='euclidean').max()

    # 최소 군집 내 거리 계산
    min_intra_cluster_distance = np.inf
    for i in range(len(clusters)):
        intra_cluster_distance = pairwise_distances(clusters[i], metric='euclidean').min()
        if intra_cluster_distance < min_intra_cluster_distance:
            min_intra_cluster_distance = intra_cluster_distance

    # Dunn Index 계산
    dunn_index = min_intra_cluster_distance / max_inter_cluster_distance

    return dunn_index


In [80]:
import pandas as pd
from sklearn.datasets import make_blobs
X, y = make_blobs(n_samples=500, n_features=2, centers=4, cluster_std=1, \
                  center_box=(-10.0, 10.0), shuffle=True, random_state=42)

X_df, y_df = pd.DataFrame(X), pd.DataFrame(y)

X.shape, y.shape

((500, 2), (500,))

In [81]:
cohesion = compute_cohesion(X, y)
separation = compute_separation(X, y)
silhouette = compute_silhouette(X, y)
dunn_index = calculate_dunn_index(X, y)

print('Cohesion:', cohesion)
print('Separation:', separation)
print('Silhouette Coefficient:', silhouette)
print('Dunn Index:', dunn_index)

Cohesion: 611.3544928662973
Separation: 1207309.073289746
Silhouette Coefficient: 0.0019989872444324463
Dunn Index: 0.0


In [82]:
from sklearn.cluster import KMeans

kmeans = KMeans(n_clusters = 4, random_state=42, init='k-means++')
labels = kmeans.fit_predict(X_df)
centers = kmeans.cluster_centers_

cohesion = compute_cohesion(X, labels)
separation = compute_separation(X, labels)
silhouette = compute_silhouette(X, labels)
dunn_index = calculate_dunn_index(X, labels)

print('Cohesion:', cohesion)
print('Separation:', separation)
print('Silhouette Coefficient:', silhouette)
print('Dunn Index:', dunn_index)

Cohesion: 611.3544928662973
Separation: 1207309.073289746
Silhouette Coefficient: 0.0019989872444324463
Dunn Index: 0.0


In [83]:
cohesion = compute_cohesion(X_df, y_df)
separation = compute_separation(X_df, y_df)
silhouette = compute_silhouette(X_df, y_df)
dunn_index = calculate_dunn_index(X_df, y_df)

print('Cohesion:', cohesion)
print('Separation:', separation)
print('Silhouette Coefficient:', silhouette)
print('Dunn Index:', dunn_index)

Cohesion: 611.3544928662973
Separation: 1207309.073289746
Silhouette Coefficient: 0.0019989872444324463
Dunn Index: 0.0
