In [70]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


In [71]:
# Utility functions


In [72]:
def show_2d(data, point):
    plt.scatter(data[:, 0], data[:, 1], c=point)
    plt.show()


In [73]:
def dunn_index(data, data_clusters):
    out_min = None
    in_max = None
    for ind_x, point_x in enumerate(data):
        for ind_y, point_y in enumerate(data[ind_x + 1:]):
            ind_y = ind_y + ind_x + 1
            dist = np.linalg.norm(point_x - point_y)
            if data_clusters[ind_x] == data_clusters[ind_y] and (in_max is None or dist > in_max):
                in_max = dist
            if data_clusters[ind_x] != data_clusters[ind_y] and (out_min is None or dist < out_min):
                out_min = dist
    return out_min / in_max


In [74]:
cancer_data = pd.read_csv("datasets/cancer.csv")
cancer_features = cancer_data.drop(['label'], axis=1).values


In [75]:
blobs_data = pd.read_csv("datasets/blobs.csv")
blobs_features = blobs_data.values


In [76]:
class Kmeans:
    def __init__(self, data, cluster_num, iter_num=50):
        self.data = data
        self.cluster_num = cluster_num
        self.iter_num = iter_num
        self.centers = self.data[np.random.choice(np.arange(self.data.shape[0]), self.cluster_num)]
        self.precalc()

    def precalc(self):
        for _ in range(self.iter_num):
            clusters_sum = np.zeros_like(self.centers)
            clusters_size = np.zeros(self.cluster_num)
            for point in self.data:
                near_center = np.argmin([np.linalg.norm(center - point) for center in self.centers])
                clusters_sum[near_center] += point
                clusters_size[near_center] += 1
            for ind in range(self.cluster_num):
                self.centers[ind] = clusters_sum[ind] / clusters_size[ind]

    def clusterize(self, data):
        cluster = np.zeros(data.shape[0])
        for index, point in enumerate(data):
            cluster[index] = np.argmin([np.linalg.norm(center - point) for center in self.centers])
        return cluster


In [77]:
for cluster_num in range(2, 6):
    result = Kmeans(blobs_features, cluster_num).clusterize(blobs_features)
    dunn_ind = dunn_index(blobs_features, result)
    print(f"Clusters: {cluster_num}; Dunn index: {dunn_ind}")
    show_2d(blobs_features, result)


Clusters: 2; Dunn index: 0.011020999037160844


Clusters: 3; Dunn index: 0.024417820826925618


Clusters: 4; Dunn index: 0.03756164709963606


Clusters: 5; Dunn index: 0.01463555414684757


In [78]:
class Dbscan:
    def __init__(self, data, min_points, epsilon):
        self.data = data
        self.data_labeled = {}

        self.min_points = min_points
        self.epsilon = epsilon

        self.neighbours_eps = {}

        self.precalc()

    def precalc(self):
        point_dists = np.zeros((len(self.data), len(self.data)))
        for ind_x, point_x in enumerate(self.data):
            for ind_y, point_y in enumerate(self.data):
                point_dists[ind_x, ind_y] = np.linalg.norm(point_x - point_y)

        for point_ind, _ in enumerate(self.data):
            self.neighbours_eps[point_ind] = [cur_ind for cur_ind in range(len(self.data))
                                              if point_dists[cur_ind, point_ind] < self.epsilon]

    def clusterize(self):
        total_clusters = 0
        for point_ind, _ in enumerate(self.data):
            near_points = self.neighbours_eps[point_ind]
            if len(near_points) < self.min_points:
                self.data_labeled[point_ind] = None

            if point_ind in self.data_labeled:
                continue

            self.data_labeled[point_ind] = total_clusters
            cluster = set((point for point in near_points if point != point_ind))
            while cluster:
                point = cluster.pop()
                if point not in self.data_labeled:
                    self.data_labeled[point] = None
                    point_nearest = self.neighbours_eps[point]
                    if len(point_nearest) >= self.min_points:
                        cluster.update(point_nearest)
                if self.data_labeled[point] is None:
                    self.data_labeled[point] = total_clusters
            total_clusters += 1

        np_data_labeled = np.ones(self.data.shape[0])
        for ind, _ in enumerate(self.data):
            np_data_labeled[ind] = self.data_labeled[ind]
        return total_clusters, np_data_labeled


In [79]:
clusters_num, result = Dbscan(blobs_features, 3, 0.29).clusterize()
dunn_ind = dunn_index(blobs_features, result)
print(f"Clusters: {clusters_num}; Dunn index: {dunn_ind}")

show_2d(blobs_features, result)

Clusters: 4; Dunn index: 0.026677432372590574
