In [None]:
import numpy as np
from collections import Counter
import matplotlib.pyplot as plt

## KMeans

In [None]:
def general_kmeans(X: np.ndarray, k, toler=0.001):
    n = X.shape[0]
    # initialize random centroids
    centroids = X[np.random.choice(n, k, replace=False)]
    
    # until the centroids do not change significantly (dist(\micron_i^{t+1}, \micron_i^t) < toler)
    while True:
        # for each point x_j in X, assign it to the closest centroid \micron_i
        C_i = np.argmin(np.linalg.norm(X[:, None] - centroids, axis=-1), axis=-1)
        # we use argmin to get the index of the closest centroid

        # for each cluster C_i, update the centroid \micron_i as the mean of all points assigned to it
        micron_new = np.array([X[C_i == i].mean(axis=0) for i in range(k)])

        # if the centroids do not change significantly, break
        if np.linalg.norm(micron_new - centroids) < toler:
            break
        centroids = micron_new

    return centroids, C_i 

## KMeans++

In [None]:
# kmeans++ is a variant of kmeans that initializes the centroids in a more intelligent way
# the first centroid is chosen uniformly at random from the data points

def kmeans_plus_plus(X: np.ndarray, k, toler=0.001):
    n = X.shape[0]
    # initialize centroids with first point
    centroids = [X[np.random.choice(n)]]
    
    for _ in range(k-1):
        # for each point x_j in X, calculate the distance to the closest centroid
        dist = np.linalg.norm(X[:, None] - centroids, axis=-1).min(axis=-1)
        # calculate the probability of each point to be chosen as the next centroid
        prob = dist**2 / (dist**2).sum()
        # choose the next centroid
        centroids.append(X[np.random.choice(n, p=prob)])
    
    centroids = np.array(centroids)

    # until the centroids do not change significantly (dist(\micron_i^{t+1}, \micron_i^t) < toler)
    while True:
        # for each point x_j in X, assign it to the closest centroid \micron_i
        C_i = np.argmin(np.linalg.norm(X[:, None] - centroids, axis=-1), axis=-1)
        # we use argmin to get the index of the closest centroid

        # for each cluster C_i, update the centroid \micron_i as the mean of all points assigned to it
        micron_new = np.array([X[C_i == i].mean(axis=0) for i in range(k)])

        # if the centroids do not change significantly, break
        if np.linalg.norm(micron_new - centroids) < toler:
            break
        centroids = micron_new

    return centroids, C_i

## MeanShift

In [None]:
# meanshift require a KDTREE to find the nearest neighbors
from sklearn.neighbors import KDTree

def meanshift(X: np.ndarray, bandwidth=0.5, toler=0.001):
    n = X.shape[0]
    # assign each data point y_i = x_i for all points in X
    y = X.copy()
    # build a KDTree to find the nearest neighbors
    tree = KDTree(X)

    while True:
        # for each point y_i in X
        for i in range(n):
            # find the neighbors of y_i within a radius of bandwidth
            neighbors = tree.query_radius(y[i][None], bandwidth)[0]
            
            # update y_i  using the Gaussian kernel, computing the new value y_i as the weighted mean of the neighbors
            kernel_j = np.exp(-np.linalg.norm(y[neighbors] - y[i], axis=-1)**2 / (2 * bandwidth**2))
            y[i] = (kernel_j[:, None] * y[neighbors]).sum(axis=0) / kernel_j.sum()

            # if the centroids do not change significantly, break
        if np.linalg.norm(y - X) < toler:
            break
        X = y

    #group points y_i that converge to the same mode into the same cluster
    C_i = np.zeros(n, dtype=int)
    for i in range(n):
        C_i[i] = np.argmin(np.linalg.norm(y[i] - y, axis=-1))
    # assign each point x_i to the corresponding cluster based on its converged value y_i
    X = y
    return C_i

## Agglomeative Clustering (insecure)

In [None]:
def agglomerative(D: np.ndarray, distances: list[list[float]], metric='normal'):
    n = D.shape[0]
    # each point x_i is a cluster {C_1, C_2, ..., C_n}
    clusters = [[i] for i in range(n)]
    while len(clusters) > 1:
        # for each pair of clusters C_i and C_j, calculate the distance between them
        if metric == 'normal':
            # compute d(C_i, C_j) = minimum distance between two points in the clusters
            D = np.array([[min(D[i, j] for i in c1 for j in c2) for c2 in clusters] for c1 in clusters])
        elif metric == 'single':
            # compute the maximum distance between two points in the clusters
            D = np.array([[max(D[i, j] for i in c1 for j in c2) for c2 in clusters] for c1 in clusters])
        elif metric == "average":
            # compute the average distance between two points in the clusters
            D = np.array([[np.mean([D[i, j] for i in c1 for j in c2]) for c2 in clusters] for c1 in clusters])

        # find the pair of clusters C_i and C_j with the smallest distance, let C_a and C_b
        i, j = np.unravel_index(D.argmin(), D.shape)
        # merge C_a and C_b into a new cluster C_new = C_a U C_b
        clusters.append(clusters[i] + clusters[j])
        # update the set of clusters by removing C_a and C_b and adding C_new
        clusters = [c for k, c in enumerate(clusters) if k not in [i, j]]
        # update the distance matrix D with values D[C_new, C_k] for all clusters C_k
        D = np.delete(D, [i, j], axis=0)

    # assign each point x_i to the corresponding cluster
    C_i = np.zeros(n, dtype=int)
    for i, c in enumerate(clusters):
        C_i[c] = i
    return C_i

def build_dendogram(X: np.ndarray, metric='normal'):
    n = X.shape[0]
    # initialize the distance matrix D with the Euclidean distance between each pair of points
    D = np.linalg.norm(X[:, None] - X, axis=-1)
    # initialize the list of distances
    distances = []
    # initialize the list of number of clusters
    n_clusters = []
    # for each number of clusters k from 1 to n
    for k in range(1, n):
        # apply the agglomerative algorithm with k clusters
        C_i = agglomerative(D, distances, metric)
        # calculate the distance between the clusters
        dist = sum(D[i, j] for i in range(n) for j in range(i) if C_i[i] != C_i[j]) / n
        # append the distance to the list of distances
        distances.append(dist)
        # append the number of clusters to the list of number of clusters
        n_clusters.append(k)
    return distances, n_clusters

def plot_dendogram(X: np.ndarray, metric='normal'):
    distances, n_clusters = build_dendogram(X, metric)
    plt.plot(n_clusters, distances)
    plt.xlabel('Number of clusters')
    plt.ylabel('Average distance between clusters')
    plt.title('Dendogram')
    plt.show()