In [10]:
import numpy as np
import pandas as pd
from sklearn.datasets import load_iris
from sklearn.metrics import silhouette_score, adjusted_rand_score
from scipy.cluster import hierarchy
import matplotlib.pyplot as plt

In [3]:
def euclidean_distance(vi, vj):
    return np.sqrt(np.sum((vi - vj) ** 2))

In [4]:
class ClusterMeasurements(object):
    
    def single_link(ci, cj):
        return min([euclidean_distance(vi, vj) for vi in ci for vj in cj])


    def complete_link(ci, cj):
        return max([euclidean_distance(vi, vj) for vi in ci for vj in cj])


    def average_link(ci, cj):
        distances = [euclidean_distance(vi, vj) for vi in ci for vj in cj]
        return sum(distances) / len(distances)


In [48]:
class AgglomerativeClustering:
    
    def __init__(self, K, measure=ClusterMeasurements.single_link):
        self.K = K
        self.measure = measure

    def init_clusters(self):
        return {data_id: [data_point] for data_id, data_point in enumerate(self.data)}

    def find_closest_clusters(self):
        min_dist = np.inf
        closest_clusters = None

        clusters_ids = list(self.clusters.keys())

        for i, cluster_i in enumerate(clusters_ids[:-1]):
            for j, cluster_j in enumerate(clusters_ids[i+1:]):
                dist = self.measure(self.clusters[cluster_i], self.clusters[cluster_j])
                if dist < min_dist:
                    min_dist, closest_clusters = dist, (cluster_i, cluster_j)
                    
        return closest_clusters

    def merge_and_form_new_clusters(self, ci_id, cj_id):
        new_clusters = {0: self.clusters[ci_id] + self.clusters[cj_id]}
        for cluster_id in self.clusters.keys():
            if (cluster_id == ci_id) | (cluster_id == cj_id):
                continue
            new_clusters[len(new_clusters.keys())] = self.clusters[cluster_id]
        return new_clusters


    def predict(self, X):
        
        self.data = X
        self.n_samples, self.n_features = self.data.shape
        
        self.clusters = self.init_clusters()
        
        while len(self.clusters.keys()) > self.K:
            closest_clusters = self.find_closest_clusters()
            self.clusters = self.merge_and_form_new_clusters(*closest_clusters)
            
        return self.clusters

In [49]:
from sklearn.model_selection import train_test_split

data = load_iris()
X_train, X_test, y_train, y_test = train_test_split(data.data, data.target, test_size=0.2)

In [69]:
ac = AgglomerativeClustering(K=3, measure=ClusterMeasurements.complete_link)
predict = ac.predict(X_train)