# Hierarchical Clustering From Scratch

Hierarchical clustering builds a hierarchy of clusters. In this notebook, we implement **Agglomerative Clustering**, which is a "bottom-up" approach: each observation starts in its own cluster, and pairs of clusters are merged as one moves up the hierarchy.

## Key Concepts:
- **Proximity Matrix**: Distances between all pairs of clusters
- **Linkage Criteria**: How to measure distance between clusters (Single, Complete, Average)
- **Dendrogram**: Visual representation of the clustering process

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from scipy.cluster.hierarchy import dendrogram, linkage
from sklearn.datasets import make_blobs

## 1. Implementation (Agglomerative)

We will implement a simple version that uses the **Min Linkage** (distance between nearest points in two clusters).

In [None]:
class HierarchicalClustering:
    def __init__(self, n_clusters=2):
        self.n_clusters = n_clusters
        self.labels = None

    def fit(self, X):
        n_samples = X.shape[0]
        # Initialize each sample as a single cluster
        clusters = [[i] for i in range(n_samples)]
        
        # Precompute distance matrix
        dist_matrix = np.full((n_samples, n_samples), float('inf'))
        for i in range(n_samples):
            for j in range(i + 1, n_samples):
                dist_matrix[i, j] = np.linalg.norm(X[i] - X[j])
                dist_matrix[j, i] = dist_matrix[i, j]

        while len(clusters) > self.n_clusters:
            # Find the two closest clusters
            min_dist = float('inf')
            to_merge = (0, 0)
            
            for i in range(len(clusters)):
                for j in range(i + 1, len(clusters)):
                    # Single Linkage: min distance between any two points in clusters
                    current_dist = self._get_cluster_dist(clusters[i], clusters[j], dist_matrix)
                    if current_dist < min_dist:
                        min_dist = current_dist
                        to_merge = (i, j)
            
            # Merge clusters
            idx1, idx2 = to_merge
            clusters[idx1].extend(clusters[idx2])
            clusters.pop(idx2)

        # Assign labels
        self.labels = np.zeros(n_samples)
        for cluster_idx, point_indices in enumerate(clusters):
            for point_idx in point_indices:
                self.labels[point_idx] = cluster_idx
        
        return self

    def _get_cluster_dist(self, cluster1, cluster2, dist_matrix):
        # Single Linkage implementation
        min_d = float('inf')
        for i in cluster1:
            for j in cluster2:
                if dist_matrix[i, j] < min_d:
                    min_d = dist_matrix[i, j]
        return min_d

## 2. Testing and Visualization

In [None]:
X, y = make_blobs(n_samples=50, centers=3, cluster_std=0.5, random_state=42)

hc = HierarchicalClustering(n_clusters=3)
hc.fit(X)

plt.scatter(X[:, 0], X[:, 1], c=hc.labels, cmap='rainbow')
plt.title("Hierarchical Agglomerative Clustering")
plt.show()

## 3. Visualizing with Dendrogram (using scipy)

In [None]:
linked = linkage(X, 'single')
plt.figure(figsize=(10, 7))
dendrogram(linked)
plt.title("Cluster Dendrogram")
plt.xlabel("Sample Index")
plt.ylabel("Distance")
plt.show()