In [1]:
# Question 5: Modify a hierarchical clustering algorithm to perform clustering using a custom distance metric on 2D points.

In [None]:
import numpy as np
from scipy.cluster.hierarchy import linkage, fcluster
from scipy.spatial.distance import pdist
from collections import defaultdict

def custom_distance_metric(point1, point2):
    # Example: Manhattan distance (L1 norm)
    # This metric calculates the sum of the absolute differences of their coordinates.
    # It's also known as city block distance.
    return np.sum(np.abs(point1 - point2))

# Generate some sample 2D data points
np.random.seed(42)
data_points = np.array([
    [1, 2],
    [1.5, 1.8],
    [5, 8],
    [8, 8],
    [1, 0.6],
    [9, 11],
    [0.8, 0.5],
    [8.5, 9.2],
    [5.5, 8.2],
    [1.2, 1.1]
])

# Calculate the condensed distance matrix using the custom distance metric
# pdist takes a function that accepts two 1D arrays (points) and returns a scalar distance
distance_matrix = pdist(data_points, metric=custom_distance_metric)

# Perform hierarchical clustering using the linkage function
# 'single' linkage: uses the minimum distance between observations in the two sets
# 'complete' linkage: uses the maximum distance between observations in the two sets
# 'average' linkage: uses the average of the distances between all observations in the two sets
linked_matrix = linkage(distance_matrix, method='average')

# Form flat clusters from the hierarchical clustering
# t: the threshold to apply when forming flat clusters.
# criterion='maxclust': forms n_clusters from the linkage matrix
num_clusters = 3
clusters = fcluster(linked_matrix, num_clusters, criterion='maxclust')
print("Data Points:")
for i, point in enumerate(data_points):
    print(f"Point {i+1}: {point}")
print("\nCluster Assignments:")
for i, cluster_id in enumerate(clusters):
    print(f"Point {i+1} ({data_points[i]}): Cluster {cluster_id}")
print("\nPoints in each cluster:")
clustered_data = defaultdict(list)
for i, cluster_id in enumerate(clusters):
    clustered_data[cluster_id].append(data_points[i])
for cluster_id, points_in_cluster in clustered_data.items():
    print(f"Cluster {cluster_id}:")
    for point in points_in_cluster:
        print(f"  {point}")

Data Points:
Point 1: [1. 2.]
Point 2: [1.5 1.8]
Point 3: [5. 8.]
Point 4: [8. 8.]
Point 5: [1.  0.6]
Point 6: [ 9. 11.]
Point 7: [0.8 0.5]
Point 8: [8.5 9.2]
Point 9: [5.5 8.2]
Point 10: [1.2 1.1]

Cluster Assignments:
Point 1 ([1. 2.]): Cluster 1
Point 2 ([1.5 1.8]): Cluster 1
Point 3 ([5. 8.]): Cluster 2
Point 4 ([8. 8.]): Cluster 3
Point 5 ([1.  0.6]): Cluster 1
Point 6 ([ 9. 11.]): Cluster 3
Point 7 ([0.8 0.5]): Cluster 1
Point 8 ([8.5 9.2]): Cluster 3
Point 9 ([5.5 8.2]): Cluster 2
Point 10 ([1.2 1.1]): Cluster 1

Points in each cluster:
Cluster 1:
  [1. 2.]
  [1.5 1.8]
  [1.  0.6]
  [0.8 0.5]
  [1.2 1.1]
Cluster 2:
  [5. 8.]
  [5.5 8.2]
Cluster 3:
  [8. 8.]
  [ 9. 11.]
  [8.5 9.2]
