From bba005ab23b81cc9f724e3ad8b272c06df761c20 Mon Sep 17 00:00:00 2001 From: rkbansal Date: Tue, 21 Oct 2025 14:20:53 +0530 Subject: [PATCH 1/3] Implement K-Medoids clustering algorithm #13488 --- machine_learning/k_medoids.py | 96 +++++++++++++++++++++++++++++++++++ 1 file changed, 96 insertions(+) create mode 100644 machine_learning/k_medoids.py diff --git a/machine_learning/k_medoids.py b/machine_learning/k_medoids.py new file mode 100644 index 000000000000..6497703fb0aa --- /dev/null +++ b/machine_learning/k_medoids.py @@ -0,0 +1,96 @@ +""" +README, Author - Rohit Kumar Bansal (mailto:rohitbansal.dev@gmail.com) + +Requirements: + - numpy + - matplotlib +Python: + - 3.5+ +Inputs: + - X: 2D numpy array of features + - k: number of clusters +Usage: + 1. Define k and X + 2. Create initial medoids: + initial_medoids = get_initial_medoids(X, k, seed=0) + 3. Run kmedoids: + medoids, cluster_assignment = kmedoids( + X, k, initial_medoids, maxiter=100, verbose=True + ) +""" + +import numpy as np +from matplotlib import pyplot as plt +from sklearn.metrics import pairwise_distances + +def get_initial_medoids(data, k, seed=None): + rng = np.random.default_rng(seed) + n = data.shape[0] + indices = rng.choice(n, k, replace=False) + medoids = data[indices, :] + return medoids + +def assign_clusters(data, medoids): + distances = pairwise_distances(data, medoids, metric='euclidean') + cluster_assignment = np.argmin(distances, axis=1) + return cluster_assignment + +def revise_medoids(data, k, cluster_assignment): + new_medoids = [] + for i in range(k): + members = data[cluster_assignment == i] + if len(members) == 0: + continue + # Compute total distance from each point to all others in cluster + total_distances = np.sum(pairwise_distances(members, members), axis=1) + medoid_index = np.argmin(total_distances) + new_medoids.append(members[medoid_index]) + return np.array(new_medoids) + +def compute_heterogeneity(data, k, medoids, cluster_assignment): + heterogeneity = 0.0 + for i in range(k): + members = data[cluster_assignment == i] + if len(members) == 0: + continue + distances = pairwise_distances(members, [medoids[i]]) + heterogeneity += np.sum(distances**2) + return heterogeneity + +def kmedoids(data, k, initial_medoids, maxiter=100, verbose=False): + medoids = initial_medoids.copy() + prev_assignment = None + for itr in range(maxiter): + cluster_assignment = assign_clusters(data, medoids) + medoids = revise_medoids(data, k, cluster_assignment) + + if prev_assignment is not None and (prev_assignment == cluster_assignment).all(): + break + + if verbose and prev_assignment is not None: + changed = np.sum(prev_assignment != cluster_assignment) + print(f"Iteration {itr}: {changed} points changed clusters") + + prev_assignment = cluster_assignment.copy() + + return medoids, cluster_assignment + +# Optional plotting +def plot_clusters(data, medoids, cluster_assignment): + ax = plt.axes(projection='3d') + ax.scatter(data[:,0], data[:,1], data[:,2], c=cluster_assignment, cmap='viridis') + ax.scatter(medoids[:,0], medoids[:,1], medoids[:,2], c='red', s=100, marker='x') + ax.set_xlabel("X") + ax.set_ylabel("Y") + ax.set_zlabel("Z") + ax.set_title("3D K-Medoids Clustering") + plt.show() + +# Optional test +if __name__ == "__main__": + from sklearn import datasets + X = datasets.load_iris()['data'] + k = 3 + medoids = get_initial_medoids(X, k, seed=0) + medoids, clusters = kmedoids(X, k, medoids, maxiter=50, verbose=True) + plot_clusters(X, medoids, clusters) From c0543f7a0dc0665d5daa7772c23ef6e6e9b9879f Mon Sep 17 00:00:00 2001 From: rkbansal Date: Tue, 21 Oct 2025 19:38:06 +0530 Subject: [PATCH 2/3] Added Manhattan and Minkowski distance metrics to KNN algorithm (#13546) --- machine_learning/k_nearest_neighbours.py | 55 ++++++++++++++++-------- 1 file changed, 36 insertions(+), 19 deletions(-) diff --git a/machine_learning/k_nearest_neighbours.py b/machine_learning/k_nearest_neighbours.py index fbc1b8bd227e..24ce6eefb753 100644 --- a/machine_learning/k_nearest_neighbours.py +++ b/machine_learning/k_nearest_neighbours.py @@ -14,7 +14,6 @@ from collections import Counter from heapq import nsmallest - import numpy as np from sklearn import datasets from sklearn.model_selection import train_test_split @@ -26,23 +25,36 @@ def __init__( train_data: np.ndarray[float], train_target: np.ndarray[int], class_labels: list[str], + distance_metric: str = "euclidean", + p: int = 2, ) -> None: """ - Create a kNN classifier using the given training data and class labels + Create a kNN classifier using the given training data and class labels. + + Parameters: + ----------- + distance_metric : str + Type of distance metric to use ('euclidean', 'manhattan', 'minkowski') + p : int + Power parameter for Minkowski distance (default 2) """ - self.data = zip(train_data, train_target) + self.data = list(zip(train_data, train_target)) self.labels = class_labels + self.distance_metric = distance_metric + self.p = p - @staticmethod - def _euclidean_distance(a: np.ndarray[float], b: np.ndarray[float]) -> float: + def _calculate_distance(self, a: np.ndarray[float], b: np.ndarray[float]) -> float: """ - Calculate the Euclidean distance between two points - >>> KNN._euclidean_distance(np.array([0, 0]), np.array([3, 4])) - 5.0 - >>> KNN._euclidean_distance(np.array([1, 2, 3]), np.array([1, 8, 11])) - 10.0 + Calculate distance between two points based on the selected metric. """ - return float(np.linalg.norm(a - b)) + if self.distance_metric == "euclidean": + return float(np.linalg.norm(a - b)) + elif self.distance_metric == "manhattan": + return float(np.sum(np.abs(a - b))) + elif self.distance_metric == "minkowski": + return float(np.sum(np.abs(a - b) ** self.p) ** (1 / self.p)) + else: + raise ValueError("Invalid distance metric. Choose 'euclidean', 'manhattan', or 'minkowski'.") def classify(self, pred_point: np.ndarray[float], k: int = 5) -> str: """ @@ -57,23 +69,18 @@ def classify(self, pred_point: np.ndarray[float], k: int = 5) -> str: >>> knn.classify(point) 'A' """ - # Distances of all points from the point to be classified distances = ( - (self._euclidean_distance(data_point[0], pred_point), data_point[1]) + (self._calculate_distance(data_point[0], pred_point), data_point[1]) for data_point in self.data ) - # Choosing k points with the shortest distances votes = (i[1] for i in nsmallest(k, distances)) - - # Most commonly occurring class is the one into which the point is classified result = Counter(votes).most_common(1)[0][0] return self.labels[result] if __name__ == "__main__": import doctest - doctest.testmod() iris = datasets.load_iris() @@ -84,5 +91,15 @@ def classify(self, pred_point: np.ndarray[float], k: int = 5) -> str: X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) iris_point = np.array([4.4, 3.1, 1.3, 1.4]) - classifier = KNN(X_train, y_train, iris_classes) - print(classifier.classify(iris_point, k=3)) + + print("\nUsing Euclidean Distance:") + classifier1 = KNN(X_train, y_train, iris_classes, distance_metric="euclidean") + print(classifier1.classify(iris_point, k=3)) + + print("\nUsing Manhattan Distance:") + classifier2 = KNN(X_train, y_train, iris_classes, distance_metric="manhattan") + print(classifier2.classify(iris_point, k=3)) + + print("\nUsing Minkowski Distance (p=3):") + classifier3 = KNN(X_train, y_train, iris_classes, distance_metric="minkowski", p=3) + print(classifier3.classify(iris_point, k=3)) \ No newline at end of file From 907ec7fe7aebec3f9260b679f9ab58be25d8fecd Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 21 Oct 2025 14:10:27 +0000 Subject: [PATCH 3/3] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- machine_learning/k_medoids.py | 23 +++++++++++++++++------ machine_learning/k_nearest_neighbours.py | 7 +++++-- 2 files changed, 22 insertions(+), 8 deletions(-) diff --git a/machine_learning/k_medoids.py b/machine_learning/k_medoids.py index 6497703fb0aa..9d382d85357f 100644 --- a/machine_learning/k_medoids.py +++ b/machine_learning/k_medoids.py @@ -23,6 +23,7 @@ from matplotlib import pyplot as plt from sklearn.metrics import pairwise_distances + def get_initial_medoids(data, k, seed=None): rng = np.random.default_rng(seed) n = data.shape[0] @@ -30,11 +31,13 @@ def get_initial_medoids(data, k, seed=None): medoids = data[indices, :] return medoids + def assign_clusters(data, medoids): - distances = pairwise_distances(data, medoids, metric='euclidean') + distances = pairwise_distances(data, medoids, metric="euclidean") cluster_assignment = np.argmin(distances, axis=1) return cluster_assignment + def revise_medoids(data, k, cluster_assignment): new_medoids = [] for i in range(k): @@ -47,6 +50,7 @@ def revise_medoids(data, k, cluster_assignment): new_medoids.append(members[medoid_index]) return np.array(new_medoids) + def compute_heterogeneity(data, k, medoids, cluster_assignment): heterogeneity = 0.0 for i in range(k): @@ -57,6 +61,7 @@ def compute_heterogeneity(data, k, medoids, cluster_assignment): heterogeneity += np.sum(distances**2) return heterogeneity + def kmedoids(data, k, initial_medoids, maxiter=100, verbose=False): medoids = initial_medoids.copy() prev_assignment = None @@ -64,7 +69,10 @@ def kmedoids(data, k, initial_medoids, maxiter=100, verbose=False): cluster_assignment = assign_clusters(data, medoids) medoids = revise_medoids(data, k, cluster_assignment) - if prev_assignment is not None and (prev_assignment == cluster_assignment).all(): + if ( + prev_assignment is not None + and (prev_assignment == cluster_assignment).all() + ): break if verbose and prev_assignment is not None: @@ -75,21 +83,24 @@ def kmedoids(data, k, initial_medoids, maxiter=100, verbose=False): return medoids, cluster_assignment + # Optional plotting def plot_clusters(data, medoids, cluster_assignment): - ax = plt.axes(projection='3d') - ax.scatter(data[:,0], data[:,1], data[:,2], c=cluster_assignment, cmap='viridis') - ax.scatter(medoids[:,0], medoids[:,1], medoids[:,2], c='red', s=100, marker='x') + ax = plt.axes(projection="3d") + ax.scatter(data[:, 0], data[:, 1], data[:, 2], c=cluster_assignment, cmap="viridis") + ax.scatter(medoids[:, 0], medoids[:, 1], medoids[:, 2], c="red", s=100, marker="x") ax.set_xlabel("X") ax.set_ylabel("Y") ax.set_zlabel("Z") ax.set_title("3D K-Medoids Clustering") plt.show() + # Optional test if __name__ == "__main__": from sklearn import datasets - X = datasets.load_iris()['data'] + + X = datasets.load_iris()["data"] k = 3 medoids = get_initial_medoids(X, k, seed=0) medoids, clusters = kmedoids(X, k, medoids, maxiter=50, verbose=True) diff --git a/machine_learning/k_nearest_neighbours.py b/machine_learning/k_nearest_neighbours.py index 24ce6eefb753..3d4e41b70d7a 100644 --- a/machine_learning/k_nearest_neighbours.py +++ b/machine_learning/k_nearest_neighbours.py @@ -54,7 +54,9 @@ def _calculate_distance(self, a: np.ndarray[float], b: np.ndarray[float]) -> flo elif self.distance_metric == "minkowski": return float(np.sum(np.abs(a - b) ** self.p) ** (1 / self.p)) else: - raise ValueError("Invalid distance metric. Choose 'euclidean', 'manhattan', or 'minkowski'.") + raise ValueError( + "Invalid distance metric. Choose 'euclidean', 'manhattan', or 'minkowski'." + ) def classify(self, pred_point: np.ndarray[float], k: int = 5) -> str: """ @@ -81,6 +83,7 @@ def classify(self, pred_point: np.ndarray[float], k: int = 5) -> str: if __name__ == "__main__": import doctest + doctest.testmod() iris = datasets.load_iris() @@ -102,4 +105,4 @@ def classify(self, pred_point: np.ndarray[float], k: int = 5) -> str: print("\nUsing Minkowski Distance (p=3):") classifier3 = KNN(X_train, y_train, iris_classes, distance_metric="minkowski", p=3) - print(classifier3.classify(iris_point, k=3)) \ No newline at end of file + print(classifier3.classify(iris_point, k=3))