diff --git a/machine_learning/k_medoids.py b/machine_learning/k_medoids.py new file mode 100644 index 000000000000..9d382d85357f --- /dev/null +++ b/machine_learning/k_medoids.py @@ -0,0 +1,107 @@ +""" +README, Author - Rohit Kumar Bansal (mailto:rohitbansal.dev@gmail.com) + +Requirements: + - numpy + - matplotlib +Python: + - 3.5+ +Inputs: + - X: 2D numpy array of features + - k: number of clusters +Usage: + 1. Define k and X + 2. Create initial medoids: + initial_medoids = get_initial_medoids(X, k, seed=0) + 3. Run kmedoids: + medoids, cluster_assignment = kmedoids( + X, k, initial_medoids, maxiter=100, verbose=True + ) +""" + +import numpy as np +from matplotlib import pyplot as plt +from sklearn.metrics import pairwise_distances + + +def get_initial_medoids(data, k, seed=None): + rng = np.random.default_rng(seed) + n = data.shape[0] + indices = rng.choice(n, k, replace=False) + medoids = data[indices, :] + return medoids + + +def assign_clusters(data, medoids): + distances = pairwise_distances(data, medoids, metric="euclidean") + cluster_assignment = np.argmin(distances, axis=1) + return cluster_assignment + + +def revise_medoids(data, k, cluster_assignment): + new_medoids = [] + for i in range(k): + members = data[cluster_assignment == i] + if len(members) == 0: + continue + # Compute total distance from each point to all others in cluster + total_distances = np.sum(pairwise_distances(members, members), axis=1) + medoid_index = np.argmin(total_distances) + new_medoids.append(members[medoid_index]) + return np.array(new_medoids) + + +def compute_heterogeneity(data, k, medoids, cluster_assignment): + heterogeneity = 0.0 + for i in range(k): + members = data[cluster_assignment == i] + if len(members) == 0: + continue + distances = pairwise_distances(members, [medoids[i]]) + heterogeneity += np.sum(distances**2) + return heterogeneity + + +def kmedoids(data, k, initial_medoids, maxiter=100, verbose=False): + medoids = initial_medoids.copy() + prev_assignment = None + for itr in range(maxiter): + cluster_assignment = assign_clusters(data, medoids) + medoids = revise_medoids(data, k, cluster_assignment) + + if ( + prev_assignment is not None + and (prev_assignment == cluster_assignment).all() + ): + break + + if verbose and prev_assignment is not None: + changed = np.sum(prev_assignment != cluster_assignment) + print(f"Iteration {itr}: {changed} points changed clusters") + + prev_assignment = cluster_assignment.copy() + + return medoids, cluster_assignment + + +# Optional plotting +def plot_clusters(data, medoids, cluster_assignment): + ax = plt.axes(projection="3d") + ax.scatter(data[:, 0], data[:, 1], data[:, 2], c=cluster_assignment, cmap="viridis") + ax.scatter(medoids[:, 0], medoids[:, 1], medoids[:, 2], c="red", s=100, marker="x") + ax.set_xlabel("X") + ax.set_ylabel("Y") + ax.set_zlabel("Z") + ax.set_title("3D K-Medoids Clustering") + plt.show() + + +# Optional test +if __name__ == "__main__": + from sklearn import datasets + + X = datasets.load_iris()["data"] + k = 3 + medoids = get_initial_medoids(X, k, seed=0) + medoids, clusters = kmedoids(X, k, medoids, maxiter=50, verbose=True) + plot_clusters(X, medoids, clusters) diff --git a/machine_learning/k_nearest_neighbours.py b/machine_learning/k_nearest_neighbours.py index fbc1b8bd227e..3d4e41b70d7a 100644 --- a/machine_learning/k_nearest_neighbours.py +++ b/machine_learning/k_nearest_neighbours.py @@ -14,7 +14,6 @@ from collections import Counter from heapq import nsmallest - import numpy as np from sklearn import datasets from sklearn.model_selection import train_test_split @@ -26,23 +25,38 @@ def __init__( train_data: np.ndarray[float], train_target: np.ndarray[int], class_labels: list[str], + distance_metric: str = "euclidean", + p: int = 2, ) -> None: """ - Create a kNN classifier using the given training data and class labels + Create a kNN classifier using the given training data and class labels. + + Parameters: + ----------- + distance_metric : str + Type of distance metric to use ('euclidean', 'manhattan', 'minkowski') + p : int + Power parameter for Minkowski distance (default 2) """ - self.data = zip(train_data, train_target) + self.data = list(zip(train_data, train_target)) self.labels = class_labels + self.distance_metric = distance_metric + self.p = p - @staticmethod - def _euclidean_distance(a: np.ndarray[float], b: np.ndarray[float]) -> float: + def _calculate_distance(self, a: np.ndarray[float], b: np.ndarray[float]) -> float: """ - Calculate the Euclidean distance between two points - >>> KNN._euclidean_distance(np.array([0, 0]), np.array([3, 4])) - 5.0 - >>> KNN._euclidean_distance(np.array([1, 2, 3]), np.array([1, 8, 11])) - 10.0 + Calculate distance between two points based on the selected metric. """ - return float(np.linalg.norm(a - b)) + if self.distance_metric == "euclidean": + return float(np.linalg.norm(a - b)) + elif self.distance_metric == "manhattan": + return float(np.sum(np.abs(a - b))) + elif self.distance_metric == "minkowski": + return float(np.sum(np.abs(a - b) ** self.p) ** (1 / self.p)) + else: + raise ValueError( + "Invalid distance metric. Choose 'euclidean', 'manhattan', or 'minkowski'." + ) def classify(self, pred_point: np.ndarray[float], k: int = 5) -> str: """ @@ -57,16 +71,12 @@ def classify(self, pred_point: np.ndarray[float], k: int = 5) -> str: >>> knn.classify(point) 'A' """ - # Distances of all points from the point to be classified distances = ( - (self._euclidean_distance(data_point[0], pred_point), data_point[1]) + (self._calculate_distance(data_point[0], pred_point), data_point[1]) for data_point in self.data ) - # Choosing k points with the shortest distances votes = (i[1] for i in nsmallest(k, distances)) - - # Most commonly occurring class is the one into which the point is classified result = Counter(votes).most_common(1)[0][0] return self.labels[result] @@ -84,5 +94,15 @@ def classify(self, pred_point: np.ndarray[float], k: int = 5) -> str: X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) iris_point = np.array([4.4, 3.1, 1.3, 1.4]) - classifier = KNN(X_train, y_train, iris_classes) - print(classifier.classify(iris_point, k=3)) + + print("\nUsing Euclidean Distance:") + classifier1 = KNN(X_train, y_train, iris_classes, distance_metric="euclidean") + print(classifier1.classify(iris_point, k=3)) + + print("\nUsing Manhattan Distance:") + classifier2 = KNN(X_train, y_train, iris_classes, distance_metric="manhattan") + print(classifier2.classify(iris_point, k=3)) + + print("\nUsing Minkowski Distance (p=3):") + classifier3 = KNN(X_train, y_train, iris_classes, distance_metric="minkowski", p=3) + print(classifier3.classify(iris_point, k=3))