Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
107 changes: 107 additions & 0 deletions machine_learning/k_medoids.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,107 @@
"""
README, Author - Rohit Kumar Bansal (mailto:rohitbansal.dev@gmail.com)

Requirements:
- numpy
- matplotlib
Python:
- 3.5+
Inputs:
- X: 2D numpy array of features
- k: number of clusters
Usage:
1. Define k and X
2. Create initial medoids:
initial_medoids = get_initial_medoids(X, k, seed=0)
3. Run kmedoids:
medoids, cluster_assignment = kmedoids(
X, k, initial_medoids, maxiter=100, verbose=True
)
"""

import numpy as np
from matplotlib import pyplot as plt
from sklearn.metrics import pairwise_distances


def get_initial_medoids(data, k, seed=None):

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

As there is no test file in this pull request nor any test function or class in the file machine_learning/k_medoids.py, please provide doctest for the function get_initial_medoids

Please provide return type hint for the function: get_initial_medoids. If the function does not return a value, please provide the type hint as: def function() -> None:

Please provide type hint for the parameter: data

Please provide descriptive name for the parameter: k

Please provide type hint for the parameter: k

Please provide type hint for the parameter: seed

rng = np.random.default_rng(seed)
n = data.shape[0]
indices = rng.choice(n, k, replace=False)
medoids = data[indices, :]
return medoids


def assign_clusters(data, medoids):

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

As there is no test file in this pull request nor any test function or class in the file machine_learning/k_medoids.py, please provide doctest for the function assign_clusters

Please provide return type hint for the function: assign_clusters. If the function does not return a value, please provide the type hint as: def function() -> None:

Please provide type hint for the parameter: data

Please provide type hint for the parameter: medoids

distances = pairwise_distances(data, medoids, metric="euclidean")
cluster_assignment = np.argmin(distances, axis=1)
return cluster_assignment


def revise_medoids(data, k, cluster_assignment):

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

As there is no test file in this pull request nor any test function or class in the file machine_learning/k_medoids.py, please provide doctest for the function revise_medoids

Please provide return type hint for the function: revise_medoids. If the function does not return a value, please provide the type hint as: def function() -> None:

Please provide type hint for the parameter: data

Please provide descriptive name for the parameter: k

Please provide type hint for the parameter: k

Please provide type hint for the parameter: cluster_assignment

new_medoids = []
for i in range(k):
members = data[cluster_assignment == i]
if len(members) == 0:
continue
# Compute total distance from each point to all others in cluster
total_distances = np.sum(pairwise_distances(members, members), axis=1)
medoid_index = np.argmin(total_distances)
new_medoids.append(members[medoid_index])
return np.array(new_medoids)


def compute_heterogeneity(data, k, medoids, cluster_assignment):

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

As there is no test file in this pull request nor any test function or class in the file machine_learning/k_medoids.py, please provide doctest for the function compute_heterogeneity

Please provide return type hint for the function: compute_heterogeneity. If the function does not return a value, please provide the type hint as: def function() -> None:

Please provide type hint for the parameter: data

Please provide descriptive name for the parameter: k

Please provide type hint for the parameter: k

Please provide type hint for the parameter: medoids

Please provide type hint for the parameter: cluster_assignment

heterogeneity = 0.0
for i in range(k):
members = data[cluster_assignment == i]
if len(members) == 0:
continue
distances = pairwise_distances(members, [medoids[i]])
heterogeneity += np.sum(distances**2)
return heterogeneity


def kmedoids(data, k, initial_medoids, maxiter=100, verbose=False):

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

As there is no test file in this pull request nor any test function or class in the file machine_learning/k_medoids.py, please provide doctest for the function kmedoids

Please provide return type hint for the function: kmedoids. If the function does not return a value, please provide the type hint as: def function() -> None:

Please provide type hint for the parameter: data

Please provide descriptive name for the parameter: k

Please provide type hint for the parameter: k

Please provide type hint for the parameter: initial_medoids

Please provide type hint for the parameter: maxiter

Please provide type hint for the parameter: verbose

medoids = initial_medoids.copy()
prev_assignment = None
for itr in range(maxiter):
cluster_assignment = assign_clusters(data, medoids)
medoids = revise_medoids(data, k, cluster_assignment)

if (
prev_assignment is not None
and (prev_assignment == cluster_assignment).all()
):
break

if verbose and prev_assignment is not None:
changed = np.sum(prev_assignment != cluster_assignment)
print(f"Iteration {itr}: {changed} points changed clusters")

prev_assignment = cluster_assignment.copy()

return medoids, cluster_assignment


# Optional plotting
def plot_clusters(data, medoids, cluster_assignment):

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

As there is no test file in this pull request nor any test function or class in the file machine_learning/k_medoids.py, please provide doctest for the function plot_clusters

Please provide return type hint for the function: plot_clusters. If the function does not return a value, please provide the type hint as: def function() -> None:

Please provide type hint for the parameter: data

Please provide type hint for the parameter: medoids

Please provide type hint for the parameter: cluster_assignment

ax = plt.axes(projection="3d")
ax.scatter(data[:, 0], data[:, 1], data[:, 2], c=cluster_assignment, cmap="viridis")
ax.scatter(medoids[:, 0], medoids[:, 1], medoids[:, 2], c="red", s=100, marker="x")
ax.set_xlabel("X")
ax.set_ylabel("Y")
ax.set_zlabel("Z")
ax.set_title("3D K-Medoids Clustering")
plt.show()


# Optional test
if __name__ == "__main__":
from sklearn import datasets

X = datasets.load_iris()["data"]
k = 3
medoids = get_initial_medoids(X, k, seed=0)
medoids, clusters = kmedoids(X, k, medoids, maxiter=50, verbose=True)
plot_clusters(X, medoids, clusters)
56 changes: 38 additions & 18 deletions machine_learning/k_nearest_neighbours.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,12 +12,11 @@
Reference: https://en.wikipedia.org/wiki/K-nearest_neighbors_algorithm
"""

from collections import Counter
from heapq import nsmallest

import numpy as np
from sklearn import datasets
from sklearn.model_selection import train_test_split

Check failure on line 19 in machine_learning/k_nearest_neighbours.py

View workflow job for this annotation

GitHub Actions / ruff

Ruff (I001)

machine_learning/k_nearest_neighbours.py:15:1: I001 Import block is un-sorted or un-formatted


class KNN:
Expand All @@ -26,23 +25,38 @@
train_data: np.ndarray[float],
train_target: np.ndarray[int],
class_labels: list[str],
distance_metric: str = "euclidean",
p: int = 2,
) -> None:
"""
Create a kNN classifier using the given training data and class labels
Create a kNN classifier using the given training data and class labels.

Parameters:
-----------
distance_metric : str
Type of distance metric to use ('euclidean', 'manhattan', 'minkowski')
p : int
Power parameter for Minkowski distance (default 2)
"""
self.data = zip(train_data, train_target)
self.data = list(zip(train_data, train_target))
self.labels = class_labels
self.distance_metric = distance_metric
self.p = p

@staticmethod
def _euclidean_distance(a: np.ndarray[float], b: np.ndarray[float]) -> float:
def _calculate_distance(self, a: np.ndarray[float], b: np.ndarray[float]) -> float:
"""
Calculate the Euclidean distance between two points
>>> KNN._euclidean_distance(np.array([0, 0]), np.array([3, 4]))
5.0
>>> KNN._euclidean_distance(np.array([1, 2, 3]), np.array([1, 8, 11]))
10.0
Calculate distance between two points based on the selected metric.
"""
return float(np.linalg.norm(a - b))
if self.distance_metric == "euclidean":
return float(np.linalg.norm(a - b))
elif self.distance_metric == "manhattan":
return float(np.sum(np.abs(a - b)))
elif self.distance_metric == "minkowski":
return float(np.sum(np.abs(a - b) ** self.p) ** (1 / self.p))
else:
raise ValueError(
"Invalid distance metric. Choose 'euclidean', 'manhattan', or 'minkowski'."

Check failure on line 58 in machine_learning/k_nearest_neighbours.py

View workflow job for this annotation

GitHub Actions / ruff

Ruff (E501)

machine_learning/k_nearest_neighbours.py:58:89: E501 Line too long (91 > 88)
)

def classify(self, pred_point: np.ndarray[float], k: int = 5) -> str:
"""
Expand All @@ -57,16 +71,12 @@
>>> knn.classify(point)
'A'
"""
# Distances of all points from the point to be classified
distances = (
(self._euclidean_distance(data_point[0], pred_point), data_point[1])
(self._calculate_distance(data_point[0], pred_point), data_point[1])
for data_point in self.data
)

# Choosing k points with the shortest distances
votes = (i[1] for i in nsmallest(k, distances))

# Most commonly occurring class is the one into which the point is classified
result = Counter(votes).most_common(1)[0][0]
return self.labels[result]

Expand All @@ -84,5 +94,15 @@

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
iris_point = np.array([4.4, 3.1, 1.3, 1.4])
classifier = KNN(X_train, y_train, iris_classes)
print(classifier.classify(iris_point, k=3))

print("\nUsing Euclidean Distance:")
classifier1 = KNN(X_train, y_train, iris_classes, distance_metric="euclidean")
print(classifier1.classify(iris_point, k=3))

print("\nUsing Manhattan Distance:")
classifier2 = KNN(X_train, y_train, iris_classes, distance_metric="manhattan")
print(classifier2.classify(iris_point, k=3))

print("\nUsing Minkowski Distance (p=3):")
classifier3 = KNN(X_train, y_train, iris_classes, distance_metric="minkowski", p=3)
print(classifier3.classify(iris_point, k=3))
Loading