-
-
Notifications
You must be signed in to change notification settings - Fork 48.9k
Implement K-Medoids clustering algorithm #13488 #13644
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,107 @@ | ||
""" | ||
README, Author - Rohit Kumar Bansal (mailto:rohitbansal.dev@gmail.com) | ||
|
||
Requirements: | ||
- numpy | ||
- matplotlib | ||
Python: | ||
- 3.5+ | ||
Inputs: | ||
- X: 2D numpy array of features | ||
- k: number of clusters | ||
Usage: | ||
1. Define k and X | ||
2. Create initial medoids: | ||
initial_medoids = get_initial_medoids(X, k, seed=0) | ||
3. Run kmedoids: | ||
medoids, cluster_assignment = kmedoids( | ||
X, k, initial_medoids, maxiter=100, verbose=True | ||
) | ||
""" | ||
|
||
import numpy as np | ||
from matplotlib import pyplot as plt | ||
from sklearn.metrics import pairwise_distances | ||
|
||
|
||
def get_initial_medoids(data, k, seed=None): | ||
rng = np.random.default_rng(seed) | ||
n = data.shape[0] | ||
indices = rng.choice(n, k, replace=False) | ||
medoids = data[indices, :] | ||
return medoids | ||
|
||
|
||
def assign_clusters(data, medoids): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. As there is no test file in this pull request nor any test function or class in the file Please provide return type hint for the function: Please provide type hint for the parameter: Please provide type hint for the parameter: |
||
distances = pairwise_distances(data, medoids, metric="euclidean") | ||
cluster_assignment = np.argmin(distances, axis=1) | ||
return cluster_assignment | ||
|
||
|
||
def revise_medoids(data, k, cluster_assignment): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. As there is no test file in this pull request nor any test function or class in the file Please provide return type hint for the function: Please provide type hint for the parameter: Please provide descriptive name for the parameter: Please provide type hint for the parameter: Please provide type hint for the parameter: |
||
new_medoids = [] | ||
for i in range(k): | ||
members = data[cluster_assignment == i] | ||
if len(members) == 0: | ||
continue | ||
# Compute total distance from each point to all others in cluster | ||
total_distances = np.sum(pairwise_distances(members, members), axis=1) | ||
medoid_index = np.argmin(total_distances) | ||
new_medoids.append(members[medoid_index]) | ||
return np.array(new_medoids) | ||
|
||
|
||
def compute_heterogeneity(data, k, medoids, cluster_assignment): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. As there is no test file in this pull request nor any test function or class in the file Please provide return type hint for the function: Please provide type hint for the parameter: Please provide descriptive name for the parameter: Please provide type hint for the parameter: Please provide type hint for the parameter: Please provide type hint for the parameter: |
||
heterogeneity = 0.0 | ||
for i in range(k): | ||
members = data[cluster_assignment == i] | ||
if len(members) == 0: | ||
continue | ||
distances = pairwise_distances(members, [medoids[i]]) | ||
heterogeneity += np.sum(distances**2) | ||
return heterogeneity | ||
|
||
|
||
def kmedoids(data, k, initial_medoids, maxiter=100, verbose=False): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. As there is no test file in this pull request nor any test function or class in the file Please provide return type hint for the function: Please provide type hint for the parameter: Please provide descriptive name for the parameter: Please provide type hint for the parameter: Please provide type hint for the parameter: Please provide type hint for the parameter: Please provide type hint for the parameter: |
||
medoids = initial_medoids.copy() | ||
prev_assignment = None | ||
for itr in range(maxiter): | ||
cluster_assignment = assign_clusters(data, medoids) | ||
medoids = revise_medoids(data, k, cluster_assignment) | ||
|
||
if ( | ||
prev_assignment is not None | ||
and (prev_assignment == cluster_assignment).all() | ||
): | ||
break | ||
|
||
if verbose and prev_assignment is not None: | ||
changed = np.sum(prev_assignment != cluster_assignment) | ||
print(f"Iteration {itr}: {changed} points changed clusters") | ||
|
||
prev_assignment = cluster_assignment.copy() | ||
|
||
return medoids, cluster_assignment | ||
|
||
|
||
# Optional plotting | ||
def plot_clusters(data, medoids, cluster_assignment): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. As there is no test file in this pull request nor any test function or class in the file Please provide return type hint for the function: Please provide type hint for the parameter: Please provide type hint for the parameter: Please provide type hint for the parameter: |
||
ax = plt.axes(projection="3d") | ||
ax.scatter(data[:, 0], data[:, 1], data[:, 2], c=cluster_assignment, cmap="viridis") | ||
ax.scatter(medoids[:, 0], medoids[:, 1], medoids[:, 2], c="red", s=100, marker="x") | ||
ax.set_xlabel("X") | ||
ax.set_ylabel("Y") | ||
ax.set_zlabel("Z") | ||
ax.set_title("3D K-Medoids Clustering") | ||
plt.show() | ||
|
||
|
||
# Optional test | ||
if __name__ == "__main__": | ||
from sklearn import datasets | ||
|
||
X = datasets.load_iris()["data"] | ||
k = 3 | ||
medoids = get_initial_medoids(X, k, seed=0) | ||
medoids, clusters = kmedoids(X, k, medoids, maxiter=50, verbose=True) | ||
plot_clusters(X, medoids, clusters) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
As there is no test file in this pull request nor any test function or class in the file
machine_learning/k_medoids.py
, please provide doctest for the functionget_initial_medoids
Please provide return type hint for the function:
get_initial_medoids
. If the function does not return a value, please provide the type hint as:def function() -> None:
Please provide type hint for the parameter:
data
Please provide descriptive name for the parameter:
k
Please provide type hint for the parameter:
k
Please provide type hint for the parameter:
seed