-
Notifications
You must be signed in to change notification settings - Fork 1
/
find_elbow.py
60 lines (49 loc) · 1.74 KB
/
find_elbow.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
from sklearn.metrics import silhouette_score
from kmeaningful.fit_assign import fit_assign
import numpy as np
def find_elbow(X):
"""
This function takes in unlabeled, scaled data and performs clustering
using the KMeans clustering algorithm values of K up to the
min(10, n_samples - 1).
It returns the value for K which maximizes the mean silhouette scores for
all clusters.
Parameters
----------
X : numpy ndarray
Pre-scaled data to train clustering model with.
Returns
-------
optimal_K: int
The value for K which maximizes the mean silhouette scores across all
clusters.
Examples
--------
>>> from sklearn.datasets import make_blobs
>>> X, _ = make_blobs(n_samples=10, centers=3, n_features=2)
>>> processed_data = preprocess(X)
>>> optimal_K = find_elbow(processed_data)
"""
# Raise exception for bad input
if not type(X) == np.ndarray:
raise Exception(
f"Please provide a numpy ndarray as input. Input type detected:\
{type(X)}"
)
# This check is needed because silhouette score is only defined for
# 1 < n_labels < n_samples
if len(X) < 3:
raise Exception(
"Please provide a numpy ndarray with at least three\
rows"
)
# Calculate the max possible value for K given the input data
max_clusters = min(10, X.shape[0] - 1)
# Calculate the mean silhouette score for each K
scores = dict()
for K in range(2, max_clusters + 1):
_, labels = fit_assign(X, K)
sil_score = silhouette_score(X, labels)
scores[K] = sil_score
# Return value for K which results in greatest silhouette score
return max(scores, key=lambda k: scores[k])