In [None]:
import pandas as pd
import numpy as np

In [None]:
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
from IPython.display import clear_output
from sklearn.metrics import silhouette_score
from sklearn.cluster import KMeans
from pylab import rcParams

In [None]:
data = pd.read_csv("clust.csv")
data.head(15)

In [None]:
data = ((data - data.min()) / (data.max() - data.min())) * 9 + 1

In [None]:
data.describe()

In [None]:
data.head()

In [None]:
def random_centroids(data, k):
    centroids = []
    for i in range(k):
        centroid = data.apply(lambda x: float(x.sample()))
        centroids.append(centroid)
    return pd.concat(centroids, axis=1)
centroids = random_centroids(data, 2)
centroids


In [None]:
def get_labels(data, centroids):
    distances = centroids.apply(lambda x: np.sqrt(((data - x) ** 2).sum(axis=1)))
    return distances.idxmin(axis=1)

In [None]:
labels = get_labels(data, centroids)
labels.value_counts()

In [None]:
def new_centroids(data, labels, k):
    centroids = data.groupby(labels).apply(lambda x: np.exp(np.log(x).mean())).T
    return centroids

In [None]:
def plot_clusters(data, labels, centroids, iteration):
    pca = PCA(n_components=2)
    data_2d = pca.fit_transform(data)
    centroids_2d = pca.transform(centroids.T)
    clear_output(wait=True)
    plt.title(f'Iteration {iteration}')
    plt.scatter(x=data_2d[:,0], y=data_2d[:,1], c=labels)
    plt.scatter(x=centroids_2d[:,0], y=centroids_2d[:,1],color='red',marker='*',label='centroid')
    plt.show()


In [None]:
max_iterations = 100
centroid_count = 2

centroids = random_centroids(data, centroid_count)
old_centroids = pd.DataFrame()
iteration = 1

while iteration < max_iterations and not centroids.equals(old_centroids):
    old_centroids = centroids
    
    labels = get_labels(data, centroids)
    centroids = new_centroids(data, labels, centroid_count)
    plot_clusters(data, labels, centroids, iteration)
    iteration += 1


In [None]:
kmeans_per_k = [KMeans(n_clusters=k, random_state=42).fit(data) for k in range(1, 10)]

predicted_labels = [model.predict(data) for model in kmeans_per_k]
silhouette_scores = [silhouette_score(data, labels) for labels in predicted_labels[1:]]

In [None]:
rcParams['figure.figsize'] = 16, 5

In [None]:
plt.plot(range (2, 10), silhouette_scores,"bo-", color="blue", linewidth=3, markersize=8, label="Silhoutee curve")
plt.xlabel("$k$", fontsize=14, family= 'Arial')
plt.ylabel("Silhouette score", fontsize=14, family="Arial")
plt.grid(which='major', color='#cccccc', linestyle="--")
plt.title('Silhoutte curve for predict optimal number of clusters', family='Arial', fontsize=14)
k =np.argmax(silhouette_scores) + 2

plt.axvline(x=k, linestyle='--',c="green", linewidth=3,label='Optimal number of clusters ({})'.format(k))
plt.scatter (k, silhouette_scores[k-2], c='red', s=400) 
plt.legend(shadow=True)
plt.show()

In [None]:
sse = []
for k in range(1, 11):
    kmeans = KMeans(n_clusters=k)
    kmeans.fit(data)
    sse.append(kmeans.inertia_)

# Plot the SSE as a function of k
plt.plot(range(1, 11), sse)
plt.title('Elbow Method')
plt.xlabel('Number of Clusters')
plt.ylabel('SSE')
plt.show()
