In [37]:
# clusterin in scikit-learn 
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.datasets import make_blobs

In [38]:
# Generate sample data
n_samples = 300
n_features = 2
n_clusters = 3
random_state = 42

In [39]:
X, y = make_blobs(n_samples=n_samples, 
                  n_features=n_features, 
                  centers=n_clusters, 
                  random_state=random_state)

In [None]:
# Visualize the data
plt.scatter(X[:, 0], X[:, 1], s=3)
plt.title("Generated Data")
plt.show()

In [None]:
# Apply KMeans clustering
kmeans = KMeans(n_clusters=n_clusters, random_state=random_state)
kmeans.fit(X)

In [42]:
# Retrieve cluster labels and centroids
labels = kmeans.labels_
centroids = kmeans.cluster_centers_

In [None]:
# Visualize the clusters
for i in range(n_clusters):
    plt.scatter(X[labels == i, 0], X[labels == i, 1], label=f'Cluster {i+1}')
plt.scatter(centroids[:, 0], centroids[:, 1], s=200, c='red', marker='X', label='Centroids')
plt.title("KMeans Clustering")
plt.legend()
plt.show()

In [44]:
# tree based clustering method 
from sklearn.cluster import AgglomerativeClustering 
from scipy.cluster.hierarchy import dendrogram, linkage 

In [45]:
# Apply Agglomerative Clustering
cluster_model = AgglomerativeClustering(n_clusters=3, metric = 'euclidean', linkage='ward')
labels = cluster_model.fit_predict(X)

In [None]:
# Visualize the clusters
plt.scatter(X[:, 0], X[:, 1], c=labels, cmap='viridis', s=50)
plt.title("Clusters from Agglomerative Clustering")
plt.show()

In [47]:
# Perform hierarchical clustering
linked = linkage(X, method='ward')  # 'ward' minimizes the variance within clusters

In [None]:
# Plot the dendrogram
plt.figure(figsize=(12, 8))
dendrogram(linked, truncate_mode='level', p=5)  # Truncate to show the top levels
plt.title("Dendrogram")
plt.xlabel("Sample Index")
plt.ylabel("Distance")
plt.show()

In [None]:
# use distance threshold 
from scipy.cluster.hierarchy import fcluster 
threshold = 80 
cluster_labels = fcluster(linked, t=threshold, criterion='distance')
print(cluster_labels[:18])

In [None]:
# change the threshold distance to 140 
threshold = 140
cluster_labels = fcluster(linked, t=threshold, criterion='distance')
print(cluster_labels[:18])

In [None]:
plt.scatter(X[:, 0], X[:, 1], c=cluster_labels, cmap='viridis', s=50)
plt.title("Threshold Distance is set to 140: two clusters ")
plt.show()

In [52]:
# How many clusters do you need?
inertias = [] 
size = range(2, 10) 
for k in size:
    k2 = KMeans(random_state=42, n_clusters=k)
    k2.fit(X)
    inertias.append(k2.inertia_)

In [None]:
print(inertias)

In [None]:
# decide how many cluster you need based on 'scree plot', did you find the 'elbow' 
# it is K = 3, where the elbow is formed

fig, ax = plt.subplots(figsize = (6, 4))
pd.Series(inertias, index=size).plot(ax=ax) 
ax.set_xlabel("K")
ax.set_ylabel("Inertia")

In [None]:
# what is Silhouette Score ?
from sklearn.metrics import silhouette_score
# Calculate Silhouette Score
sil_score = silhouette_score(X, cluster_labels)
print(f"Average Silhouette Score: {sil_score:.3f}")

In [None]:
cluster_labels = kmeans.fit_predict(X)
cluster_labels[:27]

In [58]:
# Find optimal number of clusters
sil_scores = [] 
for n_clusters in range(2, 10):
    kmeans = KMeans(n_clusters=n_clusters, random_state=42)
    cluster_labels = kmeans.fit_predict(X)
    sil_score = silhouette_score(X, cluster_labels)
    sil_scores.append(sil_score)

In [None]:
# plot results 
plt.plot(range(2, 10), sil_scores, marker = 'o')
plt.title('Silhouette Score vs Number of Clusters')
plt.xlabel('Number of clusters')
plt.ylabel('Silhouette Score')
plt.show()