In [None]:
import matplotlib.pyplot as plt

x_coordinates = [80, 93, 86, 98, 86, 9, 15, 3, 10, 20, 44, 56, 49, 62, 44]
y_coordinates = [87, 96, 95, 92, 92, 57, 49, 47, 59, 55, 25, 2, 10, 24, 10]

plt.scatter(x_coordinates, y_coordinates)


1. Hierarchical clustering

In [None]:
from scipy.cluster.hierarchy import linkage, fcluster
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

x_coordinates = [80.1, 93.1, 86.6, 98.5, 86.4, 9.5, 15.2, 3.4,
                 10.4, 20.3, 44.2, 56.8, 49.2, 62.5, 44.0]
y_coordinates = [87.2, 96.1, 95.6, 92.4, 92.4, 57.7, 49.4,
                 47.3, 59.1, 55.5, 25.6, 2.1, 10.9, 24.1, 10.3]

df = pd.DataFrame({'x_coordinate': x_coordinates,
                   'y_coordinate': y_coordinates})


In [None]:
# 1.1. Compute distance between intermediate clusters
Z = linkage(df, 'ward')
df['cluster_labels'] = fcluster(Z, 3, criterion='maxclust')

# 1.2. Plot the points
sns.scatterplot(x='x_coordinate', y='y_coordinate',
                hue='cluster_labels', data=df)

2. K-Means clustering

In [None]:
from scipy.cluster.vq import kmeans, vq
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

import random
random.seed((1000, 2000))

x_coordinates = [80.1, 93.1, 86.6, 98.5, 86.4, 9.5, 15.2, 3.4,
                 10.4, 20.3, 44.2, 56.8, 49.2, 62.5, 44.0]
y_coordinates = [87.2, 96.1, 95.6, 92.4, 92.4, 57.7, 49.4,
                 47.3, 59.1, 55.5, 25.6, 2.1, 10.9, 24.1, 10.3]

df = pd.DataFrame({'x_coordinate': x_coordinates,
                   'y_coordinate': y_coordinates})

In [None]:
centroids, _ = kmeans(df, 3)
df['cluster_labels'], _ = vq(df, centroids)

sns.scatterplot(x='x_coordinate', y='y_coordinate',
                hue='cluster_labels', data=df)


K-Means 2.0.

In [None]:
import matplotlib.pyplot as plt
from kneed import KneeLocator
from sklearn.datasets import make_blobs
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.preprocessing import StandardScaler


In [None]:
features, true_labels = make_blobs(
  n_samples=200,
  centers=3,
  cluster_std=2.75,
  random_state=42
)


In [None]:
scaler = StandardScaler()
scaled_features = scaler.fit_transform(features)


In [None]:
kmeans = KMeans(
  init='random',
  n_clusters=3,
  n_init=10,
  max_iter=300,
  random_state=42
)

kmeans.fit(scaled_features)


In [None]:
kmeans.inertia_


In [None]:
kmeans.cluster_centers_


In [None]:
kmeans.n_iter_


In [None]:
print(kmeans.labels_[:5])

print(true_labels[:5])


Choosing the appropiate number of clusters


Elbow method

In [None]:
kmeans_kwargs = {
  'init': 'random',
  'n_init': 10,
  'max_iter': 300,
  'random_state': 42,
}

sse = []
for k in range(1, 11):
  kmeans = KMeans(n_clusters=k, **kmeans_kwargs)
  kmeans.fit(scaled_features)
  sse.append(kmeans.inertia_)

plt.style.use('fivethirtyeight')
plt.plot(range(1, 11), sse, linewidth=3)
plt.xticks(range(1, 11))
plt.xlabel('Number of Clusters')
plt.ylabel('SSE')

k1 = KneeLocator(range(1, 11), sse, curve='convex', direction='decreasing')

k1.elbow
plt.vlines(x=k1.elbow, ymin=0, ymax=max(sse), colors='red', linewidth=2.0, linestyles='dashed')



Silhouette coefficient

In [None]:
silhouette_coefficients  = []

for k in range(2, 11):
  kmeans = KMeans(n_clusters=k, **kmeans_kwargs)
  kmeans.fit(scaled_features)
  score = silhouette_score(scaled_features, kmeans.labels_)
  silhouette_coefficients.append(score)

plt.style.use("fivethirtyeight")
plt.plot(range(2, 11), silhouette_coefficients, linewidth=3)
plt.xticks(range(2, 11))
plt.xlabel("Number of Clusters")
plt.ylabel("Silhouette Coefficient")


Hierarchycal clustering 2.0

In [None]:
import numpy as np

X = np.array([[5,3],
    [10,15],
    [15,12],
    [24,10],
    [30,30],
    [85,70],
    [71,80],
    [60,78],
    [70,55],
    [80,91],])


In [None]:
import matplotlib.pyplot as plt

labels = range(1, 11)
plt.figure(figsize=(10, 7))
plt.subplots_adjust(bottom=0.1)
plt.scatter(X[:, 0], X[:, 1], label='True Position')

for label, x, y in zip(labels, X[:, 0], X[:, 1]):
    plt.annotate(
        label,
        xy=(x, y), xytext=(-3, 3),
        textcoords='offset points', ha='right', va='bottom')
plt.show()


In [None]:
from scipy.cluster.hierarchy import dendrogram, linkage
import matplotlib.pyplot as plt

linked = linkage(X, 'single')

labellist = range(1, 11)

plt.figure(figsize=(10, 7))
dendrogram(linked,
           orientation='top',
           labels=labellist,
           distance_sort='descending',
           show_leaf_counts=True)


In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

X = np.array([[5,3],
    [10,15],
    [15,12],
    [24,10],
    [30,30],
    [85,70],
    [71,80],
    [60,78],
    [70,55],
    [80,91],])


In [None]:
from sklearn.cluster import AgglomerativeClustering

cluster = AgglomerativeClustering(n_clusters=2, affinity='euclidean', linkage='ward')
cluster.fit_predict(X)

print(cluster.labels_)


In [None]:
plt.scatter(X[:,0],X[:,1], c=cluster.labels_, cmap='rainbow')


In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

customer_data = pd.read_csv('C:\\Users\\TheAncientOwl\\Code\\data-analysis-tool\\server\\common\\test-data\\hierarchical-clustering.shopping-data.csv')
customer_data.head()


In [None]:
data = customer_data.iloc[:, 3:5].values


In [None]:
import scipy.cluster.hierarchy as shc

plt.figure(figsize=(10, 7))
plt.title('Customer Dendograms')
dend = shc.dendrogram(shc.linkage(data, method='ward'))


In [None]:
from sklearn.cluster import AgglomerativeClustering

cluster = AgglomerativeClustering(n_clusters=5,
                                  affinity='euclidean',
                                  linkage='ward')
cluster.fit_predict(data)


In [None]:
plt.figure(figsize=(10, 7))
plt.scatter(data[:,0], data[:,1], c=cluster.labels_, cmap='rainbow')
