# Clustering using k-means

Author: Prof. Sandro Camargo <github.com/sandrocamargo>

Data Mining Course https://moodle.unipampa.edu.br/moodle/course/view.php?id=5213

This script uses the basic concepts of clustering.

In this script, we used the iris dataset https://archive.ics.uci.edu/dataset/53/iris

To open this script in your google colab environment, [click here](https://colab.research.google.com/github/Sandrocamargo/data-mining/blob/main/Python/md07_clustering.ipynb).

In [None]:
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy import stats

# use seaborn plotting defaults
import seaborn as sns; sns.set()

# import kmeans
from sklearn.cluster import KMeans

from sklearn import datasets

iris = datasets.load_iris()

data = pd.DataFrame(data= np.c_[iris['data'], iris['target']],columns= iris['feature_names'] + ['target'])

data.describe()

In [None]:
# ScatterPlot
# To display a two-dimensional graph, 2 out of the 4 available attributes were selected.
plt.scatter(data.iloc[:,0], data.iloc[:,1])
plt.xlabel("Sepal Length (cm)")
plt.ylabel("Sepal Width (cm)")
plt.show()

In [None]:
est = KMeans(3)  # 3 clusters
est.fit(data)
y_kmeans = est.predict(data)
plt.scatter(data.iloc[:,0], data.iloc[:,1], c=y_kmeans, s=20, cmap='rainbow')
plt.xlabel("Sepal Length (cm)")
plt.ylabel("Sepal Width (cm)")
plt.scatter(est.cluster_centers_[:, 0], est.cluster_centers_[:, 1], s = 50, c = 'black', label = 'Centroids')

In [None]:
# ScatterPlot
# Selecting 2 different attributes
plt.scatter(data.iloc[:,2], data.iloc[:,3])
plt.xlabel("Petal Length (cm)")
plt.ylabel("Petal Width (cm)")
plt.show()

In [None]:
# Selecting 2 different attributes
plt.scatter(data.iloc[:, 2], data.iloc[:, 3], c=y_kmeans, s=20, cmap='rainbow')
plt.xlabel("Petal Length (cm)")
plt.ylabel("Petal Width (cm)")
plt.scatter(est.cluster_centers_[:, 2], est.cluster_centers_[:, 3], s = 50, c = 'black', label = 'Centroids')
plt.show()

# Elbow Method

The elbow method is a graphical method for finding the optimal K value in a k-means clustering algorithm. The elbow graph shows the within-cluster-sum-of-square (WCSS) values on the y-axis corresponding to the different values of K (on the x-axis). The optimal K value is the point at which the graph forms an elbow.

In [None]:
distorsions = []
for k in range(2, 20):
    kmeans = KMeans(n_clusters=k)
    kmeans.fit(data)
    distorsions.append(kmeans.inertia_)

fig = plt.figure(figsize=(10, 5))
plt.plot(range(2, 20), distorsions)
plt.xlabel('Number of clusters')
plt.ylabel('WCSS')
plt.grid(True)
plt.title('Elbow curve')