In [9]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import copy

from sklearn.preprocessing import StandardScaler

In [3]:
data = pd.read_csv("Iris.csv")

In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             150 non-null    int64  
 1   SepalLengthCm  150 non-null    float64
 2   SepalWidthCm   150 non-null    float64
 3   PetalLengthCm  150 non-null    float64
 4   PetalWidthCm   150 non-null    float64
 5   Species        150 non-null    object 
dtypes: float64(4), int64(1), object(1)
memory usage: 7.2+ KB


In [15]:
df = copy.deepcopy(data)

In [16]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Id,150.0,75.5,43.445368,1.0,38.25,75.5,112.75,150.0
SepalLengthCm,150.0,5.843333,0.828066,4.3,5.1,5.8,6.4,7.9
SepalWidthCm,150.0,3.054,0.433594,2.0,2.8,3.0,3.3,4.4
PetalLengthCm,150.0,3.758667,1.76442,1.0,1.6,4.35,5.1,6.9
PetalWidthCm,150.0,1.198667,0.763161,0.1,0.3,1.3,1.8,2.5


In [17]:
df.drop(columns=['Id', 'Species'], inplace=True)

In [18]:
df

Unnamed: 0,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2
...,...,...,...,...
145,6.7,3.0,5.2,2.3
146,6.3,2.5,5.0,1.9
147,6.5,3.0,5.2,2.0
148,6.2,3.4,5.4,2.3


## Scaling

In [20]:
s = StandardScaler()

column_names = df.columns
index_names = df.index
s = StandardScaler()
df_ = s.fit_transform(df)
df_scaled = pd.DataFrame(df_, index=index_names, columns=column_names)

In [21]:
df_scaled

Unnamed: 0,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm
0,-0.900681,1.032057,-1.341272,-1.312977
1,-1.143017,-0.124958,-1.341272,-1.312977
2,-1.385353,0.337848,-1.398138,-1.312977
3,-1.506521,0.106445,-1.284407,-1.312977
4,-1.021849,1.263460,-1.341272,-1.312977
...,...,...,...,...
145,1.038005,-0.124958,0.819624,1.447956
146,0.553333,-1.281972,0.705893,0.922064
147,0.795669,-0.124958,0.819624,1.053537
148,0.432165,0.800654,0.933356,1.447956


## Clustering

### KMeans

In [22]:
from sklearn.cluster import KMeans

k_means = KMeans(n_clusters=3, n_init=10)
k_means.fit(df_scaled)

In [23]:
k_means.labels_

array([2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1,
       1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1,
       1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0], dtype=int32)

### Hierarchical Clustering

In [24]:
from sklearn.cluster import AgglomerativeClustering

hc = AgglomerativeClustering(n_clusters=3, metric="euclidean", linkage="ward")
hc_cluster = hc.fit_predict(df_scaled)

In [25]:
hc_cluster

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1,
       1, 1, 1, 1, 1, 1, 0, 0, 0, 2, 0, 2, 0, 2, 0, 2, 2, 0, 2, 0, 2, 0,
       2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 0, 2, 0, 0, 2,
       2, 2, 2, 0, 2, 2, 2, 2, 2, 0, 2, 2, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

### Meaning of numbers

In [43]:
data['kmeans'] = k_means.labels_

In [32]:
data[data['kmeans'] == 2]

Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species,kmeans
0,1,5.1,3.5,1.4,0.2,Iris-setosa,2
1,2,4.9,3.0,1.4,0.2,Iris-setosa,2
2,3,4.7,3.2,1.3,0.2,Iris-setosa,2
3,4,4.6,3.1,1.5,0.2,Iris-setosa,2
4,5,5.0,3.6,1.4,0.2,Iris-setosa,2
5,6,5.4,3.9,1.7,0.4,Iris-setosa,2
6,7,4.6,3.4,1.4,0.3,Iris-setosa,2
7,8,5.0,3.4,1.5,0.2,Iris-setosa,2
8,9,4.4,2.9,1.4,0.2,Iris-setosa,2
9,10,4.9,3.1,1.5,0.1,Iris-setosa,2


In [44]:
data.kmeans.replace({0: "Iris-versicolor",
                   1: "Iris-virginica",
                   2: "Iris-setosa"}, inplace=True)

In [45]:
data

Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species,kmeans
0,1,5.1,3.5,1.4,0.2,Iris-setosa,Iris-setosa
1,2,4.9,3.0,1.4,0.2,Iris-setosa,Iris-setosa
2,3,4.7,3.2,1.3,0.2,Iris-setosa,Iris-setosa
3,4,4.6,3.1,1.5,0.2,Iris-setosa,Iris-setosa
4,5,5.0,3.6,1.4,0.2,Iris-setosa,Iris-setosa
...,...,...,...,...,...,...,...
145,146,6.7,3.0,5.2,2.3,Iris-virginica,Iris-virginica
146,147,6.3,2.5,5.0,1.9,Iris-virginica,Iris-versicolor
147,148,6.5,3.0,5.2,2.0,Iris-virginica,Iris-virginica
148,149,6.2,3.4,5.4,2.3,Iris-virginica,Iris-virginica


In [46]:
data["hc_cluster"] = hc_cluster

In [53]:
data[data["hc_cluster"] == 2]

Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species,kmeans,hc_cluster
41,42,4.5,2.3,1.3,0.3,Iris-setosa,Iris-setosa,2
53,54,5.5,2.3,4.0,1.3,Iris-versicolor,Iris-versicolor,2
55,56,5.7,2.8,4.5,1.3,Iris-versicolor,Iris-versicolor,2
57,58,4.9,2.4,3.3,1.0,Iris-versicolor,Iris-versicolor,2
59,60,5.2,2.7,3.9,1.4,Iris-versicolor,Iris-versicolor,2
60,61,5.0,2.0,3.5,1.0,Iris-versicolor,Iris-versicolor,2
62,63,6.0,2.2,4.0,1.0,Iris-versicolor,Iris-versicolor,2
64,65,5.6,2.9,3.6,1.3,Iris-versicolor,Iris-versicolor,2
66,67,5.6,3.0,4.5,1.5,Iris-versicolor,Iris-versicolor,2
67,68,5.8,2.7,4.1,1.0,Iris-versicolor,Iris-versicolor,2


In [54]:
data.hc_cluster.replace({0: "Iris-virginica",
                   1: "Iris-setosa",
                   2: "Iris-versicolor"}, inplace=True)

In [55]:
data

Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species,kmeans,hc_cluster
0,1,5.1,3.5,1.4,0.2,Iris-setosa,Iris-setosa,Iris-setosa
1,2,4.9,3.0,1.4,0.2,Iris-setosa,Iris-setosa,Iris-setosa
2,3,4.7,3.2,1.3,0.2,Iris-setosa,Iris-setosa,Iris-setosa
3,4,4.6,3.1,1.5,0.2,Iris-setosa,Iris-setosa,Iris-setosa
4,5,5.0,3.6,1.4,0.2,Iris-setosa,Iris-setosa,Iris-setosa
...,...,...,...,...,...,...,...,...
145,146,6.7,3.0,5.2,2.3,Iris-virginica,Iris-virginica,Iris-virginica
146,147,6.3,2.5,5.0,1.9,Iris-virginica,Iris-versicolor,Iris-virginica
147,148,6.5,3.0,5.2,2.0,Iris-virginica,Iris-virginica,Iris-virginica
148,149,6.2,3.4,5.4,2.3,Iris-virginica,Iris-virginica,Iris-virginica


In [56]:
data.Species.value_counts()

Iris-versicolor    50
Iris-virginica     50
Iris-setosa        50
Name: Species, dtype: int64

In [57]:
data.kmeans.value_counts()

Iris-versicolor    53
Iris-setosa        50
Iris-virginica     47
Name: kmeans, dtype: int64

In [58]:
data.hc_cluster.value_counts()

Iris-virginica     71
Iris-setosa        49
Iris-versicolor    30
Name: hc_cluster, dtype: int64