In [87]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score 
from sklearn.model_selection import train_test_split
from sklearn.cluster import AgglomerativeClustering
from scipy.cluster.hierarchy import dendrogram,linkage

In [70]:
file_path = "data\Iris.csv"

data = pd.read_csv(file_path)

In [71]:
df = pd.DataFrame(data)

df.head()

Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,1,5.1,3.5,1.4,0.2,Iris-setosa
1,2,4.9,3.0,1.4,0.2,Iris-setosa
2,3,4.7,3.2,1.3,0.2,Iris-setosa
3,4,4.6,3.1,1.5,0.2,Iris-setosa
4,5,5.0,3.6,1.4,0.2,Iris-setosa


In [72]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             150 non-null    int64  
 1   SepalLengthCm  150 non-null    float64
 2   SepalWidthCm   150 non-null    float64
 3   PetalLengthCm  150 non-null    float64
 4   PetalWidthCm   150 non-null    float64
 5   Species        150 non-null    object 
dtypes: float64(4), int64(1), object(1)
memory usage: 7.2+ KB


In [73]:
df.describe()

Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm
count,150.0,150.0,150.0,150.0,150.0
mean,75.5,5.843333,3.054,3.758667,1.198667
std,43.445368,0.828066,0.433594,1.76442,0.763161
min,1.0,4.3,2.0,1.0,0.1
25%,38.25,5.1,2.8,1.6,0.3
50%,75.5,5.8,3.0,4.35,1.3
75%,112.75,6.4,3.3,5.1,1.8
max,150.0,7.9,4.4,6.9,2.5


In [74]:
df.columns

Index(['Id', 'SepalLengthCm', 'SepalWidthCm', 'PetalLengthCm', 'PetalWidthCm',
       'Species'],
      dtype='object')

In [75]:
df.columns = df.columns.str.lower()    

In [76]:
df.head(1)

Unnamed: 0,id,sepallengthcm,sepalwidthcm,petallengthcm,petalwidthcm,species
0,1,5.1,3.5,1.4,0.2,Iris-setosa


In [77]:
df.drop("id",axis=1,inplace=True)

In [78]:
df.head(1)

Unnamed: 0,sepallengthcm,sepalwidthcm,petallengthcm,petalwidthcm,species
0,5.1,3.5,1.4,0.2,Iris-setosa


In [79]:
X = df.drop("species",axis=1)
y = df["species"]

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 0.2,random_state=42)


In [80]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)


In [81]:
kmeans = KMeans(n_clusters=3, random_state=42, n_init=10)
df['cluster_kmeans'] = kmeans.fit_predict(X_scaled)

In [82]:
df.head()

Unnamed: 0,sepallengthcm,sepalwidthcm,petallengthcm,petalwidthcm,species,cluster_kmeans
0,5.1,3.5,1.4,0.2,Iris-setosa,1
1,4.9,3.0,1.4,0.2,Iris-setosa,1
2,4.7,3.2,1.3,0.2,Iris-setosa,1
3,4.6,3.1,1.5,0.2,Iris-setosa,1
4,5.0,3.6,1.4,0.2,Iris-setosa,1


In [83]:
    # View how clusters correspond to real species
pd.crosstab(df['cluster_kmeans'], df['species'])


species,Iris-setosa,Iris-versicolor,Iris-virginica
cluster_kmeans,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0,39,14
1,50,0,0
2,0,11,36


In [84]:

sil_score = silhouette_score(X_scaled, df['cluster_kmeans'])
print(f"Silhouette Score for KMeans: {sil_score:.3f}")

Silhouette Score for KMeans: 0.459


In [85]:
for i in range(len(df)):
    if df.loc[i, 'cluster_kmeans'] == 0:
        df.loc[i, 'cluster_kmeans'] = 'Iris-versicolor'
    elif df.loc[i, 'cluster_kmeans'] == 1:
        df.loc[i, 'cluster_kmeans'] = 'Iris-setosa'
    elif df.loc[i, 'cluster_kmeans'] == 2:
        df.loc[i, 'cluster_kmeans'] = 'Iris-virginica'


  df.loc[i, 'cluster_kmeans'] = 'Iris-setosa'


In [86]:
df.head()

Unnamed: 0,sepallengthcm,sepalwidthcm,petallengthcm,petalwidthcm,species,cluster_kmeans
0,5.1,3.5,1.4,0.2,Iris-setosa,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa,Iris-setosa


Agglomerative Clustering

In [90]:
hier = AgglomerativeClustering(n_clusters=3)
hier_labels = hier.fit_predict(X_scaled)
df['cluster_hierarchical'] = hier_labels



In [92]:
score_hier = silhouette_score(X_scaled, hier_labels)
print(f"Silhouette Score for Hierarchical: {score_hier:.3f}")


Silhouette Score for Hierarchical: 0.446


In [94]:
pd.crosstab(df['cluster_hierarchical'], df['species'])


species,Iris-setosa,Iris-versicolor,Iris-virginica
cluster_hierarchical,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0,23,48
1,49,0,0
2,1,27,2


In [95]:
for i in range(len(df)):
    if df.loc[i, 'cluster_hierarchical'] == 0:
        df.loc[i, 'cluster_hierarchical'] = 'Iris-virginica'
    elif df.loc[i, 'cluster_hierarchical'] == 1:
        df.loc[i, 'cluster_hierarchical'] = 'Iris-setosa'
    elif df.loc[i, 'cluster_hierarchical'] == 2:
        df.loc[i, 'cluster_hierarchical'] = 'Iris-versicolor'

  df.loc[i, 'cluster_hierarchical'] = 'Iris-setosa'


In [96]:
df.head()

Unnamed: 0,sepallengthcm,sepalwidthcm,petallengthcm,petalwidthcm,species,cluster_kmeans,cluster_hierarchical
0,5.1,3.5,1.4,0.2,Iris-setosa,Iris-setosa,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa,Iris-setosa,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa,Iris-setosa,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa,Iris-setosa,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa,Iris-setosa,Iris-setosa
