# 1. Loading and Preprocessing

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.datasets import load_iris
import pandas as pd

# Load the dataset
iris = load_iris()
df = pd.DataFrame(iris.data, columns=iris.feature_names)

# 'Drop' the species by simply not including it in our training data
X = df.values 

print(f"Dataset loaded with {X.shape[0]} samples and {X.shape[1]} features.")

# 2. Clustering Algorithm Implementation

A) KMeans Clustering

In [None]:
from sklearn.cluster import KMeans

# Initialize and fit KMeans (we know there are 3 species, so K=3)
kmeans = KMeans(n_clusters=3, random_state=42, n_init=10)
y_kmeans = kmeans.fit_predict(X)

# Visualization
plt.figure(figsize=(8, 5))
plt.scatter(X[:, 0], X[:, 1], c=y_kmeans, s=50, cmap='viridis')
centers = kmeans.cluster_centers_
plt.scatter(centers[:, 0], centers[:, 1], c='red', s=200, alpha=0.75, marker='X', label='Centroids')
plt.title("KMeans Clustering (Sepal Length vs Width)")
plt.xlabel("Sepal Length")
plt.ylabel("Sepal Width")
plt.legend()
plt.show()

B) Hierarchical Clustering

In [None]:
from scipy.cluster.hierarchy import dendrogram, linkage, fcluster

# Perform Linkage
linked = linkage(X, method='ward')

# Visualization: Dendrogram
plt.figure(figsize=(10, 5))
dendrogram(linked, truncate_mode='lastp', p=12) # Truncated for readability
plt.title("Hierarchical Clustering Dendrogram")
plt.xlabel("Cluster Size")
plt.ylabel("Distance")
plt.show()

# Apply clustering
y_hierarchical = fcluster(linked, 3, criterion='maxclust')

# Visualization: Scatter Plot
plt.figure(figsize=(8, 5))
plt.scatter(X[:, 0], X[:, 1], c=y_hierarchical, s=50, cmap='plasma')
plt.title("Hierarchical Clustering (Sepal Length vs Width)")
plt.xlabel("Sepal Length")
plt.ylabel("Sepal Width")
plt.show()