In [1]:
from ucimlrepo import fetch_ucirepo
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import AgglomerativeClustering
from scipy.cluster.hierarchy import dendrogram, linkage
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.decomposition import PCA

In [2]:
# Fetch Air Quality dataset
air_quality = fetch_ucirepo(id=360)
X = air_quality.data.features

In [3]:
# Clean -200 values
X = X.replace(-200, pd.NA)
X_cleaned = X.dropna(subset=['CO(GT)'])  # Drop rows missing CO(GT)

In [4]:
# Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_cleaned)

ValueError: could not convert string to float: '3/10/2004'

In [5]:

# --- Hierarchical Clustering ---
# Compute linkage matrix (ward method minimizes variance)
Z = linkage(X_scaled, method='ward')

# Fit Agglomerative Clustering (choose number of clusters, e.g., 3)
n_clusters = 3
hierarchical = AgglomerativeClustering(n_clusters=n_clusters, linkage='ward')
clusters = hierarchical.fit_predict(X_scaled)

# Add cluster labels to DataFrame
X_cleaned = X_cleaned.copy()  # Avoid SettingWithCopyWarning
X_cleaned['Cluster'] = clusters

# --- Visualization 1: Dendrogram ---
plt.figure(figsize=(12, 6))
dendrogram(Z, truncate_mode='level', p=5, leaf_rotation=90, leaf_font_size=8)
plt.xlabel('Sample Index')
plt.ylabel('Distance (Ward)')
plt.title('Dendrogram of Hierarchical Clustering')
plt.show()

# --- Visualization 2: 2D Scatter Plot with PCA ---
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_scaled)

plt.figure(figsize=(8, 6))
plt.scatter(X_pca[:, 0], X_pca[:, 1], c=clusters, cmap='viridis', alpha=0.5)
plt.xlabel('PCA Component 1')
plt.ylabel('PCA Component 2')
plt.title(f'Hierarchical Clustering (k={n_clusters}) - PCA')
plt.colorbar(label='Cluster')
plt.grid(True)
plt.show()
print("Explained variance ratio:", pca.explained_variance_ratio_)

# --- Visualization 3: Horizontal Boxplots of Features by Cluster ---
# Select a subset of features for clarity
subset_cols = ['CO(GT)', 'NO2(GT)', 'T']
df_melted = X_cleaned.melt(id_vars=['Cluster'], value_vars=subset_cols, 
                           var_name='Feature', value_name='Value')

plt.figure(figsize=(10, 6))
sns.boxplot(y='Feature', x='Value', hue='Cluster', data=df_melted, orient='h', palette='viridis')
plt.xlabel('Value')
plt.ylabel('Feature')
plt.title('Horizontal Boxplots of Features by Cluster')
plt.legend(title='Cluster')
plt.show()

# --- Visualization 4: Cluster Size Bar Plot ---
plt.figure(figsize=(8, 4))
sns.countplot(x='Cluster', data=X_cleaned, palette='viridis')
plt.xlabel('Cluster')
plt.ylabel('Number of Samples')
plt.title('Cluster Size Distribution')
plt.show()

# --- Optional: Feature Means by Cluster ---
mean_by_cluster = X_cleaned.groupby('Cluster')[subset_cols].mean()
plt.figure(figsize=(10, 6))
mean_by_cluster.T.plot(kind='bar', colormap='viridis')
plt.xlabel('Feature')
plt.ylabel('Mean Value')
plt.title('Mean Feature Values by Cluster')
plt.legend(title='Cluster')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

NameError: name 'X_scaled' is not defined