In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import scipy.cluster.hierarchy as sch
from sklearn.preprocessing import LabelEncoder
from sklearn.cluster import AgglomerativeClustering

# Step 1: Load dataset
data = pd.read_csv("Cities_r2.csv")

# Step 2: Data preprocessing (if needed)
# Example: Encode non-numeric columns
if data['city'].dtype == 'object':
    le = LabelEncoder()
    data['city'] = le.fit_transform(data['city'])

# Step 3: Select the column for clustering
X = data[['effective_literacy_rate_total']]

# Step 4: Dendrogram to decide number of clusters
plt.figure(figsize=(10, 5))
dendrogram = sch.dendrogram(sch.linkage(X, method='ward'))
plt.title("Dendrogram for Hierarchical Clustering")
plt.xlabel("Cities")
plt.ylabel("Euclidean Distance")
plt.show()

# Step 5: Apply Agglomerative Clustering (no scaling)
hc = AgglomerativeClustering(n_clusters=3, metric='euclidean', linkage='ward')
y_hc = hc.fit_predict(X)

# Step 6: Add cluster labels to dataset
data['Cluster'] = y_hc

# Step 7: Display results
print(data.head())

# Optional: Visualize the clusters
plt.scatter(data['city'], data['effective_literacy_rate_total'], c=data['Cluster'], cmap='rainbow')
plt.title("Hierarchical Clustering (without scaling)")
plt.xlabel("City")
plt.ylabel("Effective Literacy Rate Total")
plt.show()
