In [2]:
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score

try:
    from sklearn.datasets import load_boston
    boston = load_boston()
    X = pd.DataFrame(boston.data, columns=boston.feature_names)
    dataset_name = "Boston"
except ImportError:
    from sklearn.datasets import fetch_california_housing
    boston = fetch_california_housing()
    X = pd.DataFrame(boston.data, columns=boston.feature_names)
    dataset_name = "California Housing"

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

silhouette_scores = []
for k in range(2, 7):
    kmeans = KMeans(n_clusters=k, n_init=10, random_state=42)  # Explicitly set n_init
    labels = kmeans.fit_predict(X_scaled)
    score = silhouette_score(X_scaled, labels, sample_size=1000, random_state=42)  # Limit sample size
    silhouette_scores.append((k, score))

optimal_k = max(silhouette_scores, key=lambda x: x[1])[0]
print(f"Optimal number of clusters for {dataset_name}: {optimal_k}")
print(f"Silhouette scores for k=2 to 6: {silhouette_scores}")

kmeans_optimal = KMeans(n_clusters=optimal_k, n_init=10, random_state=42)
labels_optimal = kmeans_optimal.fit_predict(X_scaled)

X['cluster'] = labels_optimal

cluster_means = X.groupby('cluster').mean()
print(f"\nMean values for each cluster in {dataset_name}:")
print(cluster_means)

centroids = pd.DataFrame(scaler.inverse_transform(kmeans_optimal.cluster_centers_), columns=boston.feature_names)
print(f"\nCentroid coordinates for {dataset_name}:")
print(centroids)

print(f"\nDifference between cluster means and centroids in {dataset_name}:")
print(cluster_means - centroids)

Optimal number of clusters for California Housing: 2
Silhouette scores for k=2 to 6: [(2, 0.3368433874602563), (3, 0.3368433874602563), (4, 0.3026463930769773), (5, 0.32344236329026266), (6, 0.3232918404758393)]

Mean values for each cluster in California Housing:
           MedInc   HouseAge  AveRooms  AveBedrms   Population  AveOccup  \
cluster                                                                    
0        3.918104  28.412773  5.225159   1.075685  1532.241745  3.098100   
1        3.805276  28.952057  5.710036   1.125614  1278.279590  3.032817   

          Latitude   Longitude  
cluster                         
0        33.945736 -118.010096  
1        37.956526 -121.719940  

Centroid coordinates for California Housing:
     MedInc   HouseAge  AveRooms  AveBedrms   Population  AveOccup   Latitude  \
0  3.918418  28.414681  5.225372   1.075692  1532.198896  3.097956  33.945450   
1  3.804868  28.949303  5.709630   1.125593  1278.397166  3.033030  37.955996   

    Long