In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering
from sklearn.metrics import silhouette_score

In [2]:
df = pd.read_csv('cleaned_data.csv')


In [3]:
features = df.drop(columns=['ID'], errors='ignore')

In [4]:
scaler = StandardScaler()
data_scaled = scaler.fit_transform(features)


In [10]:
kmeans = KMeans(n_clusters=4, random_state=42, n_init=10)
kmeans_labels = kmeans.fit_predict(data_scaled)
kmeans_silhouette = silhouette_score(data_scaled, kmeans_labels)

In [11]:
print(f"K-Means Silhouette Score: {kmeans_silhouette:.3f}")

K-Means Silhouette Score: 0.045


In [16]:
from sklearn.metrics import silhouette_score
silhouette_scores_per_cluster = {}
for cluster in range(4):
    cluster_mask = kmeans_labels == cluster
    cluster_points = data_scaled[cluster_mask]
    if len(cluster_points) > 1 and len(set(kmeans_labels[cluster_mask])) > 1:  # Ensure valid silhouette calculation
        silhouette_scores_per_cluster[cluster] = silhouette_score(cluster_points, kmeans_labels[cluster_mask])
    else:
        silhouette_scores_per_cluster[cluster] = None  # Assign None for invalid clusters


In [18]:
print(f"K-Means Silhouette Score: {kmeans_silhouette:.3f}")
print("K-Means Silhouette Score per cluster:")
for cluster, score in silhouette_scores_per_cluster.items():
    if score is not None:
        print(f"  Cluster {cluster}: {score:.3f}")
    else:
        print(f"  Cluster {cluster}: Not enough points to calculate Silhouette Score")

K-Means Silhouette Score: 0.045
K-Means Silhouette Score per cluster:
  Cluster 0: Not enough points to calculate Silhouette Score
  Cluster 1: Not enough points to calculate Silhouette Score
  Cluster 2: Not enough points to calculate Silhouette Score
  Cluster 3: Not enough points to calculate Silhouette Score


from scipy.cluster.hierarchy import linkage, fcluster
linkage_matrix = linkage(data_scaled, method='ward')  # Precompute linkage matrix for speed
hierarchical_labels = fcluster(linkage_matrix, 4, criterion='maxclust')
hierarchical_silhouette = silhouette_score(data_scaled, hierarchical_labels)


In [None]:
dbscan = DBSCAN(eps=0.5, min_samples=10, metric='euclidean', n_jobs=-1)  # Faster execution with optimized params
dbscan_labels = dbscan.fit_predict(data_scaled)
dbscan_silhouette = silhouette_score(data_scaled, dbscan_labels) if len(set(dbscan_labels)) > 1 else None


In [None]:
print(f"Hierarchical Clustering Silhouette Score: {hierarchical_silhouette:.3f}")
print(f"DBSCAN Silhouette Score: {dbscan_silhouette:.3f}" if dbscan_silhouette else "DBSCAN clustering not valid")
