In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt

# Load the dataset
file_path = 'diabetic_data.csv'  # Update the file path if needed
diabetic_data = pd.read_csv(file_path)

# Select numerical columns for clustering
numerical_columns = [
    'time_in_hospital', 'num_lab_procedures', 'num_procedures',
    'num_medications', 'number_outpatient', 'number_emergency',
    'number_inpatient', 'number_diagnoses'
]
numerical_data = diabetic_data[numerical_columns]

# Scale the numerical data
scaler = StandardScaler()
scaled_data = scaler.fit_transform(numerical_data)

# Calculate SSE for different values of k
sse = []
k_values = range(1, 11)
for k in k_values:
    kmeans = KMeans(n_clusters=k, random_state=42)
    kmeans.fit(scaled_data)
    sse.append(kmeans.inertia_)

# Plot the elbow method
plt.figure(figsize=(8, 6))
plt.plot(k_values, sse, marker='o')
plt.title('Elbow Method for Optimal k')
plt.xlabel('Number of Clusters (k)')
plt.ylabel('Sum of Squared Errors (SSE)')
plt.xticks(k_values)
plt.grid(True)
plt.show()

# Choose the optimal k (e.g., k=4 based on the elbow plot)
optimal_k = 4
kmeans = KMeans(n_clusters=optimal_k, random_state=42)
kmeans.fit(scaled_data)
cluster_labels = kmeans.labels_

# Add cluster labels to the original dataset
diabetic_data['Cluster'] = cluster_labels

# Reduce dimensions for visualization using PCA
pca = PCA(n_components=2)
pca_data = pca.fit_transform(scaled_data)

# Plot the clusters in 2D
plt.figure(figsize=(10, 8))
for cluster in range(optimal_k):
    plt.scatter(
        pca_data[cluster_labels == cluster, 0],
        pca_data[cluster_labels == cluster, 1],
        label=f'Cluster {cluster}'
    )
# Plot the centroids
centroids_pca = pca.transform(kmeans.cluster_centers_)
plt.scatter(centroids_pca[:, 0], centroids_pca[:, 1], s=200, c='black', marker='X', label='Centroids')

plt.title('K-means Clustering Visualization')
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.legend()
plt.grid(True)
plt.show()
