In [None]:
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
import matplotlib.pyplot as plt

# Load the data
data = pd.read_csv('preprocessed_data.csv')

# Prepare the data for clustering
X = data.drop('Name', axis=1)

# Determine the optimal number of clusters using the elbow method
inertia = []
silhouette = []
K = range(2, 10)
for k in K:
    kmeans = KMeans(n_clusters=k, random_state=42)
    kmeans.fit(X)
    inertia.append(kmeans.inertia_)
    silhouette.append(silhouette_score(X, kmeans.labels_))

# Plot the elbow graph
plt.figure(figsize=(16,8))
plt.plot(K, inertia, 'bx-')
plt.xlabel('k')
plt.ylabel('Inertia')
plt.title('The Elbow Method showing the optimal k')
plt.show()

# Plot the silhouette score graph
plt.figure(figsize=(16,8))
plt.plot(K, silhouette, 'bx-')
plt.xlabel('k')
plt.ylabel('Silhouette Score')
plt.title('The Silhouette Method showing the optimal k')
plt.show()

In [None]:
# Train the KMeans model with the optimal number of clusters
kmeans = KMeans(n_clusters=4, random_state=42)
kmeans.fit(X)

# Apply the model to the data
labels = kmeans.predict(X)

# Add the cluster labels to the original data
data['Cluster'] = labels

# Display the data with the cluster labels
data

In [None]:
import seaborn as sns

# Visualize the clusters
sns.pairplot(data, hue='Cluster', palette='Dark2', diag_kind='kde')

# Interpret the clusters
cluster_characteristics = data.groupby('Cluster').mean()
cluster_characteristics

In [None]:
# Save the data with cluster labels to a new CSV file
data.to_csv('preprocessed_data.csv', index=False)