# K-means Clustering on the Palmer Penguins Dataset

This notebook walks through a complete K-means clustering workflow on the Palmer Penguins dataset. We inspect the data, standardize the numeric features, experiment with a trial number of clusters, use the elbow method with the `kneed` package to pick an optimal cluster count, and visualize the final clusters.

## 1. Imports and setup

In [None]:
# Plot figures inline and import the packages we will use.
%matplotlib inline

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA

from kneed import KneeLocator

sns.set_theme(style='whitegrid', context='talk')
plt.rcParams['figure.figsize'] = (10, 6)


## 2. Load and inspect the Palmer Penguins data

In [None]:
penguins = sns.load_dataset('penguins')
print(f'Dataset shape: {penguins.shape}')
penguins.head()


In [None]:
# Quick overview of data types and missing values
penguins.info()
penguins.isna().sum()


In [None]:
# Summary statistics for numeric features
penguins.describe()


## 3. Select numeric features and handle missing values

In [None]:
numeric_features = ['bill_length_mm', 'bill_depth_mm', 'flipper_length_mm', 'body_mass_g']
penguins_model = penguins.dropna(subset=numeric_features).reset_index(drop=True)
X = penguins_model[numeric_features]
print(f'Rows after dropping missing numeric values: {len(X)}')
X.head()


In [None]:
# Standardize numeric features so each contributes equally to the distance calculations
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_scaled_df = pd.DataFrame(X_scaled, columns=numeric_features)
X_scaled_df.head()


## 4. Trial K-means clustering

In [None]:
trial_k = 3
kmeans_trial = KMeans(n_clusters=trial_k, n_init=20, random_state=42)
penguins_model['cluster_trial'] = kmeans_trial.fit_predict(X_scaled_df)
trial_inertia = kmeans_trial.inertia_
print(f'Inertia for k={trial_k}: {trial_inertia:.2f}')
penguins_model['cluster_trial'].value_counts().sort_index()


In [None]:
# Visualize the trial clustering using two informative features
fig, ax = plt.subplots()
sns.scatterplot(data=penguins_model, x='bill_length_mm', y='flipper_length_mm',
                hue='cluster_trial', palette='tab10', ax=ax)
ax.set_title(f'Trial K-means clustering (k={trial_k})')
ax.legend(title='Cluster', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
plt.show()


## 5. Elbow analysis over a range of k values

In [None]:
k_values = list(range(1, 11))
inertias = []
for k in k_values:
    model = KMeans(n_clusters=k, n_init=20, random_state=42)
    model.fit(X_scaled_df)
    inertias.append(model.inertia_)

fig, ax = plt.subplots()
ax.plot(k_values, inertias, marker='o')
ax.set_xlabel('Number of clusters (k)')
ax.set_ylabel('Inertia (within-cluster sum of squares)')
ax.set_title('Elbow curve for k=1..10')
plt.xticks(k_values)
plt.grid(True, which='both', ls='--', linewidth=0.5)
plt.show()


In [None]:
knee_locator = KneeLocator(k_values, inertias, curve='convex', direction='decreasing')
optimal_k = knee_locator.elbow
print(f'Optimal k according to KneeLocator: {optimal_k}')
best_k = optimal_k if optimal_k is not None else trial_k
print(f'Using k={best_k} for the final model.')


## 6. Fit the final model and inspect clusters

In [None]:
final_kmeans = KMeans(n_clusters=best_k, n_init=20, random_state=42)
penguins_model['cluster'] = final_kmeans.fit_predict(X_scaled_df)
cluster_sizes = penguins_model['cluster'].value_counts().sort_index()
cluster_sizes


In [None]:
# Transform cluster centers back to the original units for interpretability
centroids_original_scale = pd.DataFrame(
    scaler.inverse_transform(final_kmeans.cluster_centers_),
    columns=numeric_features
)
centroids_original_scale


## 7. Visualize the final clusters

In [None]:
# Use PCA to reduce dimensionality for a cleaner 2D visualization
pca = PCA(n_components=2, random_state=42)
components = pca.fit_transform(X_scaled_df)
components_df = pd.DataFrame(components, columns=['PC1', 'PC2'])
components_df['cluster'] = penguins_model['cluster']
components_df['species'] = penguins_model['species'].reset_index(drop=True)
explained_var = pca.explained_variance_ratio_.sum() * 100
fig, ax = plt.subplots()
sns.scatterplot(data=components_df, x='PC1', y='PC2', hue='cluster', palette='tab10', ax=ax)
ax.set_title(f'Final K-means clustering visualized in PCA space (k={best_k})')
ax.set_xlabel('PC1')
ax.set_ylabel('PC2')
ax.text(0.02, 0.02, f'Explained variance: {explained_var:.1f}%', transform=ax.transAxes)
ax.legend(title='Cluster', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
plt.show()


In [None]:
# Compare clusters to the known species labels for additional context
pd.crosstab(penguins_model['cluster'], penguins_model['species'])
