# Part A – Cluster Characterisation (Python, class-style)

This notebook follows the lecture style (missing values, imputation, simple visualisations) to explore the SMHS dataset and describe the four latent metabolic clusters.


In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.impute import SimpleImputer, KNNImputer

sns.set(style="whitegrid")
pd.set_option("display.max_columns", 100)
pd.set_option("display.width", 140)

general_path = "/mnt/data/SMHS_general-1.csv"
validation_path = "/mnt/data/SMHS_validation_students-1.csv"

general = pd.read_csv(general_path)
validation = pd.read_csv(validation_path)

general.shape, validation.shape

## Missingness overview and heatmap

In [None]:
general.info()

In [None]:
missing_ratio = general.isna().mean().sort_values(ascending=False)
missing_ratio

In [None]:
plt.figure(figsize=(10, 6))
sns.heatmap(general.isnull(), cbar=False, cmap="viridis")
plt.title("Missing Values Heatmap – SMHS_general")
plt.tight_layout()
plt.show()

## SimpleImputer and KNNImputer demonstration

In [None]:
numeric_features = general.select_dtypes(include=["int64", "float64"]).columns.tolist()
categorical_features = general.select_dtypes(include=["object"]).columns.tolist()

# SimpleImputer for numeric
num_imputer = SimpleImputer(strategy="median")
general_num_imp = pd.DataFrame(num_imputer.fit_transform(general[numeric_features]), columns=numeric_features)

# SimpleImputer for categorical
cat_imputer = SimpleImputer(strategy="most_frequent")
general_cat_imp = pd.DataFrame(cat_imputer.fit_transform(general[categorical_features]), columns=categorical_features)

general_num_imp.isna().sum().head()

In [None]:
subset_vars = ["bmi", "fasting_glucose", "fasting_insulin", "triglycerides", "alt"]
knn_imputer = KNNImputer(n_neighbors=5)
general_knn_imp = pd.DataFrame(knn_imputer.fit_transform(general[subset_vars]), columns=subset_vars)
general_knn_imp.describe().T

## Cluster-wise summaries and plots

In [None]:
general["latent_cluster"] = general["latent_cluster"].astype("category")
continuous_vars = [
    "age", "deprivation_index", "alcohol_units_week", "physical_activity_minutes",
    "diet_quality_score", "depression_score", "bmi", "waist_hip_ratio",
    "sbp", "dbp", "sleep_duration_hours", "sleep_variability_hours",
    "sleep_efficiency", "steps_per_day", "fasting_glucose", "fasting_insulin",
    "triglycerides", "hdl", "ldl", "crp", "alt", "metabolic_pressure"
]

group_means = general.groupby("latent_cluster")[continuous_vars].mean().round(2)
group_means

In [None]:
key_vars = ["bmi", "fasting_glucose", "fasting_insulin", "triglycerides", "hdl", "alt", "metabolic_pressure"]
key_means = general.groupby("latent_cluster")[key_vars].mean().round(2)
key_means

In [None]:
fig, axes = plt.subplots(2, 4, figsize=(16, 8))
axes = axes.flatten()
clusters = general["latent_cluster"].cat.categories

for i, var in enumerate(key_vars):
    ax = axes[i]
    data = [general.loc[general["latent_cluster"] == c, var].dropna() for c in clusters]
    ax.boxplot(data)
    ax.set_title(var)
    ax.set_xticks(range(1, len(clusters) + 1))
    ax.set_xticklabels(clusters)
    ax.set_xlabel("latent_cluster")

for j in range(len(key_vars), len(axes)):
    fig.delaxes(axes[j])

fig.suptitle("Key metabolic variables by latent cluster", y=1.02)
fig.tight_layout()
plt.show()

In [None]:
categorical_vars = ["sex", "ethnicity", "smoking_status", "shift_worker", "family_history_diabetes"]
for col in categorical_vars:
    print(f"\n=== {col} (row-wise proportions) ===")
    display(pd.crosstab(general["latent_cluster"], general[col], normalize="index").round(2))

In [None]:
heatmap_vars = [
    "bmi", "waist_hip_ratio", "sbp", "dbp",
    "fasting_glucose", "fasting_insulin",
    "triglycerides", "hdl", "ldl",
    "crp", "alt", "metabolic_pressure"
]

cluster_means = general.groupby("latent_cluster")[heatmap_vars].mean()
cluster_means_z = (cluster_means - cluster_means.mean()) / cluster_means.std()

plt.figure(figsize=(10, 6))
sns.heatmap(cluster_means_z, annot=True, fmt=".2f", cmap="coolwarm", center=0)
plt.title("Standardised cluster means (z-scores)")
plt.tight_layout()
plt.show()

### Interpretation outline

- Cluster 1: metabolically healthy profile (normal BMI, low glucose/insulin, favourable lipids, low ALT).
- Cluster 2: obesity-driven MOD (highest BMI, moderately elevated insulin, triglycerides and ALT).
- Cluster 3: insulin-resistant, NAFLD-like (high BMI, very high insulin, high triglycerides, low HDL, highest ALT and metabolic pressure).
- Cluster 4: insulin-deficient, SIDD-like (normal BMI, highest glucose, lowest insulin, high family history of diabetes).
