In [1]:
import numpy as np, pandas as pd

def impute_median(A):
    med = np.nanmedian(A, axis=0)
    idx = np.where(np.isnan(A))
    A[idx] = np.take(med, idx[1])
    return A

def standardize(A):
    mu = A.mean(axis=0)
    sd = A.std(axis=0, ddof=1)
    sd[sd==0] = 1.0
    return (A - mu) / sd

def pca_reduce(A, k):
    U,S,VT = np.linalg.svd(A, full_matrices=False)
    Z = U[:,:k] * S[:k]
    return Z, VT[:k]

def kmeans(A, k, iters=50, seed=0):
    rng = np.random.default_rng(seed)
    idx = rng.choice(A.shape[0], size=k, replace=False)
    C = A[idx]
    for _ in range(iters):
        d2 = ((A[:,None,:] - C[None,:,:])**2).sum(axis=2)
        labels = d2.argmin(axis=1)
        C_new = np.vstack([A[labels==i].mean(axis=0) if np.any(labels==i) else C[i] for i in range(k)])
        if np.allclose(C, C_new): break
        C = C_new
    return labels, C

path = "dementia_dataset (2).xls"
try:
    df = pd.read_excel(path, engine="xlrd")
except Exception:
    try:
        df = pd.read_excel(path, engine="openpyxl")
    except Exception:
        df = pd.read_csv(path.replace(".xls",".csv"))

num = df.select_dtypes(include=["number"])
A = num.to_numpy(dtype=float)

A = impute_median(A)
A = standardize(A)
Z, comps = pca_reduce(A, k=3)
labels, centers = kmeans(Z, k=3, iters=100, seed=42)

counts = np.bincount(labels, minlength=3)
print(Z.shape)
print(counts.tolist())


(373, 3)
[116, 159, 98]
