# Processing Pipeline

Load primary clean data source:

In [2]:
import os
import pandas as pd

DATA_DIR = os.path.abspath(os.path.join(".", "data"))
IN_PATH  = os.path.join(DATA_DIR, "clean.csv")

# load clean.csv
df_clean = pd.read_csv(IN_PATH)

# short col. names
COLS = {
    "AGE45":   "Population Age 45+ (%)",
    "OVER_W":  "Women Overweight or Obese (%) [BMI>=25.0]",
    "OVER_M":  "Men Overweight or Obese (%) [BMI>=25.0]",
    "UNDER_W": "Women Underweight (%) [BMI<18.5]",
    "UNDER_M": "Men Underweight (%) [BMI<18.5]",
    "HTN_W":   "Women Hypertensive (%) [SBP>=140 mmHg OR DBP>=90mmHg]",
    "HTN_M":   "Men Hypertensive (%) [SBP>=140 mmHg OR DBP>=90mmHg]",
    "GLU_W":   "Women High Glucose [BG>140mg/dl OR medicated]",
    "GLU_M":   "Men High Glucose [BG>140mg/dl OR medicated]",
    "WAIST_W": "Women Substantially Increased Risk of Metabolic Complications (%) [waist>88cm]",
    "WAIST_M": "Men Substantially Increased Risk of Metabolic Complications (%) [waist>102cm]",
}

ID_COLS = ["District", "State/UT"]

Quick completeness check:

In [3]:
missing = [c for c in ID_COLS + list(COLS.values()) if c not in df_clean.columns]

# data status
if missing: raise ValueError(f"Missing required column(s): {missing}")
else:       print("All columns present.")

All columns present.


Compute $z$-scores according to $z(x) = \frac{x - \mu_x}{\sigma_x}$ with national distinct distribution:

In [4]:
import numpy as np


def zscore(series:pd.Series) -> pd.Series:
	"""Mean-impute and z-score. If std==0 or NaN, return zeros."""
	
	s  = series.copy()
	mu = s.mean(skipna=True)
	s  = s.fillna(mu)
	
	std = s.std(skipna=True, ddof=0)
	
	if pd.isna(std) or std == 0: return pd.Series(np.zeros(len(s)), index=s.index)
	
	return (s - mu) / std


# compute per-district indicator z-scores
z = { k: zscore(df_clean[c]) for k, c in COLS.items() }

AGE45_z = z["AGE45"]

# sex-averaged standardized signal per district
OVER_z  = 0.5 * (z["OVER_W"] + z["OVER_M"])
UNDER_z = 0.5 * (z["UNDER_W"] + z["UNDER_M"])
WAIST_z = 0.5 * (z["WAIST_W"] + z["WAIST_M"])
HTN_z   = 0.5 * (z["HTN_W"] + z["HTN_M"])
GLU_z   = 0.5 * (z["GLU_W"] + z["GLU_M"])

Construct derived indices (reflect studies):

In [5]:
IR_index  = 0.45*WAIST_z + 0.35*OVER_z + 0.20*HTN_z
DEF_index = 0.50*UNDER_z - 0.25*OVER_z - 0.25*WAIST_z
GLY_index = GLU_z
AGE_index = AGE45_z

Compute subtype scoring using linear combinations of indices:

In [6]:
def softmax_rows(mat:np.ndarray) -> np.ndarray:
    """Row-wise softmax with numerical stability."""
    
    m    = np.max(mat, axis=1, keepdims=True)
    exps = np.exp(mat - m)
    
    return exps / np.sum(exps, axis=1, keepdims=True)


SIDD_raw  = 0.50*GLY_index + 0.30*DEF_index - 0.20*AGE_index
SIRD_raw  = 0.60*IR_index + 0.40*GLY_index
CIRDD_raw = 0.40*GLY_index + 0.30*IR_index + 0.30*DEF_index
MOD_raw   = 0.40*OVER_z - 0.30*IR_index - 0.30*GLY_index
MARD_raw  = 0.60*AGE_index - 0.20*GLY_index - 0.10*IR_index - 0.10*OVER_z

raw  = np.vstack([SIDD_raw, SIRD_raw, CIRDD_raw, MOD_raw, MARD_raw]).T
soft = softmax_rows(raw) * 100

Prior (national averages from Indian T2D clustering studies) blending calibration:

In [7]:
from dataclasses import dataclass


@dataclass
class Prior:
	SIDD  = 0.25
	SIRD  = 0.30
	CIRDD = 0.08
	MOD   = 0.02
	MARD  = 0.35
   
	def as_array(self) -> np.ndarray:
		"""Prior array representation."""
		
		a = np.array([self.SIDD, self.SIRD, self.CIRDD, self.MOD, self.MARD], float)
		
		return a / a.sum()


LAMBDA_PRIOR = 0.1

prior = Prior()
pct   = LAMBDA_PRIOR*soft + (1.0 - LAMBDA_PRIOR)*(prior.as_array()[None,:]*100.0)

Construct output frame:

In [8]:
main = df_clean[ID_COLS].copy()

# transpose because pct is (n_districts, 5)
main["SIDD_pct"], main["SIRD_pct"], main["CIRDD_pct"], main["MOD_pct"], main["MARD_pct"] = pct.T

Construct audit frame:

In [9]:
audit = df_clean[ID_COLS].copy()

audit["OVER_z"], audit["WAIST_z"], audit["HTN_z"], audit["GLU_z"], audit["UNDERWT_z"], audit["AGE45_z"] = OVER_z, WAIST_z, HTN_z, GLU_z, UNDER_z, AGE45_z
audit["IR_index"], audit["DEF_index"], audit["GLY_index"], audit["AGE_index"]                          = IR_index, DEF_index, GLY_index, AGE_index
audit["SIDD_raw"], audit["SIRD_raw"], audit["CIRDD_raw"], audit["MOD_raw"], audit["MARD_raw"]          = SIDD_raw, SIRD_raw, CIRDD_raw, MOD_raw, MARD_raw

Compute priority scores:

In [10]:
def z_norm(series:pd.Series) -> pd.Series:
	"""Z-score normalize a pandas Series, returns 0 if std == 0."""
	
	mu    = series.mean(skipna=True)
	sigma = series.std(skipna=True, ddof=0)
	
	if pd.isna(sigma) or sigma == 0: return pd.Series(np.zeros(len(series)), index=series.index)
	
	return (series - mu) / sigma


# choose proxies for inputs
diabetes_norm  = z_norm(audit["GLY_index"])
obesity_norm   =  z_norm(audit["OVER_z"])
affluence_norm = z_norm(df_clean["Population in Highest Wealth Quintile (%)"])

 # subtype shares (already percentages in main), normalize them
sird_norm = z_norm(main["SIRD_pct"])
mod_norm  = z_norm(main["MOD_pct"])

 # Compute scores
main["Priority_Score"]              = (0.5*diabetes_norm + 0.3*obesity_norm + 0.2*affluence_norm) * 100
main["GLP1_Focused_Priority_Score"] = (0.4*diabetes_norm + 0.3*obesity_norm + 0.2*affluence_norm + 0.05*sird_norm + 0.05*mod_norm) * 100

Output CSV result:

In [11]:
import os

OUT_DIR  = os.path.abspath(os.path.join(".", "out"))
OUT_PATH = os.path.join(OUT_DIR, "subphenotypes.csv")

# create if doesn't exist
os.makedirs(OUT_DIR, exist_ok=True)

main.sort_values(ID_COLS).to_csv(OUT_PATH, index=False)