In [2]:
import os, glob, pandas as pd


candidate_dir = r"C:\Users\Lion\NFHS\2019-2021\Individual Recode-IAIR7EDT"


bm_path = os.path.join(candidate_dir, "nfhs5_women_biomarkers.parquet")
if not os.path.exists(bm_path):
   
    base_dir = r"C:\Users\Lion\NFHS"
    hits = []
    for root, _, files in os.walk(base_dir):
        for f in files:
            if f.lower() == "nfhs5_women_biomarkers.parquet":
                hits.append(os.path.join(root, f))
    bm_path = hits[0] if hits else None

print("Using bm_path:", bm_path)
assert bm_path is not None and os.path.exists(bm_path), \
    "Could not find nfhs5_women_biomarkers.parquet. If needed, re-run Code_6 to recreate it."

m = pd.read_parquet(bm_path)


Using bm_path: C:\Users\Lion\NFHS\2019-2021\Individual Recode-IAIR7EDT\nfhs5_women_biomarkers.parquet


In [6]:
import pandas as pd, numpy as np, itertools


m = pd.read_parquet(bm_path)
w = pd.to_numeric(m["v005"], errors="coerce")/1_000_000

import numpy as np, os, yaml


cfg = dict(bmi_abn_ge=23.0, bp_sbp_ge=140, bp_dbp_ge=90,
           glucose_abn_ge=200, waist_f_ge_cm=80, hb_f_lt_gdl=12.0)


if os.path.exists("thresholds.yml"):
    try:
        sets = yaml.safe_load(open("thresholds.yml","r",encoding="utf-8"))["threshold_sets"]
        base = next(s for s in sets if s["name"]=="base")
        cfg.update(base)
    except Exception:
        pass  

# ensure numeric types
for c in ["bmi","sbp_clean","dbp_clean","glucose_mgdl","waist_cm","hb_gdl"]:
    if c in m: m[c] = pd.to_numeric(m[c], errors="coerce")


if {"shb53","shb54"}.issubset(m.columns):
    fast = (pd.to_numeric(m["shb53"], errors="coerce")>=8) & (pd.to_numeric(m["shb54"], errors="coerce")>=8)
    m["GLU_abn"] = np.where(fast, m["glucose_mgdl"]>=126, m["glucose_mgdl"]>=220)
else:
    m["GLU_abn"] = m["glucose_mgdl"] >= cfg["glucose_abn_ge"]

m["BMI_abn"]   = m["bmi"] >= cfg["bmi_abn_ge"]
m["BP_abn"]    = (m["sbp_clean"]>=cfg["bp_sbp_ge"]) | (m["dbp_clean"]>=cfg["bp_dbp_ge"])
m["WAIST_abn"] = m["waist_cm"] >= cfg["waist_f_ge_cm"]
m["HGB_abn"]   = m["hb_gdl"] < cfg["hb_f_lt_gdl"]

# quick sanity
print({k: (k in m.columns) for k in ["BMI_abn","BP_abn","GLU_abn","WAIST_abn","HGB_abn"]})



present_all5 = m[["bmi","sbp_clean","dbp_clean","glucose_mgdl","waist_cm","hb_gdl"]].notna().all(axis=1)
m = m.loc[present_all5].copy()
w = w.loc[present_all5]
den_w = w.sum()


flags = {
    "BMI":   m["BMI_abn"].astype(bool).to_numpy(),
    "BP":    m["BP_abn"].astype(bool).to_numpy(),
    "GLU":   m["GLU_abn"].astype(bool).to_numpy(),   
    "WAIST": m["WAIST_abn"].astype(bool).to_numpy(),
    "HGB":   m["HGB_abn"].astype(bool).to_numpy(),
}
names = ["BMI","BP","GLU","WAIST","HGB"]
weights = w.to_numpy()


def mask_for_combo(include):
    msk = np.ones(len(weights), dtype=bool)
    for k in names:
        msk &= (flags[k] if (k in include) else ~flags[k])
    return msk

rows = []
for r in range(0, len(names)+1):
    for comb in itertools.combinations(names, r):
        label = "+".join(comb) if comb else "NONE"
        msk = mask_for_combo(set(comb))
        w_count = float(weights[msk].sum())
        w_pct   = float(w_count / den_w * 100.0)
        rows.append((label, r, w_count, w_pct))

ccm = (pd.DataFrame(rows, columns=["combo","#abnormal","w_count","w_pct"])
         .sort_values("w_pct", ascending=False, ignore_index=True))


ccm["cum_pct"] = ccm["w_pct"].cumsum()
selected = ccm[(ccm["w_pct"] >= 1.0) & (ccm["combo"]!="NONE")].copy()
selected_85 = selected[selected["cum_pct"] <= 85.0].copy()


ccm.to_csv("ccm_all_combos_weighted.csv", index=False)
selected_85.to_csv("ccm_selected_weighted.csv", index=False)

print("=== Selected CCMs (>=1% and cumulative <=85%) ===")
print(selected_85[["combo","#abnormal","w_pct","cum_pct"]].to_string(index=False))
print("\nSaved: ccm_all_combos_weighted.csv  and  ccm_selected_weighted.csv")


{'BMI_abn': True, 'BP_abn': True, 'GLU_abn': True, 'WAIST_abn': True, 'HGB_abn': True}
=== Selected CCMs (>=1% and cumulative <=85%) ===
        combo  #abnormal     w_pct   cum_pct
          HGB          1 29.225623 29.225623
BMI+WAIST+HGB          3 12.891116 60.370979
    BMI+WAIST          2 10.708253 71.079231
    WAIST+HGB          2  6.269547 77.348778
        WAIST          1  4.540381 81.889159

Saved: ccm_all_combos_weighted.csv  and  ccm_selected_weighted.csv


In [7]:
import pandas as pd, numpy as np

needed = ["BMI_abn","BP_abn","GLU_abn","WAIST_abn","HGB_abn","bmi","sbp_clean","dbp_clean","glucose_mgdl","waist_cm","hb_gdl"]
missing = [c for c in needed if c not in m.columns]
assert not missing, f"Missing columns: {missing}"


present_all5 = m[["bmi","sbp_clean","dbp_clean","glucose_mgdl","waist_cm","hb_gdl"]].notna().all(axis=1)
healthy = (~(m["BMI_abn"] | m["BP_abn"] | m["GLU_abn"] | m["WAIST_abn"] | m["HGB_abn"])) & present_all5


if "edu_cat" not in m.columns:
    edu_map = {0:"No education", 1:"Primary", 2:"Secondary", 3:"Higher"}
    m["edu_cat"] = m.get("v106").map(edu_map).fillna("Unknown")

edu_order = ["No education","Primary","Secondary","Higher","Unknown"]


sel = pd.read_csv("ccm_selected_weighted.csv")["combo"].tolist()


flags = {
    "BMI":   m["BMI_abn"].astype(bool).to_numpy(),
    "BP":    m["BP_abn"].astype(bool).to_numpy(),
    "GLU":   m["GLU_abn"].astype(bool).to_numpy(),
    "WAIST": m["WAIST_abn"].astype(bool).to_numpy(),
    "HGB":   m["HGB_abn"].astype(bool).to_numpy(),
}
names = ["BMI","BP","GLU","WAIST","HGB"]
weights = pd.to_numeric(m["v005"], errors="coerce").to_numpy()/1_000_000

def mask_for_combo(include_set):
    msk = np.ones(len(weights), dtype=bool)
    for k in names:
        msk &= (flags[k] if (k in include_set) else ~flags[k])
    return msk

den_all = (present_all5.to_numpy())
den_all_w = float(weights[den_all].sum())

rows = []
for combo in sel:
    include = set(combo.split("+"))
    combo_mask = present_all5.to_numpy() & mask_for_combo(include)
    ctrl_all = float(weights[combo_mask].sum()/den_all_w*100.0)  

    for g in edu_order:
        gm = (m["edu_cat"] == g).to_numpy()
        denom = gm & present_all5.to_numpy()
        den_w = float(weights[denom].sum())
        if den_w == 0:
            combo_w = np.nan
            healthy_w = np.nan
        else:
            combo_w   = float(weights[gm & combo_mask].sum()/den_w*100.0)
            healthy_w = float(weights[gm & healthy.to_numpy()].sum()/den_w*100.0)

        rows.append([g, combo, round(combo_w,2), round(ctrl_all,2), round(healthy_w,2)])

df_edu_ccm = pd.DataFrame(rows, columns=["group","combo","combo_w%","control_all_women_w%","healthy_w%"])
df_edu_ccm.to_csv("ccm_by_education.csv", index=False)
print("Saved ccm_by_education.csv")
df_edu_ccm.head(12)


Saved ccm_by_education.csv


Unnamed: 0,group,combo,combo_w%,control_all_women_w%,healthy_w%
0,No education,HGB,29.75,29.23,16.23
1,Primary,HGB,28.36,29.23,15.57
2,Secondary,HGB,31.14,29.23,19.56
3,Higher,HGB,22.79,29.23,19.02
4,Unknown,HGB,,29.23,
5,No education,BMI+WAIST+HGB,11.29,12.89,16.23
6,Primary,BMI+WAIST+HGB,13.78,12.89,15.57
7,Secondary,BMI+WAIST+HGB,12.71,12.89,19.56
8,Higher,BMI+WAIST+HGB,15.15,12.89,19.02
9,Unknown,BMI+WAIST+HGB,,12.89,


In [8]:
import numpy as np
import pandas as pd
import os


wealth_map = {1:'Poorest',2:'Poorer',3:'Middle',4:'Richer',5:'Richest'}
m['wealth_cat'] = pd.to_numeric(m['v190'], errors='coerce').map(wealth_map).fillna('Unknown')
wealth_order = ['Poorest','Poorer','Middle','Richer','Richest']  


def wshare(mask, denom_mask):
    num = (mask.astype(int) * w).where(denom_mask, 0).sum()
    den = w.where(denom_mask, 0).sum()
    return float(num/den*100) if den>0 else np.nan


present_all5 = m[['bmi','sbp_clean','dbp_clean','glucose_mgdl','waist_cm','hb_gdl']].notna().all(axis=1)
healthy      = ~(m['BMI_abn'] | m['BP_abn'] | m['GLU_abn'] | m['WAIST_abn'] | m['HGB_abn'])

def exact_combo_mask(combo_set):
    msk = np.ones(len(m), dtype=bool)
    msk &= (m['BMI_abn']   if 'BMI'   in combo_set else ~m['BMI_abn'])
    msk &= (m['BP_abn']    if 'BP'    in combo_set else ~m['BP_abn'])
    msk &= (m['GLU_abn']   if 'GLU'   in combo_set else ~m['GLU_abn'])
    msk &= (m['WAIST_abn'] if 'WAIST' in combo_set else ~m['WAIST_abn'])
    msk &= (m['HGB_abn']   if 'HGB'   in combo_set else ~m['HGB_abn'])
    return msk & present_all5


rows = []
for combo in selected_85['combo']:
    combo_set = set(combo.split('+'))
    exact = exact_combo_mask(combo_set)
    
    allwomen_pct = wshare(exact, present_all5)

    for g in wealth_order + ['Unknown']:
        gm = (m['wealth_cat'] == g)
        rows.append({
            'group': g,
            'combo': combo,
            'combo_w%': wshare(exact & gm, present_all5 & gm),
            'control_all_women_w%': allwomen_pct,
            'healthy_w%': wshare(healthy & gm, present_all5 & gm),
        })

tbl_wealth = pd.DataFrame(rows)
os.makedirs('figs', exist_ok=True)
tbl_wealth.to_csv('ccm_by_wealth.csv', index=False)
print('Saved: ccm_by_wealth.csv')
tbl_wealth.head(12)


Saved: ccm_by_wealth.csv


Unnamed: 0,group,combo,combo_w%,control_all_women_w%,healthy_w%
0,Poorest,HGB,42.3066,29.225623,21.351067
1,Poorer,HGB,34.802589,29.225623,20.571134
2,Middle,HGB,28.761751,29.225623,18.444424
3,Richer,HGB,23.4307,29.225623,16.589004
4,Richest,HGB,17.396926,29.225623,14.398055
5,Unknown,HGB,,29.225623,
6,Poorest,BMI+WAIST+HGB,7.00303,12.891116,21.351067
7,Poorer,BMI+WAIST+HGB,10.011339,12.891116,20.571134
8,Middle,BMI+WAIST+HGB,12.791374,12.891116,18.444424
9,Richer,BMI+WAIST+HGB,15.674526,12.891116,16.589004


In [9]:
import numpy as np
import matplotlib.pyplot as plt
import os

wealth_order = ['Poorest','Poorer','Middle','Richer','Richest']
os.makedirs('figs', exist_ok=True)

for combo in tbl_wealth['combo'].unique():
    df = (tbl_wealth[tbl_wealth['combo']==combo]
            .set_index('group')
            .reindex(wealth_order)        
            .reset_index())

    x = np.arange(len(wealth_order))
    width = 0.28

    fig, ax = plt.subplots(figsize=(8,4))
    ax.bar(x - width, df['combo_w%'],              width, label='Combo')
    ax.bar(x,         df['control_all_women_w%'],  width, label='All women (control)')
    ax.bar(x + width, df['healthy_w%'],            width, label='Healthy (all 5 normal)')

    ax.set_xticks(x)
    ax.set_xticklabels(wealth_order)
    ax.set_ylabel('% abnormal (weighted)')
    ax.set_title(f'{combo} vs Wealth (with controls)')
    ax.legend()
    fig.tight_layout()

    out = f'figs/wealth_{combo.replace("+","_")}_with_controls.png'
    plt.savefig(out, dpi=200)
    plt.close()
    print('Saved:', out)


Saved: figs/wealth_HGB_with_controls.png
Saved: figs/wealth_BMI_WAIST_HGB_with_controls.png
Saved: figs/wealth_BMI_WAIST_with_controls.png
Saved: figs/wealth_WAIST_HGB_with_controls.png
Saved: figs/wealth_WAIST_with_controls.png


In [10]:
import pandas as pd, numpy as np, matplotlib.pyplot as plt, os


tbl_edu = pd.read_csv("ccm_by_education.csv")


edu_order = ["No education","Primary","Secondary","Higher"]
os.makedirs("figs", exist_ok=True)

for combo in tbl_edu["combo"].unique():
    df = (tbl_edu[tbl_edu["combo"]==combo]
            .query("group != 'Unknown'")
            .set_index("group")
            .reindex(edu_order)
            .reset_index())

    x = np.arange(len(edu_order))
    width = 0.28

    fig, ax = plt.subplots(figsize=(8,4))
    ax.bar(x - width, df["combo_w%"],             width, label="Combo")
    ax.bar(x,         df["control_all_women_w%"], width, label="All women (control)")
    ax.bar(x + width, df["healthy_w%"],           width, label="Healthy (all 5 normal)")

    ax.set_xticks(x)
    ax.set_xticklabels(edu_order)
    ax.set_ylabel("% abnormal (weighted)")
    ax.set_title(f"{combo} vs Education (with controls)")
    ax.legend()
    fig.tight_layout()

    out = f'figs/education_{combo.replace("+","_")}_with_controls.png'
    plt.savefig(out, dpi=200)
    plt.close()
    print("Saved:", out)


Saved: figs/education_HGB_with_controls.png
Saved: figs/education_BMI_WAIST_HGB_with_controls.png
Saved: figs/education_BMI_WAIST_with_controls.png
Saved: figs/education_WAIST_HGB_with_controls.png
Saved: figs/education_WAIST_with_controls.png
