In [1]:
import pandas as pd

df = pd.read_stata(r"D:\NFHS\DataBase-Files\2019-2021\Individual Recode-IAIR7EDT\IAIR7EFL.DTA", convert_categoricals=False)


In [2]:
df = df[df['v445'] < 6000]          
df = df[df['sb29s'].between(50, 300)]  
df = df[df['sb18s'].between(50, 300)]  
df = df[df['sb74'] < 300]          

In [3]:
thresholds = {
    'v445': {  # BMI
        'label': 'BMI',
        'unit': '',
        'low_cutoff': 1850,    # 18.5 in ×100
        'high_cutoff': 2500,   # 25.0 in ×100
    },
    'sb29s': {  # Systolic Blood Pressure
        'label': 'Systolic_BP',
        'unit': 'mmHg',
        'low_cutoff': 90,
        'high_cutoff': 130,
    },
    'sb18s': {  # Glucose
        'label': 'Glucose',
        'unit': 'mg/dL',
        'low_cutoff': 70,
        'high_cutoff': 140,
    },
    'sb74': {  # Waist Circumference
        'label': 'Waist',
        'unit': 'cm',
        'high_cutoff': 80,  # low values are not a problem
    },
    'sb15': {  # Hemoglobin
        'label': 'Hemoglobin',
        'unit': 'g/dL',
        'low_cutoff': 12.0,  # only low hemoglobin is abnormal
    }
}


In [4]:
for var, info in thresholds.items():
    label = info['label']
    
  
    df[f'{label}_abnormal'] = False
    
    
    if 'low_cutoff' in info:
        df[f'{label}_abnormal'] |= df[var] < info['low_cutoff']
    
   
    if 'high_cutoff' in info:
        df[f'{label}_abnormal'] |= df[var] > info['high_cutoff']



  df[f'{label}_abnormal'] = False
  df[f'{label}_abnormal'] = False
  df[f'{label}_abnormal'] = False
  df[f'{label}_abnormal'] = False
  df[f'{label}_abnormal'] = False


In [5]:
df['All_Normal'] = ~(df['BMI_abnormal'] | df['Systolic_BP_abnormal'] | df['Glucose_abnormal'] | df['Waist_abnormal'] | df['Hemoglobin_abnormal'])


total = len(df)


n_normal = df['All_Normal'].sum()
percent_normal = (n_normal / total) * 100


print(f"Totally normal women: {n_normal} out of {total} ({percent_normal:.2f}%)")

Totally normal women: 9386 out of 638817 (1.47%)


  df['All_Normal'] = ~(df['BMI_abnormal'] | df['Systolic_BP_abnormal'] | df['Glucose_abnormal'] | df['Waist_abnormal'] | df['Hemoglobin_abnormal'])


In [6]:
# abnormalities/woman
df['abnormality_count'] = (
    df['BMI_abnormal'].astype(int) +
    df['Systolic_BP_abnormal'].astype(int) +
    df['Glucose_abnormal'].astype(int) +
    df['Waist_abnormal'].astype(int) +
    df['Hemoglobin_abnormal'].astype(int)
)


df['abnormality_count'].value_counts().sort_index()


  df['abnormality_count'] = (


abnormality_count
0      9386
1    335309
2    237895
3     39088
4     17137
5         2
Name: count, dtype: int64

In [7]:
ab_cols = [f"{info['label']}_abnormal" for info in thresholds.values()]
ab_df = df[ab_cols]


In [8]:
combo_counts = ab_df.value_counts().sort_values(ascending=False)
combo_counts.head(10)  # 10 most common abnormalities


BMI_abnormal  Systolic_BP_abnormal  Glucose_abnormal  Waist_abnormal  Hemoglobin_abnormal
False         False                 False             True            False                  328595
True          False                 False             True            False                  209771
False         True                  False             True            False                   22341
                                    True              True            False                   17534
True          True                  False             True            False                   17222
                                    True              True            False                   17131
False         False                 False             False           False                    9386
True          False                 False             False           False                    6028
False         False                 True              True            False                    4911
True      

In [9]:
print(df['Hemoglobin_abnormal'].value_counts())
#just making sure about HB since it seems to be rare in our dataet to be abnormal but it is computed correctly

Hemoglobin_abnormal
False    638749
True         68
Name: count, dtype: int64
