In [7]:
import os
import pandas as pd 
import numpy as np
from pathlib import Path
from pandas.io.stata import StataReader

In [8]:
current_dir = Path.cwd()
parent_dir  = current_dir.parent 
data_raw_dir = os.path.join(parent_dir, 'data\\raw')
data_processed_dir = os.path.join(parent_dir, 'data\\processed')
pre_kids_recode = os.path.join(data_raw_dir, 'nfhs4\\IAKR74DT\\IAKR74FL.DTA')
post_kids_recode = os.path.join(data_raw_dir, 'nfhs5\\IAKR7EDT\\IAKR7EFL.DTA')

In [9]:
outcomes_of_interest = ['hw70', # Height/Age standard deviation
                         'hw71', # Weight/Age standard deviation
                         'hw72' # Weight/Height standard deviation
                        ]

In [10]:
input_path = pre_kids_recode
output_file = os.path.join(data_processed_dir, "pre_kids_recode.csv")

covariates_of_interest = ['v005', # women's individual sample weight
                          'v006', # month of interview
                          'v007', # year of interview
                          'v012', # respondent's age
                          'v024', # state
                          'v025', # urban/rural
                          'v130', # religion
                          'v136', # household size
                          'v155', # literacy
                          'v191', # wealth index score
                          'sdistri', # district
                          's116', # caste
                          'b4' # sex of child
                        ] 

# Define columns to keep
cols_to_keep = covariates_of_interest + outcomes_of_interest
 
# Read in chunks
chunksize = 50000  

# Loop over chunks and write to CSV
for i, chunk in enumerate(pd.read_stata(input_path, chunksize=chunksize, columns=cols_to_keep, convert_categoricals=False)):
    # For first chunk, write header; for others, append without header
    chunk.to_csv(output_file, mode='a', index=False, header=(i==0))
    print(f"Processed chunk {i+1}")

Processed chunk 1
Processed chunk 2
Processed chunk 3
Processed chunk 4
Processed chunk 5
Processed chunk 6


In [11]:
input_path = post_kids_recode
output_file = os.path.join(data_processed_dir, "post_kids_recode.csv")

covariates_of_interest = ['v005', # women's individual sample weight
                          'v006', # month of interview
                          'v007', # year of interview
                          'v012', # respondent's age
                          'v024', # state
                          'v025', # urban/rural
                          'v130', # religion
                          'v136', # household size
                          'v155', # literacy
                          'v191', # wealth index score
                          'sdist', # district
                          's116', # caste
                          'b4' # sex of child
                        ] 

# Define columns to keep
cols_to_keep = covariates_of_interest + outcomes_of_interest
 
# Read in chunks
chunksize = 50000  

# Loop over chunks and write to CSV
for i, chunk in enumerate(pd.read_stata(input_path, chunksize=chunksize, columns=cols_to_keep, convert_categoricals=False)):
    # For first chunk, write header; for others, append without header
    chunk.to_csv(output_file, mode='a', index=False, header=(i==0))
    print(f"Processed chunk {i+1}")

Processed chunk 1
Processed chunk 2
Processed chunk 3
Processed chunk 4
Processed chunk 5


In [12]:
df_pre = pd.read_csv(os.path.join(data_processed_dir, "pre_kids_recode.csv"))
df_post = pd.read_csv(os.path.join(data_processed_dir, "post_kids_recode.csv"))

In [13]:
invalid = [9996, 9997, 9998, 9999]

cols = ['hw70', 'hw71', 'hw72']

# Drop rows where any of these columns contains an invalid code
nrows = len(df_pre)
df_pre = df_pre[~df_pre[cols].isin(invalid).any(axis=1)]
df_pre = df_pre.dropna()
print(f'{nrows - len(df_pre)} rows dropped. Remaining rows: {len(df_pre)}')

# Create outcome variables
df_pre['stunting'] = (df_pre['hw70'] < -200).astype(int)
df_pre['underweight'] = (df_pre['hw71'] < -200).astype(int)   
df_pre['wasting'] = (df_pre['hw72'] < -200).astype(int)

# Normalize variable with implied decimals
df_pre['weight'] = df_pre['v005'] / 1000000
df_pre['wealth_score'] = df_pre['v191'] / 100000

# Map labels
with StataReader(pre_kids_recode) as reader:
    labels = reader.value_labels()

state_labels = labels['V024']
dist_labels  = labels['SDISTRI']
religion_labels = labels['V130']
caste_labels = labels['S116']
rural_urban = labels['V025']

data = df_pre.copy()
data["state_name"] = data["v024"].map(state_labels)
data["district_name"] = data["sdistri"].map(dist_labels)
data["religion"] = data["v130"].map(religion_labels)
data["caste"] = data["s116"].map(caste_labels)
data["rural_urban"] = data["v025"].map(rural_urban)

# Filter to Hindu subset and create caste groups
df_pre.rename(columns={'sdistri': 'sdist'}, inplace=True)
df_pre = data[data['religion'] == 'hindu'] 

df_pre = df_pre[
    (df_pre['caste'] != "don't know") &
    (df_pre['caste'].notna())
]

# Define SC/ST flag
df_pre['scst'] = df_pre['caste'].isin(['schedule caste', 'schedule tribe']).astype(int)

# Create literate binary variable 
df_pre['literate'] = df_pre['v155'].isin([1, 2]).astype(int)

# Create child sex binary variable
df_pre['male'] = (df_pre['b4'] == 1).astype(int)

# Create unique district identifier
df_pre['state_district'] = df_pre['state_name'] + ' - ' + df_pre['district_name']

# Define aggregation dictionary
agg_dict = {
    # Outcomes
    'stunting': lambda x: np.average(x, weights=df_pre.loc[x.index, 'weight']),
    'underweight': lambda x: np.average(x, weights=df_pre.loc[x.index, 'weight']),
    'wasting': lambda x: np.average(x, weights=df_pre.loc[x.index, 'weight']),
    
    # Z-scores
    'hw70': lambda x: np.average(x, weights=df_pre.loc[x.index, 'weight']),
    'hw71': lambda x: np.average(x, weights=df_pre.loc[x.index, 'weight']),
    'hw72': lambda x: np.average(x, weights=df_pre.loc[x.index, 'weight']),
    
    # Controls
    'v012': lambda x: np.average(x, weights=df_pre.loc[x.index, 'weight']),  # age
    'v136': lambda x: np.average(x, weights=df_pre.loc[x.index, 'weight']),  # household size
    'wealth_score': lambda x: np.average(x, weights=df_pre.loc[x.index, 'weight']),
    'literate': lambda x: np.average(x, weights=df_pre.loc[x.index, 'weight']),  # literacy
    'male': lambda x: np.average(x, weights=df_pre.loc[x.index, 'weight']),  # child sex
    
    # Calculate urban proportion (v025: 1=urban, 2=rural)
    'v025': lambda x: np.average((x == 1).astype(int), weights=df_pre.loc[x.index, 'weight']),
    
    # Sample sizes
    'weight': ['sum', 'count'],  # population weighted, sample size
    
    # Keep identifiers
    'state_name': 'first',
    'district_name': 'first',
    'v006': 'first',
    'v007': 'first'
}

agg_df_pre = df_pre.groupby(['state_district', 'scst']).agg(agg_dict).reset_index()

# Flatten multi-level column names
agg_df_pre.columns = ['_'.join(col).strip('_') for col in agg_df_pre.columns.values]

# Rename for clarity
agg_df_pre.rename(columns={
    'stunting_<lambda>': 'stunting_rate',
    'underweight_<lambda>': 'underweight_rate',
    'wasting_<lambda>': 'wasting_rate',
    'hw70_<lambda>': 'mean_haz',
    'hw71_<lambda>': 'mean_waz',
    'hw72_<lambda>': 'mean_whz',
    'v012_<lambda>': 'mean_age',
    'v136_<lambda>': 'mean_hhsize',
    'wealth_score_<lambda>': 'mean_wealth',
    'literate_<lambda>': 'prop_literate',
    'male_<lambda>': 'prop_male',
    'v025_<lambda>': 'prop_urban',
    'weight_sum': 'pop_weighted',
    'weight_count': 'n_obs',
    'state_name_first': 'state_name',
    'district_name_first': 'district_name',
    'v006_first': 'survey_month',
    'v007_first': 'survey_year'
}, inplace=True)

agg_df_pre['post'] = 0

agg_df_pre.head()

43054 rows dropped. Remaining rows: 216573


Unnamed: 0,state_district,scst,stunting_rate,underweight_rate,wasting_rate,mean_haz,mean_waz,mean_whz,mean_age,mean_hhsize,...,prop_literate,prop_male,prop_urban,pop_weighted,n_obs,state_name,district_name,survey_month,survey_year,post
0,andaman and nicobar islands - nicobars,0,0.111132,0.219049,0.217346,-95.245354,-93.520945,-56.140022,29.986716,4.688916,...,1.0,0.491122,0.0,0.152062,25,andaman and nicobar islands,nicobars,4,2015,0
1,andaman and nicobar islands - nicobars,1,0.73531,0.310181,0.310181,-218.762679,-210.68509,-147.443325,30.200668,5.84939,...,0.697988,0.673751,0.0,0.080785,7,andaman and nicobar islands,nicobars,4,2015,0
2,andaman and nicobar islands - north & middle a...,0,0.325614,0.288541,0.28302,-97.509599,-134.577906,-110.033944,26.083474,5.366299,...,0.849221,0.521283,0.020257,8.731972,125,andaman and nicobar islands,north & middle andaman,4,2015,0
3,andaman and nicobar islands - north & middle a...,1,0.232934,0.44287,0.232934,-135.320227,-156.145693,-112.698189,27.872493,4.059338,...,1.0,0.209936,0.0,0.330634,5,andaman and nicobar islands,north & middle andaman,4,2015,0
4,andaman and nicobar islands - south andaman,0,0.240917,0.203632,0.17807,-110.429313,-112.25659,-69.914855,27.671771,5.371021,...,0.85809,0.532831,0.604525,18.282817,109,andaman and nicobar islands,south andaman,7,2015,0


In [14]:
invalid = [9996, 9997, 9998, 9999]

cols = ['hw70', 'hw71', 'hw72']

# Drop rows where any of these columns contains an invalid code
nrows = len(df_post)
df_post = df_post[~df_post[cols].isin(invalid).any(axis=1)]
df_post = df_post.dropna()
print(f'{nrows - len(df_post)} rows dropped. Remaining rows: {len(df_post)}')

# Create outcome variables
df_post['stunting'] = (df_post['hw70'] < -200).astype(int)
df_post['underweight'] = (df_post['hw71'] < -200).astype(int)   
df_post['wasting'] = (df_post['hw72'] < -200).astype(int)

# Normalize variable with implied decimals
df_post['weight'] = df_post['v005'] / 1000000
df_post['wealth_score'] = df_post['v191'] / 100000

# Map labels
with StataReader(post_kids_recode) as reader:
    labels = reader.value_labels()

state_labels = labels['V024']
dist_labels  = labels['SDIST']
religion_labels = labels['V130']
caste_labels = labels['S116']
rural_urban = labels['V025']

data = df_post.copy()
data["state_name"] = data["v024"].map(state_labels)
data["district_name"] = data["sdist"].map(dist_labels)
data["religion"] = data["v130"].map(religion_labels)
data["caste"] = data["s116"].map(caste_labels)
data["rural_urban"] = data["v025"].map(rural_urban)

# Filter to Hindu subset and create caste groups
df_post = data[data['religion'] == 'hindu'] 

df_post = df_post[
    (df_post['caste'] != "don't know") &
    (df_post['caste'].notna())
]

# Define SC/ST flag
df_post['scst'] = df_post['caste'].isin(['schedule caste', 'schedule tribe']).astype(int)

# Create literate binary variable 
df_post['literate'] = df_post['v155'].isin([1, 2]).astype(int)

# Create child sex binary variable
df_post['male'] = (df_post['b4'] == 1).astype(int)

# Create unique district identifier
df_post['state_district'] = df_post['state_name'] + ' - ' + df_post['district_name']

# Define aggregation dictionary
agg_dict = {
    # Outcomes
    'stunting': lambda x: np.average(x, weights=df_post.loc[x.index, 'weight']),
    'underweight': lambda x: np.average(x, weights=df_post.loc[x.index, 'weight']),
    'wasting': lambda x: np.average(x, weights=df_post.loc[x.index, 'weight']),
    
    # Z-scores
    'hw70': lambda x: np.average(x, weights=df_post.loc[x.index, 'weight']),
    'hw71': lambda x: np.average(x, weights=df_post.loc[x.index, 'weight']),
    'hw72': lambda x: np.average(x, weights=df_post.loc[x.index, 'weight']),
    
    # Controls
    'v012': lambda x: np.average(x, weights=df_post.loc[x.index, 'weight']),  # age
    'v136': lambda x: np.average(x, weights=df_post.loc[x.index, 'weight']),  # household size
    'wealth_score': lambda x: np.average(x, weights=df_post.loc[x.index, 'weight']),
    'literate': lambda x: np.average(x, weights=df_post.loc[x.index, 'weight']),  # literacy
    'male': lambda x: np.average(x, weights=df_post.loc[x.index, 'weight']),  # child sex
    
    # Calculate urban proportion (v025: 1=urban, 2=rural)
    'v025': lambda x: np.average((x == 1).astype(int), weights=df_post.loc[x.index, 'weight']),
    
    # Sample sizes
    'weight': ['sum', 'count'],  # population weighted, sample size
    
    # Keep identifiers
    'state_name': 'first',
    'district_name': 'first',
    'v006': 'first',
    'v007': 'first'
}

agg_df_post = df_post.groupby(['state_district', 'scst']).agg(agg_dict).reset_index()

# Flatten multi-level column names
agg_df_post.columns = ['_'.join(col).strip('_') for col in agg_df_post.columns.values]

# Rename for clarity
agg_df_post.rename(columns={
    'stunting_<lambda>': 'stunting_rate',
    'underweight_<lambda>': 'underweight_rate',
    'wasting_<lambda>': 'wasting_rate',
    'hw70_<lambda>': 'mean_haz',
    'hw71_<lambda>': 'mean_waz',
    'hw72_<lambda>': 'mean_whz',
    'v012_<lambda>': 'mean_age',
    'v136_<lambda>': 'mean_hhsize',
    'wealth_score_<lambda>': 'mean_wealth',
    'literate_<lambda>': 'prop_literate',
    'male_<lambda>': 'prop_male',
    'v025_<lambda>': 'prop_urban',
    'weight_sum': 'pop_weighted',
    'weight_count': 'n_obs',
    'state_name_first': 'state_name',
    'district_name_first': 'district_name',
    'v006_first': 'survey_month',
    'v007_first': 'survey_year'
}, inplace=True)

agg_df_post['post'] = 1

agg_df_post.head()

43343 rows dropped. Remaining rows: 189577


Unnamed: 0,state_district,scst,stunting_rate,underweight_rate,wasting_rate,mean_haz,mean_waz,mean_whz,mean_age,mean_hhsize,...,prop_literate,prop_male,prop_urban,pop_weighted,n_obs,state_name,district_name,survey_month,survey_year,post
0,andaman & nicobar islands - nicobars,0,0.486101,0.590923,0.386869,-169.862766,-194.052737,-115.44526,28.137633,4.856669,...,0.873661,0.704288,0.0,0.157782,10,andaman & nicobar islands,nicobars,1,2020,1
1,andaman & nicobar islands - north & middle an...,0,0.228903,0.330817,0.277325,-95.402469,-141.545154,-118.837583,27.564489,5.247096,...,0.864164,0.501569,0.0,5.641546,73,andaman & nicobar islands,north & middle andaman,10,2019,1
2,andaman & nicobar islands - south andaman,0,0.098122,0.122324,0.096141,-85.982027,-75.787405,-36.618326,27.413979,5.070069,...,0.870729,0.478257,0.68968,11.998584,77,andaman & nicobar islands,south andaman,12,2019,1
3,andaman & nicobar islands - south andaman,1,0.339246,0.0,0.0,-42.735946,-20.995503,19.243816,33.035473,5.643018,...,1.0,1.0,1.0,0.455835,3,andaman & nicobar islands,south andaman,10,2019,1
4,andhra pradesh - anantapur,0,0.364716,0.377471,0.170152,-149.146415,-161.610643,-103.129328,25.886587,5.64288,...,0.694073,0.51971,0.280431,320.522589,119,andhra pradesh,anantapur,10,2019,1


In [15]:
districts_nfhs4 = set(agg_df_pre['state_district'])
districts_nfhs5 = set(agg_df_post['state_district'])

valid_districts  = districts_nfhs4 & districts_nfhs5
print(f"\nDistricts in both surveys: {len(valid_districts )}")



Districts in both surveys: 573


In [16]:
agg_df_pre = agg_df_pre[agg_df_pre['state_district'].isin(valid_districts)]
agg_df_post = agg_df_post[agg_df_post['state_district'].isin(valid_districts)]
full_df = pd.concat([agg_df_pre, agg_df_post], ignore_index=True)

In [17]:
full_df['survey_date'] = pd.to_datetime(
    full_df['survey_year'].astype(str) + '-' + full_df['survey_month'].astype(str),
    format='%Y-%m'
)

# Identify Phase 2 based on survey date
full_df['phase2_temp'] = (full_df['survey_date'] > '2020-04').astype(int)

# Create district-level phase2 mapping
# For each district, determine if it's a Phase 2 district
district_phase_mapping = full_df.groupby('state_district')['phase2_temp'].max().to_dict()

# Apply this mapping to all rows (including NFHS-4)
full_df['phase2'] = full_df['state_district'].map(district_phase_mapping)
full_df = full_df.drop('phase2_temp', axis=1)

# Calculate months since COVID
covid_start = pd.to_datetime('2020-03-01')

full_df['months_since_covid'] = (
    (full_df['survey_date'].dt.year - covid_start.year) * 12 + 
    (full_df['survey_date'].dt.month - covid_start.month)
).clip(lower=0)
full_df = full_df.drop('survey_date', axis=1)

In [18]:
full_df.head()

Unnamed: 0,state_district,scst,stunting_rate,underweight_rate,wasting_rate,mean_haz,mean_waz,mean_whz,mean_age,mean_hhsize,...,prop_urban,pop_weighted,n_obs,state_name,district_name,survey_month,survey_year,post,phase2,months_since_covid
0,andhra pradesh - anantapur,0,0.42442,0.373835,0.142843,-156.453107,-163.093026,-105.822097,25.765874,5.954224,...,0.315953,386.610788,132,andhra pradesh,anantapur,7,2015,0,0,0
1,andhra pradesh - anantapur,1,0.402882,0.442281,0.169819,-169.162711,-185.365409,-127.838352,26.311161,5.083751,...,0.267077,133.414648,46,andhra pradesh,anantapur,7,2015,0,0,0
2,andhra pradesh - chittoor,0,0.300597,0.29089,0.170514,-101.365416,-126.320578,-101.284249,24.976525,6.072965,...,0.358394,382.116796,118,andhra pradesh,chittoor,6,2015,0,0,0
3,andhra pradesh - chittoor,1,0.388943,0.431217,0.192149,-165.868714,-174.765534,-114.718253,25.762278,6.087762,...,0.179219,174.797475,54,andhra pradesh,chittoor,6,2015,0,0,0
4,andhra pradesh - east godavari,0,0.261943,0.227783,0.137352,-109.833649,-121.023186,-84.910702,24.114466,5.067943,...,0.241745,472.713753,118,andhra pradesh,east godavari,5,2015,0,0,0


In [19]:
full_df.to_csv(os.path.join(data_processed_dir, "panel_dataset.csv"), index=False)