In [None]:
import os
import re
import sys
import json
from pathlib import Path
from typing import Dict, List, Tuple
import numpy as np
import pandas as pd

from sklearn.decomposition import PCA, FactorAnalysis
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.impute import SimpleImputer

import warnings
warnings.filterwarnings("ignore")

# --------------------- User knobs ---------------------
CSV_PATHS = [
    # e.g. "/path/to/IHDS_II_individual.csv",
    "states_datasets/data_subset_Uttar Pradesh 09.csv",
    "states_datasets/data_subset_Gujarat 24.csv",
    "states_datasets/data_subset_Maharashtra 27.csv",
    # e.g. "/path/to/IHDS_II_household.csv",
]

MERGE_KEYS = ["STATEID", "DISTID", "PSUID", "HHID", "IDPERSON"]

TARGET_CANDIDATES = ["INCOMEPC", "INCOME", "INCEARN"]

PC1_KEEP_THRESHOLD = 0.60
CUMVAR_TARGET = 0.70
MAX_COMPONENTS = 2
INTRA_BLOCK_CORR_DROP = 0.90
MAX_CATS = 20



In [4]:
# ----- Thematic lists -----
Basic_info_list = ["RO3", "RO4", "RO5", "RO6", "RO7", "RO8", "RO9", "RO10"]

Farm = ["FM1", "FM36Y", "FM37", "FM38", "FMHOURS", "WKFARM", "FM39AY"]

Animal_work = ["AN1", "AN5Y", "AN6", "AN7Y"]

Non_farm = ["NF1", "NFBN1", "NF9", "NF12", "NF13", "NF15Y", "NFBN21", "NF29", "NF32", "NF33", "NF35Y", "NFBN41", "NF49", "NF52", "NF53", "NFDAYS", "NFHOURS", "NF55Y"]


In [None]:
Farm_edit = ["FM1", "FM36Y", "WKFARM", "FM39AY"]

In [5]:
Income = ["IN11S1", "IN13S1", "IN11S2", "IN13S2", "IN11S3", "IN13S3", "IN11S4", "IN13S4", "IN11S5", "IN13S5", "IN11S6", "IN13S6", "IN11S7",
           "IN13S7", "IN11S8", "IN13S8", "IN18", "IN19", "IN20", "IN21", "IN22", "IN23", "IN24"]

Education = ["ED2", "ED3", "ED4", "ED5", "ED6", "EDUC7", "EDUNDER1", "ED7", "ED8", "ED9", "ED10", "ED11", "ED12", "ED13"]

Technology = ["MM7Y", "MM8", "MM9", "MM12Y", "MM13", "MM14"]

Teacher_and_school = ["TA3", "TA4", "TA5", "TA6", "TA8A", "TA8B", "TA9A", "TA9B", "TA10A", "TA10B"]


In [6]:
College_school = ["CS3", "CS3Y", "CS4", "CS5", "CS6", "CS7", "CS8", "CS9", "CS10", "CS11", "CS12", "CS13", "CS16", "CS17",
                   "CS18", "CS19", "CS20", "CS21", "CS22", "CS23", "CS24", "CS25", "CS26", "CS27", "CS28"]

Child_and_school = ["CH2", "CH3", "CH4A", "CH4B", "CH5", "CH6", "CH7", "CH8", "CH9", "CH10", "CH11", "CH12", "CH13", "CH14", "CH15", "CH16", "CH17", "CH18", "CH19", "CH20", "CH22"]

Short_term_Morbidity = ["SM3", "SM4", "SM5", "SM6", "SM7", "SM8", "SM9", "SM10", "SM11", "SM12", "SM14A", "SM14B", "SM15A", "SM15B", "SM16", "SM17", "SM18", "SM19", "SM20", "SM21", "SM22"]

Major_Morbidity =["MB3", "MB4", "MB5", "MB6", "MB7", "MB8", "MB9", "MB10", "MB11", "MB12", "MB13", "MB14", "MB15", "MB16", "MB17", 
                  "MB18", "MB19", "MB21A", "MB21B", "MB22A", "MB22B", "MB23", "MB24", "MB25", "MB26", "MB27", "MB28", "MB29"]


In [7]:
Activity_difficulty = ["AD3", "AD4", "AD5", "AD6", "AD7", "AD8", "AD9"]

Tobacco_and_other = ["TO3", "TO4", "TO5", "TO6"]

Anthropometry = ["AP2", "AP3", "AP5", "AP6", "AP7", "AP8", "AP9"]

Eligible_women = ["EW3Y"]


In [8]:
Urban = ["URBAN2011", "URBAN4_2011", "METRO", "METRO6", "POVLINE2005", "POVLINE2012","DEFLATOR"]

Household_details = ["NPERSONS", "EWELIGIBLE", "EWQELIGIBLE","MHEADAGE", "FHEADAGE", "NADULTM", "NADULTF", "NCHILDM", "NCHILDF", "NTEENM", "NTEENF", "NELDERM", "NELDERF", "NMARRIEDM", 
                     "NMARRIEDF", "NWKNONAG", "NWKAGLAB", "NWKSALARY", "NWKBUSINESS", "NWKFARM", "NWKANIMAL", "NWKNREGA", "NWKNREGA4", "NWKNONNREGA", "NWKANY5",
                     "NNR", "HHEDUC", "HHEDUCM", "HHEDUCF"]

Caste_and_Religion = ["ID11", "ID13", "GROUPS"]

Buiseness = ["NF5", "NF25", "NF45"]


In [9]:
Household_financial = ["COTOTAL", "COPC", "ASSETS", "ASSETS2005", "INCCROP", "INCAGPROP", "INCANIMAL", "INCAG", "INCBUS", "INCOTHER", "INCEARN", "INCBENEFITS", "INCREMIT", "INCOME", "INCOMEPC","RSUNEARN"]

Work_participation = ["WKANIMAL", "WKBUSINESS","WKNREGA", "WKDAYS", "WKHOURS", "WKANY5"]

WorkSpace = ["WS3NM", "WS4", "WS5", "WS7", "WS7MONTHS", "WS8", "WS8YEAR", "WS9", "WS10", "WS10ANNUAL", "WSEARN", "WSEARNHOURLY", "WS11", "WS11MEALS", "WS11HOUSE",
              "WS11MEALSRS", "WS11HOUSERS", "WS12", "WS13", "WS14", "WS15", "WS7AGLAB", "WS8AGLAB", "WSEARNAGLAB", "WKAGLAB", "WS7NONAG", "WS8NONAG", "WSEARNNONAG",
                "WKNONAG", "WS7SALARY", "WS8SALARY", "WSEARNSALARY", "WKSALARY", "WS7NREGA", "WS8NREGA", "WSEARNNREGA"]

WSEARN = ["WSEARNAGLAB", "WSEARNNONAG", "WSEARNSALARY", "WSEARNNREGA", "WSEARNANNUAL", "WSEARN"]


In [10]:
Income_Household = ["INCNONAG", "INCAGLAB", "INCSALARY", "INCNREGA", "INCNONNREGA"]

Migrants_data = ["MG4", "MG5", "MG6", "MG7", "MG8", "MG9NM", "MG10", "MG11","MGYEAR5", "NMIG5", "MGMONTHS5", "MGYEAR1", "NMIG1", "MGMONTHS1"]



In [11]:
Regression_list = [("Farm", Farm), ("Animal_work", Animal_work), ("Non_farm", Non_farm), ("Education", Education), ("Technology", Technology), 
                   ("Teacher_and_school", Teacher_and_school), ("College_school", College_school), ("Child_and_school", Child_and_school),
                   ("Short_term_Morbidity", Short_term_Morbidity), ("Major_Morbidity", Major_Morbidity), ("Activity_difficulty", Activity_difficulty), 
                   ("Tobacco_and_other", Tobacco_and_other), ("Anthropometry", Anthropometry), ("WorkSpace", WorkSpace), ("WSEARN", WSEARN), 
                   ("Income_Household", Income_Household)]

Control_blocks = [("Basic_info", Basic_info_list), ("Household_details", Household_details), ("Caste_and_Religion", Caste_and_Religion), ("Urban", Urban)]
