In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn  as sns
import statsmodels.api as sm
from statsmodels.iolib.summary2 import summary_col
from statsmodels.api import stats
from linearmodels.panel import PanelOLS


In [2]:
df = pd.read_excel("UYdata.xlsx")

# remove rows with attribute missing
df = df.loc[df.att_missing == 0]

In [3]:
# fill missing distance from sub district with google distance
df["subdistricthqdist_c"].fillna(df["google_km"], inplace=True)

# Table 4

Set of Household Controls:
1. household size: hhnum_b
2. dummy for household head’s education above primary level: edu_hhhead_b 
3. Household head’s occupation: occu_b
4. primary cook’s age: age_pc_b
5. dummy for primary cook’s education above primary level: pc_edu_b
6. dummy for non-Hindu household: hindu_b
7. household caste: hh_caste_b
8. household wealth index: assets_index
   
Village level controls include
1. dummies for presence of pvt. primary school: education_b
2. access to health sub-centres: healthstatus_b 
3. all weather road: road_b
4. proportion of irrigated land: irrigation_b
5. distance to block headquarter: google_km subdistricthqdist_c

Sub-district fixed effects (FE)
Dummy for whether treatment assigned: treatment012 <br>
Dummy for whether treatment assigned to h: treatment_h <br>
Dummy for whether treatment assigned to h: treatment_hs <br>
villages added to C from H/H+S: noncompliance_village

## Cleaning data

In [4]:
# choose all dependent and independent variables for table 4
all_controls = df[["villagecode_str",   # for clustering
                   "hhnum_b", "edu_hhhead_b", "occu_b", "age_pc_b", "pc_edu_b", 
                   "hindu_b", "hh_caste_b", "assets_index",   #  HH controls
                   "education_b", "healthstatus_b", "road_b", "irrigation_b",  
                   "subdistricthqdist_c", #  Village controls
                    # "noncompliance_village",
                   "totrefills_omc_b2", "treatment012", "treatment_h", "treatment_hs",
                   "totrefills_omc_e2", 
                   "tehsil_e"      # fixed  effects
                   ]]  

In [5]:
# the total lpg refills is reported for only those households who could be 
# matched with OMC sales records (N = 2729). Therefore drop all those obvs 
# which could not be matched
all_controls =  all_controls.drop(all_controls['totrefills_omc_b2'].index[all_controls['totrefills_omc_b2'].apply(np.isnan)])

In [7]:
# function for fixed effects model
def fixed_effects_model(endog = None,exog = None,data= None, 
                        group = None, cluster = None):
    '''
    Outputs the fitted fixed effects model. Just need to call summary()
    endog: (string) column name of the dependent variable of PD df 
    exog: (list) list of all independent variables col names in string
    data: Pandas dataframe containing the exog and endog columns
    group: (string) the column name to apply fixed effects on
    cluster: (string) the column name to cluster se on. If same as group,
            specify again
    '''
    mean_val = data.groupby(group).mean()
    demeaned_val = (data.set_index(group) - mean_val)
    
    FE =  sm.OLS(
                    endog= demeaned_val[endog],
                    exog= demeaned_val[exog],
                    drop_missing =  False

                ).fit(cov_type='cluster', 
                        cov_kwds={'groups': data[cluster]}
                     )
    
    return FE

## Regressions Table 4: Impact of information campaign on annual LPG refill consumption

In [8]:
# Non fixed  effects  overall treatment
lpg_refill_nonfe_o =  sm.OLS(
    endog= all_controls["totrefills_omc_e2"],
    exog= sm.add_constant(all_controls.iloc[:,1:-4]),
    drop_missing =  False
).fit(cov_type='cluster', cov_kwds={'groups': all_controls["villagecode_str"]})

# variable of interest: treatment012, totrefills_omc_b2

In [9]:
# Non fixed effects individual treatment
lpg_refill_nonfe_i =  sm.OLS(
    endog= all_controls["totrefills_omc_e2"],
    exog= sm.add_constant(all_controls.iloc[:,1:-2].drop("treatment012", axis=1)),
    drop_missing =  False
).fit(cov_type='cluster', cov_kwds={'groups': all_controls["villagecode_str"]})

# variable of interest: treatment_h, treatment_hs, totrefills_omc_b2

In [10]:
# fixed effects models
lpg_refill_fe_o = fixed_effects_model(
                    endog= "totrefills_omc_e2",
                    exog= all_controls.iloc[:,1:-4].columns,
                    data= all_controls,
                    group = "tehsil_e",
                    cluster= "villagecode_str"
                )

In [11]:
# fixed effects models
lpg_refill_fe_i = fixed_effects_model(
                    endog= "totrefills_omc_e2",
                    exog= all_controls.iloc[:,1:-2].drop("treatment012", axis=1).columns,
                    data= all_controls,
                    group = "tehsil_e",
                    cluster= "villagecode_str"
                )

In [20]:
info_dict={'No. observations' : lambda x: f"{int(x.nobs):d}"}
           
results_table = summary_col(results=[lpg_refill_nonfe_o, lpg_refill_nonfe_i,
                                    lpg_refill_fe_o, lpg_refill_fe_i],
                            float_format='%0.3f',
                            stars = True,
                            model_names=['Non FE Overall',
                                         "Non FE Ind",
                                         'FE Overall',
                                         'FE Ind'],
                            info_dict=info_dict,
                            regressor_order=["treatment012","treatment_h", 
                                             "treatment_hs", "totrefills_omc_b2"],
                            drop_omitted=  True
                            )

In [21]:
print(results_table)


                  Non FE Overall Non FE Ind FE Overall  FE Ind 
---------------------------------------------------------------
treatment012      0.069                     0.095              
                  (0.100)                   (0.092)            
treatment_h                      0.007                 0.015   
                                 (0.115)               (0.112) 
treatment_hs                     0.134                 0.179   
                                 (0.121)               (0.109) 
totrefills_omc_b2 0.804***       0.803***   0.801***   0.800***
                  (0.018)        (0.018)    (0.018)    (0.018) 
R-squared         0.593          0.593      0.591      0.592   
R-squared Adj.    0.591          0.591      0.589      0.589   
No. observations  2729           2729       2729       2729    
Standard errors in parentheses.
* p<.1, ** p<.05, ***p<.01


# Table 5

In [None]:
tabel_5_controls = df[["villagecode_str",   # for clustering
                   "hhnum_b", "edu_hhhead_b", "occu_b", "age_pc_b", "pc_edu_b", 
                   "hindu_b", "hh_caste_b", "assets_index",   #  HH controls
                   "education_b", "healthstatus_b", "road_b", "irrigation_b",  
                   "subdistricthqdist_c", #  Village controls
                    # "noncompliance_village",
                   "totrefills_omc_b2", "treatment012", "treatment_h", "treatment_hs",
                   "totrefills_omc_e2", 
                   "tehsil_e"      # fixed  effects
                   ]]  

In [17]:
# list of all controls
control = "hhnum_b + edu_hhhead_b + occu_b + age_pc_b + pc_edu_b + hindu_b + hh_caste_b + assets_index + education_b + healthstatus_b + road_b + irrigation_b + subdistricthqdist_c"
