In [1]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np

import statsmodels.api as sm
import statsmodels.formula.api as smf
from statsmodels.iolib.summary2 import summary_col
import statsmodels.formula.api as smf


In [2]:
nsw_lalonde = pd.read_stata("./data/nsw.dta")
nsw_dw = pd.read_stata("./data/nsw_dw.dta")

# loading comparision groups data
psid_1 = pd.read_stata("./data/psid_controls.dta")
psid_2 = pd.read_stata("./data/psid_controls2.dta")
psid_3 = pd.read_stata("./data/psid_controls3.dta")

cps_1 = pd.read_stata("./data/cps_controls.dta")
cps_2 = pd.read_stata("./data/cps_controls2.dta")
cps_3 = pd.read_stata("./data/cps_controls3.dta")

Lalonde's dataset is `nsw_lalonde`

DW subset of Lalonde dataset is `nsw_dw`

Non-experimental estimates of treatment effects come from `psid_1` and `cps_1`. Therefore they are the control groups needed for comparision

the 2 and 3 versions of psid and cps are subsets that resemble treatment group in terms of single preintervention characteristics

In [3]:
# checking the columns of all the data
data_list = [nsw_lalonde, nsw_dw, psid_1, psid_2, psid_3, cps_1, cps_2, cps_3]
for i in data_list:
    print("\n")
    print("###_______________________###") 
    print(i.columns.values)
    print(f"Rows: {i.shape[0]}, columns: {i.shape[1]}")



###_______________________###
['data_id' 'treat' 'age' 'education' 'black' 'hispanic' 'married'
 'nodegree' 're75' 're78']
Rows: 722, columns: 10


###_______________________###
['data_id' 'treat' 'age' 'education' 'black' 'hispanic' 'married'
 'nodegree' 're74' 're75' 're78']
Rows: 445, columns: 11


###_______________________###
['data_id' 'treat' 'age' 'education' 'black' 'hispanic' 'married'
 'nodegree' 're74' 're75' 're78']
Rows: 2490, columns: 11


###_______________________###
['data_id' 'treat' 'age' 'education' 'black' 'hispanic' 'married'
 'nodegree' 're74' 're75' 're78']
Rows: 253, columns: 11


###_______________________###
['data_id' 'treat' 'age' 'education' 'black' 'hispanic' 'married'
 'nodegree' 're74' 're75' 're78']
Rows: 128, columns: 11


###_______________________###
['data_id' 'treat' 'age' 'education' 'black' 'hispanic' 'married'
 'nodegree' 're74' 're75' 're78']
Rows: 15992, columns: 11


###_______________________###
['data_id' 'treat' 'age' 'education' 'blac

# Table 1 Sample Means of Characteristics in NSW and Comparision Samples

In [4]:
# function to calculate mean, ste and layer them one after the other
def make_mean_std_table(data, treat = None):

    def mean_std_table(data, treat, mean_or_sde):
        sample_std_func = lambda x: x.std()/ np.sqrt(x.count())

        nsw_dw_treat = nsw_dw.loc[nsw_dw.treat == 1]
        merged = data.append(nsw_dw_treat)

        for char in data.columns[2:]:
             
            if treat == None:
                reg = sm.OLS(merged[char], sm.add_constant(merged.treat)).fit()

                if mean_or_sde == "mean":
                    yield {
                        "cg": reg.params[0]
                    }
                elif mean_or_sde == "sde":
                    yield {
                        "cg": reg.bse[1]
                    }

            else:
            # calculate mean and sample standard devaition of mean for each group
                # means = data.groupby(treat)[char].mean()
                # sample_std_of_mean = data.groupby(treat)[char].apply(sample_std_func)

                model = smf.ols(f"{char} ~ C({treat}) -1", data=data).fit()
                means = model.params
                sample_std_of_mean = model.bse

                if mean_or_sde == "mean":
                    yield {
                        "Control":  means[0], 
                        "Treated":  means[1], 
                    }
                elif mean_or_sde == "sde":
                    yield {
                        "Control":  sample_std_of_mean[0], 
                        "Treated":  sample_std_of_mean[1], 
                    }

    xx = mean_std_table(data, treat, "mean")
    yy = mean_std_table(data, treat, "sde")
    new_df = pd.DataFrame(xx, index= data.columns[2:])
    new_df_2 = pd.DataFrame(yy, index= data.columns[2:])

    def std_below_mean(mean_df, sde_df):
    
    # function combines the mean and std dataframes.
    # first entry is the mean df and second is the std df 
    # Make sure the names of the covariates are in mean_df.index 

    # returns another dataframe with std below means 
    
        mean_df = mean_df.transpose()
        sde_df = sde_df.transpose()
        # create empty dataframe with same column names and twice as many rows
        new_df = pd.DataFrame({}, 
                            columns=mean_df.columns, 
                            index=[i  for i in  range(0,len(mean_df)*2)])
        # collect all covariate names in the list
        covariate_names = []
        # in the new database put the mean and standard dev values one after the other
        for index_12  in range(0, len(mean_df)):
            new_df.iloc[index_12*2] = mean_df.iloc[index_12]  
            new_df.iloc[(index_12*2)+1] = sde_df.iloc[index_12]
            covariate_names.append(mean_df.index[index_12])  
            covariate_names.append("_")  
        # add covariates names to the datatframe. and there we have it
        new_df.insert(0, "covariates", covariate_names, True)

        return new_df
    
    return std_below_mean(new_df, new_df_2)


In [5]:
# mean and standard deviation of individual datasets
table_1_mean_std_lst = []
for index, d  in enumerate(data_list):
    if index <=1: 
        table_1_mean_std_lst.append(make_mean_std_table(d, treat="treat"))
    elif index >1: 
        table_1_mean_std_lst.append(make_mean_std_table(d, treat=None))

In [8]:
# merging the dataset
table_1 = pd.concat(table_1_mean_std_lst, 
                    keys=["lalonde", "DW", "psid-1", "psid-2", "psid-2",
                            "cps_1", "cps_2", "cps_3"])
table_1.drop("re78", axis = 1)

Unnamed: 0,Unnamed: 1,covariates,age,education,black,hispanic,married,nodegree,re75,re74
lalonde,0,Control,24.447059,10.188235,0.8,0.112941,0.157647,0.814118,3026.682756,
lalonde,1,_,0.3216,0.082623,0.01941,0.0149,0.017898,0.02003,245.912842,
lalonde,2,Treated,24.626263,10.380471,0.801347,0.094276,0.16835,0.73064,3066.098191,
lalonde,3,_,0.384709,0.098837,0.023218,0.017824,0.02141,0.02396,294.169369,
DW,0,Control,25.053846,10.088462,0.826923,0.107692,0.153846,0.834615,1266.909015,2107.026651
DW,1,_,0.440218,0.110988,0.023138,0.017514,0.023243,0.025369,195.466036,333.010296
DW,2,Treated,25.816216,10.345946,0.843243,0.059459,0.189189,0.708108,1532.055313,2095.573693
DW,3,_,0.521878,0.131576,0.027431,0.020763,0.027555,0.030075,231.724587,394.783025
psid-1,0,cg,34.850602,12.116867,0.250602,0.03253,0.866265,0.305221,19063.337668,19428.745805
psid-1,1,_,0.780966,0.230202,0.032696,0.013882,0.026236,0.035074,1001.910615,990.693126


# Table 2 Earnings Comparisions 

## Panel-A

For panel A We need to compare lalonde treatment to lalonde control, psid, and cps. Unadjusted is just the difference and Adjusted is a regression with controls

In [114]:
## Panel A
# iterate over all the datasets. 
# first one compare lanodel treat with control
# from second onwards compare lanodel treat with psid and cps

# the following function does just that but is interchangable between lanodel and DW dataset
def panel_A_treat_eff(main, iden ,growth = "yes"):
    '''
    main -> is the dataframe that we are comparing other to
    iden -> is the identification of main. either "l" or "dw"
    growth option: yes or no
    If yes calculate 1975-1978 earnings diff
    If no calculate 1978 earnings diff
    '''
    comparion_groups = [main, psid_1, psid_2, psid_3, cps_1, cps_2, cps_3]
    controls = "age + age**2 + education + nodegree + black + hispanic"
    if growth == "no":
    

        for index, set in enumerate(comparion_groups):
            if index == 0:
                model_u = smf.ols(formula="re78 ~ 1 + treat", data = main).fit()
                model_a = smf.ols(formula="re78 ~ 1 + treat + " + controls, data = main).fit()
            
            else:
                x = set.drop(columns="re74", inplace = False)
                y = main.loc[main.treat == 1].append(x, ignore_index=True)

                model_u = smf.ols(formula="re78 ~ 1 + treat", data = y).fit()
                model_a = smf.ols(formula="re78 ~ 1 + treat + " + controls, data = y).fit()

            yield {
                    f"Unadjusted_{iden}_78": f"{model_u.params[1].round(2)} ({model_u.bse[1].round(2)})",
                    f"Adjusted_{iden}_78": f"{model_a.params[1].round(2)} ({model_a.bse[1].round(2)})",
                    
            }
    
    elif growth == "yes":
        for index, set in enumerate(comparion_groups):
            if index == 0:
                model_u = smf.ols(formula="re78 ~ 1 + treat + re75", data = main).fit()
                model_a = smf.ols(formula="re78 ~ 1 + treat + re75 +" + controls, data = main).fit()
            
            else:
                x = set.drop(columns="re74", inplace = False)
                y = main.loc[main.treat == 1].append(x, ignore_index=True)

                model_u = smf.ols(formula="re78 ~ 1 + treat + re75", data = y).fit()
                model_a = smf.ols(formula="re78 ~ 1 + treat + re75 +" + controls, data = y).fit()

            yield {
                    f"Unadjusted_{iden}_dif": f"{model_u.params[1].round(2)} ({model_u.bse[1].round(2)})",
                    f"Adjusted_{iden}_dif": f"{model_a.params[1].round(2)} ({model_a.bse[1].round(2)})",
                    
            }

In [115]:
# convert the dict into a dataframe
df_A_78 = pd.DataFrame(panel_A_treat_eff(nsw_lalonde , "l" ,growth = 'no'))
df_A_dif = pd.DataFrame(panel_A_treat_eff(nsw_lalonde, "l", growth = 'yes'))

panel_a = pd.concat([df_A_78, df_A_dif], axis = 1)
# panel_a["group"] = ["nsw_lalonde", "psid_1", "psid_2", "psid_3", "cps_1", "cps_2", "cps_3"]
panel_a

Unnamed: 0,Unadjusted_l_78,Adjusted_l_78,Unadjusted_l_dif,Adjusted_l_dif
0,886.3 (472.09),799.47 (471.75),878.78 (466.71),807.34 (467.5)
1,-15577.57 (913.33),-8939.86 (980.09),-2380.08 (680.27),-2336.95 (737.58)
2,-4019.6 (781.4),-3327.3 (957.16),-1363.82 (729.04),-1468.54 (888.68)
3,697.06 (759.8),68.98 (969.41),628.91 (757.04),-2.43 (968.34)
4,-8870.31 (562.48),-4556.08 (588.15),-1543.49 (425.69),-1095.37 (450.41)
5,-4194.76 (533.0),-1592.78 (627.18),-1648.94 (458.63),-774.89 (548.06)
6,-1007.82 (539.35),250.33 (675.42),-1204.45 (531.91),-121.33 (671.78)


## Panel-B

For panel B We need to compare DW treatment to DW control, psid, and cps. Unadjusted is just the difference and Adjusted is a regression with controls

In [116]:
## Panel B
# iterate over all the datasets. 
# first one compare nsw_dw treat with control
# from second onwards compare nsw_dw treat with psid and cps. No need to do it again.

# we already have the function in place. Just need to call it

# convert the dict into a dataframe
df_B_78 = pd.DataFrame(panel_A_treat_eff(nsw_dw , "dw" ,growth = 'no'))
df_B_dif = pd.DataFrame(panel_A_treat_eff(nsw_dw, "dw", growth = 'yes'))

panel_b = pd.concat([df_B_78, df_B_dif], axis = 1)
# panel_a["group"] = ["nsw_lalonde", "psid_1", "psid_2", "psid_3", "cps_1", "cps_2", "cps_3"]
panel_b


Unnamed: 0,Unadjusted_dw_78,Adjusted_dw_78,Unadjusted_dw_dif,Adjusted_dw_dif
0,1794.34 (632.85),1676.08 (636.71),1750.15 (632.09),1632.54 (636.66)
1,-15204.78 (1154.61),-8453.47 (1174.28),-581.83 (841.26),-419.17 (877.79)
2,-3646.81 (959.7),-2568.82 (1117.67),720.5 (886.35),676.18 (1018.73)
3,1069.85 (899.62),690.84 (1117.3),1369.83 (896.97),902.98 (1113.72)
4,-8497.52 (712.02),-4285.48 (727.89),-77.71 (536.6),547.03 (556.57)
5,-3821.97 (670.6),-1216.91 (755.05),-262.97 (573.65),858.09 (656.7)
6,-635.03 (657.14),829.26 (810.47),-90.8 (641.4),1083.71 (789.37)


## Panel - C

In [125]:
# have to make slight changes to the function in order to acconomodate RE74 in the regressions
def panel_C_treat_eff(growth = "yes"):
    '''
    growth option: yes or no
    If yes calculate 1975-1978 earnings diff
    If no calculate 1978 earnings diff
    '''
    comparion_groups = [nsw_dw, psid_1, psid_2, psid_3, cps_1, cps_2, cps_3]
    controls = "age + age**2 + education + nodegree + black + hispanic + re74"
    if growth == "no":
    

        for index, set in enumerate(comparion_groups):
            if index == 0:
                model_u = smf.ols(formula="re78 ~ 1 + treat + re74", data = nsw_dw).fit()
                model_a = smf.ols(formula="re78 ~ 1 + treat + " + controls, data = nsw_dw).fit()
            
            else:
                y = nsw_dw.loc[nsw_dw.treat == 1].append(set, ignore_index=True)

                model_u = smf.ols(formula="re78 ~ 1 + treat + re74", data = y).fit()
                model_a = smf.ols(formula="re78 ~ 1 + treat + " + controls, data = y).fit()

            yield {
                    "Unadjusted_dw_78": f"{model_u.params[1].round(2)} ({model_u.bse[1].round(2)})",
                    "Adjusted_dw_78": f"{model_a.params[1].round(2)} ({model_a.bse[1].round(2)})",
                    
            }
    
    elif growth == "yes":
        for index, set in enumerate(comparion_groups):
            if index == 0:
                model_u = smf.ols(formula="re78 ~ 1 + treat + re75 + re74", data = nsw_dw).fit()
                model_a = smf.ols(formula="re78 ~ 1 + treat + re75 +" + controls, data = nsw_dw).fit()
            
            else:
                y = nsw_dw.loc[nsw_dw.treat == 1].append(set, ignore_index=True)

                model_u = smf.ols(formula="re78 ~ 1 + treat + re75 + re74", data = y).fit()
                model_a = smf.ols(formula="re78 ~ 1 + treat + re75 +" + controls, data = y).fit()

            yield {
                    "Unadjusted_dw_dif": f"{model_u.params[1].round(2)} ({model_u.bse[1].round(2)})",
                    "Adjusted_dw_dif": f"{model_a.params[1].round(2)} ({model_a.bse[1].round(2)})",
                    
            }

In [126]:
df_C_78 = pd.DataFrame(panel_C_treat_eff(growth = 'no'))
df_C_dif = pd.DataFrame(panel_C_treat_eff(growth = 'yes'))

panel_c = pd.concat([df_C_78, df_C_dif], axis = 1)

panel_c

Unnamed: 0,Unadjusted_dw_78,Adjusted_dw_78,Unadjusted_dw_dif,Adjusted_dw_dif
0,1795.55 (631.21),1691.01 (635.26),1772.6 (632.61),1673.48 (637.71)
1,-1457.56 (893.32),-1038.1 (927.63),219.5 (829.01),105.07 (863.26)
2,970.93 (954.86),497.88 (1062.22),1725.35 (905.3),1301.65 (1016.59)
3,2238.88 (905.95),1493.73 (1108.8),2228.0 (907.13),1491.57 (1110.3)
4,-832.51 (554.12),-47.73 (571.42),169.05 (530.17),684.29 (546.86)
5,-314.04 (581.36),916.12 (662.44),50.14 (566.99),1173.76 (645.02)
6,811.11 (637.54),1517.04 (779.48),798.13 (635.69),1500.52 (776.58)


In [129]:
## Table 2
table_2 = pd.concat([panel_a, panel_b, panel_c], axis = 1)
# renaming index
table_2.index = ["nsw", "psid 1", "psid 2", "psid 3", "cps 1", "cps 2", "cps 3"]
table_2


Unnamed: 0,Unadjusted_l_78,Adjusted_l_78,Unadjusted_l_dif,Adjusted_l_dif,Unadjusted_dw_78,Adjusted_dw_78,Unadjusted_dw_dif,Adjusted_dw_dif,Unadjusted_dw_78.1,Adjusted_dw_78.1,Unadjusted_dw_dif.1,Adjusted_dw_dif.1
nsw,886.3 (472.09),799.47 (471.75),878.78 (466.71),807.34 (467.5),1794.34 (632.85),1676.08 (636.71),1750.15 (632.09),1632.54 (636.66),1795.55 (631.21),1691.01 (635.26),1772.6 (632.61),1673.48 (637.71)
psid 1,-15577.57 (913.33),-8939.86 (980.09),-2380.08 (680.27),-2336.95 (737.58),-15204.78 (1154.61),-8453.47 (1174.28),-581.83 (841.26),-419.17 (877.79),-1457.56 (893.32),-1038.1 (927.63),219.5 (829.01),105.07 (863.26)
psid 2,-4019.6 (781.4),-3327.3 (957.16),-1363.82 (729.04),-1468.54 (888.68),-3646.81 (959.7),-2568.82 (1117.67),720.5 (886.35),676.18 (1018.73),970.93 (954.86),497.88 (1062.22),1725.35 (905.3),1301.65 (1016.59)
psid 3,697.06 (759.8),68.98 (969.41),628.91 (757.04),-2.43 (968.34),1069.85 (899.62),690.84 (1117.3),1369.83 (896.97),902.98 (1113.72),2238.88 (905.95),1493.73 (1108.8),2228.0 (907.13),1491.57 (1110.3)
cps 1,-8870.31 (562.48),-4556.08 (588.15),-1543.49 (425.69),-1095.37 (450.41),-8497.52 (712.02),-4285.48 (727.89),-77.71 (536.6),547.03 (556.57),-832.51 (554.12),-47.73 (571.42),169.05 (530.17),684.29 (546.86)
cps 2,-4194.76 (533.0),-1592.78 (627.18),-1648.94 (458.63),-774.89 (548.06),-3821.97 (670.6),-1216.91 (755.05),-262.97 (573.65),858.09 (656.7),-314.04 (581.36),916.12 (662.44),50.14 (566.99),1173.76 (645.02)
cps 3,-1007.82 (539.35),250.33 (675.42),-1204.45 (531.91),-121.33 (671.78),-635.03 (657.14),829.26 (810.47),-90.8 (641.4),1083.71 (789.37),811.11 (637.54),1517.04 (779.48),798.13 (635.69),1500.52 (776.58)
