# Paper Notes (Angrist & Evans, AER 1998)

Hard to draw causal inference on fertility->labor suply since fertility is both dependent & exogenous. Robus negative correlations are found historically, though

Angrist & Evans use a IVreg strategy. Parents prefer mixed siblings; same sex siblings increase likelihood of parents having another child. IV is dummy(first two children same sex). Previous literature tended to use first born twins as instruments; here we can compare estimates (note third born is older in sex-mix instrument)

In [1]:
# Import statistical libraries

import pandas as pd
import statsmodels.api as sm
import statsmodels.formula.api as smf
import numpy as np
from numba import jit

In [110]:
# Import Data

df = pd.read_stata('c:/users/matt/desktop/Projet Empirique/data_angrist_evans.dta')

# Data cleanup

In [111]:
# Clean NA values on children dummy variables
df['firstborn_male'] = df['firstborn_male'].fillna(0).astype(int)
df['firstborn_fem'] = df['firstborn_fem'].fillna(0).astype(int)
df['secondborn_male'] = df['secondborn_male'].fillna(0).astype(int)
df['secondborn_fem'] = df['secondborn_fem'].fillna(0).astype(int)


# Clean nonsensical data
df = df[df['secondborn_male'] == (1-df['secondborn_fem'])]
df = df[df['firstborn_male'] == (1-df['firstborn_fem'])]



# Create dummy variables for two boys/girls
df['two_boys'] = ((df['firstborn_male'] == 1) & (df['secondborn_male'] == 1)).astype(int)
df['two_girls'] = ((df['firstborn_fem'] == 1) & (df['secondborn_fem'] == 1)).astype(int)

# dummy variable for different gendered kids
df['one_boy_one_girl'] = (((df['firstborn_male'] == 1) & (df['secondborn_fem'] == 1)) | \
                         ((df['firstborn_fem'] == 1) & (df['secondborn_male'] == 1))).astype(int)
    
####################
# The Instrument   #
####################
    
# Same sex instrument
df['same_sex'] = (df['one_boy_one_girl'] == False).astype(int)


# Change to integer valued data
df['nchild'] = df['nchild'].replace('9+', 9).astype(int)
df['eldch'] = df['eldch'].replace('Less than 1 year old', 1).astype(int)
df['age'] = df['age'].astype(int)

# more than two kids dummy
df['>2'] = (df['nchild'] > 2).astype(int)

# Worked for pay binary
df['worked_for_pay'] = (df['wkswork2'] != 'N/A')
    
# Age @ First Birth
df['AAFB'] = (df['age'] - df['eldch']).astype(int)


# Race dummies
df['black'] = (df['race'] == 'Black/Negro').astype(int)
df['other_race'] = (df['race'] == 'Other race, nec').astype(int)

# Mean based variable of weeks worked
df['wkswork'] = \
df['wkswork2'].replace('N/A', 0)\
              .replace('1-13 weeks', 6.5)\
              .replace('14-26 weeks', 20.5)\
              .replace('27-39 weeks', 33.5)\
              .replace('40-47 weeks', 43.5)\
              .replace('48-49 weeks', 48.5)\
              .replace('50-52 weeks', 51)
# Same as above for hours worked   
df['hrswork'] = \
df['hrswork2'].replace('N/A', 0)\
              .replace('1-14 hours', 7)\
              .replace('15-29 hours', 22.5)\
              .replace('30-34 hours', 32.5)\
              .replace('35-39 hours', 37.5)\
              .replace('40 hours', 48.5)\
              .replace('41-48 hours', 45.5)\
              .replace('49-59 hours', 55)\
              .replace('60+ hours', 65)              
# Mean based variable of years of schooling
df['yrsshool'] = \
df['educ'].replace('N/A or no schooling', 0)\
              .replace('Nursery school to grade 4', 3)\
              .replace('Grade 5, 6, 7, or 8', 7)\
              .replace('Grade 9', 9)\
              .replace('Grade 10', 10)\
              .replace('Grade 11', 11)\
              .replace('Grade 12', 12)\
              .replace('1 year of college', 13)\
              .replace('2 years of college', 14)\
              .replace('4 years of college', 16)\
              .replace('5+ years of college', 18.5)

# dataset subsampled on married women
married = df[df['marst']  == "Married, spouse present"].append(df[df['marst']  == "Married, spouse absent"])

# THE PROBLEM

In [91]:
df.columns

Index(['year', 'serial', 'subsamp', 'hhwt', 'region', 'statefip', 'urban',
       'city', 'cntygp98', 'farm', 'mortgage', 'valueh', 'perwt', 'slwt',
       'slrec', 'famsize', 'nchild', 'nchlt5', 'eldch', 'yngch', 'nsibs',
       'sex', 'age', 'marst', 'birthyr', 'chborn', 'race', 'hispan', 'bpl',
       'bpld', 'mbpl', 'mbpld', 'fbpl', 'fbpld', 'school', 'higrade',
       'higraded', 'educ', 'educd', 'empstat', 'empstatd', 'labforce', 'occ',
       'occ1990', 'ind', 'ind1990', 'classwkr', 'wkswork2', 'hrswork2',
       'inctot', 'ftotinc', 'incwage', 'disabwrk', 'firstborn_male',
       'firstborn_fem', 'secondborn_male', 'secondborn_fem', 'two_boys',
       'two_girls', 'one_boy_one_girl', 'same_sex', '>2', 'worked_for_pay',
       'AAFB', 'black', 'other_race', 'wkswork', 'hrswork', 'yrsshool'],
      dtype='object')

# number 1

In [92]:
nchild = df['nchild'].astype(int)
avg_first_boy = df['firstborn_male']
avg_sec_boy = df['secondborn_male']
two_boys = df['two_boys']
two_girls = df['two_girls']
same_sex = df['same_sex']
age = df['age'].astype(int)
AAFB = (df['age'].astype(int) - df['eldch'])
worked_for_pay = df['worked_for_pay']
weeks_worked = df['wkswork']
hours_worked = df['hrswork']
wage_income = df['incwage']
family_income = df['ftotinc']
non_wife_income = (df['ftotinc'] - df['incwage'])


print('-----ALL WOMEN-------')
print('n_child %.4f' %nchild.mean(), "(%.4f)" %nchild.std())
print('avg_first_boy %.4f' %avg_first_boy.mean(), "(%.4f)" %avg_first_boy.std())
print('avg_sec_boy %.4f' %avg_sec_boy.mean(), "(%.4f)" %avg_sec_boy.std())
print('two_boys %.4f' %two_boys.mean(), "(%.4f)" %two_boys.std())
print('two_girls %.4f' %two_girls.mean(), "(%.4f)" %two_girls.std())
print('same_sex %.4f' %same_sex.mean(), "(%.4f)" %same_sex.std())
print('age %.4f' %age.mean(), "(%.4f)" %age.std())
print('age @ first birth %.4f' %AAFB.mean(), "(%.4f)" %AAFB.std())
print('worked_for_pay %.4f' %worked_for_pay.mean(), "(%.4f)" %worked_for_pay.std())
print('wksworked %.4f' %weeks_worked.mean(), "(%.4f)" %weeks_worked.std())
print('hrsworked %.4f' %hours_worked.mean(), "(%.4f)" %hours_worked.std())
print('wage income %.4f' %wage_income.mean(), "(%.4f)" %wage_income.std())
print('family income %.4f' % family_income.mean(), "(%.4f)" %family_income.std())
print('non wife income %.4f' % non_wife_income.mean(), "(%.4f)" %non_wife_income.std())

-----ALL WOMEN-------
n_child 2.5635 (0.8246)
avg_first_boy 0.5168 (0.4997)
avg_sec_boy 0.5164 (0.4997)
two_boys 0.2684 (0.4431)
two_girls 0.2352 (0.4241)
same_sex 0.5036 (0.5000)
age 30.4116 (3.4837)
age @ first birth 21.6739 (3.8616)
worked_for_pay 0.6679 (0.4710)
wksworked 26.0169 (22.5353)
hrsworked 20.6631 (21.7496)
wage income 7292.0777 (10293.7682)
family income 36077.2282 (27634.2942)
non wife income 28785.1505 (25385.3384)


In [93]:
nchild = married['nchild'].astype(int)
avg_first_boy = married['firstborn_male']
avg_sec_boy = married['secondborn_male']
two_boys = married['two_boys']
two_girls = married['two_girls']
same_sex = married['same_sex']
age = married['age'].astype(int)
AAFB = (married['age'].astype(int) - married['eldch'])
worked_for_pay = married['worked_for_pay']
weeks_worked = married['wkswork']
hours_worked = married['hrswork']
wage_income = married['incwage']
family_income = married['ftotinc']
non_wife_income = (married['ftotinc'] - married['incwage'])


print('------MARRIED WOMEN--------')
print('n_child %.4f' %nchild.mean(), "(%.4f)" %nchild.std())
print('avg_first_boy %.4f' %avg_first_boy.mean(), "(%.4f)" %avg_first_boy.std())
print('avg_sec_boy %.4f' %avg_sec_boy.mean(), "(%.4f)" %avg_sec_boy.std())
print('two_boys %.4f' %two_boys.mean(), "(%.4f)" %two_boys.std())
print('two_girls %.4f' %two_girls.mean(), "(%.4f)" %two_girls.std())
print('same_sex %.4f' %same_sex.mean(), "(%.4f)" %same_sex.std())
print('age %.4f' %age.mean(), "(%.4f)" %age.std())
print('age @ first birth %.4f' %AAFB.mean(), "(%.4f)" %AAFB.std())
print('worked_for_pay %.4f' %worked_for_pay.mean(), "(%.4f)" %worked_for_pay.std())
print('wksworked %.4f' %weeks_worked.mean(), "(%.4f)" %weeks_worked.std())
print('hrsworked %.4f' %hours_worked.mean(), "(%.4f)" %hours_worked.std())
print('wage income %.4f' %wage_income.mean(), "(%.4f)" %wage_income.std())
print('family income %.4f' % family_income.mean(), "(%.4f)" %family_income.std())
print('non wife income %.4f' % non_wife_income.mean(), "(%.4f)" %non_wife_income.std())

------MARRIED WOMEN--------
n_child 2.4984 (0.7894)
avg_first_boy 0.5186 (0.4997)
avg_sec_boy 0.5167 (0.4997)
two_boys 0.2692 (0.4435)
two_girls 0.2339 (0.4233)
same_sex 0.5031 (0.5000)
age 30.4693 (3.4596)
age @ first birth 21.9151 (3.8462)
worked_for_pay 0.6775 (0.4675)
wksworked 26.4958 (22.4849)
hrsworked 21.0321 (21.7158)
wage income 7465.0125 (10420.7064)
family income 38350.9613 (27518.9143)
non wife income 30885.9489 (25336.9734)


# Number 2

In [94]:
one_one = df[df['one_boy_one_girl'] == True]['>2']

two_boys_had_other =  df[df['two_boys'] == 1]['>2']

two_girls_had_other =  df[df['two_girls'] == 1]['>2']
    
print("fraction of one of each that had other child %.4f" % one_one.mean(), ' (%.4f)' %one_one.std())
print("fraction of two boys that had other child %.4f" % two_boys_had_other.mean(),  ' (%.4f)' %two_boys_had_other.std())
print("fraction of two girls that had other child %.4f" % two_girls_had_other.mean(),  ' (%.4f)' %two_girls_had_other.std())
print("Difference %.4f" % (df[df['two_girls'] == 1]['>2'].mean() - one_one.mean()),  ' (%.4f)' %
                             (df[df['two_girls'] == 1]['>2'] - one_one).std())

fraction of one of each that had other child 0.3793  (0.4852)
fraction of two boys that had other child 0.4331  (0.4955)
fraction of two girls that had other child 0.4410  (0.4965)
Difference 0.0617  (nan)


# Number 3

In [95]:
diff_sex = df[df['one_boy_one_girl'] == True]
same_sex = df[df['same_sex'] == True]

age_diff = (same_sex['age'].astype(int).mean() - diff_sex['age'].astype(int).mean())
AAFB = (same_sex['age'].astype(int) - same_sex['eldch']).mean() - \
       (diff_sex['age'].astype(int) - diff_sex['eldch']).mean()
yrs = same_sex['yrsshool'].astype(int).mean() - diff_sex['yrsshool'].astype(int).mean()
black = ( len(same_sex[same_sex['race'] == 'Black/Negro']) / len(df) ) - \
        ( len(diff_sex[diff_sex['race'] == 'Black/Negro']) / len(df) )
white = ( len(same_sex[same_sex['race'] == 'White']) / len(df) ) - \
        ( len(diff_sex[diff_sex['race'] == 'White']) / len(df) )
other = ( len(same_sex[same_sex['race'] == 'Other race, nec']) / len(df) ) - \
        ( len(diff_sex[diff_sex['race'] == 'Other race, nec']) / len(df) )

print("-------Differences in means------")
print("Age %.4f" % age_diff)
print("Age @ first birth %.4f" % AAFB)
print("Black %.4f" % black)
print("White %.4f"% white)
print('Other race %.4f' % other)
print('years edu %.4f'% yrs)

-------Differences in means------
Age -0.0127
Age @ first birth -0.0644
Black 0.0015
White 0.0025
Other race 0.0020
years edu -0.0319


Instrument critique: effect is LATE on people whos likelihood of getting an additional child is increased by having children with same sex first two children. 

Possible other instruments: twins (as before), infant with disability (grave mental or physical handicap), possible change in technology in discovering infant with disability, etc.

# Number 4

In [96]:
@jit(nopython=True)
def safeln(x):
    ret = np.zeros(len(x))
    for i in range(len(x)):
        if x[i] <= 0:
            continue
        else:
            ret[i] = np.log(x[i])
    return ret


# The two X variables
waldxn =  same_sex['nchild'].mean() - diff_sex['nchild'].mean()
waldx = same_sex['>2'].mean() - diff_sex['>2'].mean()


wfp = same_sex['worked_for_pay'].mean() - diff_sex['worked_for_pay'].mean()

wwork = same_sex['wkswork'].mean() - diff_sex['wkswork'].mean()

hwork = same_sex['hrswork'].mean() - diff_sex['hrswork'].mean()

linc = same_sex['inctot'].mean() - diff_sex['inctot'].mean()

lnfam = safeln(np.array(same_sex['ftotinc'])).mean() - safeln(np.array(diff_sex['ftotinc'])).mean()


print("Number of children diff %.4f" %waldxn)
print(">2 children diff %.4f" %waldx)
print("----Difference-------------------Wald >2 Child----------Wald # Child-")
print("Worked for Pay %.4f" %wfp, "            %.4f"%(wfp/waldx), "             %.4f"%(wfp/waldxn))
print("Weeks worked   %.4f" %wwork, "            %.4f"%(wwork/waldx), "             %.4f"%(wwork/waldxn))
print("Hours worked   %.4f" %hwork, "            %.4f"%(hwork/waldx), "             %.4f"%(hwork/waldxn))
print("total income   %.3f" %linc, "            %.1f"%(linc/waldx), "             %.1f"%(linc/waldxn))
print("ln fam income  %.4f" %lnfam, "            %.4f"%(lnfam/waldx), "             %.4f"%(lnfam/waldxn))

Number of children diff 0.0746
>2 children diff 0.0575
----Difference-------------------Wald >2 Child----------Wald # Child-
Worked for Pay 0.0030             0.0526              0.0405
Weeks worked   0.1788             3.1122              2.3974
Hours worked   -0.1048             -1.8232              -1.4045
total income   57.985             1009.2              777.4
ln fam income  -0.0027             -0.0464              -0.0358


# Number 5

**Table 6**

In [97]:
X1 = sm.add_constant(df['same_sex'])
t61 = sm.OLS(df['>2'], X1).fit()

X2 = sm.add_constant(df[['firstborn_male', 'secondborn_male', 'age',\
                         'same_sex', 'AAFB', 'black', 'other_race']])
t62 = sm.OLS(df['>2'], X2).fit()

X3 = sm.add_constant(df[['firstborn_male','two_boys', 'two_girls',\
                          'age', 'AAFB', 'black', 'other_race']])
t63 = sm.OLS(df['>2'], X3).fit()

X4 = sm.add_constant(married['same_sex'])
t64 = sm.OLS(married['>2'], X4).fit()

X5 = sm.add_constant(married[['firstborn_male', 'secondborn_male', 'age',\
                              'same_sex', 'AAFB', 'black', 'other_race']])
t65 = sm.OLS(married['>2'], X5).fit()

X6 = sm.add_constant(married[['firstborn_male','two_boys', 'two_girls',\
                              'age', 'AAFB', 'black', 'other_race']])
t66 = sm.OLS(married['>2'], X6).fit()

# Number 6

**Table 7**

In [98]:
X = sm.add_constant(df[['firstborn_male', 'secondborn_male', 'age',\
        'AAFB', 'black', 'other_race']])

lninc = safeln(np.array(df['ftotinc']))

# table 7 (1) estimates

X1 = X
X1['>2'] = df['>2']
t711 = sm.OLS(df['worked_for_pay'], X1).fit()
t712 = sm.OLS(df['wkswork'], X1).fit()
t713 = sm.OLS(df['hrswork'], X1).fit()
t714 = sm.OLS(df['inctot'], X1).fit()
t715 = sm.OLS(lninc, X1).fit()

# table 7 (2) estimates

X1 = X
X1['>2'] = t62.predict()
t721 = sm.OLS(df['worked_for_pay'], X1).fit()
t722 = sm.OLS(df['wkswork'], X1).fit()
t723 = sm.OLS(df['hrswork'], X1).fit()
t724 = sm.OLS(df['inctot'], X1).fit()
t725 = sm.OLS(lninc, X1).fit()

# table 7 (3) estimates

X1 = X
X1['>2'] = t63.predict()
t731 = sm.OLS(df['worked_for_pay'], X1).fit()
t732 = sm.OLS(df['wkswork'], X1).fit()
t733 = sm.OLS(df['hrswork'], X1).fit()
t734 = sm.OLS(df['inctot'], X1).fit()
t735 = sm.OLS(lninc, X1).fit()

In [99]:
X = sm.add_constant(married[['firstborn_male', 'secondborn_male', 'age',\
        'AAFB', 'black', 'other_race']])

lninc = safeln(np.array(married['ftotinc']))

# table 7 (4) estimates

X1 = X
X1['>2'] = df['>2']
t741 = sm.OLS(married['worked_for_pay'], X1).fit()
t742 = sm.OLS(married['wkswork'], X1).fit()
t743 = sm.OLS(married['hrswork'], X1).fit()
t744 = sm.OLS(married['inctot'], X1).fit()
t745 = sm.OLS(lninc, X1).fit()

# table 7 (5) estimates

X1 = X
X1['>2'] = t65.predict()
t751 = sm.OLS(married['worked_for_pay'], X1).fit()
t752 = sm.OLS(married['wkswork'], X1).fit()
t753 = sm.OLS(married['hrswork'], X1).fit()
t754 = sm.OLS(married['inctot'], X1).fit()
t755 = sm.OLS(lninc, X1).fit()

# table 7 (6) estimates

X1 = X
X1['>2'] = t66.predict()
t761 = sm.OLS(married['worked_for_pay'], X1).fit()
t762 = sm.OLS(married['wkswork'], X1).fit()
t763 = sm.OLS(married['hrswork'], X1).fit()
t764 = sm.OLS(married['inctot'], X1).fit()
t765 = sm.OLS(lninc, X1).fit()

In [109]:
t731.summary()

0,1,2,3
Dep. Variable:,worked_for_pay,R-squared:,0.011
Model:,OLS,Adj. R-squared:,0.011
Method:,Least Squares,F-statistic:,70.24
Date:,"Mon, 03 Apr 2017",Prob (F-statistic):,1.8599999999999998e-101
Time:,23:21:56,Log-Likelihood:,-29262.0
No. Observations:,44305,AIC:,58540.0
Df Residuals:,44297,BIC:,58610.0
Df Model:,7,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5
,coef,std err,t,P>|t|,[95.0% Conf. Int.]
const,0.3615,0.047,7.619,0.000,0.269 0.455
firstborn_male,0.0014,0.005,0.309,0.758,-0.008 0.010
secondborn_male,-0.0033,0.004,-0.751,0.453,-0.012 0.005
age,0.0115,0.003,4.588,0.000,0.007 0.016
AAFB,-0.0029,0.004,-0.739,0.460,-0.011 0.005
black,0.0078,0.016,0.505,0.614,-0.023 0.038
other_race,-0.1101,0.014,-7.741,0.000,-0.138 -0.082
>2,0.0590,0.082,0.721,0.471,-0.101 0.219

0,1,2,3
Omnibus:,2988.148,Durbin-Watson:,1.947
Prob(Omnibus):,0.0,Jarque-Bera (JB):,7560.138
Skew:,-0.702,Prob(JB):,0.0
Kurtosis:,1.543,Cond. No.,1590.0


# Number 7