In [1]:
# Dehejia and Wahba 1999 data prep
import pandas as pd
import numpy as np

df = pd.read_stata(r'cps1re74.dta')
df = df.loc[df['re78']>0]
# df = df.loc[df['age'].isin(range(40,50))].loc[df['re78']>0].reset_index(drop=True)
df['income_78'] = df['re78']
# df['ln_income_78'] = np.log(df['income_78'])
# df['school'] = df ['ed']
df.rename(columns={'ed':'edu_year'},inplace=True)
df = df[['income_78',  'edu_year','age']]
df

Unnamed: 0,income_78,edu_year,age
0,9930.045898,11,37
1,3595.894043,9,22
2,24909.449219,12,30
3,7506.145996,11,27
4,289.789886,8,33
...,...,...,...
16172,2757.437988,12,22
16173,6895.071777,12,20
16174,4221.865234,12,37
16175,13671.929688,9,47


In [2]:
edu_level = pd.DataFrame({'edu_level':['None','Elementary', 'Secondary', 'Post-Secondary', 'Higher Edu'],
                          'edu_year_int':[[0], np.arange(1,7), np.arange(7,13), np.arange(13,17), np.arange(17,23)]})
edu_level['edu_level'] = edu_level['edu_level'].astype('category').cat.reorder_categories(['None','Elementary', 'Secondary', 'Post-Secondary', 'Higher Edu'], ordered=True)
edu_level

Unnamed: 0,edu_level,edu_year_int
0,,[0]
1,Elementary,"[1, 2, 3, 4, 5, 6]"
2,Secondary,"[7, 8, 9, 10, 11, 12]"
3,Post-Secondary,"[13, 14, 15, 16]"
4,Higher Edu,"[17, 18, 19, 20, 21, 22]"


In [3]:
df = df.merge(edu_level.explode('edu_year_int').rename(columns={'edu_year_int':'edu_year'}), how='left', on='edu_year')
df

Unnamed: 0,income_78,edu_year,age,edu_level
0,9930.045898,11,37,Secondary
1,3595.894043,9,22,Secondary
2,24909.449219,12,30,Secondary
3,7506.145996,11,27,Secondary
4,289.789886,8,33,Secondary
...,...,...,...,...
13955,2757.437988,12,22,Secondary
13956,6895.071777,12,20,Secondary
13957,4221.865234,12,37,Secondary
13958,13671.929688,9,47,Secondary


In [4]:
# CPI 변환
df_cpi_table = pd.read_html('https://www.minneapolisfed.org/about-us/monetary-policy/inflation-calculator/consumer-price-index-1913-')[0]
# print(df_cpi_table.columns)
# print(df_cpi_table.dtypes)
df_cpi_table

Unnamed: 0,Year,Annual Average CPI(-U),Annual Percent Change (rate of inflation)
0,1913,9.9,
1,1914,10.0,1.3%
2,1915,10.1,0.9%
3,1916,10.9,7.7%
4,1917,12.8,17.8%
...,...,...,...
106,2019,255.7,1.8%
107,2020,258.8,1.2%
108,2021,271.0,4.7%
109,2022,292.7,8.0%


In [5]:
df_cpi_table.loc[df_cpi_table['Year'].isin([1978,2023])]

Unnamed: 0,Year,Annual Average CPI(-U),Annual Percent Change (rate of inflation)
65,1978,65.2,7.6%
110,2023,304.7,4.1%


In [6]:
cpi_conversion_78_to_2023 = df_cpi_table.loc[df_cpi_table['Year'] == 2023][df_cpi_table.columns[1]].values[0] / df_cpi_table.loc[df_cpi_table['Year'] == 1978][df_cpi_table.columns[1]].values[0]
cpi_conversion_78_to_2023

4.673312883435583

In [7]:
df['income_at_23'] = df['income_78'] * cpi_conversion_78_to_2023
df['ln_income_at_23'] = np.log(df['income_at_23'])
df

Unnamed: 0,income_78,edu_year,age,edu_level,income_at_23,ln_income_at_23
0,9930.045898,11,37,Secondary,46406.210938,10.745189
1,3595.894043,9,22,Secondary,16804.736328,9.729416
2,24909.449219,12,30,Secondary,116409.640625,11.664870
3,7506.145996,11,27,Secondary,35078.566406,10.465345
4,289.789886,8,33,Secondary,1354.278687,7.211024
...,...,...,...,...,...,...
13955,2757.437988,12,22,Secondary,12886.370117,9.463925
13956,6895.071777,12,20,Secondary,32222.826172,10.380430
13957,4221.865234,12,37,Secondary,19730.095703,9.889900
13958,13671.929688,9,47,Secondary,63893.203125,11.064968


In [8]:
df.to_csv(r'data_cps78_income.csv')
df.to_excel(r'data_cps78_income.xlsx')
df.to_pickle(r'data_cps78_income.pkl')

In [9]:
pwd()

'c:\\Users\\SeanJ\\Repositories\\Teaching_YU_DS_basic_KR\\data\\Dehejia_and_Wahba_1999'