In [None]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
import patsy
import matplotlib.pyplot as plt


def ols_alt_spec(spec, data, show_res=True):
    y,X = patsy.dmatrices(spec, data=data, return_type='dataframe')
    model  = sm.OLS(y,X)
    result = model.fit(cov_type='HC0')
    if show_res : print(result.summary())
    return result

# Load the dataset
df_hps = pd.read_csv("https://raw.githubusercontent.com/SeanJSLee/Teaching_YU_DS_basic_KR/main/data/KOSIS_houshold_panel_survey/data_income_kor.csv")
df_hps.head(3)

## 연령그룹에 따라 추정해보기

In [None]:
# 20대의 경우 
print('20대 OLS')
ols_alt_spec(spec='ln_income ~ edu_year', data= df_hps.loc[df_hps['age'].isin(range(20,30))])

In [None]:
# 30대의 경우 
print('30대 OLS')
ols_alt_spec(spec='ln_income ~ edu_year', data= df_hps.loc[df_hps['age'].isin(range(30,40))])

In [None]:
# 40대의 경우 
print('40대 OLS')
ols_alt_spec(spec='ln_income ~ edu_year', data= df_hps.loc[df_hps['age'].isin(range(40,50))])

In [None]:
# 50대의 경우 
print('50대 OLS')
ols_alt_spec(spec='ln_income ~ edu_year', data= df_hps.loc[df_hps['age'].isin(range(50,60))])

### Alternative specification
- 임금과 연관된 다른 변수 추가해보기 - 경력을 근사 할 수 있는 나이

In [None]:
result_spec = {}
result_spec['original'] = ols_alt_spec(spec='ln_income ~ edu_year', data= df_hps)
print('\n\n\n\n')
result_spec['alt_age'] = ols_alt_spec(spec='ln_income ~ edu_year + age', data= df_hps)
print('\n\n\n\n')
result_spec['alt_age_sq'] = ols_alt_spec(spec='ln_income ~ edu_year + age + np.power(age,2)', data= df_hps)

In [None]:
for model in result_spec.keys() :
    print(model, 'RMSE=', round(result_spec[model].mse_resid ** .5, 4))

In [None]:


# fitted value df
df_predict = pd.DataFrame(columns=result_spec['alt_age_sq'].params.index)
df_predict['edu_year'] = range(0,22)
df_predict['Intercept'] = 1
df_predict['age'] = df_hps['age'].mean()
df_predict['np.power(age, 2)'] = df_hps['age'].mean() ** 2
df_predict


In [None]:
fig, ax = plt.subplots(figsize=(8,8), sharex = True)


# actual values
ax.scatter(df_hps['edu_year'], df_hps['ln_income'], alpha=0.1, s=5, marker='D', c='b')


# 교육연도별 임금평균 - alternative specification
CEF_income_edu = df_hps.groupby('edu_year')['ln_income'].mean()
# 
ax.plot(CEF_income_edu, linestyle='-', marker='o', c='b')


for model in result_spec.keys() :
    ax.plot(result_spec[model].predict(df_predict[result_spec[model].params.index]))

# HS
edu_year = 12
ax.axvline(edu_year, color='0.5', linestyle='--', label='HS')
# Post-secondary
edu_year = 16
ax.axvline(edu_year, color='0.5', linestyle='--', label='PS')
    
plt.legend(['Actual', 'CEF', 'Fitted', '+age', '+age+$age^2$'])
plt.show()