In [None]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
import patsy
import matplotlib.pyplot as plt

# Load the dataset
df_hps = pd.read_csv("https://raw.githubusercontent.com/SeanJSLee/Teaching_YU_DS_basic_KR/main/data/KOSIS_houshold_panel_survey/data_income_kor.csv")
df_hps.head(3)

In [None]:
def ols_alt_spec(spec, data, show_res=True):
    y,X = patsy.dmatrices(spec, data=data, return_type='dataframe')
    model  = sm.OLS(y,X)
    result = model.fit(cov_type='HC0')
    if show_res : print(result.summary())
    return result

In [None]:
# 한국표준산업분류
# 'A' : '농업, 임업 및 어업(01~03)',
# 'B' : '광 업(05~08)',
# 'C' : '제 조 업(10~34)',
# 'D' : '전기, 가스, 증기 및 공기 조절 공급업(35)',
# 'E' : '수도, 하수 및 폐기물 처리, 원료 재생업(36~39)',
# 'F' : '건 설 업(41~42)',
# 'G' : '도매 및 소매업(45~47)',
# 'H' : '운수 및 창고업(49~52)',
# 'I' : '숙박 및 음식점업(55~56)',
# 'J' : '정보통신업(58~63)',
# 'K' : '금융 및 보험업(64~66)',
# 'L' : '부동산업(68)',
# 'M' : '전문, 과학 및 기술 서비스업(70~73)',
# 'N' : '사업시설 관리, 사업 지원 및 임대 서비스업(74~76)',
# 'O' : '공공 행정, 국방 및 사회보장 행정(84)',
# 'P' : '교육 서비스업(85)',
# 'Q' : '보건업 및 사회복지 서비스업(86~87)',
# 'R' : '예술, 스포츠 및 여가관련 서비스업(90~91)',
# 'S' : '협회 및 단체, 수리 및 기타 개인 서비스업(94~96)',
# 'T' : '가구 내 고용활동 및 달리 분류되지 않은 자가 소비 생산활동(97~98)',
# 'U' : '국제 및 외국기관(99)'


dict_ksic = {
    'A' : 'Agriculture, Forestry and Fishing (01~03)',
    'B' : 'Mining (05~08)',
    'C' : 'Manufacturing (10~34)',
    'D' : 'Electricity, Gas, Steam and Air Conditioning Supply (35)',
    'E' : 'Water Supply; Sewerage, Waste Management and Remediation Activities (36~39)',
    'F' : 'Construction (41~42)',
    'G' : 'Wholesale and Retail Trade (45~47)',
    'H' : 'Transportation and Storage (49~52)',
    'I' : 'Accommodation and Food Service Activities (55~56)',
    'J' : 'Information and Communication (58~63)',
    'K' : 'Financial and Insurance Activities (64~66)',
    'L' : 'Real Estate Activities (68)',
    'M' : 'Professional, Scientific and Technical Services (70~73)',
    'N' : 'Administrative and Support Service Activities; Rental and Leasing Activities (74~76)',
    'O' : 'Public Administration and Defence; Compulsory Social Security (84)',
    'P' : 'Education (85)',
    'Q' : 'Human Health and Social Work Activities (86~87)',
    'R' : 'Arts, Entertainment and Recreation (90~91)',
    'S' : 'Other Service Activities; Activities of Membership Organizations; Repair of Computers and Personal and Household Goods (94~96)',
    'T' : 'Activities of Households as Employers; Undifferentiated Goods- and Services-Producing Activities of Households for Own Use (97~98)',
    'U' : 'Activities of Extraterritorial Organizations and Bodies (99)'
    }

res_ksic = {}
for ksic in dict_ksic.keys() :
    res_ksic[ksic] = ols_alt_spec(spec='ln_income ~ edu_year', data= df_hps.loc[df_hps['ksic']==ksic], show_res=False)
    print(ksic, dict_ksic[ksic], '\nbeta_edu_year =',round(res_ksic[ksic].params['edu_year'],3),'\n')

In [None]:
fig, axe = plt.subplots(len(dict_ksic), sharex=True, figsize=(5,40))
for ax,ksic in zip(axe, dict_ksic.keys()) :
    ax.hist(df_hps.loc[df_hps['ksic']==ksic]['ln_income'], bins=30, density=True, label=ksic)
    ax.set_title((ksic + ': ' +dict_ksic[ksic][:40]))
    # mean
    ax.axvline(df_hps['ln_income'].mean(), color='k', linestyle='--')
    # mean - hs
    ax.axvline(df_hps.loc[df_hps['edu_year'] == 12]['ln_income'].mean(), color='r', linestyle='--')
    # mean - ps
    ax.axvline(df_hps.loc[df_hps['edu_year'] == 16]['ln_income'].mean(), color='m', linestyle='--')


# plt.legend()
plt.show()

In [None]:
# 한국표준직업분류
# '1': '관리자',
# '2': '전문가 및 관련 종사자',
# '3': '사무 종사자',
# '4': '서비스 종사자',
# '5': '판매 종사자',
# '6': '농림ㆍ어업 숙련 종사자',
# '7': '기능원 및 관련 기능 종사자',
# '8': '장치ㆍ기계 조작 및 조립 종사자',
# '9': '단순노무 종사자',
# 'A': '군 인'

dict_ksco = {
    '1': 'Managers',
    '2': 'Professionals and related workers',
    '3': 'Office workers',
    '4': 'Service workers',
    '5': 'Sales workers',
    '6': 'Agriculture, forestry, and fishery skilled workers',
    '7': 'Craft and related trades workers',
    '8': 'Plant and machine operators and assemblers',
    '9': 'Elementary occupations',
    'A': 'Military personnel'
}


res_ksco = {}
for ksco in dict_ksco.keys() :
    res_ksco[ksco] = ols_alt_spec(spec='ln_income ~ edu_year', data= df_hps.loc[df_hps['ksco']==ksco], show_res=False)
    print(ksco, dict_ksco[ksco], '\nbeta_edu_year =',round(res_ksco[ksco].params['edu_year'],3),'\n')

In [None]:
fig, axe = plt.subplots(len(dict_ksco), sharex=True, figsize=(5,40))
for ax,ksco in zip(axe, dict_ksco.keys()) :
    ax.hist(df_hps.loc[df_hps['ksco']==ksco]['ln_income'], bins=30, density=True, label=ksco)
    ax.set_title((ksco + ': ' +dict_ksco[ksco]))
    # mean
    ax.axvline(df_hps['ln_income'].mean(), color='k', linestyle='--')
    # mean - hs
    ax.axvline(df_hps.loc[df_hps['edu_year'] == 12]['ln_income'].mean(), color='r', linestyle='--')
    # mean - ps
    ax.axvline(df_hps.loc[df_hps['edu_year'] == 16]['ln_income'].mean(), color='m', linestyle='--')


plt.show()