In [94]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import statsmodels.api as sm
import os

In [95]:
# Define main project folder
main = r'C:\Users\Do Thu An\OneDrive\Desktop\Dynamic Macroeconomics\Problem sets\Dynamic-Macroeconomics\PS2_Code'
# Set project folder as current working directory
os.chdir(main)

# Define data file path
data_path = os.path.join(main, 'Data Files', 'VHLSS 2008 Data')

In [96]:
# Load muc123a
muc123a = pd.read_csv(os.path.join(data_path, 'muc123a.csv'))

# Create household size column
muc123a['hsize'] = muc123a.groupby(['tinh', 'huyen', 'xa', 'diaban', 'hoso'])['matv'].transform('max')

# Keep only household heads who are male and age ≥ 25
muc123a = muc123a[(muc123a['m1ac3'] == 1) & (muc123a['m1ac2'] == 1) & (muc123a['m1ac5'] >= 25)]

# Keep relevant columns
columns123a = ['tinh', 'huyen', 'xa', 'diaban', 'hoso', 'matv', 'hsize', 'm1ac2', 'm1ac3', 'm1ac5']
df = muc123a[columns123a].copy()

In [97]:
# Load and process income file
muc4a = pd.read_csv(os.path.join(data_path, 'muc4a.csv'))
columns4a = ['tinh', 'huyen', 'xa', 'diaban', 'hoso', 'matv', 'm4ac11', 'm4ac12f', 'm4ac21', 'm4ac22f', 'm4ac25']
muc4a = muc4a[columns4a]
muc4a['indi_income'] = muc4a[['m4ac11', 'm4ac12f', 'm4ac21', 'm4ac22f', 'm4ac25']].sum(axis=1)
hh_income = muc4a.groupby(['tinh', 'huyen', 'xa', 'diaban', 'hoso'])['indi_income'].sum().reset_index()
hh_income.rename(columns={'indi_income': 'HH_Income'}, inplace=True)
muc4a = muc4a.merge(hh_income, on=['tinh', 'huyen', 'xa', 'diaban', 'hoso'], how='left').fillna(0)
df = df.merge(muc4a, on=['tinh', 'huyen', 'xa', 'diaban', 'hoso', 'matv'], how='left')

In [98]:
# Function to process and merge expenditure data
def merge_expenditure(df, file_name, columns, expense_cols, new_col_name):
    data = pd.read_csv(os.path.join(data_path, file_name))[columns]
    exp = data.groupby(['tinh', 'huyen', 'xa', 'diaban', 'hoso'])[expense_cols].sum().reset_index()
    exp[new_col_name] = exp[expense_cols].sum(axis=1)
    exp = exp.drop_duplicates(subset=['tinh', 'huyen', 'xa', 'diaban', 'hoso'], keep='first')
    return df.merge(exp, on=['tinh', 'huyen', 'xa', 'diaban', 'hoso'], how='left')

# Merge all expenditure files
df = merge_expenditure(df, 'muc5a1.csv', ['tinh', 'huyen', 'xa', 'diaban', 'hoso', 'm5a1c2b', 'm5a1c3b'], ['m5a1c2b', 'm5a1c3b'], 'HH_exp1')
df = merge_expenditure(df, 'muc5a2.csv', ['tinh', 'huyen', 'xa', 'diaban', 'hoso', 'm5a2c6', 'm5a2c10'], ['m5a2c6', 'm5a2c10'], 'HH_exp2')
df = merge_expenditure(df, 'muc5b1.csv', ['tinh', 'huyen', 'xa', 'diaban', 'hoso', 'm5b1c4', 'm5b1c5'], ['m5b1c4', 'm5b1c5'], 'HH_exp3')
df = merge_expenditure(df, 'muc5b2.csv', ['tinh', 'huyen', 'xa', 'diaban', 'hoso', 'm5b2c2', 'm5b2c3'], ['m5b2c2', 'm5b2c3'], 'HH_exp4')
df = merge_expenditure(df, 'muc5b3.csv', ['tinh', 'huyen', 'xa', 'diaban', 'hoso', 'm5b3c2'], ['m5b3c2'], 'HH_exp5')

# Merge housing expenditure
muc7 = pd.read_csv(os.path.join(data_path, 'muc7.csv'))
columns7 = ['tinh', 'huyen', 'xa', 'diaban', 'hoso', 'm7c15', 'm7c32', 'm7c36', 'm7c39']
muc7 = muc7[columns7]
muc7['HH_exp6'] = muc7[['m7c32', 'm7c36', 'm7c39']].sum(axis=1)
muc7.fillna(0, inplace=True)
df = df.merge(muc7, on=['tinh', 'huyen', 'xa', 'diaban', 'hoso'], how='left', indicator=True)

# Aggregate the household income earned from wage and rent/land leasing 
df['HH_income'] = df['HH_Income'] + df['m7c15']

# Calculate the average household income 
df['HH_income_avr'] = df['HH_income']/ df['hsize']

# Aggregate total household consumption expenditure (sum of HH_exp1 to HH_exp6)
df['HH_consumption'] = (
    df['HH_exp1'] + df['HH_exp2'] + df['HH_exp3'] +
    df['HH_exp4'] + df['HH_exp5'] + df['HH_exp6']
)

# Final check
df

Unnamed: 0,tinh,huyen,xa,diaban,hoso,matv,hsize,m1ac2,m1ac3,m1ac5,...,HH_exp5,m7c15,m7c32,m7c36,m7c39,HH_exp6,_merge,HH_income,HH_income_avr,HH_consumption
0,101,1,3.0,1,14,1,3,1,1,64,...,8150.0,0.0,720.0,3000.0,108.0,3828.0,both,56468.0,18822.666667,94038.0
1,101,1,3.0,1,15,1,2,1,1,61,...,7850.0,0.0,1000.0,5000.0,72.0,6072.0,both,42351.0,21175.500000,93917.6
2,101,1,9.0,19,15,1,2,1,1,50,...,760.0,0.0,180.0,1440.0,72.0,1692.0,both,0.0,0.000000,35919.6
3,101,1,9.0,19,20,1,3,1,1,50,...,640.0,0.0,180.0,1440.0,120.0,1740.0,both,47203.0,15734.333333,46195.6
4,101,1,15.0,50,13,1,4,1,1,35,...,1800.0,0.0,840.0,3600.0,60.0,4500.0,both,163382.0,40845.500000,97436.8
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6892,823,13,12.0,25,15,1,6,1,1,78,...,0.0,0.0,0.0,840.0,0.0,840.0,both,27677.0,4612.833333,16410.9
6893,823,13,12.0,25,19,1,4,1,1,32,...,0.0,0.0,0.0,600.0,0.0,600.0,both,0.0,0.000000,14952.3
6894,823,13,17.0,1,13,1,3,1,1,57,...,70.0,0.0,0.0,0.0,0.0,0.0,both,0.0,0.000000,10990.7
6895,823,13,17.0,1,14,1,5,1,1,45,...,300.0,0.0,0.0,0.0,0.0,0.0,both,0.0,0.000000,17261.0


In [99]:
print(df.columns)

Index(['tinh', 'huyen', 'xa', 'diaban', 'hoso', 'matv', 'hsize', 'm1ac2',
       'm1ac3', 'm1ac5', 'm4ac11', 'm4ac12f', 'm4ac21', 'm4ac22f', 'm4ac25',
       'indi_income', 'HH_Income', 'm5a1c2b', 'm5a1c3b', 'HH_exp1', 'm5a2c6',
       'm5a2c10', 'HH_exp2', 'm5b1c4', 'm5b1c5', 'HH_exp3', 'm5b2c2', 'm5b2c3',
       'HH_exp4', 'm5b3c2', 'HH_exp5', 'm7c15', 'm7c32', 'm7c36', 'm7c39',
       'HH_exp6', '_merge', 'HH_income', 'HH_income_avr', 'HH_consumption'],
      dtype='object')


In [100]:
# Ensure no zero or negative incomes before taking the log
df['log_income'] = np.log(df['HH_income_avr'].replace(0, np.nan))

# Compute average log income per age group
avg_log_income = df.groupby('m1ac5')['log_income'].mean().reset_index()

# Exponentiate to get G_t
avg_log_income['Gt'] = np.exp(avg_log_income['log_income'])

# Display the result
print(avg_log_income.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 72 entries, 0 to 71
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   m1ac5       72 non-null     int64  
 1   log_income  67 non-null     float64
 2   Gt          67 non-null     float64
dtypes: float64(2), int64(1)
memory usage: 1.8 KB
None


In [103]:
# Select relevant columns
data = df[['tinh', 'huyen', 'xa', 'diaban', 'hoso', 'matv', 'hsize', 'm1ac2',
           'm1ac3', 'm1ac5', 'HH_exp1', 'HH_exp2', 'HH_exp3', 'HH_exp4',
           'HH_exp5', 'HH_exp6', 'HH_income', 'HH_income_avr', 'HH_consumption']].copy()

# Rename 'm1ac5' to 'age'
data.rename(columns={'m1ac5': 'age'}, inplace=True)

# Display the first few rows to check
data

Unnamed: 0,tinh,huyen,xa,diaban,hoso,matv,hsize,m1ac2,m1ac3,age,HH_exp1,HH_exp2,HH_exp3,HH_exp4,HH_exp5,HH_exp6,HH_income,HH_income_avr,HH_consumption
0,101,1,3.0,1,14,1,3,1,1,64,4619.3,36904.7,21586.0,18950.0,8150.0,3828.0,56468.0,18822.666667,94038.0
1,101,1,3.0,1,15,1,2,1,1,61,3530.7,38741.9,21488.0,16235.0,7850.0,6072.0,42351.0,21175.500000,93917.6
2,101,1,9.0,19,15,1,2,1,1,50,1555.2,17770.4,7012.0,7130.0,760.0,1692.0,0.0,0.000000,35919.6
3,101,1,9.0,19,20,1,3,1,1,50,1993.9,22501.7,9280.0,10040.0,640.0,1740.0,47203.0,15734.333333,46195.6
4,101,1,15.0,50,13,1,4,1,1,35,5259.8,51339.0,17447.0,17091.0,1800.0,4500.0,163382.0,40845.500000,97436.8
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6892,823,13,12.0,25,15,1,6,1,1,78,316.6,13047.3,1816.0,391.0,0.0,840.0,27677.0,4612.833333,16410.9
6893,823,13,12.0,25,19,1,4,1,1,32,655.6,10056.7,2670.0,970.0,0.0,600.0,0.0,0.000000,14952.3
6894,823,13,17.0,1,13,1,3,1,1,57,529.0,8405.7,1631.0,355.0,70.0,0.0,0.0,0.000000,10990.7
6895,823,13,17.0,1,14,1,5,1,1,45,482.0,13583.0,2136.0,760.0,300.0,0.0,0.0,0.000000,17261.0


In [106]:
# Ensure no zero or negative incomes before taking the log
data['log_income'] = np.log(data['HH_income_avr'].replace(0, np.nan))

# Compute average log income per age group
avg_log_income = data.groupby('age')['log_income'].mean().reset_index()

# Exponentiate to get G_t
avg_log_income['Gt'] = np.exp(avg_log_income['log_income'])

avg_log_income

Unnamed: 0,age,log_income,Gt
0,25,8.022570,3049.002515
1,26,7.931950,2784.853355
2,27,7.976383,2911.380461
3,28,7.748141,2317.259554
4,29,8.107539,3319.399540
...,...,...,...
67,92,9.806862,18157.914012
68,93,,
69,94,,
70,95,,
