In [68]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import statsmodels.api as sm
import os
import chardet

In [69]:
# Define main project folder
main = r'C:\Users\Do Thu An\OneDrive\Desktop\Dynamic Macroeconomics\Problem sets\Dynamic-Macroeconomics\PS2_Code'
# Set project folder as current working directory
os.chdir(main)
# Define data file path
data_path = os.path.join(main, 'Data Files', 'ES Panel Data')

In [70]:
# Get list of all CSV files
file_list = [f for f in os.listdir(data_path) if f.endswith('.csv')]

# Display the number of files found
print(f'Found {len(file_list)} CSV files in the VHLSS 2008 Data folder.')
for i, file_name in enumerate(file_list, 1):
    print(f'{i}: {file_name}')

Found 1 CSV files in the VHLSS 2008 Data folder.
1: ES_Vietnam_2005_2009_2015.csv


In [71]:
# Load ES Panel data
df = pd.read_csv(os.path.join(data_path, 'ES_Vietnam_2005_2009_2015.csv'), encoding='latin1')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3199 entries, 0 to 3198
Columns: 977 entries, idstd2015 to _2005_q92c5
dtypes: float64(909), int64(16), object(52)
memory usage: 23.8+ MB


  df = pd.read_csv(os.path.join(data_path, 'ES_Vietnam_2005_2009_2015.csv'), encoding='latin1')


In [None]:
# Drop rows where 'a6a' (firm size) or 'l1' is NaN
df = df.dropna(subset=['a6a', 'l1'])

# Compute the median of 'l1'- Permanent, full-time workers end of last fiscal year
l1_median = df['l1'].median()

# Relabel 'a6a' based on the conditions:
# 1. If 'a6a' is 2 or 3, change it to 2 (Large)
# 2. If 'l1' is greater than the median, set it to 2 (Large), otherwise keep it as 1 (Small)
df['a6a'] = df.apply(lambda row: 2 if row['a6a'] in [2, 3] or row['l1'] > l1_median else 1, axis=1)

df['a6a'].value_counts()

a6a
2    1446
1     603
Name: count, dtype: int64

In [73]:
# Select Firms ID
col_id = ['idstd2015', 'id2015', 'idstd2009', 'id2009', 'idstd2005', 'id2005', 'a6a'] 

# Select the cost input of 2015
## d2: total annual sales for all products and services (VND)
# c9b: Annual losses due to power outages, d1a3: % of sales (d2) represented by main activity or product
# d10: Losses due to theft as % of the value of the products, d11: Losses due to breakage or spoilage as % of the value of the products
# h8: Cost of formal research and development activities, i2b: Total annual cost of security
# i4b: Total annual value of losses due to theft, robbery, vadalism 
# n5a: purchase of new or used machinery, vehicles, and equipment, n5b: lands & buildings
# n2a: Total annual cost of labor, n2b: Total annual cost of electricity, n2i: Total annual cost of sales (for retails)
col_15 = ['c9b', 'd1a3', 'd2', 'd10', 'd11', '_2015_h8', 'i2b', 'i4b','n2a', 'n2b', 'n2i','n5a', 'n5b']

# Select the cost inputs of 2009
# c9b: Annual losses due to power outages
col_09 = ['']

# Check if all selected columns exist in the dataset
columns = [col for col in col_id + col_15 if col in df.columns]

df = df[columns]
df

Unnamed: 0,idstd2015,id2015,idstd2009,id2009,idstd2005,id2005,a6a,c9b,d1a3,d2,d10,d11,_2015_h8,i2b,i4b,n2a,n2b,n2i,n5a,n5b
1,599613.0,,466100.0,4.0,60775.0,,2,,100.0,2.750000e+10,0.0,0.0,,5.400000e+07,,2.900000e+09,2.800000e+08,,2.000000e+08,0.0
2,599613.0,10697.0,,,,,2,,100.0,6.230000e+11,1.0,1.0,,,,6.800000e+09,9.000000e+08,,,
4,599614.0,,466101.0,6.0,60785.0,,2,,80.0,9.000000e+10,0.0,0.0,,6.000000e+08,,9.900000e+09,1.170000e+09,,,
5,599614.0,10698.0,,,,,2,,95.0,9.800000e+10,0.0,0.0,1.200000e+09,1.000000e+09,,1.100000e+10,1.200000e+09,,2.500000e+09,0.0
7,599615.0,,466103.0,11.0,60782.0,,2,,70.0,1.500000e+11,0.0,0.0,,4.000000e+07,,2.700000e+09,1.500000e+08,,1.000000e+09,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2401,,,467148.0,3387.0,,,1,,90.0,2.400000e+10,0.0,0.0,,3.600000e+07,24000000.0,6.000000e+08,2.040000e+07,,5.000000e+08,0.0
2402,,,467149.0,3400.0,,,1,,85.0,3.200000e+09,,,,1.200000e+07,,1.600000e+08,2.200000e+07,2.200000e+09,0.000000e+00,100000000.0
2403,,,467150.0,3408.0,,,2,,60.0,1.200000e+12,0.0,0.0,,1.200000e+09,,2.400000e+11,5.800000e+11,,3.600000e+10,0.0
2404,,,467151.0,3414.0,,,2,,100.0,2.360000e+11,0.0,0.0,,1.000000e+07,,4.991000e+09,1.224000e+09,,2.200000e+09,0.0
