In [57]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import statsmodels.api as sm
import os
import chardet

In [58]:
# Define main project folder
main = r'C:\Users\Do Thu An\OneDrive\Desktop\Dynamic Macroeconomics\Problem sets\Dynamic-Macroeconomics\PS2_Code'
# Set project folder as current working directory
os.chdir(main)
# Define data file path
data_path = os.path.join(main, 'Data Files', 'ES Panel Data')

In [59]:
# Get list of all CSV files
file_list = [f for f in os.listdir(data_path) if f.endswith('.csv')]

# Display the number of files found
print(f'Found {len(file_list)} CSV files in the VHLSS 2008 Data folder.')
for i, file_name in enumerate(file_list, 1):
    print(f'{i}: {file_name}')

Found 1 CSV files in the VHLSS 2008 Data folder.
1: ES_Vietnam_2005_2009_2015.csv


In [60]:
# Load ES Panel data
df = pd.read_csv(os.path.join(data_path, 'ES_Vietnam_2005_2009_2015.csv'), encoding='latin1')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3199 entries, 0 to 3198
Columns: 977 entries, idstd2015 to _2005_q92c5
dtypes: float64(909), int64(16), object(52)
memory usage: 23.8+ MB


  df = pd.read_csv(os.path.join(data_path, 'ES_Vietnam_2005_2009_2015.csv'), encoding='latin1')


In [61]:
# Calculate the median of the number of employees (l1)
l1_median = df['l1'].median()

# Re-categorize the firm size (a6a) into Small (1) and Large (2)
df['firm_size'] = df.apply(
    lambda row: 2 if (row['a6a'] in [2, 3] or row['l1'] > l1_median) else 1, axis=1
)

# Label small as 1, large as 2
# - 1 is small
# - 2 is large (medium + large or larger than median of employees)

# Print the value counts of the new firm size
print(df['firm_size'].value_counts())

firm_size
2    2161
1    1038
Name: count, dtype: int64


In [62]:
# Select Firms ID
col_id = ['idstd2015', 'id2015', 'idstd2005', 'id2005', 'firm_size'] 

#************************************************ YEAR 2015 ************************************************
# Select the variable cost of 2015
## d2: total annual sales for all products and services (VND)
# c9b: Annual losses due to power outages, d1a3: % of sales (d2) represented by main activity or product
# d10: Losses due to theft as % of the value of the products, d11: Losses due to breakage or spoilage as % of the value of the products
# h8: Cost of formal research and development activities, i2b: Total annual cost of security
# i4b: Total annual value of losses due to theft, robbery, vadalism 
# n2a: Total annual cost of labor, n2b: Total annual cost of electricity, n2i: Total annual cost of sales (for retails)
col_15A = ['c9b', 'd1a3', 'd2', 'd10', 'd11', 'i2b', 'i4b','n2a', 'n2b', 'n2i']

# Select production input of 2015 (Labor, Capital, Investment)
# LABOR - l1: Permanent, full-time workers end of last fiscal year, f2: Typical hours of operation in a week
# INVESTMENT - n5a: purchase of new or used machinery, vehicles, and equipment, n5b: lands & buildings, h8: Cost of formal research and development activities (innovation activities)
col_15B = ['n5a', 'n5b', '_2015_h8']

# Select financing variables OF 2015 (SECTION K. FINANCE)
# k1c: % Purchased on credit (loans), k2c: % Sold on credit (receivables)
col_15C = ['k1c', 'k2c']

#************************************************ YEAR 2005 ************************************************
# Select the variable cost of 2005 (Section II: Productivity)
# q86a1: Total sales, q86a2: Total cost
# q86a3: Total purchases of raw materials and intermediate goods, q86a4: Total cost of labor
# q86a5: Depreciation, q86a6: Rent on land and buildings, q86a8: Rent on machinery, equipment, and vehicles
# q86a9: Interest charges, q86a10: Energy cost, q86a11: Taxes
col_05A = ['_2005_q86a1', '_2005_q86a2','_2005_q86a3', '_2005_q86a4', '_2005_q86a5', '_2005_q86a6', 
           '_2005_q86a8', '_2005_q86a9', '_2005_q86a10', '_2005_q86a11']

# Select production input of 2005

# CAPITAL (ASSETS) - q91a1: Total fixed assets, q91a7: Total current assets (need to exclude receivables (q91a12))  
# INVESTMENT - q87a: net profits (after tax) in 2004, q87d: % of the establishment’s net profits (after tax) were reinvested in the establishment
col_05B = ['_2005_q91a1', '_2005_q91a7', '_2005_q91a12', '_2005_q87a', '_2005_q87d']

# Select financing variables of 2005 (SECTION III. PRODUCTIVITY)
# q91a12: Receivables

# Check if all selected columns exist in the dataset
columns = [col for col in col_id + col_15A + col_15B + col_15C + col_05A + col_05B if col in df.columns]

df = df[columns]
df.fillna(0, inplace = True)
# Replace all occurrences of -9 with 0 in the entire DataFrame
df.replace(-9, 0, inplace=True)
df.columns

Index(['idstd2015', 'id2015', 'idstd2005', 'id2005', 'firm_size', 'c9b',
       'd1a3', 'd2', 'd10', 'd11', 'i2b', 'i4b', 'n2a', 'n2b', 'n2i', 'n5a',
       'n5b', '_2015_h8', 'k1c', 'k2c', '_2005_q86a1', '_2005_q86a2',
       '_2005_q86a3', '_2005_q86a4', '_2005_q86a5', '_2005_q86a6',
       '_2005_q86a8', '_2005_q86a9', '_2005_q86a10', '_2005_q86a11',
       '_2005_q91a1', '_2005_q91a7', '_2005_q91a12', '_2005_q87a',
       '_2005_q87d'],
      dtype='object')

In [63]:
# Calculate the costs based on the percentage of sales (d1a3, d10, d11) and total sales (d2)
df['2015_cost_main_activity'] = df['d1a3'] * df['d2'] / 100  # Main activity cost (percentage of sales)
df['2015_cost_theft'] = df['d10'] * df['d2'] / 100  # Losses due to theft
df['2015_cost_spoilage'] = df['d11'] * df['d2'] / 100  # Losses due to spoilage

# Calculate the finance based on purchases made on credit (k1c)
df['2015_total_vrcost'] = df['n5a'] + df['n5b'] + df['_2015_h8']  # Total variable cost

# Finance based on credit purchases
df['2015_finance_on_credit'] = (df['k1c'] / 100) * df['2015_total_vrcost']

# Finance based on sales on credit (receivables)
df['2015_finance_from_receivables'] = (df['k2c'] / 100) * df['2015_total_vrcost']
df


Unnamed: 0,idstd2015,id2015,idstd2005,id2005,firm_size,c9b,d1a3,d2,d10,d11,...,_2005_q91a7,_2005_q91a12,_2005_q87a,_2005_q87d,2015_cost_main_activity,2015_cost_theft,2015_cost_spoilage,2015_total_vrcost,2015_finance_on_credit,2015_finance_from_receivables
0,599613.0,0.0,60775.0,10597.0,2,0.0,40.0,2.860000e+10,0.0,0.0,...,7431.00000,1114.000,1750.0,100.0,1.144000e+10,0.000000e+00,0.000000e+00,5.300000e+08,0.0,5.300000e+08
1,599613.0,0.0,60775.0,0.0,2,0.0,100.0,2.750000e+10,0.0,0.0,...,0.00000,0.000,0.0,0.0,2.750000e+10,0.000000e+00,0.000000e+00,2.000000e+08,200000000.0,2.000000e+08
2,599613.0,10697.0,0.0,0.0,2,0.0,100.0,6.230000e+11,1.0,1.0,...,0.00000,0.000,0.0,0.0,6.230000e+11,6.230000e+09,6.230000e+09,0.000000e+00,0.0,0.000000e+00
3,599614.0,0.0,60785.0,10619.0,2,0.0,100.0,6.732800e+10,0.0,0.2,...,30195.00000,570.000,1728.0,0.0,6.732800e+10,0.000000e+00,1.346560e+08,0.000000e+00,0.0,0.000000e+00
4,599614.0,0.0,60785.0,0.0,2,0.0,80.0,9.000000e+10,0.0,0.0,...,0.00000,0.000,0.0,0.0,7.200000e+10,0.000000e+00,0.000000e+00,0.000000e+00,0.0,0.000000e+00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3194,0.0,0.0,60687.0,9467.0,1,0.0,8.0,2.700000e+09,0.0,0.0,...,3009.00000,473.000,94.0,100.0,2.160000e+08,0.000000e+00,0.000000e+00,1.900000e+08,95000000.0,9.500000e+07
3195,0.0,0.0,60619.0,8246.0,2,0.0,50.0,1.062000e+09,0.0,0.0,...,630.98297,161.215,0.0,0.0,5.310000e+08,0.000000e+00,0.000000e+00,0.000000e+00,0.0,0.000000e+00
3196,0.0,0.0,61059.0,2764.0,1,0.0,100.0,3.700000e+09,0.0,0.0,...,660.00000,100.000,200.0,20.0,3.700000e+09,0.000000e+00,0.000000e+00,4.000000e+08,0.0,0.000000e+00
3197,0.0,0.0,60332.0,3285.0,2,0.0,100.0,5.073000e+09,0.0,1.0,...,1135.00000,0.000,290.0,30.0,5.073000e+09,0.000000e+00,5.073000e+07,2.000000e+08,140000000.0,1.400000e+08


In [64]:
# Calculate the total value of reinvestment in the establishment
df['2005_total_reinvestment'] = df['_2005_q87a'] * (df['_2005_q87d'] / 100)

# Calculate total assets excluding receivables
df['2005_total_assets'] = df['_2005_q91a1'] - df['_2005_q91a12']

# Label 'q91a12' as receivables separately
df['2005_receivables'] = df['_2005_q91a12']

# Select the relevant columns for the cost calculation
cost_col05 = ['_2005_q86a3', '_2005_q86a4', '_2005_q86a5', '_2005_q86a6', '_2005_q86a8', '_2005_q86a9', '_2005_q86a10', '_2005_q86a11']

# Calculate the total cost by summing up the selected columns
df['2005_total_cost'] = df[cost_col05].sum(axis=1)

In [65]:
df

Unnamed: 0,idstd2015,id2015,idstd2005,id2005,firm_size,c9b,d1a3,d2,d10,d11,...,2015_cost_main_activity,2015_cost_theft,2015_cost_spoilage,2015_total_vrcost,2015_finance_on_credit,2015_finance_from_receivables,2005_total_reinvestment,2005_total_assets,2005_receivables,2005_total_cost
0,599613.0,0.0,60775.0,10597.0,2,0.0,40.0,2.860000e+10,0.0,0.0,...,1.144000e+10,0.000000e+00,0.000000e+00,5.300000e+08,0.0,5.300000e+08,1750.0,39317.00000,1114.000,26653.0
1,599613.0,0.0,60775.0,0.0,2,0.0,100.0,2.750000e+10,0.0,0.0,...,2.750000e+10,0.000000e+00,0.000000e+00,2.000000e+08,200000000.0,2.000000e+08,0.0,0.00000,0.000,0.0
2,599613.0,10697.0,0.0,0.0,2,0.0,100.0,6.230000e+11,1.0,1.0,...,6.230000e+11,6.230000e+09,6.230000e+09,0.000000e+00,0.0,0.000000e+00,0.0,0.00000,0.000,0.0
3,599614.0,0.0,60785.0,10619.0,2,0.0,100.0,6.732800e+10,0.0,0.2,...,6.732800e+10,0.000000e+00,1.346560e+08,0.000000e+00,0.0,0.000000e+00,0.0,68522.00000,570.000,58578.0
4,599614.0,0.0,60785.0,0.0,2,0.0,80.0,9.000000e+10,0.0,0.0,...,7.200000e+10,0.000000e+00,0.000000e+00,0.000000e+00,0.0,0.000000e+00,0.0,0.00000,0.000,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3194,0.0,0.0,60687.0,9467.0,1,0.0,8.0,2.700000e+09,0.0,0.0,...,2.160000e+08,0.000000e+00,0.000000e+00,1.900000e+08,95000000.0,9.500000e+07,94.0,4871.00000,473.000,2564.0
3195,0.0,0.0,60619.0,8246.0,2,0.0,50.0,1.062000e+09,0.0,0.0,...,5.310000e+08,0.000000e+00,0.000000e+00,0.000000e+00,0.0,0.000000e+00,0.0,469.76797,161.215,1114.4
3196,0.0,0.0,61059.0,2764.0,1,0.0,100.0,3.700000e+09,0.0,0.0,...,3.700000e+09,0.000000e+00,0.000000e+00,4.000000e+08,0.0,0.000000e+00,40.0,2160.00000,100.000,3487.0
3197,0.0,0.0,60332.0,3285.0,2,0.0,100.0,5.073000e+09,0.0,1.0,...,5.073000e+09,0.000000e+00,5.073000e+07,2.000000e+08,140000000.0,1.400000e+08,87.0,2227.00000,0.000,4560.0


In [66]:
# Split the data into small and large firms
df_small = df[df['firm_size'] == 1]  # Small firms (labeled as 1)
df_large = df[df['firm_size'] == 2]  # Large firms (labeled as 2)

path = r'C:\Users\Do Thu An\OneDrive\Desktop\Dynamic Macroeconomics\Problem sets\Dynamic-Macroeconomics\PS2_Code\Firms Modelling'

# Save the small and large firms DataFrames as CSV files
df_small.to_csv(f"{path}\\small_firms.csv", index=False)
df_large.to_csv(f"{path}\\large_firms.csv", index=False)