In [215]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import statsmodels.api as sm
import os
import chardet

In [216]:
# Define main project folder
main = r'C:\Users\Do Thu An\OneDrive\Desktop\Dynamic Macroeconomics\Problem sets\Dynamic-Macroeconomics\PS2_Code'
# Set project folder as current working directory
os.chdir(main)
# Define data file path
data_path = os.path.join(main, 'Data Files', 'ES Panel Data')

In [217]:
# Get list of all CSV files
file_list = [f for f in os.listdir(data_path) if f.endswith('.csv')]

# Display the number of files found
print(f'Found {len(file_list)} CSV files in the VHLSS 2008 Data folder.')
for i, file_name in enumerate(file_list, 1):
    print(f'{i}: {file_name}')

Found 1 CSV files in the VHLSS 2008 Data folder.
1: ES_Vietnam_2005_2009_2015.csv


In [218]:
# Load ES Panel data
df = pd.read_csv(os.path.join(data_path, 'ES_Vietnam_2005_2009_2015.csv'), encoding='latin1')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3199 entries, 0 to 3198
Columns: 977 entries, idstd2015 to _2005_q92c5
dtypes: float64(909), int64(16), object(52)
memory usage: 23.8+ MB


  df = pd.read_csv(os.path.join(data_path, 'ES_Vietnam_2005_2009_2015.csv'), encoding='latin1')


In [219]:
df['a6a'].value_counts(dropna=False)

# Drop rows where 'a6a' has NaN or 0 values
df = df.dropna(subset=['a6a'])  # Drop NaN values in 'a6a'
df = df[df['a6a'] != 0]  # Drop rows where 'a6a' is 0

df['a6a'].value_counts(dropna=False)

a6a
2.0    738
1.0    684
3.0    620
Name: count, dtype: int64

In [220]:
# Calculate the median of the number of employees (l1)
l1_median = df['l1'].median()
print(l1_median)

# Re-categorize the firm size (a6a) into Small (1) and Large (2)
df['firm_size'] = df.apply(
    lambda row: 2 if (row['a6a'] in [2, 3] and row['l1'] > l1_median) else 1, axis=1
)

# Label small as 1, large as 2
# - 1 is small
# - 2 is large (medium + large or larger than median of employees)

# Print the value counts of the new firm size
print(df['firm_size'].value_counts())

37.0
firm_size
1    1106
2     936
Name: count, dtype: int64


In [221]:
# Select Firms ID
col_id = ['idstd2015', 'id2015', 'idstd2005', 'id2005', 'firm_size'] 

#************************************************ YEAR 2015 ************************************************
# Select the variable cost of 2015
## d2: total annual sales for all products and services (VND)
# c9b: Annual losses due to power outages, d1a3: % of sales (d2) represented by main activity or product
# d10: Losses due to theft as % of the value of the products, d11: Losses due to breakage or spoilage as % of the value of the products
# h8: Cost of formal research and development activities, i2b: Total annual cost of security
# n2a: Total annual cost of labor, n2b: Total annual cost of electricity, n2i: Total annual cost of sales (for retails)
# n2e: Total annual cost of raw materials and intermediate goods used in production
# nef: Total annual cost of fuel
col_15A = ['c9b', 'd1a3', 'd2', 'd10', 'd11', 'i2b', 'n2a', 'n2b', 'n2i']

# Select production input of 2015 (Labor, Capital, Investment)
# LABOR - l1: Permanent, full-time workers end of last fiscal year, f2: Typical hours of operation in a week
# INVESTMENT - n5a: purchase of new or used machinery, vehicles, and equipment, n5b: lands & buildings, h8: Cost of formal research and development activities (innovation activities)
col_15B = ['n5a', 'n5b', '_2015_h8']

# Select financing variables OF 2015 (SECTION K. FINANCE)
# k1c: % Purchased on credit (loans), k2c: % Sold on credit (receivables)
col_15C = ['k1c', 'k2c']

#************************************************ YEAR 2005 ************************************************
# Select the variable cost of 2005 (Section II: Productivity)
# q86a1: Total sales, q86a2: Total cost
# q86a3: Total purchases of raw materials and intermediate goods, q86a4: Total cost of labor
# q86a5: Depreciation, q86a6: Rent on land and buildings, q86a8: Rent on machinery, equipment, and vehicles
# q86a9: Interest charges, q86a10: Energy cost, q86a11: Taxes
col_05A = ['_2005_q86a1', '_2005_q86a2','_2005_q86a3', '_2005_q86a4', '_2005_q86a5', '_2005_q86a6', 
           '_2005_q86a8', '_2005_q86a9', '_2005_q86a10', '_2005_q86a11']

# Select production input of 2005

# CAPITAL (ASSETS) - q91a1: Total fixed assets, q91a7: Total current assets (need to exclude receivables (q91a12))  
# INVESTMENT - q87a: net profits (after tax) in 2004, q87d: % of the establishment’s net profits (after tax) were reinvested in the establishment
col_05B = ['_2005_q91a1', '_2005_q91a7', '_2005_q91a12', '_2005_q87a', '_2005_q87d']

# Select financing variables of 2005 (SECTION III. PRODUCTIVITY)
# q91a12: Receivables

# Check if all selected columns exist in the dataset
columns = [col for col in col_id + col_15A + col_15B + col_15C + col_05A + col_05B if col in df.columns]

df = df[columns]
df.fillna(0, inplace = True)
# Replace all occurrences of -9 with 0 in the entire DataFrame
df.replace(-9, 0, inplace=True)
df.columns

Index(['idstd2015', 'id2015', 'idstd2005', 'id2005', 'firm_size', 'c9b',
       'd1a3', 'd2', 'd10', 'd11', 'i2b', 'n2a', 'n2b', 'n2i', 'n5a', 'n5b',
       '_2015_h8', 'k1c', 'k2c', '_2005_q86a1', '_2005_q86a2', '_2005_q86a3',
       '_2005_q86a4', '_2005_q86a5', '_2005_q86a6', '_2005_q86a8',
       '_2005_q86a9', '_2005_q86a10', '_2005_q86a11', '_2005_q91a1',
       '_2005_q91a7', '_2005_q91a12', '_2005_q87a', '_2005_q87d'],
      dtype='object')

In [222]:
# Calculate the costs based on the percentage of sales (d1a3, d10, d11) and total sales (d2)
df['2015_cost_main_activity'] = df['d1a3'] * df['d2'] / 100  # Main activity cost (percentage of sales)
#df['2015_cost_spoilage'] = df['d11'] * df['d2'] / 100  # Losses due to spoilage

# Calculate the finance based on purchases made on credit (k1c)
#df.rename(columns={'n5b': '2015_total_vrcost'}, inplace=True) # Total variable cost
df['2015_total_vrcost'] = df['n5b']
df['2015_investment_cost'] = df['n5a'] + df['_2015_h8'] # Investment 

# Finance based on credit purchases
df['2015_finance_on_credit'] = (df['k1c'] / 100) * df['2015_total_vrcost']

# Finance based on sales on credit (receivables)
df['2015_finance_from_receivables'] = (df['k2c'] / 100) * df['2015_total_vrcost']
df


Unnamed: 0,idstd2015,id2015,idstd2005,id2005,firm_size,c9b,d1a3,d2,d10,d11,...,_2005_q91a1,_2005_q91a7,_2005_q91a12,_2005_q87a,_2005_q87d,2015_cost_main_activity,2015_total_vrcost,2015_investment_cost,2015_finance_on_credit,2015_finance_from_receivables
1,599613.0,0.0,60775.0,0.0,2,0.0,100.0,2.750000e+10,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,2.750000e+10,0.0,2.000000e+08,0.0,0.0
2,599613.0,10697.0,0.0,0.0,2,0.0,100.0,6.230000e+11,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,6.230000e+11,0.0,0.000000e+00,0.0,0.0
4,599614.0,0.0,60785.0,0.0,2,0.0,80.0,9.000000e+10,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,7.200000e+10,0.0,0.000000e+00,0.0,0.0
5,599614.0,10698.0,0.0,0.0,2,0.0,95.0,9.800000e+10,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,9.310000e+10,0.0,3.700000e+09,0.0,0.0
7,599615.0,0.0,60782.0,0.0,2,0.0,70.0,1.500000e+11,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.050000e+11,0.0,1.000000e+09,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2401,0.0,0.0,0.0,0.0,1,0.0,90.0,2.400000e+10,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,2.160000e+10,0.0,5.000000e+08,0.0,0.0
2402,0.0,0.0,0.0,0.0,1,0.0,85.0,3.200000e+09,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,2.720000e+09,100000000.0,0.000000e+00,0.0,0.0
2403,0.0,0.0,0.0,0.0,2,0.0,60.0,1.200000e+12,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,7.200000e+11,0.0,3.600000e+10,0.0,0.0
2404,0.0,0.0,0.0,0.0,2,0.0,100.0,2.360000e+11,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,2.360000e+11,0.0,2.200000e+09,0.0,0.0


In [223]:
# Calculate the total value of reinvestment in the establishment
df['2005_total_reinvestment'] = df['_2005_q87a'] * (df['_2005_q87d'] / 100)

# Calculate total assets excluding receivables
df['2005_total_assets'] = df['_2005_q91a1'] - df['_2005_q91a12']

# Label 'q91a12' as receivables separately
df.rename(columns={'_2005_q91a12': '2005_receivables'}, inplace=True)

# Select the relevant columns for the cost calculation
cost_col05 = ['_2005_q86a3', '_2005_q86a4', '_2005_q86a5', '_2005_q86a6', '_2005_q86a8', '_2005_q86a9', '_2005_q86a10', '_2005_q86a11']

# Calculate the total cost by summing up the selected columns
df['2005_total_cost'] = df[cost_col05].sum(axis=1)

In [224]:
# Price of inputs in 2015
average_electricity_2015 = 1606.19  # VND per kWh
average_wage_2015 = 5233000 * 12          # VND per year
ppi_2015 = 132.350                        # Producer Price Index
average_rental_price_2015 = 1026000 * 12  # VND/m²/year
average_rnd_2015 = 0.346 * 100000000 + 0.654 * 500000000  # VND

# Price of inputs in 2005
average_wage_2005 = 2000000 * 12          # VND per year
average_rental_price_2005 = 347600 * 12   # Average office rent in Ho Chi Minh in grade A in 2005 (VND/m2)
average_gasoline_price_2005 = 15000       # VND/lit (Oil + Gas)


# Define the function to calculate the input quanitity: xt = wt/price
def compute_quantities(df):
    return pd.DataFrame({
        # Year 2015
        '2015_labor_quantity': df['n2a'] / average_wage_2015,
        '2015_electricity_quantity': df['n2b'] / average_electricity_2015,
        '2015_production_quantity': df['n2i'] / ppi_2015,
        '2015_land_building_quantity': df['n5b']/ average_rental_price_2015,

        # Year 2005
        '2005_labor_quantity': df['_2005_q86a4'] / average_wage_2005,
        '2015_gasoline_quantity': df['_2005_q86a10'] / average_gasoline_price_2005,
        '2005_land_building_quantity': df['_2005_q86a6']/ average_rental_price_2005,
    })

In [None]:
quantity_df = compute_quantities(df)
df = pd.concat([df, quantity_df], axis=1)
df.columns

Index(['idstd2015', 'id2015', 'idstd2005', 'id2005', 'firm_size', 'c9b',
       'd1a3', 'd2', 'd10', 'd11', 'i2b', 'n2a', 'n2b', 'n2i', 'n5a', 'n5b',
       '_2015_h8', 'k1c', 'k2c', '_2005_q86a1', '_2005_q86a2', '_2005_q86a3',
       '_2005_q86a4', '_2005_q86a5', '_2005_q86a6', '_2005_q86a8',
       '_2005_q86a9', '_2005_q86a10', '_2005_q86a11', '_2005_q91a1',
       '_2005_q91a7', '2005_receivables', '_2005_q87a', '_2005_q87d',
       '2015_cost_main_activity', '2015_total_vrcost', '2015_investment_cost',
       '2015_finance_on_credit', '2015_finance_from_receivables',
       '2005_total_reinvestment', '2005_total_assets', '2005_total_cost',
       '2015_labor_quantity', '2015_electricity_quantity',
       '2015_production_quantity', '2015_land_building_quantity',
       '2005_labor_quantity', '2015_gasoline_quantity',
       '2005_land_building_quantity'],
      dtype='object')

In [226]:
# Split the data into small and large firms
df_small = df[df['firm_size'] == 1]  # Small firms (labeled as 1)
df_large = df[df['firm_size'] == 2]  # Large firms (labeled as 2)

In [227]:
df_small.columns

Index(['idstd2015', 'id2015', 'idstd2005', 'id2005', 'firm_size', 'c9b',
       'd1a3', 'd2', 'd10', 'd11', 'i2b', 'n2a', 'n2b', 'n2i', 'n5a', 'n5b',
       '_2015_h8', 'k1c', 'k2c', '_2005_q86a1', '_2005_q86a2', '_2005_q86a3',
       '_2005_q86a4', '_2005_q86a5', '_2005_q86a6', '_2005_q86a8',
       '_2005_q86a9', '_2005_q86a10', '_2005_q86a11', '_2005_q91a1',
       '_2005_q91a7', '2005_receivables', '_2005_q87a', '_2005_q87d',
       '2015_cost_main_activity', '2015_total_vrcost', '2015_investment_cost',
       '2015_finance_on_credit', '2015_finance_from_receivables',
       '2005_total_reinvestment', '2005_total_assets', '2005_total_cost',
       '2015_labor_quantity', '2015_electricity_quantity',
       '2015_production_quantity', '2015_land_building_quantity',
       '2005_labor_quantity', '2015_gasoline_quantity',
       '2005_land_building_quantity'],
      dtype='object')

In [228]:
# Calculate the total number of firms in each dataset
num_small_firms = len(df_small)
num_large_firms = len(df_large)

# Calculate the total cost for small firms
df_small['2015_total_cost'] = df_small[['n2a', 'n2b', 'n2i', 'n5b']].sum(axis=1)
df_small['2005_total_cost'] = df_small[['_2005_q86a4', '_2005_q86a10', '_2005_q86a6']].sum(axis=1)

# Calculate the averages for small firms by dividing by the total number of small firms
small_firm_averages = {
    'avg_2015_investment_cost': df_small['2015_investment_cost'].sum() / num_small_firms,
    'avg_2015_total_cost': df_small['2015_total_cost'].sum() / num_small_firms,
    'avg_2005_total_cost': df_small['2005_total_cost'].sum() / num_small_firms,
    'avg_2005_total_reinvestment': df_small['2005_total_reinvestment'].sum() / num_small_firms
}

# Calculate the total cost for large firms
df_large['2015_total_cost'] = df_large[['n2a', 'n2b', 'n2i', 'n5b']].sum(axis=1)
df_large['2005_total_cost'] = df_large[['_2005_q86a4', '_2005_q86a10', '_2005_q86a6']].sum(axis=1)

# Calculate the averages for large firms by dividing by the total number of large firms
large_firm_averages = {
    'avg_2015_investment_cost': df_large['2015_investment_cost'].sum() / num_large_firms,
    'avg_2015_total_cost': df_large['2015_total_cost'].sum() / num_large_firms,
    'avg_2005_total_cost': df_large['2005_total_cost'].sum() / num_large_firms,
    'avg_2005_total_reinvestment': df_large['2005_total_reinvestment'].sum() / num_large_firms
}

small_firm_averages, large_firm_averages


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_small['2015_total_cost'] = df_small[['n2a', 'n2b', 'n2i', 'n5b']].sum(axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_small['2005_total_cost'] = df_small[['_2005_q86a4', '_2005_q86a10', '_2005_q86a6']].sum(axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_large['2015_total_cost

({'avg_2015_investment_cost': 1682513813472.586,
  'avg_2015_total_cost': 5018182121.686256,
  'avg_2005_total_cost': 0.0,
  'avg_2005_total_reinvestment': 0.0},
 {'avg_2015_investment_cost': 698929309344.418,
  'avg_2015_total_cost': 32978123552.785255,
  'avg_2005_total_cost': 0.0,
  'avg_2005_total_reinvestment': 0.0})

In [231]:
# Print the average values for small firms
print(f"Average 2015 Investment Cost for Small Firms: {small_firm_averages['avg_2015_investment_cost']:.2f}")
print(f"Average Total Cost of 2015 for Small Firms: {small_firm_averages['avg_2015_total_cost']:.2f}")
print(f"Average Total Cost of 2005 for Small Firms: {small_firm_averages['avg_2005_total_cost']:.2f}")
print(f"Average Total Reinvestment in 2005 for Small Firms: {small_firm_averages['avg_2005_total_reinvestment']:.2f}")

print("-" * 60)

# Print the average values for large firms
print(f"Average 2015 Investment Cost for Large Firms: {large_firm_averages['avg_2015_investment_cost']:.2f}")
print(f"Average Total Cost of 2015 for Large Firms: {large_firm_averages['avg_2015_total_cost']:.2f}")
print(f"Average Total Cost of 2005 for Large Firms: {large_firm_averages['avg_2005_total_cost']:.2f}")
print(f"Average Total Reinvestment in 2005 for Large Firms: {large_firm_averages['avg_2005_total_reinvestment']:.2f}")


Average 2015 Investment Cost for Small Firms: 1682513813472.59
Average Total Cost of 2015 for Small Firms: 5018182121.69
Average Total Cost of 2005 for Small Firms: 0.00
Average Total Reinvestment in 2005 for Small Firms: 0.00
------------------------------------------------------------
Average 2015 Investment Cost for Large Firms: 698929309344.42
Average Total Cost of 2015 for Large Firms: 32978123552.79
Average Total Cost of 2005 for Large Firms: 0.00
Average Total Reinvestment in 2005 for Large Firms: 0.00


In [232]:
# Compute the average of 2015 and 2005 costs/investments for both small and large firms by dividing by 2
small_firm_averages_combined = {
    'avg_combined_2015_investment_cost': (small_firm_averages['avg_2015_investment_cost'] + small_firm_averages['avg_2005_total_reinvestment']) / 2,
    'avg_combined_2015_total_cost': (small_firm_averages['avg_2015_total_cost'] + small_firm_averages['avg_2005_total_cost']) / 2,
}

large_firm_averages_combined = {
    'avg_combined_2015_investment_cost': (large_firm_averages['avg_2015_investment_cost'] + large_firm_averages['avg_2005_total_reinvestment']) / 2,
    'avg_combined_2015_total_cost': (large_firm_averages['avg_2015_total_cost'] + large_firm_averages['avg_2005_total_cost']) / 2,
}

# Print the average results for small firms
print(f"Average Combined 2015 Investment Cost for Small Firms: {small_firm_averages_combined['avg_combined_2015_investment_cost']:.2f}")
print(f"Average Combined Total Cost of 2015 for Small Firms: {small_firm_averages_combined['avg_combined_2015_total_cost']:.2f}")

print("-" * 60)

# Print the average results for large firms
print(f"Average Combined 2015 Investment Cost for Large Firms: {large_firm_averages_combined['avg_combined_2015_investment_cost']:.2f}")
print(f"Average Combined Total Cost of 2015 for Large Firms: {large_firm_averages_combined['avg_combined_2015_total_cost']:.2f}")


Average Combined 2015 Investment Cost for Small Firms: 841256906736.29
Average Combined Total Cost of 2015 for Small Firms: 2509091060.84
------------------------------------------------------------
Average Combined 2015 Investment Cost for Large Firms: 349464654672.21
Average Combined Total Cost of 2015 for Large Firms: 16489061776.39


In [229]:
path = r'C:\Users\Do Thu An\OneDrive\Desktop\Dynamic Macroeconomics\Problem sets\Dynamic-Macroeconomics\PS2_Code\Firms Modelling'
# Save the small and large firms DataFrames as CSV files
df_small.to_csv(f"{path}\\small_firms.csv", index=False)
df_large.to_csv(f"{path}\\large_firms.csv", index=False)