Library

In [1]:
from utils.utils_ml_train import *
from utils.utils_dl_train.dl_training_utils import *

Raw data

In [2]:
df_roa = pd.read_csv('../../data/data_for_modelling/df_roa.csv')
roa_input, roa_test = input_test_split(df_roa)

Feature engineer

- Added features:
    + oea (operating efficiency)
    + loss
    + liq (liquidity)
    + d/e (debt/equity)
    + industry
- Besides, some features are dropped, such as: 'invest_nav_lag1', 'long_receive_lag1',
       'long_liability_lag1', 'other_long_asset_lag1', 'cwip_lag1', ...
- The reason is because experiment shows the chosen features to give best performance
- Also note that the names of added features have no suffix "lag_1" despite them being lagged in the original set (roa_addon_data) already

In [3]:
df_addon = pd.read_csv('../../data/data_addon/roa_addon_data.csv')

# Input set
roa_input = roa_input.merge(df_addon, on=['company', 'year'], how='left')
roa_input['oea'] = roa_input['expense_lag1']/roa_input['tot_asset']
roa_input = roa_input[['company', 'year', 'roa', 'in_stock_lag1', 'industry', 'other_fund_lag1', 'for_own_lag1',
'roa_lag1', 'loss', 'liq', 'oea', 'gov_own_lag1', 'd/e', 'equity']]
roa_input['industry'] = roa_input['industry'].fillna('Dầu khí')

# Test set
roa_test = roa_test.merge(df_addon, on=['company', 'year'], how='left')
roa_test['oea'] = roa_test['expense_lag1']/roa_test['tot_asset']
roa_test = roa_test[['company', 'year', 'roa', 'in_stock_lag1', 'industry', 'other_fund_lag1', 'for_own_lag1',
'roa_lag1', 'loss', 'liq', 'oea', 'gov_own_lag1', 'd/e', 'equity']]

Data Standardization

- Control absolute ROA to be under 2, eliminating firm-year with abnormally high ROA (outlier removal)
- Compress ROA distribution using hyperbolic arcsinh (due to negative values) to better model's performance through decreasing data variance (data transformation)

In [None]:
# Input set
roa_input = roa_input[abs(roa_input['roa'])<=2]
roa_input.loc[:, 'roa'] = roa_input['roa'].apply(lambda x: np.arcsinh(x))
roa_input.dropna(inplace=True)

# Test set
all_comp = roa_input['company'].unique().tolist()
roa_test = roa_test[roa_test['company'].isin(all_comp)]
roa_test.loc[:, 'roa'] = roa_test['roa'].apply(lambda x: np.arcsinh(x))
roa_test.dropna(inplace=True)

Modelling

In [5]:
all_features = ['in_stock_lag1', 'industry', 'other_fund_lag1','for_own_lag1','roa_lag1','loss',
'liq','oea','gov_own_lag1', 'equity', 'd/e']

In [9]:
tensorflow.random.set_seed(42)
model = LSTM(
    roa_input,
    roa_test,
    'roa',
    all_features,
    'roa'

)
model.result_summary()

[1m315/315[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step
[1m99/99[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step
[1m47/47[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 9ms/step


Unnamed: 0,R2,MAE,SMAPE
Train,0.473185,0.033163,0.679338
Val,0.367163,0.040811,0.777233
Test,0.383567,0.035079,0.733562


Conclusion:
- Generally, the best model discovered is not good
- This can be explained by the nature of ROA, which has highly skewed and high-kurtosis distribution
- However, high consistency is seen between val and test sets
- So the model is acceptable, and should be improved in the future using more advanced architecture (transformer, for example) or better suited params with LSTM.

Best params:

LSTM architecture:
- Dense(16)
- LSTM(32, return_sequences=True)
- LSTM(64,  return_sequences=True)
- LSTM(64, return_sequences=False)
- Dense(16)
