Library

In [1]:
from src.training.deep_ml.dl_training_utils import *
from src.training.normal_ml import *

Data

In [2]:
# Diversify ori data
df_roe = pd.read_csv('../../data/data_for_modelling/df_roe.csv')
df_add = pd.read_csv('../../data/data_addon/roe_add_data.csv')
df_roe_used = pd.merge(df_roe, df_add, how='left', on=['company', 'year'])

# Defined target + features
df_roe_used = df_roe_used[[
    'company', 'year', 'roe',  # id + target
    'long_receive_lag1', 'other_fund_lag1', 'roe_lag1',   # Avail
    'long_invest_lag1', 'in_stock_lag1',
    'asset/equity', 'cash&equi_to_asset', 'a/w',  # from added data
    'size', 'asset/lia', 'industry'
]]

Split data

In [3]:
# Split -> input + test set
roe_input, roe_test = input_test_split(df_roe_used)

Standardize data (same as roa)

In [4]:
# Input set
roe_input = roe_input[abs(roe_input['roe'])<=2]
roe_input.loc[:, 'roe'] = roe_input['roe'].apply(lambda x: np.arcsinh(x))
roe_input.dropna(inplace=True)


# Test set
all_comp = roe_input['company'].unique().tolist()
roe_test = roe_test[roe_test['company'].isin(all_comp)]
roe_test.loc[:, 'roe'] = roe_test['roe'].apply(lambda x: np.arcsinh(x))
roe_test.dropna(inplace=True)

Modelling

In [5]:
all_features = [
    'long_receive_lag1', 'other_fund_lag1', 'roe_lag1',
    'long_invest_lag1', 'in_stock_lag1',
    'asset/equity', 'cash&equi_to_asset', 'a/w',
    'size', 'asset/lia', 'industry'
]

In [6]:
tensorflow.random.set_seed(42)
model = LSTM(
    roe_input,
    roe_test,
    'roe',
    all_features,
    'roe'

)
model.result_summary()

[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step
[1m98/98[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step
[1m47/47[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 7ms/step


Unnamed: 0,R2,MAE,SMAPE
Train,0.327536,0.072281,0.686126
Val,0.261835,0.082567,0.772567
Test,0.17267,0.084573,0.741663


Conclusion:
- Roe has lost (~5%), which is way lower than roa dataset (this is due to more outliers in roe plus higher variance in original data)
- For that reason, model predicting roe performs worse (due to less data)

Best params:
LSTM architecture:
- LSTM 16
- LSTM 16
- LSTM 32
- Dense 32