# Load data + libs

In [11]:
# Import libs
from utils import *
from utils.ml_hyperpars import lrg, xgb_class, \
    r2, rmse, mape, \
    accuracy, roc_auc, precision, recall, \
    classification_report

# Load data
df_revenue = pd.read_csv('data_for_modelling/df_revenue.csv')
df_ebitda = pd.read_csv('data_for_modelling/df_ebitda.csv')
df_roa = pd.read_csv('data_for_modelling/df_roa.csv')
df_roe = pd.read_csv('data_for_modelling/df_roe.csv')
df_roic = pd.read_csv('data_for_modelling/df_roic.csv')
df_roce = pd.read_csv('data_for_modelling/df_roce.csv')
df_value = pd.read_csv('data_for_modelling/df_value_ad.csv')

# Modelling

Data

In [12]:
# Make input - test set
revenue_input, revenue_test = input_test_split(df_revenue)
roa_input, roa_test = input_test_split(df_roa)
roe_input, roe_test = input_test_split(df_roe)
roic_input, roic_test = input_test_split(df_roic)
roce_input, roce_test = input_test_split(df_roce)

value_input, value_test = input_test_split(df_value)
ebitda_input, ebitda_test = input_test_split(df_ebitda)

Modelling

In [9]:
# Revenue

# Hyper-param + model choice
revenue_obj = InputData(revenue_input, 'company', 'year', 'revenue', reg=True)
display(revenue_obj.optimal_param(n_splits=3, test_size=2))

# Train val
X_train, y_train, X_val, y_val = train_val_split(revenue_input, 'revenue')
model = lrg(fit_intercept=True)
model.fit(X_train, y_train)
y_pred = model.predict(X_val)

print('----------- TRAIN RESULTS -----------')
print(f'R2: {r2(y_pred, y_val)}')
print(f'RMSE: {rmse(y_pred, y_val)}')
print(f'MAPE: {mape(y_pred, y_val)}')
# Test
print('----------- TEST RESULTS -----------')
X_test = revenue_test.drop(['company', 'revenue', 'year'], axis=1)
y_test = revenue_test['revenue']
y_pred_test = model.predict(X_test)
print(f'R2: {r2(y_pred_test, y_test)}')
print(f'RMSE: {rmse(y_pred_test, y_test)}')
print(f'MAPE: {mape(y_pred_test, y_test)}')


Processing XG_reg ...
Processing Linear_reg ...


Unnamed: 0,algo_used,params,mean_test_r2,mean_test_mape,mean_test_rmse
7,Linear_reg,{'fit_intercept': False},0.914456,0.012553,0.569842
6,Linear_reg,{'fit_intercept': True},0.914036,0.012749,0.571233
0,XG_reg,"{'learning_rate': 0.05, 'max_depth': 8, 'n_est...",0.907331,0.013537,0.59305
4,XG_reg,"{'learning_rate': 0.03, 'max_depth': 8, 'n_est...",0.906996,0.013553,0.594216
5,XG_reg,"{'learning_rate': 0.03, 'max_depth': 8, 'n_est...",0.906342,0.013689,0.596333
1,XG_reg,"{'learning_rate': 0.05, 'max_depth': 8, 'n_est...",0.90625,0.013743,0.596654
2,XG_reg,"{'learning_rate': 0.05, 'max_depth': 8, 'n_est...",0.905204,0.013921,0.599998
3,XG_reg,"{'learning_rate': 0.03, 'max_depth': 8, 'n_est...",0.902434,0.014009,0.608339


----------- TRAIN RESULTS -----------
R2: 0.9044978091299306
RMSE: 0.601895303996183
MAPE: 0.012731403722221023
----------- TEST RESULTS -----------
R2: 0.9215238827698281
RMSE: 0.5478324216147226
MAPE: 0.010842666730429688


In [42]:
# Value_add

# Hyper-param + model choice
value_obj = InputData(value_input, 'company', 'year', 'value_add', reg=False)
display(value_obj.optimal_param(n_splits=3, test_size=2))

# Train val
X_train, y_train, X_val, y_val = train_val_split(value_input, 'value_add')
model = xgb_class(n_estimators= 200, learning_rate=0.02, max_depth=10,
                subsample=0.5, scale_pos_weight=0.223
                  )
model.fit(X_train, y_train)
y_pred = model.predict(X_val)


print('----------- TRAIN RESULTS -----------')
print(f'Accuracy: {accuracy(y_pred, y_val)}')
print(f'Precision: {precision(y_pred, y_val)}')
print(f'Recall: {recall(y_pred, y_val)}')
print(f'ROC_AUC: {roc_auc(y_pred, y_val)}')
# Test
print('----------- TEST RESULTS -----------')
X_test = value_test.drop(['company', 'value_add', 'year'], axis=1)
y_test = value_test['value_add']
y_pred_test = model.predict(X_test)
print(f'Accuracy: {accuracy(y_pred_test, y_test)}')
print(f'Precision: {precision(y_pred_test, y_test)}')
print(f'Recall: {recall(y_pred_test, y_test)}')
print(f'ROC_AUC: {roc_auc(y_pred_test, y_test)}')

print(classification_report(y_val, y_pred, digits=3))

Processing XG_class ...
Processing KNN_class ...


Unnamed: 0,algo_used,params,mean_test_accuracy,mean_test_roc_auc,mean_test_precision,mean_test_recall
20,XG_class,"{'learning_rate': 0.03, 'max_depth': 10, 'n_es...",0.777431,0.777431,0.911312,0.92563
2,XG_class,"{'learning_rate': 0.05, 'max_depth': 5, 'n_est...",0.776972,0.776972,0.911659,0.921493
14,XG_class,"{'learning_rate': 0.03, 'max_depth': 5, 'n_est...",0.776823,0.776823,0.911153,0.925006
5,XG_class,"{'learning_rate': 0.05, 'max_depth': 5, 'n_est...",0.776787,0.776787,0.912213,0.916147
23,XG_class,"{'learning_rate': 0.03, 'max_depth': 10, 'n_es...",0.776778,0.776778,0.91153,0.921616
17,XG_class,"{'learning_rate': 0.03, 'max_depth': 5, 'n_est...",0.776737,0.776737,0.911577,0.920922
10,XG_class,"{'learning_rate': 0.05, 'max_depth': 10, 'n_es...",0.776724,0.776724,0.912996,0.910286
7,XG_class,"{'learning_rate': 0.05, 'max_depth': 10, 'n_es...",0.776717,0.776717,0.912807,0.911305
8,XG_class,"{'learning_rate': 0.05, 'max_depth': 10, 'n_es...",0.776492,0.776492,0.911416,0.921626
22,XG_class,"{'learning_rate': 0.03, 'max_depth': 10, 'n_es...",0.776426,0.776426,0.912523,0.913152


----------- TRAIN RESULTS -----------
Accuracy: 0.8234170524029945
Precision: 0.9417391304347826
Recall: 0.8819218241042345
ROC_AUC: 0.8234170524029943
----------- TEST RESULTS -----------
Accuracy: 0.8013917363191925
Precision: 0.901541095890411
Recall: 0.9410187667560321
ROC_AUC: 0.8013917363191926
              precision    recall  f1-score   support

           0      0.765     0.601     0.673       363
           1      0.882     0.942     0.911      1150

    accuracy                          0.860      1513
   macro avg      0.823     0.771     0.792      1513
weighted avg      0.854     0.860     0.854      1513



In [52]:
# Ebitda

# # Hyper-param + model choice
# ebitda_obj = InputData(ebitda_input, 'company', 'year', 'ebitda', reg=False)
# display(ebitda_obj.optimal_param(n_splits=3, test_size=2))

# Train val
X_train, y_train, X_val, y_val = train_val_split(ebitda_input, 'ebitda')
model = xgb_class(n_estimators= 300, learning_rate=0.03, max_depth=15,
                subsample=0.7, scale_pos_weight= 0.15134
                  )
model.fit(X_train, y_train)
y_pred = model.predict(X_val)


print('----------- TRAIN RESULTS -----------')
print(f'Accuracy: {accuracy(y_pred, y_val)}')
print(f'Precision: {precision(y_pred, y_val)}')
print(f'Recall: {recall(y_pred, y_val)}')
print(f'ROC_AUC: {roc_auc(y_pred, y_val)}')
# Test
print('----------- TEST RESULTS -----------')
X_test = ebitda_test.drop(['company', 'ebitda', 'year'], axis=1)
y_test = ebitda_test['ebitda']
y_pred_test = model.predict(X_test)
print(f'Accuracy: {accuracy(y_pred_test, y_test)}')
print(f'Precision: {precision(y_pred_test, y_test)}')
print(f'Recall: {recall(y_pred_test, y_test)}')
print(f'ROC_AUC: {roc_auc(y_pred_test, y_test)}')

print(classification_report(y_val, y_pred, digits=3))

----------- TRAIN RESULTS -----------
Accuracy: 0.7966333967046895
Precision: 0.9388544891640866
Recall: 0.9224334600760457
ROC_AUC: 0.7966333967046895
----------- TEST RESULTS -----------
Accuracy: 0.7469932043722134
Precision: 0.9034749034749034
Recall: 0.9535452322738386
ROC_AUC: 0.7469932043722134
              precision    recall  f1-score   support

           0      0.671     0.612     0.640       263
           1      0.922     0.939     0.931      1292

    accuracy                          0.884      1555
   macro avg      0.797     0.776     0.785      1555
weighted avg      0.880     0.884     0.881      1555

