# Load data + libs

In [1]:
# Import libs
from utils_dta_processing import *
from utils_ml_train import *
from utils_ml_train.hyperparams_ML import lrg, xgb_class,\
    r2, rmse, mape, \
    accuracy, roc_auc, precision, recall, \
    classification_report
from utils_ml_train.model_training_utils import MinMaxScaler

# Load data
df_revenue = pd.read_csv('../data_for_modelling/df_revenue.csv')
df_ebitda = pd.read_csv('../data_for_modelling/df_ebitda.csv')
df_roa = pd.read_csv('../data_for_modelling/df_roa.csv')
df_roe = pd.read_csv('../data_for_modelling/df_roe.csv')
df_roic = pd.read_csv('../data_for_modelling/df_roic.csv')
df_roce = pd.read_csv('../data_for_modelling/df_roce.csv')
df_value = pd.read_csv('../data_for_modelling/df_value_ad.csv')

# Modelling

Data

In [2]:
# Make input - test set
revenue_input, revenue_test = input_test_split(df_revenue)
roa_input, roa_test = input_test_split(df_roa)
roe_input, roe_test = input_test_split(df_roe)
roic_input, roic_test = input_test_split(df_roic)
roce_input, roce_test = input_test_split(df_roce)

value_input, value_test = input_test_split(df_value)
ebitda_input, ebitda_test = input_test_split(df_ebitda)

Modelling

In [3]:
# Revenue

# Hyper-param + model choice
revenue_obj = InputData(revenue_input, 'company', 'year', 'revenue', reg=True)
display(revenue_obj.optimal_param(n_splits=3, test_size=2))

# Train val
X_train, y_train, X_val, y_val = train_val_split(revenue_input, 'revenue')
scaler = MinMaxScaler()
X_train = pd.DataFrame(scaler.fit_transform(X_train), columns=X_train.columns)
X_val = pd.DataFrame(scaler.transform(X_val), columns=X_val.columns)

model = lrg(fit_intercept=True)
model.fit(X_train, y_train)
y_pred = model.predict(X_val)

print('----------- TRAIN RESULTS -----------')
print(f'R2: {r2(y_pred, y_val)}')
print(f'RMSE: {rmse(y_pred, y_val)}')
print(f'MAPE: {mape(y_pred, y_val)}')
# Test
print('----------- TEST RESULTS -----------')
X_test = revenue_test.drop(['company', 'revenue', 'year'], axis=1)
X_test = pd.DataFrame(scaler.transform(X_test), columns=X_test.columns)
y_test = revenue_test['revenue']
y_pred_test = model.predict(X_test)
print(f'R2: {r2(y_pred_test, y_test)}')
print(f'RMSE: {rmse(y_pred_test, y_test)}')
print(f'MAPE: {mape(y_pred_test, y_test)}')


Processing XG_reg ...
Processing Linear_reg ...


Unnamed: 0,algo_used,params,mean_test_r2,mean_test_mape,mean_test_rmse
6,Linear_reg,{'algo__fit_intercept': True},0.910469,0.013154,0.582818
4,XG_reg,"{'algo__learning_rate': 0.03, 'algo__max_depth...",0.900262,0.014152,0.615011
5,XG_reg,"{'algo__learning_rate': 0.03, 'algo__max_depth...",0.899502,0.014274,0.617466
0,XG_reg,"{'algo__learning_rate': 0.05, 'algo__max_depth...",0.898469,0.014209,0.620249
1,XG_reg,"{'algo__learning_rate': 0.05, 'algo__max_depth...",0.897024,0.014434,0.624892
2,XG_reg,"{'algo__learning_rate': 0.05, 'algo__max_depth...",0.895921,0.014617,0.628318
3,XG_reg,"{'algo__learning_rate': 0.03, 'algo__max_depth...",0.894602,0.014661,0.631641
7,Linear_reg,{'algo__fit_intercept': False},-2.075128,0.088564,3.334669


----------- TRAIN RESULTS -----------
R2: 0.904497818031815
RMSE: 0.6018953024549542
MAPE: 0.012731402864655031
----------- TEST RESULTS -----------
R2: 0.9215239040111723
RMSE: 0.5478324466353163
MAPE: 0.010842668618786534


In [3]:
# Value_add (xgb no need scaler)

# Hyper-param + model choice
value_obj = InputData(value_input, 'company', 'year', 'value_add', reg=False)
display(value_obj.optimal_param(n_splits=3, test_size=2))

# Train val
X_train, y_train, X_val, y_val = train_val_split(value_input, 'value_add')

model = xgb_class(n_estimators= 200, learning_rate=0.02, max_depth=10,
                subsample=0.5, scale_pos_weight=0.223
                  )
model.fit(X_train, y_train)
y_pred = model.predict(X_val)


print('----------- TRAIN RESULTS -----------')
print(f'Accuracy: {accuracy(y_pred, y_val)}')
print(f'Precision: {precision(y_pred, y_val)}')
print(f'Recall: {recall(y_pred, y_val)}')
print(f'ROC_AUC: {roc_auc(y_pred, y_val)}')
# Test
print('----------- TEST RESULTS -----------')
X_test = value_test.drop(['company', 'value_add', 'year'], axis=1)
y_test = value_test['value_add']
y_pred_test = model.predict(X_test)
print(f'Accuracy: {accuracy(y_pred_test, y_test)}')
print(f'Precision: {precision(y_pred_test, y_test)}')
print(f'Recall: {recall(y_pred_test, y_test)}')
print(f'ROC_AUC: {roc_auc(y_pred_test, y_test)}')

print(classification_report(y_val, y_pred, digits=3))

Processing XG_class ...
Processing KNN_class ...


Unnamed: 0,algo_used,params,mean_test_accuracy,mean_test_roc_auc,mean_test_precision,mean_test_recall
26,XG_class,"{'algo__learning_rate': 0.03, 'algo__max_depth...",0.778344,0.778344,0.914021,0.908605
2,XG_class,"{'algo__learning_rate': 0.05, 'algo__max_depth...",0.778,0.778,0.912273,0.920541
27,XG_class,"{'algo__learning_rate': 0.03, 'algo__max_depth...",0.777777,0.777777,0.912127,0.920445
32,XG_class,"{'algo__learning_rate': 0.03, 'algo__max_depth...",0.777365,0.777365,0.911438,0.924496
22,XG_class,"{'algo__learning_rate': 0.03, 'algo__max_depth...",0.777364,0.777364,0.911504,0.923902
1,XG_class,"{'algo__learning_rate': 0.05, 'algo__max_depth...",0.776897,0.776897,0.913648,0.906102
37,XG_class,"{'algo__learning_rate': 0.03, 'algo__max_depth...",0.776721,0.776721,0.911467,0.922125
6,XG_class,"{'algo__learning_rate': 0.05, 'algo__max_depth...",0.776577,0.776577,0.913899,0.903446
7,XG_class,"{'algo__learning_rate': 0.05, 'algo__max_depth...",0.77649,0.77649,0.912252,0.916037
36,XG_class,"{'algo__learning_rate': 0.03, 'algo__max_depth...",0.776408,0.776408,0.91256,0.913176


----------- TRAIN RESULTS -----------
Accuracy: 0.8234170524029945
Precision: 0.9417391304347826
Recall: 0.8819218241042345
ROC_AUC: 0.8234170524029943
----------- TEST RESULTS -----------
Accuracy: 0.8013917363191925
Precision: 0.901541095890411
Recall: 0.9410187667560321
ROC_AUC: 0.8013917363191926
              precision    recall  f1-score   support

           0      0.765     0.601     0.673       363
           1      0.882     0.942     0.911      1150

    accuracy                          0.860      1513
   macro avg      0.823     0.771     0.792      1513
weighted avg      0.854     0.860     0.854      1513



In [None]:
# Ebitda (xgb no need scaler)

# # Hyper-param + model choice
# ebitda_obj = InputData(ebitda_input, 'company', 'year', 'ebitda', reg=False)
# display(ebitda_obj.optimal_param(n_splits=3, test_size=2))

# Train val
X_train, y_train, X_val, y_val = train_val_split(ebitda_input, 'ebitda')
model = xgb_class(n_estimators= 300, learning_rate=0.03, max_depth=15,
                subsample=0.7, scale_pos_weight= 0.15134
                  )

model.fit(X_train, y_train)
y_pred = model.predict(X_val)


print('----------- TRAIN RESULTS -----------')
print(f'Accuracy: {accuracy(y_pred, y_val)}')
print(f'Precision: {precision(y_pred, y_val)}')
print(f'Recall: {recall(y_pred, y_val)}')
print(f'ROC_AUC: {roc_auc(y_pred, y_val)}')
# Test
print('----------- TEST RESULTS -----------')
X_test = ebitda_test.drop(['company', 'ebitda', 'year'], axis=1)
y_test = ebitda_test['ebitda']
y_pred_test = model.predict(X_test)
print(f'Accuracy: {accuracy(y_pred_test, y_test)}')
print(f'Precision: {precision(y_pred_test, y_test)}')
print(f'Recall: {recall(y_pred_test, y_test)}')
print(f'ROC_AUC: {roc_auc(y_pred_test, y_test)}')

print(classification_report(y_val, y_pred, digits=3))