Library

In [2]:
from sklearn.metrics import classification_report
from src.utils import *
from src.training.normal_ml import *

Data

In [3]:
df_ebitda = pd.read_csv('../../data/data_for_modelling/df_ebitda.csv')
ebitda_input, ebitda_test = input_test_split(df_ebitda)

Modelling

In [4]:
# Best param choice
ebitda_obj = InputData(ebitda_input, 'company', 'year', 'ebitda', reg=False)
display(ebitda_obj.optimal_param(n_splits=3, test_size=2))

Processing XG_class ...
Processing KNN_class ...


Unnamed: 0,algo_used,params,mean_test_accuracy,mean_test_roc_auc,mean_test_precision,mean_test_recall
0,XG_class,"{'algo__learning_rate': 0.05, 'algo__max_depth...",0.783938,0.783938,0.945564,0.887456
1,XG_class,"{'algo__learning_rate': 0.05, 'algo__max_depth...",0.783912,0.783912,0.942128,0.919981
20,XG_class,"{'algo__learning_rate': 0.03, 'algo__max_depth...",0.783908,0.783908,0.944955,0.893153
25,XG_class,"{'algo__learning_rate': 0.03, 'algo__max_depth...",0.782727,0.782727,0.945113,0.887911
5,XG_class,"{'algo__learning_rate': 0.05, 'algo__max_depth...",0.78214,0.78214,0.944506,0.891647
26,XG_class,"{'algo__learning_rate': 0.03, 'algo__max_depth...",0.78149,0.78149,0.941668,0.91656
30,XG_class,"{'algo__learning_rate': 0.03, 'algo__max_depth...",0.781264,0.781264,0.942863,0.901681
10,XG_class,"{'algo__learning_rate': 0.05, 'algo__max_depth...",0.781235,0.781235,0.942697,0.905248
2,XG_class,"{'algo__learning_rate': 0.05, 'algo__max_depth...",0.780961,0.780961,0.940148,0.93169
21,XG_class,"{'algo__learning_rate': 0.03, 'algo__max_depth...",0.780534,0.780534,0.940775,0.923588


In [5]:
# Main model
# Train val
X_train, y_train, X_val, y_val = train_val_split(ebitda_input, 'ebitda')
model = xgb_class(n_estimators= 300, learning_rate=0.03, max_depth=15,
                subsample=0.7, scale_pos_weight= 0.15134
                  )

model.fit(X_train, y_train)
y_pred = model.predict(X_val)


print('----------- TRAIN RESULTS -----------')
print(f'Accuracy: {accuracy(y_val, y_pred)}')
print(f'Precision: {precision(y_val, y_pred)}')
print(f'Recall: {recall(y_val, y_pred)}')
print(f'ROC_AUC: {roc_auc(y_val, y_pred)}')
# Test
print('----------- TEST RESULTS -----------')
X_test = ebitda_test.drop(['company', 'ebitda', 'year'], axis=1)
y_test = ebitda_test['ebitda']
y_pred_test = model.predict(X_test)
print(f'Accuracy: {accuracy(y_test, y_pred_test)}')
print(f'Precision: {precision(y_test, y_pred_test)}')
print(f'Recall: {recall(y_test, y_pred_test)}')
print(f'ROC_AUC: {roc_auc(y_test, y_pred_test)}')

print(classification_report(y_val, y_pred, digits=3))

----------- TRAIN RESULTS -----------
Accuracy: 0.7755108947721574
Precision: 0.9224334600760457
Recall: 0.9388544891640866
ROC_AUC: 0.7755108947721574
----------- TEST RESULTS -----------
Accuracy: 0.8120315693845106
Precision: 0.9535452322738386
Recall: 0.9034749034749034
ROC_AUC: 0.8120315693845105
              precision    recall  f1-score   support

           0      0.671     0.612     0.640       263
           1      0.922     0.939     0.931      1292

    accuracy                          0.884      1555
   macro avg      0.797     0.776     0.785      1555
weighted avg      0.880     0.884     0.881      1555



In [6]:
df_ebitda['ebitda'].value_counts()

ebitda
1    12799
0     1945
Name: count, dtype: int64

Conclusion:
- Balanced accuracy, precision, recall of val and test set are consistent and fairly high
- But, The performance of class no.1 is way more superior in terms of recall, precision
- However, this is due to class imbalance of 1:7 for class 0:1
- Therefore, this result for class 0 is not bad but acceptable

Best param:
- XGBoostRegressor:
    + n_estimator = 300
    + learning_rate = 0.03
    + max_depth = 15
    + subsample = 0.7
    + scale_pos_weight = 0.15134