Library

In [1]:
from sklearn.metrics import classification_report
from utils.utils_dta_processing import *
from utils.utils_ml_train import *

Data

In [2]:
df_value = pd.read_csv('../../data/data_for_modelling/df_value_ad.csv')
value_input, value_test = input_test_split(df_value)

Modelling

In [3]:
# Best param choice
value_obj = InputData(value_input, 'company', 'year', 'value_add', reg=False)
display(value_obj.optimal_param(n_splits=3, test_size=2))

Processing XG_class ...
Processing KNN_class ...


Unnamed: 0,algo_used,params,mean_test_accuracy,mean_test_roc_auc,mean_test_precision,mean_test_recall
26,XG_class,"{'algo__learning_rate': 0.03, 'algo__max_depth...",0.778344,0.778344,0.914021,0.908605
2,XG_class,"{'algo__learning_rate': 0.05, 'algo__max_depth...",0.778,0.778,0.912273,0.920541
27,XG_class,"{'algo__learning_rate': 0.03, 'algo__max_depth...",0.777777,0.777777,0.912127,0.920445
32,XG_class,"{'algo__learning_rate': 0.03, 'algo__max_depth...",0.777365,0.777365,0.911438,0.924496
22,XG_class,"{'algo__learning_rate': 0.03, 'algo__max_depth...",0.777364,0.777364,0.911504,0.923902
1,XG_class,"{'algo__learning_rate': 0.05, 'algo__max_depth...",0.776897,0.776897,0.913648,0.906102
37,XG_class,"{'algo__learning_rate': 0.03, 'algo__max_depth...",0.776721,0.776721,0.911467,0.922125
6,XG_class,"{'algo__learning_rate': 0.05, 'algo__max_depth...",0.776577,0.776577,0.913899,0.903446
7,XG_class,"{'algo__learning_rate': 0.05, 'algo__max_depth...",0.77649,0.77649,0.912252,0.916037
36,XG_class,"{'algo__learning_rate': 0.03, 'algo__max_depth...",0.776408,0.776408,0.91256,0.913176


In [4]:
# Train val
X_train, y_train, X_val, y_val = train_val_split(value_input, 'value_add')

model = xgb_class(n_estimators= 200, learning_rate=0.02, max_depth=10,
                subsample=0.5, scale_pos_weight=0.223
                  )
model.fit(X_train, y_train)
y_pred = model.predict(X_val)


print('----------- TRAIN RESULTS -----------')
print(f'Accuracy: {accuracy(y_val, y_pred)}')
print(f'Precision: {precision(y_val, y_pred)}')
print(f'Recall: {recall(y_val, y_pred)}')
print(f'ROC_AUC: {roc_auc(y_val, y_pred)}')
# Test
print('----------- TEST RESULTS -----------')
X_test = value_test.drop(['company', 'value_add', 'year'], axis=1)
y_test = value_test['value_add']
y_pred_test = model.predict(X_test)
print(f'Accuracy: {accuracy(y_test, y_pred_test)}')
print(f'Precision: {precision(y_test, y_pred_test)}')
print(f'Recall: {recall(y_test, y_pred_test)}')
print(f'ROC_AUC: {roc_auc(y_test, y_pred_test)}')

print(classification_report(y_val, y_pred, digits=3))

----------- TRAIN RESULTS -----------
Accuracy: 0.7711450473110553
Precision: 0.8819218241042345
Recall: 0.9417391304347826
ROC_AUC: 0.7711450473110553
----------- TEST RESULTS -----------
Accuracy: 0.8373684860895354
Precision: 0.9410187667560321
Recall: 0.901541095890411
ROC_AUC: 0.8373684860895354
              precision    recall  f1-score   support

           0      0.765     0.601     0.673       363
           1      0.882     0.942     0.911      1150

    accuracy                          0.860      1513
   macro avg      0.823     0.771     0.792      1513
weighted avg      0.854     0.860     0.854      1513



In [5]:
df_value['value_add'].value_counts()

value_add
1    11696
0     2639
Name: count, dtype: int64

Conclusion:
- precision and recall of both val and test set are consistent
- precision and recall among class are close despite imbalanced class of around 1:7 for class 0 versus 1
- Note that the balanced accuracy (accuracy) are quite high for both val and test set, signaling the effectiveness of this model

Best params:
- XGBoost:
    + n_estimators= 200
    + learning_rate=0.02
    + max_depth=10
    + subsample=0.5
    + scale_pos_weight=0.223