Library

In [1]:
from utils.utils_dta_processing import *
from utils.utils_ml_train import *

Modelling

In [2]:
# Data load
df_revenue = pd.read_csv('../../data/data_for_modelling/df_revenue.csv')
revenue_input, revenue_test = input_test_split(df_revenue)

In [3]:
df_revenue.columns

Index(['revenue_lag1', 'in_stock_lag1', 'invest_nav_lag1', 'long_receive_lag1',
       'long_liability_lag1', 'other_long_asset_lag1', 'cwip_lag1',
       'other_short_asset_lag1', 'long_invest_lag1', 'other_fund_lag1',
       'gov_own_lag1', 'for_own_lag1', 'expense_lag1', 'company', 'year',
       'revenue'],
      dtype='object')

In [3]:
# Hyper-param choice
revenue_obj = InputData(revenue_input, 'company', 'year', 'revenue', reg=True)
display(revenue_obj.optimal_param(n_splits=3, test_size=2))

Processing XG_reg ...
Processing Linear_reg ...


Unnamed: 0,algo_used,params,mean_test_r2,mean_test_mape,mean_test_rmse
6,Linear_reg,{'algo__fit_intercept': True},0.910469,0.013154,0.582818
4,XG_reg,"{'algo__learning_rate': 0.03, 'algo__max_depth...",0.900262,0.014152,0.615011
5,XG_reg,"{'algo__learning_rate': 0.03, 'algo__max_depth...",0.899502,0.014274,0.617466
0,XG_reg,"{'algo__learning_rate': 0.05, 'algo__max_depth...",0.898469,0.014209,0.620249
1,XG_reg,"{'algo__learning_rate': 0.05, 'algo__max_depth...",0.897024,0.014434,0.624892
2,XG_reg,"{'algo__learning_rate': 0.05, 'algo__max_depth...",0.895921,0.014617,0.628318
3,XG_reg,"{'algo__learning_rate': 0.03, 'algo__max_depth...",0.894602,0.014661,0.631641
7,Linear_reg,{'algo__fit_intercept': False},-2.075128,0.088564,3.334669


In [4]:
# Main model
# Train val
X_train, y_train, X_val, y_val = train_val_split(revenue_input, 'revenue')
scaler = MinMaxScaler()
X_train = pd.DataFrame(scaler.fit_transform(X_train), columns=X_train.columns)
X_val = pd.DataFrame(scaler.transform(X_val), columns=X_val.columns)

model = lrg(fit_intercept=True)
model.fit(X_train, y_train)
y_pred = model.predict(X_val)

print('----------- TRAIN RESULTS -----------')
print(f'R2: {r2(y_val, y_pred)}')
print(f'RMSE: {rmse(y_val, y_pred)}')
print(f'MAPE: {mape(y_val, y_pred)}')
# Test
print('----------- TEST RESULTS -----------')
X_test = revenue_test.drop(['company', 'revenue', 'year'], axis=1)
X_test = pd.DataFrame(scaler.transform(X_test), columns=X_test.columns)
y_test = revenue_test['revenue']
y_pred_test = model.predict(X_test)
print(f'R2: {r2(y_test, y_pred_test)}')
print(f'RMSE: {rmse(y_test, y_pred_test)}')
print(f'MAPE: {mape(y_test, y_pred_test)}')

----------- TRAIN RESULTS -----------
R2: 0.9119685094011432
RMSE: 0.6018953024549542
MAPE: 0.01303415435402573
----------- TEST RESULTS -----------
R2: 0.9277712919094564
RMSE: 0.5478324466353163
MAPE: 0.010833115763457267


Conclusion:
- Linear Regression is most suitable due to high and consistent R2, RMSE, MAPE
- More, it can predict values outside of average range, which is the limitation of ensemble learning like XGB

Model of choice:
- Linear Regression: intercept = True