In [1]:
import pandas as pd
import numpy as np

from lightgbm import LGBMRegressor

from sktime.performance_metrics.forecasting import (
    mean_absolute_percentage_error,
    mean_absolute_error,
)

In [2]:
train_data = pd.read_csv('train_data_with_latent_feat.csv')
train_data = train_data.drop(['Unnamed: 0'], axis=1)

test_data = pd.read_csv('test_data_with_latent_feat.csv')
test_data = test_data.drop(['Unnamed: 0'], axis=1)

test_data_cols = list(test_data.columns)


In [3]:
PREDICTION_HORIZON = 28
TARGET_TYPE = 'RV'

ONLY_LATENT = [f'latent_{i}' for i in range(11)]
FEAT_COLS_WITHOUT_LATENT = [f'RV{i}' for i in range(PREDICTION_HORIZON,100)] 
FEAT_COLS_WITH_LATENT = [f'RV{i}' for i in range(PREDICTION_HORIZON,100)] + [f'latent_{i}' for i in range(11)]

In [4]:
print(train_data.arrival_date.max())
print(test_data.arrival_date.max())

2016-08-31
2017-08-31


In [5]:
model_with_latent = LGBMRegressor(objective='quantile', alpha=0.5)
model_with_latent.fit(X=train_data[FEAT_COLS_WITH_LATENT], y=train_data['RV0'])

model_without_latent = LGBMRegressor(objective='quantile', alpha=0.5)
model_without_latent.fit(X=train_data[FEAT_COLS_WITHOUT_LATENT], y=train_data['RV0'])

model_only_latent = LGBMRegressor(objective='quantile', alpha=0.5)
model_only_latent.fit(X=train_data[ONLY_LATENT], y=train_data['RV0'])

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002179 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 11033
[LightGBM] [Info] Number of data points in the train set: 428, number of used features: 83
[LightGBM] [Info] Start training from score 2785.665039
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001916 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 9667
[LightGBM] [Info] Number of data points in the train set: 428, number of used features: 72
[LightGBM] [Info] Start training from score 2785.665039
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000450 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1366
[LightGBM] [Info] Number of data points in the train set: 428, number of used features: 11
[LightGBM] [Info] Start tr

In [6]:
pred_col_names = ['pred_with_latent','pred_without_latent', 'pred_only_latent']
variant_feat_cols = [FEAT_COLS_WITH_LATENT,FEAT_COLS_WITHOUT_LATENT,ONLY_LATENT]
models = [model_with_latent,model_without_latent, model_only_latent]

for pred_col_name, feat_cols, model in zip(pred_col_names,variant_feat_cols,models):
    test_data[pred_col_name] = model.predict(test_data[feat_cols])

test_data= test_data[pred_col_names+test_data_cols]
test_data

Unnamed: 0,pred_with_latent,pred_without_latent,pred_only_latent,arrival_date,RV0,RV1,RV2,RV3,RV4,RV5,...,latent_1,latent_2,latent_3,latent_4,latent_5,latent_6,latent_7,latent_8,latent_9,latent_10
0,5886.427530,5692.908221,6656.258754,2016-09-01,6255.86,6102.86,6102.86,6102.86,5871.86,5871.86,...,6384.6200,6238.1390,6076.1910,7056.6724,7229.5610,3266.0227,0.000000,5318.2990,9454.3310,1071.08690
1,3685.434626,3688.139857,3633.714400,2016-09-02,4366.89,4189.89,4028.89,3839.29,3578.29,3578.29,...,3825.1190,3292.8730,3914.0752,3685.4539,4078.9597,1767.5865,0.000000,3364.7874,5111.4540,0.00000
2,5113.417441,5007.293263,5402.331163,2016-09-03,5232.95,4696.95,4195.45,4195.45,4195.45,4077.45,...,4248.6694,5774.3210,5086.8690,5573.8950,5923.1520,2343.4746,0.000000,5385.0493,6699.8257,251.05945
3,3058.460007,3210.111696,3073.099911,2016-09-04,3275.52,3098.52,3098.52,3098.52,3098.52,3098.52,...,3670.9072,2895.4832,3486.7617,3452.4897,3720.1640,1701.8142,33.998240,2797.8684,4913.4810,171.76420
4,4846.179917,5468.311759,4891.565343,2016-09-05,5278.86,4898.86,4717.86,4717.86,4549.36,4549.36,...,6222.8716,4932.7974,5779.9890,5941.7954,6302.7446,2917.0334,33.407284,4598.5664,8474.8955,526.49590
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
360,9096.761715,9011.093958,9319.249257,2017-08-27,9816.47,8576.47,8576.47,8576.47,8346.47,8136.47,...,8314.0580,10692.8120,10751.9810,9809.1630,11353.0560,4103.0073,0.000000,11533.2890,11883.5500,0.00000
361,7878.533929,8429.770181,8327.936968,2017-08-28,8957.56,8570.76,8358.26,8358.26,8122.46,8122.46,...,5504.2427,9557.5060,7052.6030,8660.1870,9033.2900,3381.2400,0.000000,8471.3210,9843.6360,871.90080
362,3583.264026,3504.213248,3671.672278,2017-08-29,3512.50,3141.40,2937.40,2937.40,2937.40,2740.90,...,2155.5347,3311.6810,2480.9023,3169.7083,3252.3838,1274.7631,0.000000,2828.7031,3708.4830,483.87018
363,2515.488294,2557.318972,2613.855600,2017-08-30,4837.43,4137.88,4137.88,4137.88,3920.88,3920.88,...,1476.9889,2861.0483,2033.5083,2525.5957,2619.8901,958.3636,0.000000,2532.8994,2757.7024,243.35753


In [7]:
for pred_col in pred_col_names:
    print(f'SMAPE {pred_col}:',mean_absolute_percentage_error(y_true=test_data['RV0'], y_pred=test_data[pred_col], symmetric=True))



SMAPE pred_with_latent: 0.21134488730743214
SMAPE pred_without_latent: 0.21460461533441866
SMAPE pred_only_latent: 0.22260842905491615
