In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from autogluon.timeseries import TimeSeriesDataFrame, TimeSeriesPredictor



In [13]:
df_train = pd.read_csv('dataset/train_feature_selected.csv')
df_train['timestamp'] = pd.to_datetime(df_train['timestamp'])

df_test = pd.read_csv('dataset/test_feature_selected.csv')
df_test['timestamp'] = pd.to_datetime(df_test['timestamp'])

df_test_ori = pd.read_csv('dataset/test.csv')

## Modeling

In [3]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 492157 entries, 0 to 492156
Data columns (total 8 columns):
 #   Column                          Non-Null Count   Dtype         
---  ------                          --------------   -----         
 0   item_id                         492157 non-null  object        
 1   timestamp                       492157 non-null  datetime64[ns]
 2   rerata_kecepatan                492157 non-null  float64       
 3   mean_rerata_kecepatan_mingguan  492157 non-null  float64       
 4   mean_rerata_kecepatan_harian    492157 non-null  float64       
 5   hour                            492157 non-null  int64         
 6   2_per_3_maxspeed                492157 non-null  float64       
 7   highway                         492157 non-null  int64         
dtypes: datetime64[ns](1), float64(4), int64(2), object(1)
memory usage: 30.0+ MB


In [4]:
train_data = TimeSeriesDataFrame.from_data_frame(
    df_train,
    id_column="item_id",
    timestamp_column="timestamp"
)

In [5]:
test_data = TimeSeriesDataFrame.from_data_frame(
    df_test,
    id_column="item_id",
    timestamp_column="timestamp"
)

In [6]:
predictor = TimeSeriesPredictor(
    prediction_length=200,
    path="models/autogluon-model",
    target="rerata_kecepatan",
    eval_metric="sMAPE",
    #random_seed = 42,
    #known_covariates_names=["mean_rerata_kecepatan_mingguan",'mean_rerata_kecepatan_harian','hour','2_per_3_maxspeed','highway'],
    #ignore_time_index=True
)



In [7]:
predictor.fit(
    train_data,
    presets="best_quality",
    time_limit=900
)

TimeSeriesPredictor.fit() called
Setting presets to: best_quality
Fitting with arguments:
{'enable_ensemble': True,
 'evaluation_metric': 'sMAPE',
 'excluded_model_types': None,
 'hyperparameter_tune_kwargs': {'num_trials': 3,
                                'scheduler': 'local',
                                'searcher': 'auto'},
 'hyperparameters': 'best_quality',
 'num_val_windows': 1,
 'prediction_length': 200,
 'random_seed': None,
 'target': 'rerata_kecepatan',
 'time_limit': 900,
 'verbosity': 2}
Provided training data set with 492157 rows, 934 items (item = single time series). Average time series length is 526.9. Data frequency is 'H'.
AutoGluon will save models to models/autogluon-model/
AutoGluon will gauge predictive performance using evaluation metric: 'sMAPE'
	This metric's sign has been flipped to adhere to being 'higher is better'. The reported score can be multiplied by -1 to get the metric value.

Provided dataset contains following columns:
	target:           'rerat

  0%|          | 0/3 [00:00<?, ?it/s]

	Trained 1 models while tuning DeepAR.
	-0.1051       = Validation score (-sMAPE)
	313.31  s     = Total tuning time
Hyperparameter tuning model: TemporalFusionTransformer. Tuning model for up to 89.98s of the 899.81s remaining.
	-0.1065       = Validation score (-sMAPE)
	94.12   s     = Training runtime
	6.54    s     = Validation (prediction) runtime
Hyperparameter tuning model: PatchTST. Tuning model for up to 89.98s of the 899.81s remaining.
	-0.1027       = Validation score (-sMAPE)
	91.11   s     = Training runtime
	5.60    s     = Validation (prediction) runtime
Hyperparameter tuning model: DirectTabular. Tuning model for up to 89.98s of the 899.81s remaining.
	-0.1343       = Validation score (-sMAPE)
	37.87   s     = Training runtime
	6.32    s     = Validation (prediction) runtime
Hyperparameter tuning model: AutoARIMA. Tuning model for up to 89.98s of the 899.81s remaining.
	Time limit exceeded... Skipping AutoARIMA.
Fitting simple weighted ensemble.
	-0.0947       = Validat

<autogluon.timeseries.predictor.TimeSeriesPredictor at 0x7f0125b08940>

In [8]:
predictions = predictor.predict(train_data)
predictions.head()

Global seed set to 123
Model not specified in predict, will default to the model with the best validation score: WeightedEnsemble


Unnamed: 0_level_0,Unnamed: 1_level_0,mean,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9
item_id,timestamp,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
691007296_21390008_1425033102,2020-02-23 00:00:00,30.437643,18.457239,26.252857,26.727321,28.206328,30.444317,30.81577,32.178851,34.089194,35.780968
691007296_21390008_1425033102,2020-02-23 01:00:00,33.59609,22.177115,29.701334,30.0342,31.518755,33.587934,33.710014,35.009722,36.811776,38.372879
691007296_21390008_1425033102,2020-02-23 02:00:00,35.275684,23.268634,30.975517,31.470889,33.135713,35.290791,35.375353,36.779628,38.692134,40.577981
691007296_21390008_1425033102,2020-02-23 03:00:00,36.122512,23.87065,31.545243,32.183112,33.871314,36.063343,36.147152,37.598173,39.584198,41.726273
691007296_21390008_1425033102,2020-02-23 04:00:00,37.054773,24.813203,32.331919,33.041976,34.803475,37.012028,37.033927,38.450829,40.466863,42.775359


### Predict Test Data

In [9]:
predictions = pd.DataFrame(predictions)

In [10]:
predictions.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,mean,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9
item_id,timestamp,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
691007296_21390008_1425033102,2020-02-23 00:00:00,30.437643,18.457239,26.252857,26.727321,28.206328,30.444317,30.81577,32.178851,34.089194,35.780968
691007296_21390008_1425033102,2020-02-23 01:00:00,33.59609,22.177115,29.701334,30.0342,31.518755,33.587934,33.710014,35.009722,36.811776,38.372879
691007296_21390008_1425033102,2020-02-23 02:00:00,35.275684,23.268634,30.975517,31.470889,33.135713,35.290791,35.375353,36.779628,38.692134,40.577981
691007296_21390008_1425033102,2020-02-23 03:00:00,36.122512,23.87065,31.545243,32.183112,33.871314,36.063343,36.147152,37.598173,39.584198,41.726273
691007296_21390008_1425033102,2020-02-23 04:00:00,37.054773,24.813203,32.331919,33.041976,34.803475,37.012028,37.033927,38.450829,40.466863,42.775359


### Create Submission file

In [12]:
df_test.head()

Unnamed: 0,item_id,timestamp,mean_rerata_kecepatan_mingguan,mean_rerata_kecepatan_harian,hour,2_per_3_maxspeed,highway
0,4004732_32046542_6454026544,2020-02-23,43.125,43.848238,0,19.98,1
1,182210371_1314925464_1314925496,2020-02-23,38.126333,42.54681,0,19.98,1
2,22932408_1482086782_26481020,2020-02-23,37.112,38.242619,0,13.32,0
3,182210371_3892883_267337489,2020-02-23,44.518333,46.145143,0,19.98,1
4,66924592_266041030_2592978110,2020-02-23,27.969333,31.625048,0,19.98,1


In [15]:
submission = pd.DataFrame()
submission['id'] = df_test_ori['id']
submission['rerata_kecepatan'] = 0

for i in submission['id']:
    time = df_test['timestamp'][i]
    series_id = df_test['item_id'][i]
    result = predictions.loc[(series_id, time)]['mean']
    submission.loc[i, 'rerata_kecepatan'] = result
    

In [16]:
submission.head()

Unnamed: 0,id,rerata_kecepatan
0,0,44.74647
1,1,44.069065
2,2,38.742769
3,3,46.424478
4,4,34.749656


In [21]:
submission.to_csv(r'submission/submission_new_2.csv', index=False)