In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sktime.forecasting.model_selection import temporal_train_test_split

from sklearn.pipeline import Pipeline
from sktime.transformations.compose import ColumnwiseTransformer
from sklearn.preprocessing import StandardScaler
from sktime.transformations.series.impute import Imputer
from sktime.transformations.series.difference import Differencer
from sktime.transformations.series.boxcox import LogTransformer
from sktime.transformations.series.adapt import TabularToSeriesAdaptor

from sktime.transformations.compose import ColumnEnsembleTransformer
from sktime.transformations.compose import FeatureUnion
from sktime.transformations.series.summarize import WindowSummarizer
from sktime.transformations.series.fourier import FourierFeatures

from catboost import CatBoostRegressor

from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_percentage_error

In [2]:
pd.options.display.max_columns = None

In [3]:
df = pd.read_csv('daily-website-visitors.csv')[["Date", "Page.Loads", "Unique.Visits", "First.Time.Visits", "Returning.Visits"]]
df = df.rename(columns={"Date": 'ds', 
                        "Page.Loads": 'y1',
                        "Unique.Visits": 'y2',
                        "First.Time.Visits": 'y3',
                        "Returning.Visits": 'y4'
                       })

In [4]:
targets = ['y1', 'y2', 'y3', 'y4']

In [5]:
df['ds'] = pd.to_datetime(df.ds, yearfirst = True)

for target in targets : 
    df[target] = df[target].str.replace(',','.').astype("float")

df = df.set_index('ds')

In [6]:
y_train, y_test = temporal_train_test_split(df, test_size=365)
index_train, index_test = y_train.index, y_test.index

In [7]:
def inverse_transform(pipeline, X_transformed):
    # Iterate over the steps in reverse order
    for name, transformer in reversed(pipeline.steps):
        X_transformed = transformer.inverse_transform(X_transformed)
    return X_transformed

In [8]:
transform = Pipeline(
    steps=[
        ("imputer", ColumnwiseTransformer(Imputer(method="drift"))),
        ("log", ColumnwiseTransformer(LogTransformer(offset=1))),
#        ("differencer", ColumnwiseTransformer(Differencer(lags=1))),
        ("scale", ColumnwiseTransformer(TabularToSeriesAdaptor(StandardScaler()))),
    ]
)

window_summarizer = WindowSummarizer(
    lag_feature={
        "lag": list(range(1, 15)),
        "mean": [[2, 7], [1, 14]],
        "std": [[2, 7], [1, 14]],
    }
)

multi_window_summarizer = ColumnEnsembleTransformer(
    transformers = [(f'window_summarizer_{target}', window_summarizer, target) for target in targets]
)

fourier_featurizer = FourierFeatures(sp_list=[365.25, 7], fourier_terms_list=[10, 2], freq='D')

featurizer = FeatureUnion([multi_window_summarizer, fourier_featurizer])

In [9]:
y_train_transform = transform.fit_transform(y_train)
y_full_transform = y_train_transform.reindex(y_train_transform.index.union(y_test.index))
X_full = featurizer.fit_transform(y_full_transform)
y_full_transform = pd.melt(y_full_transform, var_name = 'Target_type', value_vars=targets, value_name='y', ignore_index=False).set_index('Target_type', append=True)

In [10]:
lag_features = [f"ColumnEnsembleTransformer__{j}_lag_{i}" for i in window_summarizer.lag_feature["lag"] for j in targets]
mean_features = [f"ColumnEnsembleTransformer__{j}_mean_{start}_{end}" for start, end in window_summarizer.lag_feature["mean"] for j in targets]
std_features = [f"ColumnEnsembleTransformer__{j}_std_{start}_{end}" for start, end in window_summarizer.lag_feature["std"] for j in targets]
fourier_features = [f"FourierFeatures__{fn}_{per}_{i}" for per, freq in zip(fourier_featurizer.sp_list, fourier_featurizer.fourier_terms_list) for i in range(1, freq + 1) for fn in ("cos", "sin")]

X_full = X_full.reset_index().merge(pd.DataFrame(targets, columns=['Target_Type']), how='cross').set_index('ds')
X_full = X_full[['Target_Type'] + lag_features + mean_features + std_features + fourier_features]
X_train, X_test = X_full.loc[index_train], X_full.loc[index_test]
y_train_transform_stack = y_full_transform.loc[index_train]

In [13]:
X_train.shape, y_train_transform_stack.shape

((7208, 97), (7208, 1))

In [17]:
model = CatBoostRegressor(loss_function="RMSE", has_time=True)
model.fit(X_train, y_train_transform_stack, cat_features=['Target_Type'])

Learning rate set to 0.055939
0:	learn: 0.9802405	total: 183ms	remaining: 3m 2s
1:	learn: 0.9606242	total: 218ms	remaining: 1m 48s
2:	learn: 0.9416700	total: 254ms	remaining: 1m 24s
3:	learn: 0.9239075	total: 290ms	remaining: 1m 12s
4:	learn: 0.9087663	total: 326ms	remaining: 1m 4s
5:	learn: 0.8963997	total: 362ms	remaining: 60s
6:	learn: 0.8811053	total: 398ms	remaining: 56.4s
7:	learn: 0.8666470	total: 435ms	remaining: 54s
8:	learn: 0.8539763	total: 473ms	remaining: 52.1s
9:	learn: 0.8416062	total: 510ms	remaining: 50.5s
10:	learn: 0.8329166	total: 547ms	remaining: 49.2s
11:	learn: 0.8215908	total: 583ms	remaining: 48s
12:	learn: 0.8118846	total: 617ms	remaining: 46.9s
13:	learn: 0.8031053	total: 652ms	remaining: 45.9s
14:	learn: 0.7943581	total: 688ms	remaining: 45.1s
15:	learn: 0.7862565	total: 724ms	remaining: 44.5s
16:	learn: 0.7787638	total: 757ms	remaining: 43.8s
17:	learn: 0.7727437	total: 790ms	remaining: 43.1s
18:	learn: 0.7666196	total: 823ms	remaining: 42.5s
19:	learn: 0.7

160:	learn: 0.4997686	total: 5.77s	remaining: 30.1s
161:	learn: 0.4987310	total: 5.82s	remaining: 30.1s
162:	learn: 0.4978770	total: 5.86s	remaining: 30.1s
163:	learn: 0.4963243	total: 5.89s	remaining: 30s
164:	learn: 0.4954476	total: 5.93s	remaining: 30s
165:	learn: 0.4948141	total: 5.98s	remaining: 30s
166:	learn: 0.4939827	total: 6.02s	remaining: 30s
167:	learn: 0.4926626	total: 6.07s	remaining: 30s
168:	learn: 0.4921509	total: 6.1s	remaining: 30s
169:	learn: 0.4913901	total: 6.14s	remaining: 30s
170:	learn: 0.4907625	total: 6.18s	remaining: 30s
171:	learn: 0.4901014	total: 6.22s	remaining: 29.9s
172:	learn: 0.4887825	total: 6.26s	remaining: 29.9s
173:	learn: 0.4883443	total: 6.3s	remaining: 29.9s
174:	learn: 0.4878920	total: 6.34s	remaining: 29.9s
175:	learn: 0.4874328	total: 6.38s	remaining: 29.9s
176:	learn: 0.4855165	total: 6.42s	remaining: 29.9s
177:	learn: 0.4848343	total: 6.46s	remaining: 29.8s
178:	learn: 0.4843072	total: 6.5s	remaining: 29.8s
179:	learn: 0.4837643	total: 6.

320:	learn: 0.3969694	total: 11.6s	remaining: 24.5s
321:	learn: 0.3962564	total: 11.6s	remaining: 24.5s
322:	learn: 0.3958154	total: 11.7s	remaining: 24.5s
323:	learn: 0.3955230	total: 11.7s	remaining: 24.4s
324:	learn: 0.3951300	total: 11.7s	remaining: 24.4s
325:	learn: 0.3940564	total: 11.8s	remaining: 24.4s
326:	learn: 0.3936988	total: 11.8s	remaining: 24.3s
327:	learn: 0.3934770	total: 11.9s	remaining: 24.3s
328:	learn: 0.3928299	total: 11.9s	remaining: 24.3s
329:	learn: 0.3925899	total: 11.9s	remaining: 24.2s
330:	learn: 0.3922330	total: 12s	remaining: 24.2s
331:	learn: 0.3918946	total: 12s	remaining: 24.2s
332:	learn: 0.3917247	total: 12s	remaining: 24.1s
333:	learn: 0.3912139	total: 12.1s	remaining: 24.1s
334:	learn: 0.3905047	total: 12.1s	remaining: 24.1s
335:	learn: 0.3902386	total: 12.2s	remaining: 24s
336:	learn: 0.3899208	total: 12.2s	remaining: 24s
337:	learn: 0.3895553	total: 12.2s	remaining: 23.9s
338:	learn: 0.3891081	total: 12.3s	remaining: 23.9s
339:	learn: 0.3888948	

479:	learn: 0.3340530	total: 17.4s	remaining: 18.8s
480:	learn: 0.3335600	total: 17.4s	remaining: 18.8s
481:	learn: 0.3333871	total: 17.4s	remaining: 18.7s
482:	learn: 0.3331772	total: 17.5s	remaining: 18.7s
483:	learn: 0.3330689	total: 17.5s	remaining: 18.7s
484:	learn: 0.3325967	total: 17.5s	remaining: 18.6s
485:	learn: 0.3323839	total: 17.6s	remaining: 18.6s
486:	learn: 0.3317910	total: 17.6s	remaining: 18.5s
487:	learn: 0.3314823	total: 17.6s	remaining: 18.5s
488:	learn: 0.3313597	total: 17.7s	remaining: 18.5s
489:	learn: 0.3310133	total: 17.7s	remaining: 18.4s
490:	learn: 0.3303369	total: 17.8s	remaining: 18.4s
491:	learn: 0.3302444	total: 17.8s	remaining: 18.4s
492:	learn: 0.3291474	total: 17.8s	remaining: 18.3s
493:	learn: 0.3289917	total: 17.9s	remaining: 18.3s
494:	learn: 0.3288580	total: 17.9s	remaining: 18.3s
495:	learn: 0.3284350	total: 17.9s	remaining: 18.2s
496:	learn: 0.3279118	total: 18s	remaining: 18.2s
497:	learn: 0.3277191	total: 18s	remaining: 18.2s
498:	learn: 0.32

643:	learn: 0.2878702	total: 23.1s	remaining: 12.8s
644:	learn: 0.2877468	total: 23.1s	remaining: 12.7s
645:	learn: 0.2876972	total: 23.2s	remaining: 12.7s
646:	learn: 0.2872103	total: 23.2s	remaining: 12.7s
647:	learn: 0.2870692	total: 23.2s	remaining: 12.6s
648:	learn: 0.2868661	total: 23.3s	remaining: 12.6s
649:	learn: 0.2866905	total: 23.3s	remaining: 12.5s
650:	learn: 0.2866360	total: 23.3s	remaining: 12.5s
651:	learn: 0.2865264	total: 23.4s	remaining: 12.5s
652:	learn: 0.2862180	total: 23.4s	remaining: 12.4s
653:	learn: 0.2860422	total: 23.4s	remaining: 12.4s
654:	learn: 0.2854914	total: 23.5s	remaining: 12.4s
655:	learn: 0.2852184	total: 23.5s	remaining: 12.3s
656:	learn: 0.2851320	total: 23.5s	remaining: 12.3s
657:	learn: 0.2850147	total: 23.6s	remaining: 12.2s
658:	learn: 0.2847625	total: 23.6s	remaining: 12.2s
659:	learn: 0.2846826	total: 23.6s	remaining: 12.2s
660:	learn: 0.2845954	total: 23.7s	remaining: 12.1s
661:	learn: 0.2844443	total: 23.7s	remaining: 12.1s
662:	learn: 

805:	learn: 0.2544050	total: 28.7s	remaining: 6.9s
806:	learn: 0.2542974	total: 28.7s	remaining: 6.87s
807:	learn: 0.2542207	total: 28.8s	remaining: 6.83s
808:	learn: 0.2541271	total: 28.8s	remaining: 6.8s
809:	learn: 0.2539737	total: 28.8s	remaining: 6.76s
810:	learn: 0.2538247	total: 28.9s	remaining: 6.72s
811:	learn: 0.2537840	total: 28.9s	remaining: 6.69s
812:	learn: 0.2536932	total: 28.9s	remaining: 6.65s
813:	learn: 0.2535213	total: 29s	remaining: 6.62s
814:	learn: 0.2534083	total: 29s	remaining: 6.58s
815:	learn: 0.2533406	total: 29s	remaining: 6.55s
816:	learn: 0.2532374	total: 29.1s	remaining: 6.51s
817:	learn: 0.2531557	total: 29.1s	remaining: 6.48s
818:	learn: 0.2530215	total: 29.2s	remaining: 6.44s
819:	learn: 0.2529660	total: 29.2s	remaining: 6.41s
820:	learn: 0.2528749	total: 29.2s	remaining: 6.37s
821:	learn: 0.2528047	total: 29.3s	remaining: 6.34s
822:	learn: 0.2526172	total: 29.3s	remaining: 6.3s
823:	learn: 0.2524663	total: 29.3s	remaining: 6.26s
824:	learn: 0.2524091

966:	learn: 0.2308526	total: 34.1s	remaining: 1.16s
967:	learn: 0.2307419	total: 34.2s	remaining: 1.13s
968:	learn: 0.2306051	total: 34.2s	remaining: 1.09s
969:	learn: 0.2305296	total: 34.2s	remaining: 1.06s
970:	learn: 0.2304176	total: 34.3s	remaining: 1.02s
971:	learn: 0.2300476	total: 34.3s	remaining: 988ms
972:	learn: 0.2299890	total: 34.3s	remaining: 953ms
973:	learn: 0.2299422	total: 34.4s	remaining: 917ms
974:	learn: 0.2298385	total: 34.4s	remaining: 882ms
975:	learn: 0.2297142	total: 34.4s	remaining: 847ms
976:	learn: 0.2296077	total: 34.5s	remaining: 811ms
977:	learn: 0.2295014	total: 34.5s	remaining: 776ms
978:	learn: 0.2294418	total: 34.5s	remaining: 741ms
979:	learn: 0.2293887	total: 34.6s	remaining: 705ms
980:	learn: 0.2292254	total: 34.6s	remaining: 670ms
981:	learn: 0.2290981	total: 34.6s	remaining: 635ms
982:	learn: 0.2290453	total: 34.7s	remaining: 600ms
983:	learn: 0.2289938	total: 34.7s	remaining: 564ms
984:	learn: 0.2289589	total: 34.7s	remaining: 529ms
985:	learn: 

<catboost.core.CatBoostRegressor at 0x182f413a7d0>

In [None]:
for idx in index_test:
    y_tmp = y_full_transform.loc[idx-pd.DateOffset(14+1):idx]
    a = featurizer.transform(y_tmp).loc[idx, lag_features+mean_features+std_features]
    X_test.loc[idx, lag_features+mean_features+std_features] = a
    y_full_transform.loc[idx] = model.predict(X_test.loc[idx])
    
y_pred = inverse_transform(transform, y_full_transform.loc[index_test])

In [None]:
fig, (ax0, ax1) = plt.subplots(nrows=1,ncols=2)

ax0.plot(y_train.index, y_train.y, label='train')
ax0.plot(y_test.index, y_test.y, label='true')
ax0.plot(y_pred.index, y_pred.y, label='pred')
ax0.legend()

ax1.plot(y_pred.index, y_pred.y, label='pred')
ax1.plot(y_test.index, y_test.y, label='true')
ax1.legend()

plt.subplots_adjust(left=-1, bottom=-1, right=2, top=0.1, wspace=0.25, hspace=0.5)

In [None]:
print("MSE:", mean_squared_error(y_test, y_pred))
print("MAPE:", mean_absolute_percentage_error(y_test, y_pred))

In [None]:
#shape