In [1]:
import pandas as pd
import numpy as np
import statistics
import collections
import statsmodels
from fbprophet import Prophet

from statsmodels.tsa.arima_model import ARIMA
from statsmodels.tsa.arima.model import ARIMAResults

import hts
from hts import HTSRegressor
from hts.hierarchy import HierarchyTree

import pmdarima as pm


import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

In [2]:
# import the data files
data = pd.read_excel('./data - lisbon e resto de pt.xlsx',sheet_name='Sheet1', skiprows=2, usecols="A:D", header=None, index_col=0)

In [3]:
first_row = data.loc["Years",:]
data.columns=first_row
data.drop(data.index[[0]],inplace=True)

In [4]:
# lowercase the column names
data.columns = [col_name.lower() for col_name in data.columns]

In [5]:
# map the full NUTSII names to abbreviations
zone_dict = {
    "Área Metropolitana de Lisboa": "LIS",
    "Resto de PT": "R_PT",
}

data["zone"] = data["zone"].map(zone_dict)

In [6]:
data.reset_index(inplace=True)
data.columns.values[0]="date"
data

Unnamed: 0,date,country,zone,nº of nights
0,2010-12-31,Portugal,LIS,0.0
1,2010-12-31,Portugal,R_PT,21911.83
2,2011-12-31,Portugal,LIS,0.0
3,2011-12-31,Portugal,R_PT,23449.847
4,2012-12-31,Portugal,LIS,0.0
5,2012-12-31,Portugal,R_PT,23567.587
6,2013-12-31,Portugal,LIS,10040.808
7,2013-12-31,Portugal,R_PT,17312.124
8,2014-12-31,Portugal,LIS,12279.422
9,2014-12-31,Portugal,R_PT,36393.35


## Ground Up Example
Make custom forecasts and reconciling them 

In [7]:
# specifying levels in the hierarchy
level_names = ['country', 'zone']
#specifying the levels to include in the hierarchy structure
hier = [['country'], ['zone']]
#get a wide pandas.DataFrame with the individual time series to create forecasts.
wide_df, sum_mat, sum_mat_labels = hts.functions.get_hierarchichal_df(data,
                                                                      level_names=level_names,
                                                                      hierarchy=hier,
                                                                      date_colname='date',
                                                                      val_colname='nº of nights')

In [8]:
wide_df = pd.DataFrame(wide_df,
                    index=pd.date_range(end='2021-12', freq='A', periods=11),
                    columns=wide_df.columns)
wide_df

country_zone,Portugal _LIS,Portugal _R_PT,total,Portugal,LIS,R_PT
2010-12-31,0.0,21911.83,21911.83,21911.83,0.0,21911.83
2011-12-31,0.0,23449.847,23449.847,23449.847,0.0,23449.847
2012-12-31,0.0,23567.587,23567.587,23567.587,0.0,23567.587
2013-12-31,10040.808,17312.124,27352.932,27352.932,10040.808,17312.124
2014-12-31,12279.422,36393.35,48672.772,48672.772,12279.422,36393.35
2015-12-31,13468.659,39560.489,53029.148,53029.148,13468.659,39560.489
2016-12-31,14800.346,44273.949,59074.295,59074.295,14800.346,44273.949
2017-12-31,16695.206,48636.101,65331.307,65331.307,16695.206,48636.101
2018-12-31,17516.975,50104.521,67621.496,67621.496,17516.975,50104.521
2019-12-31,18639.062,51473.511,70112.573,70112.573,18639.062,51473.511


In [9]:
wide_df.index[0]

Timestamp('2010-12-31 00:00:00', freq='A-DEC')

In [10]:
# Create a DataFrame to store new forecasts in
# Here we just do an average
forecasts = pd.DataFrame(index=['forecast'], columns=wide_df.columns)

for col in wide_df.columns:
        forecasts[col] = statistics.mean(wide_df[col])
forecasts

country_zone,Portugal _LIS,Portugal _R_PT,total,Portugal,LIS,R_PT
forecast,9881.348364,34293.386727,44174.735091,44174.735091,9881.348364,34293.386727


In [11]:
#Store the forecasts in a dictionary to be passed to the reconciliation algorithm
pred_dict = collections.OrderedDict()
# Add predictions to dictionary in same order as summing matrix
for label in sum_mat_labels:
    pred_dict[label] = pd.DataFrame(data=forecasts[label].values, columns=['yhat'])

In [12]:
#Reconcile the forecasts.Here we use OLS optimal reconciliation. 
revised = hts.functions.optimal_combination(pred_dict, sum_mat, method='OLS', mse={})

#Then, put reconciled forecasts in the same wide DataFrame format.
revised_forecasts = pd.DataFrame(data=revised[0:,0:], index=forecasts.index, columns=sum_mat_labels)

In [13]:
revised_forecasts

Unnamed: 0,total,R_PT,LIS,Portugal,Portugal _LIS,Portugal _R_PT
forecast,44174.735091,34293.386727,9881.348364,44174.735091,9881.348364,34293.386727


## Reconcile Pre-Computed Forecasts Example

In [14]:
# create the bottom level data
data_bottom_level = data.pivot(index="date", columns="country_zone", values="nº of nights")

# create the middle level data
data_middle_level = data.groupby(["date", "country"]).sum().reset_index(drop=False).pivot(index="date", columns="country", values="nº of nights")

# create the total level data
data_total = data.groupby("date")["nº of nights"].sum().to_frame().rename(columns={"nº of nights": "total"})


In [15]:
# join the DataFrames
hierarchy_data = data_bottom_level.join(data_middle_level).join(data_total)
hierarchy_data.index = pd.to_datetime(hierarchy_data.index)

print(f"Number of time series at the bottom level: {data_bottom_level.shape[1]}")
print(f"Number of time series at the middle level: {data_middle_level.shape[1]}")

hierarchy_data

Number of time series at the bottom level: 2
Number of time series at the middle level: 1


Unnamed: 0_level_0,Portugal _LIS,Portugal _R_PT,Portugal,total
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2010-12-31,0.0,21911.83,21911.83,21911.83
2011-12-31,0.0,23449.847,23449.847,23449.847
2012-12-31,0.0,23567.587,23567.587,23567.587
2013-12-31,10040.808,17312.124,27352.932,27352.932
2014-12-31,12279.422,36393.35,48672.772,48672.772
2015-12-31,13468.659,39560.489,53029.148,53029.148
2016-12-31,14800.346,44273.949,59074.295,59074.295
2017-12-31,16695.206,48636.101,65331.307,65331.307
2018-12-31,17516.975,50104.521,67621.496,67621.496
2019-12-31,18639.062,51473.511,70112.573,70112.573


In [16]:
pd.date_range(end='2021-12', freq='A', periods=11)

DatetimeIndex(['2010-12-31', '2011-12-31', '2012-12-31', '2013-12-31',
               '2014-12-31', '2015-12-31', '2016-12-31', '2017-12-31',
               '2018-12-31', '2019-12-31', '2020-12-31'],
              dtype='datetime64[ns]', freq='A-DEC')

In [17]:
hierarchy_data = pd.DataFrame(hierarchy_data, index=pd.date_range(end='2021-12', freq='A', periods=11), columns=hierarchy_data.columns)
hierarchy_data

Unnamed: 0,Portugal _LIS,Portugal _R_PT,Portugal,total
2010-12-31,0.0,21911.83,21911.83,21911.83
2011-12-31,0.0,23449.847,23449.847,23449.847
2012-12-31,0.0,23567.587,23567.587,23567.587
2013-12-31,10040.808,17312.124,27352.932,27352.932
2014-12-31,12279.422,36393.35,48672.772,48672.772
2015-12-31,13468.659,39560.489,53029.148,53029.148
2016-12-31,14800.346,44273.949,59074.295,59074.295
2017-12-31,16695.206,48636.101,65331.307,65331.307
2018-12-31,17516.975,50104.521,67621.496,67621.496
2019-12-31,18639.062,51473.511,70112.573,70112.573


In [18]:
hierarchy_data.index[0]

Timestamp('2010-12-31 00:00:00', freq='A-DEC')

In [19]:
#Creating the hierarchy
country = data["country"].unique()
zone = data["country_zone"].unique()

total = {'total': list(country)}
country = {k: [x for x in zone if x.startswith(k)] for k in country}
hierarchy = {**total, **country}

In [20]:
Htree = HierarchyTree.from_nodes(nodes=hierarchy, df=hierarchy_data)
Htree

- total
   - Portugal 
      |- Portugal _LIS
      - Portugal _R_PT

In [21]:
sum_mat2, sum_mat_labels2 = hts.functions.to_sum_mat(Htree)

In [22]:
forecasts2 = pd.DataFrame(columns=hierarchy_data.columns, index=['forecast'])

In [23]:
 # Make forecasts made outside of hts package.
for col in hierarchy_data.columns:
    model = statsmodels.tsa.holtwinters.SimpleExpSmoothing(hierarchy_data[col].values).fit()
    fcst = list(model.forecast(1))
    forecasts2[col] = fcst

In [24]:
pred_dict2 = collections.OrderedDict()

In [25]:
# Add predictions to dictionary is same order as summing matrix
for label in sum_mat_labels2:
    pred_dict2[label] = pd.DataFrame(data=forecasts2[label].values, columns=['yhat'])

In [26]:
revised2 = hts.functions.optimal_combination(pred_dict2, sum_mat2, method='OLS', mse={})

In [27]:
# Put reconciled forecasts in nice DataFrame form
revised_forecasts2 = pd.DataFrame(data=revised2[0:,0:], index=forecasts2.index, columns=sum_mat_labels2)


In [28]:
revised_forecasts2

Unnamed: 0,total,Portugal,Portugal _LIS,Portugal _R_PT
forecast,26652.317737,26652.317737,4056.229516,22596.088222


Using prophet:

In [33]:
reg = HTSRegressor(model='prophet', revision_method='OLS')
reg = reg.fit(df=hierarchy_data, nodes=hierarchy)
preds_prophet = reg.predict(steps_ahead=1)

Fitting models: 100%|████████████████████████████████████████████████████████████████████| 4/4 [00:06<00:00,  1.57s/it]
Fitting models: 100%|████████████████████████████████████████████████████████████████████| 4/4 [00:06<00:00,  1.64s/it]


In [34]:
preds_prophet

Unnamed: 0,total,Portugal,Portugal _LIS,Portugal _R_PT
2010-12-31,26021.867464,26021.867464,2545.136856,23476.730608
2011-12-31,35553.475103,35553.475103,5727.863207,29825.611896
2012-12-31,22907.81653,22907.81653,2410.197151,20497.619379
2013-12-31,32901.345687,32901.345687,5771.924303,27129.421384
2014-12-31,42667.32572,42667.32572,9045.152192,33622.173528
2015-12-31,52198.93336,52198.93336,12227.878543,39971.054817
2016-12-31,39553.274535,39553.274535,8910.212957,30643.061577
2017-12-31,49546.803445,49546.803445,12271.940582,37274.862862
2018-12-31,59312.783251,59312.783251,15545.168955,43767.614296
2019-12-31,68844.390654,68844.390654,18727.895783,50116.494872


Using auto_arima:

In [35]:
reg = HTSRegressor(model='auto_arima', revision_method='OLS')
reg = reg.fit(df=hierarchy_data, nodes=hierarchy)
preds_autoarima = reg.predict(steps_ahead=1)

Fitting models: 100%|████████████████████████████████████████████████████████████████████| 4/4 [00:05<00:00,  1.45s/it]
Fitting models: 100%|████████████████████████████████████████████████████████████████████| 4/4 [00:05<00:00,  1.26s/it]


In [36]:
preds_autoarima

Unnamed: 0,total,Portugal,Portugal _LIS,Portugal _R_PT
2010-12-31,31450.373256,31450.373256,20212.041187,11238.332068
2011-12-31,23609.565777,23609.565777,13412.59718,10196.968597
2012-12-31,24295.633365,24295.633365,13800.554572,10495.078793
2013-12-31,24348.15398,24348.15398,13830.253921,10517.900058
2014-12-31,28463.594853,28463.594853,18577.912104,9885.682749
2015-12-31,38514.877603,38514.877603,24801.355126,13713.522477
2016-12-31,40745.581642,40745.581642,26349.453391,14396.128251
2017-12-31,43764.031237,43764.031237,28377.3452,15386.686037
2018-12-31,47013.109099,47013.109099,30671.412371,16341.696728
2019-12-31,48233.324858,48233.324858,31559.517235,16673.807623
