# Holt-Winters for a Sample of Timeseries

Repeat (some of) the analysis from the previous notebook but with a large sample

Start with:

- All Spanish ISBNs that have demand in the 12 months prior to 9 months ago i.e. will have a full year of history for forecasting
- Holt-Winters with fixed hyperparameters

In [None]:
#Import libraries
#NB I don't need anything that is only ref'd in helpers.py

from helpers import *

from sqlalchemy import create_engine
import psycopg2
import numpy as np
import pandas as pd
import datetime as dt

import random #I'm going to us this for dev and testing

from statsmodels.tsa.holtwinters import ExponentialSmoothing as hwes
from sklearn.metrics import mean_squared_error

import matplotlib.pyplot as plt
%matplotlib inline

#This is suppress all warnings in the notebook - turn when happy code works
import warnings
warnings.filterwarnings('ignore')

In [None]:
#Redshift user credentials - set here
USER = 
PASSWORD = 

FCST_PERIOD = 9   #How many months I want to forecast ahead

In [None]:
#Create SQLAlchemy engine for Redshift database
user = USER
password = PASSWORD
host= 
port='5439'
dbname='prod'

url = "postgresql+psycopg2://{0}:{1}@{2}:{3}/{4}".format(user, password, host, port, dbname)
engine = create_engine(url)

## 1. Get the catalog of ISBN/countries

Hardcoded to Spain with demand in the period that we want to forecast (set by FCST_PERIOD above). This eliminates "old" ISBNs that are no longer being sold in Spain

In [None]:
query = f"""
select
    isbn + ship_to_country_key as key,
    isbn,
    isbn_short,
    subject_2_key,
    series_key,
    series_short,
    family_key,
    family_name,
    ship_to_country_key as country,
    sum(quantity_demanded) as qty_fcst_period
from r2ibp.f_demand_actual t1
left join r2ibp.lu_product t2
on t1.isbn = t2.isbn13
where last_day(date) <= current_date
and last_day(date) > dateadd(month, -{FCST_PERIOD}, current_date)
and ship_to_country_key = 'ES'
and isbn not like '555%%'
group by isbn, isbn_short, subject_2_key, series_key, series_short, family_key, family_name, ship_to_country_key
order by qty_fcst_period desc
"""

conn = engine.connect()
df_catalog = pd.read_sql_query(query, conn)
conn.close()

## 2. Get demand data for the test cases

Read all the demand data for the selected ISBN/countries. In this case Spanish ISBNs

In [None]:
key_list = list(df_catalog['key'])
# key_list = random.sample(list(df_catalog['key']), 1000) #For testing
 
df_demand = get_demand(key_list, engine)

In [None]:
#Let's look at the last 12 months
first_month = df_demand['month'].min()
last_full_month = df_demand['month'].max()
twelve_months_ago =  last_full_month - dateutil.relativedelta.relativedelta(months=12)


df_12m_demand = df_demand.copy()
df_12m_demand = df_12m_demand[df_12m_demand['month'] > twelve_months_ago]

#Calculate the number of months with demand - before aggregating
df_mths_w_demand = df_12m_demand[['key', 'month']].groupby(['key']).count()
df_mths_w_demand.rename(columns = {'month':'mths_w_orders'}, inplace = True)

#Now aggregate and use cut to put order quantity into log10 bins
df_12m_demand = df_12m_demand[['key', 'qty']].groupby(['key']).sum()
df_12m_demand['qty_bin'] = pd.cut(df_12m_demand['qty'], [0, 10, 100, 1000, 10000, 100000],
                           labels = ['<=10', '10-100', '100-1000', '1000-10000', '>10000'])

#Join the number of months with demand
df_12m_demand = df_12m_demand.join(df_mths_w_demand)
#And tidy up
del df_mths_w_demand

#Print the crosstab
df_crosstab = pd.crosstab(df_12m_demand['mths_w_orders'], columns=df_12m_demand['qty_bin'],
                  values=df_12m_demand['qty'], aggfunc='count', margins = True)
print(df_crosstab)


## 3. Run Holt-Winters

### Fit and Predict Using HWES

In [None]:
#Default config is ['add', False, 'add', False]
config = ['add', True, 'add', False]

df_errors, df_hwes_forecasts = predict_using_hwes(df_demand, FCST_PERIOD, config)

### Calculate Accuracy Metrics for ISBN/country combinations forecast

In [None]:
df_metrics = calc_prediction_metrics(df_hwes_forecasts)

df_metrics.head()

## 4. How Good is HW?

### In aggregate

In [None]:
plot_pred_naive1(df_metrics)

### What about some specific examples?
Look for cases where HW has been particularly good or particularly bad

In [None]:
df_metrics_sorted = df_metrics.dropna().sort_values(by='rmse_pc_diff', ascending = True)

In [None]:
df_metrics_sorted

Plot the top 12 based on RMSE

NB I should make the plotting code into a function as I'm constantly reusing it


In [None]:
plot_list = list(df_metrics_sorted.head(12)['key'])

plot_sample_preds(plot_list, df_demand, df_hwes_forecasts, FCST_PERIOD)

#NB There are negative values here

df_metrics_sorted[df_metrics_sorted['key'].isin(plot_list)]

This is the worst 12 based on RMSE

In [None]:
plot_list = list(df_metrics_sorted.tail(12)['key'])

plot_sample_preds(plot_list, df_demand, df_hwes_forecasts, FCST_PERIOD)

df_metrics_sorted[df_metrics_sorted['key'].isin(plot_list)]

#These are all examples were naive-1 was exactly right (RMSE = 0)

Finally a random selection

In [None]:
random.seed(1234)
random_list = random.sample(list(df_metrics_sorted['key']), 12)

cols = ['key', 'rmse_pred', 'rmse_naive1', 'pred_rmse_lower', 'rmse_pc_diff']
df_random_12 = df_metrics_sorted[df_metrics_sorted['key'].isin(random_list)][cols] #This is in sort order

plot_list = list(df_random_12['key']) #So that the plots in the same order as the metrics

plot_sample_preds(plot_list, df_demand, df_hwes_forecasts, FCST_PERIOD)
df_random_12