In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import mean_squared_error
import random
from statsmodels.tsa.api import ExponentialSmoothing, SimpleExpSmoothing, Holt
from sklearn import metrics 
import math
from typing import Union
from tqdm.auto import tqdm as tqdm

from sklearn.model_selection import GridSearchCV
from scipy.stats import uniform as sp_rand

In [None]:
######### Auxiliary functions ###############
#Printing filenames
def print_dir():
    import os
    for dirname, _, filenames in os.walk('/kaggle/input'):
        for filename in filenames:
            print(os.path.join(dirname, filename))

#Plotting
def plot_predictions(predictions, index, method, number_of_values = 84, number_of_values_to_predict = 112):
    sns.set()
    _ = plt.figure(figsize=(12,8))
    _ = plt.plot(np.arange(number_of_values,number_of_values_to_predict), predictions[index], label='predicted')
    _ = plt.plot(np.arange(number_of_values, number_of_values_to_predict), val_dataset.iloc[index].values, label='Val')
    _ = plt.plot(np.arange(number_of_values), train_dataset.iloc[index].values, label='Train')
    _ = plt.xlabel('days')
    _ = plt.ylabel('product count')
    _ = plt.legend()
    _ = plt.title(f'Predictions of the {method} for the product: {df_stv.iloc[index]["id"]}')
    plt.show()

    
######### Models #############################    
#Moving_average
def moving_average(series, k):
    '''
    #Calculate average of last k observations
    '''
    return np.average(series[:,-k:], axis = 1)    

def moving_averages (series, h, k):
    '''
    #Calculate moving averages of all rows in a dataframe
    '''
    series = series[series.columns[:]].values
    predictions = []
    for i in range (h):
        prediction = moving_average(series, k)
        predictions.append(prediction)
        series = np.append(series, [[x] for x in prediction], axis=1)   
    return np.array(predictions).T


#Exponential smoothing
def exponential_smoothing(trainingdata, trend = 'add', damped = True, seasonal = 'add', seasonal_periods = 28,\
                          smoothing_level = 0, smoothing_slope = 0, smoothing_seasonal = 1, optimized = True, forecasting_period = 28,\
                         use_boxcox = False, remove_bias = True, use_basinhopping = False):
    
    '''
    #########parameters explained#################
    
    #trainingdata = df of training data
    
    #trend = "add" or "mul"
    #damped = True or False
    #seasonal = "add" or "mul"
    #seasonal_periods = seasonal periods # of seasonal periods in a cycle (e.g. 7 days for weekly)
    
    #smoothing_level = float (0:1)
    #smoothing_seasonal = float (0:1) 1 is include full strenght seasonality,
    #seasonality strength level
    #optimized = Estimate model parameters by maximizing the log-likelihood
    
    #forecasting_period = int
    '''
    
    resultdict = {}
    index = 0
    for row in trainingdata.itertuples(index=False):  
        #more parameters can be added to this function from statsmodels.tsa.holtwinters.ExponentialSmoothing
        ses = ExponentialSmoothing(np.array(row), trend = trend, damped = damped,\
                                   seasonal = seasonal,seasonal_periods = seasonal_periods )
        fit = ses.fit(smoothing_level=smoothing_level, smoothing_slope = smoothing_slope, smoothing_seasonal=smoothing_seasonal,\
                      optimized = optimized,use_boxcox = use_boxcox, remove_bias = remove_bias, use_basinhopping = use_basinhopping)
        fcast = fit.forecast(forecasting_period)
    
    
        resultdict.update({index: fcast})
        index += 1
    
    resultdf = pd.DataFrame(resultdict).T
    
    return resultdf


######### prediction scoring #########################
#Compute rmse
def rmse(predictions, y):
    return np.sqrt(mean_squared_error(y, predictions))



In [None]:
#WRMSSE evaluator
class WRMSSEEvaluator(object):
    
    group_ids = ( 'all_id', 'state_id', 'store_id', 'cat_id', 'dept_id', 'item_id',
        ['state_id', 'cat_id'],  ['state_id', 'dept_id'], ['store_id', 'cat_id'],
        ['store_id', 'dept_id'], ['item_id', 'state_id'], ['item_id', 'store_id'])

    def __init__(self, 
                 train_df: pd.DataFrame, 
                 valid_df: pd.DataFrame, 
                 calendar: pd.DataFrame, 
                 prices: pd.DataFrame):
        '''
        intialize and calculate weights
        '''
        self.calendar = calendar
        self.prices = prices
        self.train_df = train_df
        self.valid_df = valid_df
        self.train_target_columns = [i for i in self.train_df.columns if i.startswith('d_')]
        self.weight_columns = self.train_df.iloc[:, -28:].columns.tolist()

        self.train_df['all_id'] = "all"

        self.id_columns = [i for i in self.train_df.columns if not i.startswith('d_')]
        self.valid_target_columns = [i for i in self.valid_df.columns if i.startswith('d_')]

        if not all([c in self.valid_df.columns for c in self.id_columns]):
            self.valid_df = pd.concat([self.train_df[self.id_columns], self.valid_df],
                                      axis=1, 
                                      sort=False)
        self.train_series = self.trans_30490_to_42840(self.train_df, 
                                                      self.train_target_columns, 
                                                      self.group_ids)
        self.valid_series = self.trans_30490_to_42840(self.valid_df, 
                                                      self.valid_target_columns, 
                                                      self.group_ids)
        self.weights = self.get_weight_df()
        self.scale = self.get_scale()
        self.train_series = None
        self.train_df = None
        self.prices = None
        self.calendar = None

    def get_scale(self):
        '''
        scaling factor for each series ignoring starting zeros
        '''
        scales = []
        for i in tqdm(range(len(self.train_series))):
            series = self.train_series.iloc[i].values
            series = series[np.argmax(series!=0):]
            scale = ((series[1:] - series[:-1]) ** 2).mean()
            scales.append(scale)
        return np.array(scales)
    
    def get_name(self, i):
        '''
        convert a str or list of strings to unique string 
        used for naming each of 42840 series
        '''
        if type(i) == str or type(i) == int:
            return str(i)
        else:
            return "--".join(i)
    
    def get_weight_df(self) -> pd.DataFrame:
        """
        returns weights for each of 42840 series in a dataFrame
        """
        day_to_week = self.calendar.set_index("d")["wm_yr_wk"].to_dict()
        weight_df = self.train_df[["item_id", "store_id"] + self.weight_columns].set_index(
            ["item_id", "store_id"]
        )
        weight_df = (
            weight_df.stack().reset_index().rename(columns={"level_2": "d", 0: "value"})
        )
        weight_df["wm_yr_wk"] = weight_df["d"].map(day_to_week)
        weight_df = weight_df.merge(
            self.prices, how="left", on=["item_id", "store_id", "wm_yr_wk"]
        )
        weight_df["value"] = weight_df["value"] * weight_df["sell_price"]
        weight_df = weight_df.set_index(["item_id", "store_id", "d"]).unstack(level=2)[
            "value"
        ]
        weight_df = weight_df.loc[
            zip(self.train_df.item_id, self.train_df.store_id), :
        ].reset_index(drop=True)
        weight_df = pd.concat(
            [self.train_df[self.id_columns], weight_df], axis=1, sort=False
        )
        weights_map = {}
        for i, group_id in enumerate(tqdm(self.group_ids, leave=False)):
            lv_weight = weight_df.groupby(group_id)[self.weight_columns].sum().sum(axis=1)
            lv_weight = lv_weight / lv_weight.sum()
            for i in range(len(lv_weight)):
                weights_map[self.get_name(lv_weight.index[i])] = np.array(
                    [lv_weight.iloc[i]]
                )
        weights = pd.DataFrame(weights_map).T / len(self.group_ids)

        return weights

    def trans_30490_to_42840(self, df, cols, group_ids, dis=False):
        '''
        transform 30490 sries to all 42840 series
        '''
        series_map = {}
        for i, group_id in enumerate(tqdm(self.group_ids, leave=False, disable=dis)):
            tr = df.groupby(group_id)[cols].sum()
            for i in range(len(tr)):
                series_map[self.get_name(tr.index[i])] = tr.iloc[i].values
        return pd.DataFrame(series_map).T
    
    def get_rmsse(self, valid_preds) -> pd.Series:
        '''
        returns rmsse scores for all 42840 series
        '''
        score = ((self.valid_series - valid_preds) ** 2).mean(axis=1)
        rmsse = (score / self.scale).map(np.sqrt)
        return rmsse

    def score(self, valid_preds: Union[pd.DataFrame, np.ndarray]) -> float:
        assert self.valid_df[self.valid_target_columns].shape == valid_preds.shape

        if isinstance(valid_preds, np.ndarray):
            valid_preds = pd.DataFrame(valid_preds, columns=self.valid_target_columns)

        valid_preds = pd.concat([self.valid_df[self.id_columns], valid_preds],
                                axis=1, 
                                sort=False)
        valid_preds = self.trans_30490_to_42840(valid_preds, 
                                                self.valid_target_columns, 
                                                self.group_ids, 
                                                True)
        self.rmsse = self.get_rmsse(valid_preds)
        self.contributors = pd.concat([self.weights, self.rmsse], 
                                      axis=1, 
                                      sort=False).prod(axis=1)
        return np.sum(self.contributors)

## Main part:

### Preprocessing

In [None]:
print(print_dir())

INPUT_DIR = '/kaggle/input/m5-forecasting-accuracy'
df_cal = pd.read_csv(f'{INPUT_DIR}/calendar.csv') #calendar data
df_sp = pd.read_csv(f'{INPUT_DIR}/sell_prices.csv') #selling prices
df_ss = pd.read_csv(f'{INPUT_DIR}/sample_submission.csv') # sample submission
df_stv = pd.read_csv(f'{INPUT_DIR}/sales_train_validation.csv')# sales train validation

INPUT_DIR_methods = '../input/m5methods'
weightsdata = pd.read_csv(f'{INPUT_DIR_methods}/validation/weights_validation.csv')
weightsdata = weightsdata.loc[weightsdata['Level_id'] == 'Level12']

ids = sorted(list(set(df_stv['id'])))
d_cols = [c for c in df_stv.columns if 'd_' in c]

Train - Validation split

In [None]:
#train/val split
train_dataset = df_stv[d_cols[-112:-28]] #use last 84 (28*3) days for training
val_dataset = df_stv[d_cols[-28:]]

### First model: moving average

In [None]:
predictions_ma =  moving_averages(train_dataset, h=28, k=35)
print("ma shape ",predictions_ma.shape)

### Second model: Exponential smoothing

In [None]:
predictions_es = exponential_smoothing(train_dataset, smoothing_level = 0, trend = 'add', damped = False, seasonal = 'add', seasonal_periods = 28,\
                          smoothing_slope = 0, smoothing_seasonal = 0.5, optimized = False, forecasting_period = 28, use_boxcox = False,\
                                       remove_bias = True, use_basinhopping = False)
predictions_es = predictions_es.to_numpy()
print("ES shape ", predictions_es.shape)

# Grid Search
Source: https://machinelearningmastery.com/how-to-grid-search-triple-exponential-smoothing-for-time-series-forecasting-in-python/ 

In [None]:
# grid search holt winter's exponential smoothing
from math import sqrt
from multiprocessing import cpu_count
from joblib import Parallel
from joblib import delayed
from warnings import catch_warnings
from warnings import filterwarnings
from statsmodels.tsa.holtwinters import ExponentialSmoothing
from sklearn.metrics import mean_squared_error
from numpy import array

In [None]:
# one-step Holt Winter’s Exponential Smoothing forecast
def exp_smoothing_forecast(history, config):
	a_vals,t,t_vals,d,s,s_vals,p,b,r,bas = config
	# define model
	history = array(history)
	model = ExponentialSmoothing(history, trend=t, damped=d, seasonal=s, seasonal_periods=p)
	# fit model
	model_fit = model.fit(smoothing_level = a_vals,smoothing_slope = t_vals,smoothing_seasonal = s_vals, optimized=True,\
                          use_boxcox=b, remove_bias=r, use_basinhopping = bas)
	# make one step forecast
	yhat = model_fit.predict(len(history), len(history))
	return yhat[0]

In [None]:
# root mean squared error or rmse
def measure_rmse(actual, predicted):
	return sqrt(mean_squared_error(actual, predicted))

In [None]:
# split a univariate dataset into train/test sets
def train_test_split(data, n_test):
 	return data[:-n_test], data[-n_test:]

In [None]:
# walk-forward validation for univariate data
def walk_forward_validation(data, n_test, cfg):
	predictions = list()
	# split dataset
	train, test = train_test_split(data, n_test)
	# seed history with training dataset
	history = [x for x in train]
	# step over each time-step in the test set
	for i in range(len(test)):
		# fit model and make forecast for history
		yhat = exp_smoothing_forecast(history, cfg)
		# store forecast in list of predictions
		predictions.append(yhat)
		# add actual observation to history for the next loop
		history.append(test[i])
	# estimate prediction error
	error = measure_rmse(test, predictions)
	return error

In [None]:
# score a model, return None on failure
def score_model(data, n_test, cfg, debug=False):
	result = None
	# convert config to a key
	key = str(cfg)
	# show all warnings and fail on exception if debugging
	if debug:
		result = walk_forward_validation(data, n_test, cfg)
	else:
		# one failure during model validation suggests an unstable config
		try:
			# never show warnings when grid searching, too noisy
			with catch_warnings():
				filterwarnings("ignore")
				result = walk_forward_validation(data, n_test, cfg)
		except:
			error = None
	# check for an interesting result
	if result is not None:
		print(' > Model[%s] %.3f' % (key, result))
	return (key, result)

In [None]:
# grid search configs
def grid_search(data, cfg_list, n_test, parallel=True):
	scores = None
	if parallel:
		# execute configs in parallel
		executor = Parallel(n_jobs=cpu_count(), backend='multiprocessing')
		tasks = (delayed(score_model)(data, n_test, cfg) for cfg in cfg_list)
		scores = executor(tasks)
	else:
		scores = [score_model(data, n_test, cfg) for cfg in cfg_list]
	# remove empty results
	scores = [r for r in scores if r[1] != None]
	# sort configs by error, asc
	scores.sort(key=lambda tup: tup[1])
	return scores

In [None]:
def exp_smoothing_configs(seasonal=[None]):
    models = list()
    # define config lists
    a_vals = np.linspace(0, 1, 5) #alpha coef
    t_params = ['add', None] #trend
    t_vals = np.linspace(0, 1, 5) #trend coef
    d_params = [False] #dampened trend
    s_params = ['add', None] #seasonality
    s_vals = np.linspace(0, 1, 5) #seasonality coef
    p_params = np.linspace(0, 112, 5) #number of seasonal periods
    b_params = [False] #use boxcox
    r_params = [True, False] #remove bias
    use_basinhopping = [True, False] #use bassinhopping

    # create config instances
    for alpha in a_vals:
        for t in t_params:
            for tv in t_vals:
                for d in d_params:
                    for s in s_params:
                        for sv in s_vals:
                            for p in p_params:
                                for b in b_params:
                                    for r in r_params:
                                        for bas in use_basinhopping:
                                            cfg = [alpha,t,tv,d,s,sv,p,b,r,bas]
                                            models.append(cfg)
    return models

In [None]:
if __name__ == '__main__':
    # define dataset
    data = np.mean(df_stv[d_cols[-364:]], axis=0)
    print(data)
    # data split
    n_test = 28
    # model configs
    cfg_list = exp_smoothing_configs()
    # grid search
    scores = grid_search(data, cfg_list, n_test)
    print('done')
    # list top 3 configs
    for cfg, error in scores[:3]:
        print(cfg, error)

In [None]:
for cfg, error in scores[:10]:
    print(cfg, error)

In [None]:
file = open('configs.txt', 'w')
counter = 0
for cfg, error in scores[:50]:
    counter +=1
    string = f'{counter} {cfg}: {error}\n'
    file.write(string)

file.close() 

### See results

In [None]:
#Compute the wrmsse scores
train_fold_df = df_stv.iloc[:, :-28]
valid_fold_df = df_stv.iloc[:, -28:].copy()
wrmsse = WRMSSEEvaluator(train_fold_df, valid_fold_df, df_cal, df_sp) #instantiate wrmsse class
print(f'Movering Average total WRMSSE score { wrmsse.score( predictions_ma)}\n',
      f'Exponential Smoothing total WRMSSE score: {wrmsse.score( predictions_es)}') 

In [None]:
#plot a random product
rand_int = random.randint(0, val_dataset.shape[0])

#plot moving average predictions
plot_predictions(predictions_ma, rand_int, "moving average", 84, 112)
print("RMSE: ", rmse( val_dataset.iloc[rand_int,:], predictions_ma[rand_int,:]))


#plot exponential smoothing predictions
plot_predictions(predictions_es, rand_int, "exponential smoothing", 84, 112)
print("RMSE: ", rmse( val_dataset.iloc[rand_int,:], predictions_es[rand_int,:]))


Optimizing Models

In [None]:
#Optimizing MA
#Which k returns the best score for moving average
errors = []
arng =  np.arange(5,75,5)
for i in arng:
    preds = moving_averages(train_dataset, h=28, k=i)
    errors.append(rmse(preds, val_dataset))
sns.set()
_ = plt.plot(arng, errors)
_ = plt.xlabel('k value')
_ = plt.ylabel('rmse')
_ = plt.title('Plot of the the parameter k and the rmse for the moving average model')
_ = plt.plot()
k = (np.argmin(errors)+1)*5
print("best k value: ", k)

# Submission

Run and submit the best model

In [None]:
predictions =  moving_averages(df_stv[d_cols[-35:]], h=28, k=k
#predictions = exponential_smoothing(df_stv[d_cols[-364:]], smoothing_level = 0, trend = 'add', damped = False, seasonal = 'add', seasonal_periods = 28,\
                           smoothing_slope = 0, smoothing_seasonal = 0.5, optimized = False, forecasting_period = 28, use_boxcox = False,\
                                        remove_bias = True, use_basinhopping = False) #using the last 364 days to make a prediction


In [None]:
first_cols = df_stv.loc[:,'id']
results = pd.concat([first_cols, predictions], axis=1)
results.columns= ['id']+[f'F{x}' for x in range(1,29)]
df_ss = pd.concat([results, df_ss.loc[30490:]],axis=0)
df_ss.to_csv('submission.csv', index=False) #make csv

In [None]:
df_ss.shape

In [None]:
df_ss