In [1]:
# Importing Required Python Packages
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns',None)

In [2]:
# Reading in the dataset & getting its info
df = pd.read_csv('prepared_data.csv',index_col='Date',parse_dates=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 72 entries, 2016-10-01 to 2019-09-01
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Vol       72 non-null     float64
 1   Category  72 non-null     object 
dtypes: float64(1), object(1)
memory usage: 1.7+ KB


In [3]:
# Segregating the dataset for buiscuits and Chocolate
bis = df.loc[df['Category'] == 'BISCUIT']

In [4]:
# Converting the biscuit volume series into pandas series
bis_s = (bis['Vol']).copy()

In [5]:
# Taking log of the biscuit volume series to reduce variance
bis_log = np.log10(bis_s)

### Creating functions for making simple forescats on univariate series

In [7]:
# importing necessary packages
from math import sqrt
from numpy import mean
from numpy import median
from multiprocessing import cpu_count
from joblib import Parallel
from joblib import delayed
from warnings import catch_warnings
from warnings import filterwarnings
from sklearn.metrics import mean_squared_error

In [8]:
# Makes one-step simple forecast on the given training set history
def simple_forecast(history, config):
    ''' makes naive predictions on historical observations
    with given configs(tuple) '''
    n, offset, avg_type = config
    # persist value, ignore other config
    if avg_type == 'persist':
        return history[-n]
    # collect values to average
    values = []
    if offset == 1:
        values = history[-n:]
    else:
    # skip bad configs
        if n*offset > len(history):
            raise Exception('Config beyond end of data: {} {}'.format(n,offset))
    # try and collect n values using offset
        for i in range(1, n+1):
            ix = i * offset
            values.append(history[-ix])
    # check if we can average
    # mean of last n values
    if avg_type == 'mean':
        return mean(values)
 # median of last n values
    return median(values)

In [10]:
# split a univariate dataset into train/test sets
def train_test_split(data, n_test):
    return data[:-n_test], data[-n_test:]

In [11]:
# walk-forward validation for univariate data
def walk_forward_validation(data, n_test, cfg):
    ''' Outputs the test set RMSE
    data: The time series sequence
    n_test: no. of test observations
    cfg : configuration tuple '''
    predictions = []
    # split dataset
    train, test = train_test_split(data, n_test)
# seed history with training dataset
    history = [x for x in train]
# step over each time-step in the test set
    for i in range(len(test)):
# fit model and make forecast for history
        yhat = simple_forecast(history, cfg)
# store forecast in list of predictions
        predictions.append(yhat)
# add actual observation to history for the next loop
        history.append(test[i])
# estimate prediction error
    error = sqrt(mean_squared_error(test, predictions))
    return error

In [12]:
# score a model, return None on failure
def score_model(data, n_test, cfg):
    ''' Calculates the rmse of the naive model by making a call to walk_forward_validation function 
     within a try except bloc.
     
    data: Whole Time series.
    n_test: No. of test observations.
    config: configurations.
     
    '''
     
    # one failure during model validation suggests an unstable config
    try:
    # never show warnings when grid searching, too noisy
        result = walk_forward_validation(data, n_test, cfg)
    except:
        result = None
    # return an interesting result
    return (str(cfg), result)

In [13]:
def grid_search(data, n_test, config, parallel=True):
    ''' performs a grid search for naive model hyper-parameters using multicores
    data: Whole Time series.
    n_test: No. of test observations.
    config: configurations
    
    '''
    
    #scores = None
    # Entering 
    '''if parallel:
        print('Entering Parallel')
        executor = Parallel(n_jobs=(cpu_count()-1), backend='multiprocessing')
        print('past executor')
        tasks = (delayed(score_func)(data, n_test, cfg, enforce) for cfg in config)
        print('past tasks')
        scores = executor(tasks)
        print('scores assigned')
        
    else: '''
    scores = [score_model(data, n_test, cfg) for cfg in config]
    scores = [s for s in scores if s[1] != None]
    # sort the scores in ascending order
    scores.sort(key=lambda x: x[1])
    #print('leaving grid_search')
    return scores

In [16]:
def simple_configs(max_length, offsets=[1]):
    '''Generates various configuartions for Naive model
    max_length: Length of training data
    offsets: periodicity or seasonality of the training data, periodicity = 1 for no seasonality
    '''
    configs = []
    for i in range(1, max_length+1):
        for o in offsets:
            for t in ['persist', 'mean', 'median']: # persist: simply copying the relevant observation from the 
                cfg = [i, o, t]                     # historical data.
                configs.append(cfg)
    return configs

In [18]:
# Initializing the required data for the analysis of the log_bis series
data = bis_log.values 
n_test = 6
max_length = len(data)-n_test
config = simple_configs(max_length,[1,12])

In [19]:
print('Total no. of different configurations to be tested are: ',len(config))

Total no. of different configurations to be tested are:  180


### Evaluating various Naive Models

In [20]:
# Evaluating the scores for various configurations for the naive models.
scores =  grid_search(data, n_test, config)

In [21]:
scores

[("[3, 1, 'median']", 0.05123110439463994),
 ("[4, 1, 'median']", 0.05928379671261053),
 ("[4, 1, 'mean']", 0.06411667318388294),
 ("[7, 1, 'mean']", 0.06412692110264459),
 ("[5, 1, 'median']", 0.06614641617378819),
 ("[6, 1, 'median']", 0.06634601986525086),
 ("[3, 1, 'mean']", 0.06674453194538722),
 ("[8, 1, 'median']", 0.0680809572218614),
 ("[7, 1, 'median']", 0.06817349955636702),
 ("[6, 1, 'mean']", 0.06864618783044167),
 ("[8, 1, 'mean']", 0.07016235538564773),
 ("[9, 1, 'median']", 0.07085874681484293),
 ("[5, 1, 'mean']", 0.07212922288706589),
 ("[9, 1, 'mean']", 0.07435031093495045),
 ("[2, 1, 'mean']", 0.07653338975089091),
 ("[2, 1, 'median']", 0.07653338975089091),
 ("[10, 1, 'median']", 0.07661428695824515),
 ("[7, 1, 'persist']", 0.07855190557645911),
 ("[7, 12, 'persist']", 0.07855190557645911),
 ("[10, 1, 'mean']", 0.07876384562881074),
 ("[3, 1, 'persist']", 0.07961547844656877),
 ("[3, 12, 'persist']", 0.07961547844656877),
 ("[4, 1, 'persist']", 0.07969902058828587)

### Observations:
### 1) The best naive model has configuration [  'Past_observations_to be_analyzed': 3,  'Period of Seasonality' : 1, 'Method': Median ]  
### 2) The best naive model has test set rmse of 0.0512311. This is the baseline & if any model has test set rmse  less than the previous figure, then that model is said to have some predictive power.