### Imports

In [1]:
import math
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from statsmodels.tsa.statespace.sarimax import SARIMAX
from statsmodels.tsa.holtwinters import ExponentialSmoothing
from sklearn.metrics import mean_absolute_error as mae
import itertools
import seaborn as sns
from sklearn.base import BaseEstimator
import statsmodels.api as sm

import warnings
warnings.filterwarnings("ignore")

### Functions

#### Prepare Data

In [3]:
def prepare_data(data, splits=0.8):
    '''
    Prepares the data by splitting into training, validation(if specified) and test sets
    
    Parameters
    ----------
    data : pandas.core.frame.DataFrame
          Dataset that needs to be splitted
          Dates in the dataset should be an index with type as pandas.core.indexes.datetimes.DatetimeIndex 
          and needs to be sorted
    splits : float, tuple
             Default = 0.8; splitting data in two sets - train and test of size 0.8 and 0.2 respectively
             float: Splits in two sets - train and test
                    Only accepts value between range (0.0, 1.0)
             tuple(train_size, validation_size): Splits in three sets - train, validation and test
                                                 train_size -> Train size
                                                 validation_size -> Validation size
                                                 Should only contain values between range (0.0, 1.0) and 
                                                 "train_size + test_size" should also be between (0.0, 1.0)
                                           
    Returns
    -------
    Train, test and validation(if tuples) splits : (pandas.core.frame.DataFrame)
    If splits is float - Train and test splits 
    If splits is a tuple - Train, validation, and test splits
    '''
    
    # if data is not pandas dataframe then error
    if not isinstance(data, pd.core.frame.DataFrame):
        raise TypeError("data is not a pandas dataframe")
    
    if not isinstance(data.index, pd.core.indexes.datetimes.DatetimeIndex):
        raise TypeError("data does not have index in pandas DatetimeIndex format")
    
    # if splits is not float or tuple then error
    if not isinstance(splits, (float, tuple)):
        raise TypeError("splits is not float or tuple")

    # check if dates are sorted or not, and if not raise exception
    if(data.index.is_monotonic_increasing==False):
        raise Exception("Dates are not sorted in the dataframe")
    
    #if splits is float then perform
    if isinstance(splits, float):
        # check if splits is between 0.0 and 1.0
        if(splits<=0.0 or splits>=1.0):
            raise Exception("splits can be only between 0.0 and 1.0")
        
        # splits in two sets - train and test and returns
        train = data[:int(splits*(len(data)))]
        test = data[int(splits*(len(data))):]
        return train, test 

    #if splits is tuple then perform
    if isinstance(splits, tuple):
        train_size = splits[0]
        test_size = splits[1]
        # check if tuple vals contain float
        if not isinstance(train_size, (float)):
            raise TypeError("Should contain only float")
        if not isinstance(test_size, (float)):
            raise TypeError("Should contain only float")
            
        # tuple len is not equal to two then exception
        if len(splits)!=2:
            raise Exception("Split should contain only two values")
            
        # check if all value in splits is between 0.0 and 1.0
        if(train_size<=0.0 or train_size>=1.0):
            raise Exception("splits can only be between 0.0 and 1.0")
        if(test_size<=0.0 or test_size>=1.0):
            raise Exception("splits can only be between 0.0 and 1.0")
        
        # check if sum of splits is between 0.0 and 1.0
        if((train_size+test_size)<=0.0 or (train_size+test_size)>=1.0):
            raise Exception("sum of values in tuples in splits should be <1.0")
        
        # splits in three sets - train, validation, test and returns
        train = data[:int(train_size*(len(data)))]
        val = data[int(train_size*(len(data))):int((train_size+test_size)*(len(data)))]
        test = data[int((train_size+test_size)*(len(data))):]
        return train, val, test

#### SARIMA Model

In [4]:
def sarima_model(train, test, val=None, **model_parameters):
    '''
    Creates a SARIMA model and returns predictions for the train and test data
    
    Parameters
    ----------
    train : pandas.core.frame.DataFrame
            Train dataset that will be used for training the model
    test : pandas.core.frame.DataFrame
            Test dataset that will be used for evaluating the model
    val : NoneType or pandas.core.frame.DataFrame
          Default : None
          Validation dataset that will be used for evaluating the model
    model_parameters : kwargs
                       Keyword argument that can be used to provide parameters to the model
                       Can take statsmodels.tsa.statespace.sarimax.SARIMAX as parameters
            
    Returns
    -------
    train_preds : pandas.core.series.Series
                  Predictions made by the model on training dataset
    val_preds : pandas.core.series.Series
                Predictions made by the model on validation dataset (If val is not None)
    test_preds : pandas.core.series.Series
                 Predictions made by the model on testing dataset
    '''

    # check if train and test are pandas dataframe
    if not isinstance(train, pd.core.frame.DataFrame):
        raise TypeError("train is not a pandas dataframe")
    
    if not isinstance(test, pd.core.frame.DataFrame):
        raise TypeError("test is not a pandas dataframe")
    
    # If val is not pandas dataframe or not NoneType
    if not isinstance(val, pd.core.frame.DataFrame):
        if val is not None:
            raise TypeError("val is not a pandas dataframe")
    
    # model creation + fit
    sarima_model = SARIMAX(train,  **model_parameters)
    sarima_model_fit = sarima_model.fit()

    if val is None:
        # predictions for train and test
        train_preds = sarima_model_fit.predict(start=1,end=len(train)-1)
        test_preds = sarima_model_fit.predict(start=len(train), end=len(train)+len(test)-1)
        return train_preds, test_preds
    
    else:
        # predictions for train, val, test
        train_preds = sarima_model_fit.predict(start=1,end=len(train)-1)
        val_preds = sarima_model_fit.predict(start=len(train), end=len(train)+len(val)-1)
        test_preds = sarima_model_fit.predict(start=len(train)+len(val), end=len(train)+len(val)+len(test)-1)
        return train_preds, val_preds, test_preds

#### ETS Model

In [5]:
def ets_model(train, test, val=None, **model_parameters):
    '''
    Creates a Triple Exponential Smoothing Model by calling statsmodels.tsa.holtwinters.ExponentialSmoothing and returns predictions for the train, val and test data
    
    Parameters
    ----------
    train : pandas.core.frame.DataFrame
            Train dataset that will be used for training the model
    test : pandas.core.frame.DataFrame
           Test dataset that will be used for evaluating the model
    val : NoneType or pandas.core.frame.DataFrame
          Default : None
          Validation dataset that will be used for evaluating the model
    model_parameters: kwargs
                      Keyword argument that can be used to provide parameters to the model
                      Can take statsmodels.tsa.holtwinters.ExponentialSmoothing parameters and
                      smoothing_level, smoothing_trend, smoothing_seasonal parameters of 
                      statsmodels.tsa.holtwinters.ExponentialSmoothing.fit
                      default smoothing_level = 0.5
                      default smoothing_trend = 0.5
                      default smoothing seasonal = 0.5
            
    Returns
    -------
    train_preds : pandas.core.series.Series
                  Predictions made by the model on training dataset
    val_preds : pandas.core.series.Series
                Predictions made by the model on validation dataset (If val is not None)
    test_preds : pandas.core.series.Series
                 Predictions made by the model on testing dataset
    '''
    # check if train and test are pandas dataframe
    if not isinstance(train, pd.core.frame.DataFrame):
        raise TypeError("train is not a pandas dataframe")
    
    if not isinstance(test, pd.core.frame.DataFrame):
        raise TypeError("test is not a pandas dataframe")

    # If val is not pandas dataframe or not NoneType
    if not isinstance(val, pd.core.frame.DataFrame):
        if val is not None:
            raise TypeError("val is not a pandas dataframe")

    # default fit parameters
    smoothing_level=0.5
    smoothing_trend=0.5
    smoothing_seasonal=0.5
    
    # If fit parameters are specified by user with model_parameters then removing those from model_parameters 
    #and updating default fit parameters to whatever is specified as input
    if('smoothing_level' in model_parameters):
        smoothing_level = model_parameters['smoothing_level']
        model_parameters.pop('smoothing_level')
        
    if('smoothing_trend' in model_parameters):
        smoothing_trend = model_parameters['smoothing_trend']
        model_parameters.pop('smoothing_trend')
        
    if('smoothing_seasonal' in model_parameters):
        smoothing_seasonal = model_parameters['smoothing_seasonal']
        model_parameters.pop('smoothing_seasonal')

    # model creation + fit
    EXPmodel = ExponentialSmoothing(train, **model_parameters).fit(smoothing_level=smoothing_level,
                                                                   smoothing_trend=smoothing_trend,
                                                                   smoothing_seasonal=smoothing_seasonal)

    if val is None:
        # predictions for train and test
        train_preds = EXPmodel.predict(start=0,end=len(train)-1)
        test_preds = EXPmodel.predict(start=len(train), end=len(train)+len(test)-1)
        return train_preds, test_preds
    
    else:
        # predictions for train, val, test
        train_preds = EXPmodel.predict(start=0,end=len(train)-1)
        val_preds = EXPmodel.predict(start=len(train), end=len(train)+len(val)-1)
        test_preds = EXPmodel.predict(start=len(train)+len(val), end=len(train)+len(val)+len(test)-1)
        return train_preds, val_preds, test_preds

#### SARIMAX Model

In [6]:
def sarimax_model(X_train, X_test, y_train, y_test, X_val=None, y_val=None, **model_parameters):
    '''
    Creates a SARIMAX model and returns predictions for the train and test data
    
    Parameters
    ----------
    X_train : pandas.core.frame.DataFrame
              Train dataset that will be used for training the model (features)
    y_train : pandas.core.frame.DataFrame
              Train dataset that will be used for training the model (target variable)
    X_test : pandas.core.frame.DataFrame
             Test dataset that will be used for evaluating the model (features)
    y_test : pandas.core.frame.DataFrame
             Test dataset that will be used for evaluating the model (target variable)
    X_val : NoneType or pandas.core.frame.DataFrame
            Default : None
            Validation dataset that will be used for evaluating the model (features)
    y_val : NoneType or pandas.core.frame.DataFrame
            Default : None
            Validation dataset that will be used for evaluating the model (target variable)
    model_parameters : kwargs
                       Keyword argument that can be used to provide parameters to the model
                       Can take statsmodels.tsa.statespace.sarimax.SARIMAX as parameters
            
    Returns
    -------
    train_preds : pandas.core.series.Series
                  Predictions made by the model on training dataset
    val_preds : pandas.core.series.Series
                Predictions made by the model on validation dataset (If X_val, y_val is not None)
    test_preds : pandas.core.series.Series
                 Predictions made by the model on testing dataset
    '''
    sarimax_model_fit = SARIMAX(y_train, exog=X_train, **model_parameters).fit()
    
    if X_val is None or y_val is None:
        train_preds = sarimax_model_fit.predict(start=1,end=len(X_train)-1, exog=X_train)
        test_preds = sarimax_model_fit.predict(start=len(X_train), end=len(X_train)+len(X_test)-1, exog=X_test)
        return train_preds, test_preds
    
    else:
        train_preds = sarimax_model_fit.predict(start=1,end=len(y_train)-1, exog=X_train)
        val_preds = sarimax_model_fit.predict(start=len(y_train), end=len(y_train)+len(y_val)-1, exog=X_val)
        test_preds = sarimax_model_fit.predict(start=len(y_train)+len(y_val), end=len(y_train)+len(y_val)+len(y_test)-1, exog=pd.concat([X_val,X_test]))
        test_preds = test_preds[-18:]
        return train_preds, val_preds, test_preds

#### Evaluation Metrics

In [7]:
def evaluation_metrics(actual, preds):
    '''
    Calculates MAE for the specified actual and predicted values
    Length of actual and preds should be equal
    
    Parameters
    ----------
    actual : Actual values
    preds : Predicted values
    
    Returns
    -------
    error_mae : float
                Mean average error
    '''
    # if actual does not equal to preds then raise error
    if len(actual)!=len(preds):
        raise ValueError("Length of actual does not equal to length of preds")
    
    error_mae = mae(actual, preds)
    return error_mae

#### Prediction Plot

In [8]:
def prediction_plot(plot_title, plot_train, plot_test, plot_preds):
    '''
    Creates a matplotlib plot that displays train, test, and predicted values over time
    
    Parameters
    ----------
    plot_title : Title of the plot
    plot_train : Actual data values over time
    plot_test : Test values over time
    plot_preds : Predicted values over time
    
    Returns
    -------
    Matplotlib line chart
    '''
    plt.figure(figsize=(20,6))
    plt.title(plot_title)
    plt.plot(plot_train, label='Train')
    plt.plot(plot_test, label='Test')
    plt.plot(plot_preds, label='Prediction')
    plt.legend()
    plt.show()

In [9]:
def val_prediction_plot(plot_title, plot_train, plot_val, plot_test, plot_val_preds, plot_test_preds):
    '''
    Creates a matplotlib plot that displays train, test, and predicted values over time
    
    Parameters
    ----------
    plot_title : Title of the plot
    plot_train : Actual data values over time
    plot_val : Validation values over time
    plot_test : Test values over time
    plot_val_preds : Predicted values on validation data over time
    plot_test_preds : Predicted_values on test data over time
    
    Returns
    -------
    Matplotlib line chart
    '''
    plt.figure(figsize=(20,6))
    plt.title(plot_title)
    plt.plot(plot_train, label='Train')
    plt.plot(plot_val, label='Validation')
    plt.plot(plot_test, label='Test')
    plt.plot(plot_val_preds, label='Validation Prediction')
    plt.plot(plot_test_preds, label='Test Predictions')
    plt.legend()
    plt.show()

#### All grid searches (SARIMA, ETS, combined)

In [10]:
def grid_search_sarima(train, val,
                       p_range=range(0,2), d_range=range(0,2), q_range=range(0,2),
                       P_range=range(0,2), D_range=range(0,2), Q_range=range(0,2), m=[52]):
    '''
    Performs a grid search on statsmodels.tsa.statespace.sarimax.SARIMAX
    
    Parameters
    ----------
    train : Pandas.core.frame.DataFrame
            Train set
    val : Pandas.core.frame.DataFrame
          Validation set
    p_range : range
              Default : range(0,2)
              Range of trend autoregression order
    d_range : range
              Default : range(0,2)
              Range of trend difference order
    q_range : range
              Default : range(0,2)
              Range of trend moving average order
    P_range : range
              Default : range(0,2)
              Range of seasonal autoregression order
    D_range : range
              Default : range(0,2)
              Range of seasonal difference order
    Q_range : range
              Default : range(0,2)
              Range of seasonal moving average order
    m : list
        Default : [52]
        The number of time steps for a single seasonal period
    
    Returns
    -------
    grid : pandas.core.frame.DataFrame
           Returns a dataframe with columns=['order', 'seasonal_order', 'train_error', 'val_error', 'test_error']
    best_params : dict({'order':order_parameters, 'seasonal_order':seasonal_order_parameters})
                  Returns a key-value pair of best performing model
    '''
    # check if train and test are pandas dataframe
    if not isinstance(train, pd.core.frame.DataFrame):
        raise TypeError("train is not a pandas dataframe")
  
    if not isinstance(val, pd.core.frame.DataFrame):
        raise TypeError("val is not a pandas dataframe")
        
    # check if the range variables are python range
    range_inputs = [p_range, d_range, q_range, P_range, D_range, Q_range]
    for i in range_inputs:
        if not isinstance(i, range):
            raise TypeError("Range inputs are not of type range")
    
    # check if m is a python list
    if not isinstance(m, list):
        raise TypeError("m is not a list")

    params_combinations = list(itertools.product(p_range, d_range, q_range, P_range, D_range, Q_range, m))
    
    # defining return variables
    best_error=float("inf")
    best_params=None
    
    # defining output of each value as pandas dataframe
    grid = pd.DataFrame(columns=['order', 'seasonal_order', 'train_error', 'val_error'])
    
    # unpacking itertools product params_combinations
    for p, d, q, P, D, Q, m in params_combinations:
        order = (p, d, q)
        seasonal_order = (P, D, Q, m)
        
        # try except for handling exceptions
        try:
            # create the model and returns predictions for train and test
            train_preds, val_preds = sarima_model(train, val, order=order, seasonal_order=seasonal_order)

            # if preds are nan then handle
            if(np.isnan(np.min(train_preds)) or np.isnan(np.min(val_preds))):
                grid.loc[len(grid)] = [order, seasonal_order, 'nan', 'nan']
            
            # if preds are non-nan
            # evaluates the model and assigns best error and best params if the test/val error is least
            else:
                train_error = evaluation_metrics(train[1:], train_preds)
                val_error = evaluation_metrics(val, val_preds)

                grid.loc[len(grid)] = [order, seasonal_order, train_error, val_error]

            if(val_error<best_error):
                best_error=val_error
                best_params = {'order':order, 'seasonal_order':seasonal_order}
    
        except Exception as e:
            continue
    
    # returning best parameters
    return grid, best_params

In [11]:
def grid_search_ets(train, val,
                    trend=['add', 'mul'], seasonal=['add','mul'], seasonal_periods=[52], 
                    smoothing_values=np.arange(0.0,0.3,0.1)):
    '''
    Performs a grid search on statsmodels.tsa.holtwinters.ExponentialSmoothing
    
    Parameters
    ----------
    train : Pandas.core.frame.DataFrame
            Train set
    val : Pandas.core.frame.DataFrame
          Validation set
    trend : list
            Default : ['add', 'mul']
            Trend component of statsmodels.tsa.holtwinters.ExponentialSmoothing
            Can have only 'add' and/or 'mul' as values inside list
    seasonal : list
               Default : ['add', 'mul']
               Seasonal component of statsmodels.tsa.holtwinters.ExponentialSmoothing
               Can have only 'add' and/or 'mul' as values inside list
    seasonal_periods : list
                       Default : [52]
                       Seasonal_periods component of statsmodels.tsa.holtwinters.ExponentialSmoothing
    smoothing_values : numpy.ndarray
                       Default : np.arange(0.0,0.3,0.1)
                       Set of values that can be used for for smoothing_level, smoothing_trend, smoothing_seasonal values of
                       statsmodels.tsa.holtwinters.ExponentialSmoothing.fit
    
    Returns
    -------
    grid : pandas.core.frame.DataFrame
           Returns a dataframe with columns=['trend', 'seasonal', 'seasonal_periods', 
                                             'smoothing_level', 'smoothing_trend', 'smoothing_seasonal',
                                             'train_error', 'val_error', 'test_error']
    best_params : dict({'trend':trend,'seasonal':seasonal,'seasonal_periods':seasonal_periods,
                        'smoothing_level':smoothing_level, 'smoothing_trend':smoothing_trend,'smoothing_seasonal':smoothing_seasonal})
                  Returns a key-value pair of the best performing model
              
    '''
    # check if train and test are pandas dataframe
    if not isinstance(train, pd.core.frame.DataFrame):
        raise TypeError("train is not a pandas dataframe")
     
    if not isinstance(val, pd.core.frame.DataFrame):
        raise TypeError("val is not a pandas dataframe")
    
    
    list_inputs = [trend, seasonal]    
    for col in list_inputs:
        # check if the list variables are python range
        if not isinstance(col, list):
            raise TypeError("Trend and/or seasonal is not of type list")
            
        # check len of list_inputs as it can only have 1 or 2 values
        if(len(col)>2 or len(col)<1):
            raise ValueError("Trend and seasonal can only have 1 or 2 values")

        for i in col:
            if(i!='add' and i!='mul'):
                raise ValueError("Trend and seasonal can only have 'add' and/or 'mul' as values in the list")
        
    # check if smoothing_values is of correct type
    if not isinstance(smoothing_values, np.ndarray):
        raise TypeError("Smoothing range is not of type ")

    smoothing_level = smoothing_values
    smoothing_trend = smoothing_values
    smoothing_seasonal = smoothing_values
    
    params_combinations = list(itertools.product(trend, seasonal, seasonal_periods, 
                                                 smoothing_level, smoothing_trend, smoothing_seasonal))
    
    # defining return variables
    best_error=float("inf")
    best_params=None
    
    # defining output of each value as pandas dataframe
    grid = pd.DataFrame(columns=['trend', 'seasonal', 'seasonal_periods', 
                                 'smoothing_level', 'smoothing_trend', 'smoothing_seasonal',
                                 'train_error', 'val_error'])
    
    # unpacking itertools product params_combinations
    for trend, seasonal, seasonal_periods, alpha, beta, gamma in params_combinations:
        
        # try except for handling exceptions
        try:
            # create the model and returns predictions for train and test
            train_preds, val_preds = ets_model(train, val,
                                               trend=trend, seasonal=seasonal, seasonal_periods=seasonal_periods,
                                               smoothing_level=alpha, smoothing_trend=beta, smoothing_seasonal=gamma)

            # if preds are nan then handle
            if(np.isnan(np.min(train_preds)) or np.isnan(np.min(val_preds))):
                grid.loc[len(grid)] = [trend, seasonal, seasonal_periods,
                                       alpha, beta, gamma, 
                                       'nan', 'nan']
            
            # if preds are non-nan
            # evaluates the model and assigns best error and best params if the test/val error is least
            else:
                train_error = evaluation_metrics(train, train_preds)
                val_error = evaluation_metrics(val, val_preds)

                grid.loc[len(grid)] = [trend, seasonal, seasonal_periods,
                                       alpha, beta, gamma,
                                       train_error, val_error]

            if(val_error<best_error):
                best_error=val_error
                best_params = {'trend':trend,
                               'seasonal':seasonal,
                               'seasonal_periods':seasonal_periods,
                               'smoothing_level':alpha,
                               'smoothing_trend':beta,
                               'smoothing_seasonal':gamma}
    
        except Exception as e:
            continue
    
    # printing the grid dataframe and returning best parameters
    return grid, best_params

In [12]:
def grid_search_sarimax(X_train, X_val, y_train, y_val,
                        p_range=range(0,2), d_range=range(0,2), q_range=range(0,2),
                        P_range=range(0,2), D_range=range(0,2), Q_range=range(0,2), m=[52]):
    '''
    Performs a grid search on statsmodels.tsa.statespace.sarimax.SARIMAX
    
    Parameters
    ----------
    X_train : Pandas.core.frame.DataFrame
              Train set exogenous variables
    X_val : Pandas.core.frame.DataFrame
            Validation set exogenous variables
    y_train : Pandas.core.frame.DataFrame
              Train set output variable
    y_val : Pandas.core.frame.DataFrame
            Validation set output variable
    p_range : range
              Default : range(0,2)
              Range of trend autoregression order
    d_range : range
              Default : range(0,2)
              Range of trend difference order
    q_range : range
              Default : range(0,2)
              Range of trend moving average order
    P_range : range
              Default : range(0,2)
              Range of seasonal autoregression order
    D_range : range
              Default : range(0,2)
              Range of seasonal difference order
    Q_range : range
              Default : range(0,2)
              Range of seasonal moving average order
    m : list
        Default : [52]
        The number of time steps for a single seasonal period
    
    Returns
    -------
    grid : pandas.core.frame.DataFrame
           Returns a dataframe with columns=['order', 'seasonal_order', 'train_error', 'val_error', 'test_error']
    best_params : dict({'order':order_parameters, 'seasonal_order':seasonal_order_parameters})
                  Returns a key-value pair of best performing model
    '''
        
    # check if the range variables are python range
    range_inputs = [p_range, d_range, q_range, P_range, D_range, Q_range]
    for i in range_inputs:
        if not isinstance(i, range):
            raise TypeError("Range inputs are not of type range")
    
    # check if m is a python list
    if not isinstance(m, list):
        raise TypeError("m is not a list")

    params_combinations = list(itertools.product(p_range, d_range, q_range, P_range, D_range, Q_range, m))
    
    # defining return variables
    best_error=float("inf")
    best_params=None
    
    # defining output of each value as pandas dataframe
    grid = pd.DataFrame(columns=['order', 'seasonal_order', 'train_error', 'val_error'])
    
    # unpacking itertools product params_combinations
    for p, d, q, P, D, Q, m in params_combinations:
        order = (p, d, q)
        seasonal_order = (P, D, Q, m)
        
        # try except for handling exceptions
        try:
            # create the model and returns predictions for train and test
            train_preds, val_preds = sarimax_model(X_train, X_val, y_train, y_val, order=order, seasonal_order=seasonal_order)

            # if preds are nan then handle
            if(np.isnan(np.min(train_preds)) or np.isnan(np.min(val_preds))):
                grid.loc[len(grid)] = [order, seasonal_order, 'nan', 'nan']
            
            # if preds are non-nan
            # evaluates the model and assigns best error and best params if the test/val error is least
            else:
                train_error = evaluation_metrics(y_train[1:], train_preds)
                val_error = evaluation_metrics(y_val, val_preds)

                grid.loc[len(grid)] = [order, seasonal_order, train_error, val_error]

            if(val_error<best_error):
                best_error=val_error
                best_params = {'order':order, 'seasonal_order':seasonal_order}
    
        except Exception as e:
            continue
    
    # returning best parameters
    return grid, best_params

In [13]:
def grid_search(train, val, model_name, **parameters):
    '''
    Performs a grid search on the given model
    
    Parameters
    ----------
    train : Pandas.core.frame.DataFrame
            Train set
    val : Pandas.core.frame.DataFrame
          Validation set
    test : Pandas.core.frame.DataFrame
           Test set
    model_name : {'SARIMA', 'ETS'} (Can accept this in upper/lower/mixed case)
                 SARIMA : Creates a SARIMA model
                 ETS : Creates a triple Exponential Smoothing model
    **parameters : kwargs
                   Parameters that can be passed in grid search
                   
    Returns
    -------
    grid : pandas.core.frame.DataFrame
           Returns a dataframe of all the results of exhaustive search
           
           If model_name = 'SARIMA' : Returns a dataframe with 
                                      columns=['order', 'seasonal_order', 'train_error', 'test_error']
           If model_name = 'ETS' : Returns a dataframe with 
                                   columns=['trend', 'seasonal', 'seasonal_periods',
                                            'smoothing_level', 'smoothing_trend', 'smoothing_seasonal',
                                            'train_error', 'test_error']
                                                         
    best_params : Dictionary
                  Returns a key-value pair of the best performing model   
                  If model_name = 'SARIMA' : dict({'order':order_parameters, 
                                                   'seasonal_order':seasonal_order_parameters})
                  If model_name = 'ETS' : dict({'trend':trend,
                                                'seasonal':seasonal,
                                                'seasonal_periods':seasonal_periods,
                                                'smoothing_level':smoothing_level, 
                                                'smoothing_trend':smoothing_trend,
                                                'smoothing_seasonal':smoothing_seasonal})
    '''
    # if model_name is not sarima or ets (uppercase/lowercase both works for user)
    model_name = model_name.upper()
    if (model_name not in ['SARIMA', 'ETS']):
        raise ValueError("model_name must be one of {'SARIMA', 'ETS'}")
    
    model_name = model_name.upper()
    # Performs SARIMA grid search
    if(model_name=='SARIMA'):
        grid, best_parameters = grid_search_sarima(train, val, **parameters)

    # Performs ETS grid search
    elif(model_name=='ETS'):
        grid, best_parameters = grid_search_ets(train, val, **parameters)
    
    return grid, best_parameters

#### Generate Predictions Full

In [14]:
def generate_predictions_full(dataset, model_name, splits=0.8, perform_grid_search=False, **kwargs):
    '''
    Returns predictions for the specified model
    
    Parameters
    ----------
    dataset : pandas.core.frame.DataFrame
              Pandas univariate dataframe to be used for predictions
    
    model_name : {'SARIMA', 'ETS'} (Can accept in upper/lower/mixed case)
                 SARIMA : Creates a SARIMA model by calling the sarima_model function
                 ETS : Creates a triple Exponential Smoothing model by calling the ets_model function
    splits : float, tuple
             Default = 0.8; splitting data in two sets - train and test of size 0.8 and 0.2 respectively
             float : Splits in two sets - train and test
                     Only accepts value between range (0.0, 1.0)
             tuple(train_size, validation_size) : Splits in three sets - train, validation and test
                                                  train_size -> Train size
                                                  validation_size -> Validation size
                                                  Should only contain values between range (0.0, 1.0) and 
                                                  "train_size + test_size" should also be between (0.0, 1.0)
             Splits should be tuple if perform_grid_search is True
    perform_grid_search : boolean
                          Default = False; does not perform grid search
    **kwargs : kwargs
               Keyword arguments that can be used to provide parameters to the model or grid search
               If perform_grid_search=False & model_name='SARIMA' : kwargs =  statsmodels.tsa.statespace.sarimax.SARIMAX parameters
               If perform_grid_search=False & model_name='ETS' : kwargs =  statsmodels.tsa.holtwinters.ExponentialSmoothing parameters
               If perform_grid_search=True & model_name='SARIMA' : kwargs = grid_search_sarima function parameters
               If perform_grid_search=True & model_name='ETS' : kwargs = grid_search_ets function parameters
                                                      
    Prints
    ------
    Split size : float
                 If splits is float - Train and test splits 
                 If splits is a tuple - Train, validation, and test splits
    Grid Search Parameters and Range : Parameters provided by the user for grid search(if perform_grid_search==True)
    Best Parameters : Best parameters found by the grid search(if perform_grid_search==True)
    Model Parameters : Model parameters provided by the user (if perform_grid_search==False)
    Evaluation : Prints a dataframe containing "MAE" for train, val, test
    Prediction plot : Predictions on val and test data as a plot
    
    Returns
    -------
    val_preds : Predictions for val split (if splits is tuple)
    test_preds : Predictions for test split
    '''
    # check dataset type
    if not isinstance(dataset, pd.core.frame.DataFrame):
        raise TypeError("train is not a pandas dataframe")
        
    # check splits type
    if not isinstance(splits, (float, tuple)):
        raise TypeError("splits is not float or tuple")
    
    # prepares data and returns train, test if splits are float
    if isinstance(splits, float):
        train, test = prepare_data(dataset, splits)
    # prepares data and returns train, val, test if splits are tuple
    elif isinstance(splits, tuple):
        train, val, test = prepare_data(dataset, splits)
    
    # if model_name is not sarima or ets (uppercase/lowercase both works for user)
    model_name = model_name.upper()
    if (model_name not in ['SARIMA', 'ETS']):
        raise ValueError("model_name must be one of {'SARIMA', 'ETS'}")
    
    # perform_grid_search should be bool
    if not isinstance(perform_grid_search, bool):
        raise TypeError("perform_grid_search should be boolean")
        
    # if grid search then splits must be tuple for train/val/test
    if(perform_grid_search==True):
        if not isinstance(splits, tuple):
            raise ValueError("If perform_grid_search is True then splits should be a tuple")
            
    # print parameters
    print("------------------Parameters------------------")
    # printing train/test split size
    if isinstance(splits, float):
        print("Train Split Size :", splits)
        print("Test Split Size :", round(1.0-splits, 3))

    # printing train/validation/test split size
    if isinstance(splits, tuple):
        print("Train Split Size :", splits[0])
        print("Validation Split Size :", splits[1])
        print("Test Split Size :", round(1.0-splits[1]-splits[0],3))
    
    # If grid search then performs grid search and returns the grid dataframe(as grid) and the best model parameters(as kwargs)
    if perform_grid_search==True:
        # printing user input range
        print("-----Parameters and Range for grid search-----")
        for key, value in kwargs.items():
            print(key, ":", value)
            
        grid, kwargs = grid_search(train, val, model_name, **kwargs)
        print("-------------Best Parameters Found-------------")
        for key, value in kwargs.items():
            print(key, ":", value)
        
        if model_name=='SARIMA':
            train_preds, val_preds, test_preds = sarima_model(train, test, val=val, **kwargs)
            
            train_error_mae = evaluation_metrics(train[1:], train_preds)
            val_error_mae = evaluation_metrics(val, val_preds)
            test_error_mae = evaluation_metrics(test, test_preds)
        
        elif model_name=='ETS':
            train_preds, val_preds, test_preds = ets_model(train, test, val=val, **kwargs)

            train_error_mae = evaluation_metrics(train, train_preds)
            val_error_mae = evaluation_metrics(val, val_preds)
            test_error_mae = evaluation_metrics(test, test_preds)
    
        # mae
        error_df = pd.DataFrame([[round(train_error_mae, 4)],
                                 [round(val_error_mae, 4)],
                                 [round(test_error_mae, 4)]],
                                columns = ['MAE'],
                                index = ['Train', 'Val', 'Test'])
    
        val_prediction_plot(model_name, train, val, test, val_preds, test_preds)
        print("------------------Evaluation------------------")
        print(error_df)
    
        return val_preds, test_preds
    
    # If no grid search
    else:
        print("------------------Parameters------------------")
        for key, value in kwargs.items():
            print(key, ":", value)
   
        if isinstance(splits, float):
            # for sarima model perform
            if model_name=='SARIMA':
                train_preds, test_preds = sarima_model(train, test, **kwargs)

                train_error_mae = evaluation_metrics(train[1:], train_preds)
                test_error_mae = evaluation_metrics(test, test_preds)

            # for ets perform
            elif model_name=='ETS':
                train_preds, test_preds = ets_model(train, test, **kwargs)

                train_error_mae = evaluation_metrics(train, train_preds)
                test_error_mae = evaluation_metrics(test, test_preds)

            # mae
            error_df = pd.DataFrame([[round(train_error_mae, 4)],
                                     [round(test_error_mae, 4)]], 
                                    columns = ['MAE'], 
                                    index = ['Train', 'Test'])
            
            prediction_plot(model_name, train, test, test_preds)
            print("------------------Evaluation------------------")
            print(error_df)

            return test_preds
        else:
            if model_name=='SARIMA':
                train_preds, val_preds, test_preds = sarima_model(train, test, val=val, **kwargs)
                train_error_mae = evaluation_metrics(train[1:], train_preds)
                val_error_mae = evaluation_metrics(val, val_preds)
                test_error_mae = evaluation_metrics(test, test_preds)
        
            elif model_name=='ETS':
                train_preds, val_preds, test_preds = ets_model(train, test, val=val, **kwargs)

                train_error_mae = evaluation_metrics(train, train_preds)
                val_error_mae = evaluation_metrics(val, val_preds)
                test_error_mae = evaluation_metrics(test, test_preds)

            # mae
            error_df = pd.DataFrame([[round(train_error_mae, 4)],
                                     [round(val_error_mae, 4)],
                                     [round(test_error_mae, 4)]],
                                    columns = ['MAE'],
                                    index = ['Train', 'Val', 'Test'])
            
            val_prediction_plot(model_name, train, val, test, val_preds, test_preds)
            print("------------------Evaluation------------------")
            print(error_df)

            return val_preds, test_preds

In [15]:
def sarimax_full(dataset, splits=0.8, perform_grid_search=False, **kwargs):
    '''
    Returns predictions for the specified model
    
    Parameters
    ----------
    dataset : pandas.core.frame.DataFrame
              Pandas univariate dataframe to be used for predictions
    splits : float, tuple
             Default = 0.8; splitting data in two sets - train and test of size 0.8 and 0.2 respectively
             float : Splits in two sets - train and test
                     Only accepts value between range (0.0, 1.0)
             tuple(train_size, validation_size) : Splits in three sets - train, validation and test
                                                  train_size -> Train size
                                                  validation_size -> Validation size
                                                  Should only contain values between range (0.0, 1.0) and 
                                                  "train_size + test_size" should also be between (0.0, 1.0)
             Splits should be tuple if perform_grid_search is True
    perform_grid_search : boolean
                          Default = False; does not perform grid search
    **kwargs : kwargs
               Keyword arguments that can be used to provide parameters to the model or grid search
               If perform_grid_search=False : kwargs =  statsmodels.tsa.statespace.sarimax.SARIMAX parameters
               If perform_grid_search=True : kwargs = grid_search_sarimax function parameters
                                                      
    Prints
    ------
    Split size : float
                 If splits is float - Train and test splits 
                 If splits is a tuple - Train, validation, and test splits
    Grid Search Parameters and Range : Parameters provided by the user for grid search(if perform_grid_search==True)
    Best Parameters : Best parameters found by the grid search(if perform_grid_search==True)
    Model Parameters : Model parameters provided by the user (if perform_grid_search==False)
    Evaluation : Prints a dataframe containing "MAE" for train, val, test
    Prediction plot : Predictions on val and test data as a plot
    
    Returns
    -------
    val_preds : Predictions for val split (if splits is tuple)
    test_preds : Predictions for test split
    '''
    # perform_grid_search should be bool
    if not isinstance(perform_grid_search, bool):
        raise TypeError("perform_grid_search should be boolean")
        
    # if grid search then splits must be tuple for train/val/test
    if(perform_grid_search==True):
        if not isinstance(splits, tuple):
            raise ValueError("If perform_grid_search is True then splits should be a tuple")
            
    # print parameters
    print("------------------Parameters------------------")
    # printing train/test split size
    if isinstance(splits, float):
        print("Train Split Size :", splits)
        print("Test Split Size :", round(1.0-splits, 3))

    # printing train/validation/test split size
    if isinstance(splits, tuple):
        print("Train Split Size :", splits[0])
        print("Validation Split Size :", splits[1])
        print("Test Split Size :", round(1.0-splits[1]-splits[0],3))
            
    # prepares data and returns train, test if splits are float
    if isinstance(splits, float):
        train, test = prepare_data(dataset, splits)
        train_Y = train['Weekly_Sales']
        train_X = train.drop(columns='Weekly_Sales')

        test_Y = test['Weekly_Sales']
        test_X = test.drop(columns='Weekly_Sales')
    # prepares data and returns train, val, test if splits are tuple
    elif isinstance(splits, tuple):
        train, val, test = prepare_data(dataset, splits)
        train_Y = train['Weekly_Sales']
        train_X = train.drop(columns='Weekly_Sales')
        
        val_Y = val['Weekly_Sales']
        val_X = val.drop(columns='Weekly_Sales')
        
        test_Y = test['Weekly_Sales']
        test_X = test.drop(columns='Weekly_Sales')
    
    if perform_grid_search==True:
        # printing user input range
        print("-----Parameters and Range for grid search-----")
        for key, value in kwargs.items():
            print(key, ":", value)

        grid, kwargs = grid_search_sarimax(train_X, val_X, train_Y, val_Y, **kwargs)
        print("-------------Best Parameters Found-------------")
        for key, value in kwargs.items():
            print(key, ":", value)
            
        train_preds, val_preds, test_preds = sarimax_model(train_X, test_X, train_Y, test_Y, X_val=val_X, y_val=val_Y, **kwargs)
        train_error_mae = evaluation_metrics(train_Y[1:], train_preds)
        val_error_mae = evaluation_metrics(val_Y, val_preds)
        test_error_mae = evaluation_metrics(test_Y, test_preds)
    
        # mae
        error_df = pd.DataFrame([[round(train_error_mae, 4)],
                                 [round(val_error_mae, 4)],
                                 [round(test_error_mae, 4)]],
                                columns = ['MAE'],
                                index = ['Train', 'Val', 'Test'])
    
        val_prediction_plot("SARIMAX", train_Y, val_Y, test_Y, val_preds, test_preds)
        print("------------------Evaluation------------------")
        print(error_df)
    
        return val_preds, test_preds
    
    else:
        print("------------------Parameters------------------")
        for key, value in kwargs.items():
            print(key, ":", value)
        
        if isinstance(splits, float):         
            train_preds, test_preds = sarimax_model(train_X, test_X, train_Y, test_Y, **kwargs)
            
            train_error_mae = evaluation_metrics(train_Y[1:], train_preds)
            test_error_mae = evaluation_metrics(test_Y, test_preds)
            # mae
            error_df = pd.DataFrame([[round(train_error_mae, 4)],
                                     [round(test_error_mae, 4)]], 
                                    columns = ['MAE'], 
                                    index = ['Train', 'Test'])
            
            prediction_plot("SARIMAX", train_Y, test_Y, test_preds)
            print("------------------Evaluation------------------")
            print(error_df)

            return test_preds
        
        else:
            train_preds, val_preds, test_preds = sarimax_model(train_X, test_X, train_Y, test_Y, val_X, val_Y, **kwargs)
            train_error_mae = evaluation_metrics(train_Y[1:], train_preds)
            val_error_mae = evaluation_metrics(val_Y, val_preds)
            test_error_mae = evaluation_metrics(test_Y, test_preds)

            # mae
            error_df = pd.DataFrame([[round(train_error_mae, 4)],
                                     [round(val_error_mae, 4)],
                                     [round(test_error_mae, 4)]],
                                    columns = ['MAE'],
                                    index = ['Train', 'Val', 'Test'])
            
            val_prediction_plot("SARIMAX", train_Y, val_Y, test_Y, val_preds, test_preds)
            print("------------------Evaluation------------------")
            print(error_df)

            return val_preds, test_preds

In [16]:
def generate_predictions(dataset, model_name, splits=0.8, perform_grid_search=False, **kwargs):
    '''
    Returns predictions for the specified model by calling either generate_predictions_full or sarimax_full functions
    
    Parameters
    ----------
    dataset : pandas.core.frame.DataFrame
              Pandas univariate dataframe to be used for predictions
    
    model_name : {'SARIMA', 'ETS', 'SARIMAX'} (Can accept in upper/lower/mixed case)
                 SARIMA : Creates a SARIMA model
                 ETS : Creates a triple Exponential Smoothing model
                 SARIMAX : Creates a SARIMAX model
    splits : float, tuple
             Default = 0.8; splitting data in two sets - train and test of size 0.8 and 0.2 respectively
             float : Splits in two sets - train and test
                     Only accepts value between range (0.0, 1.0)
             tuple(train_size, validation_size) : Splits in three sets - train, validation and test
                                                  train_size -> Train size
                                                  validation_size -> Validation size
                                                  Should only contain values between range (0.0, 1.0) and 
                                                  "train_size + test_size" should also be between (0.0, 1.0)
             Splits should be tuple if perform_grid_search is True
    perform_grid_search : boolean
                          Default = False; does not perform grid search
    **kwargs : kwargs
               Keyword arguments that can be used to provide parameters to the model or grid search of the specified model
    
    Returns
    -------
    val_preds : Predictions for val split (if splits is tuple)
    test_preds : Predictions for test split
    '''
    model_name=model_name.upper()
    if (model_name not in ['SARIMA', 'ETS', 'SARIMAX']):
        raise ValueError("model_name must be one of {'SARIMA', 'ETS', 'SARIMAX'}")
        
    if model_name in ['SARIMA', 'ETS']:
        if perform_grid_search==False:
            test_preds = generate_predictions_full(dataset, model_name, splits, perform_grid_search, **kwargs)
            return test_preds
        
        elif perform_grid_search==True:
            val_preds, test_preds = generate_predictions_full(dataset, model_name, splits, perform_grid_search, **kwargs)
            return val_preds, test_preds
        
    elif model_name=='SARIMAX':
        if perform_grid_search==False:
            test_preds = sarimax_full(dataset, splits, perform_grid_search, **kwargs)
            return test_preds
        elif perform_grid_search==True:
            val_preds, test_preds = sarimax_full(dataset, splits, perform_grid_search, **kwargs)
            return val_preds, test_preds

### Preparing data + Predictions

#### Prep

In [17]:
df = pd.read_csv("Walmart_Store_sales.csv")

FileNotFoundError: [Errno 2] No such file or directory: 'Walmart_Store_sales.csv'

In [None]:
df.head()

In [None]:
df['Date'] = pd.to_datetime(df['Date'], format="%d-%m-%Y")

In [None]:
# Dropping columns to make data univariate

df_copy = df.copy()
df_copy.drop(columns=['Holiday_Flag', 'Temperature', 'Fuel_Price', 'CPI', 'Unemployment'], inplace=True)

df_copy

In [None]:
dict_of_df = {f'df_store_{i}' : df_copy.loc[df_copy.Store==i].set_index('Date') for i in df_copy['Store'].unique()}

In [None]:
df_preds = df_copy.copy()
df_preds

In [None]:
# df_preds with 4 columns Store, date, weekly_sales, predictions(np.nan)

df_preds['Predictions']=np.nan

In [None]:
predicted_store_df = pd.DataFrame()

#### SARIMA [order=(2,0,2), seasonal_order=(1,0,0,52)]

In [None]:
# merged_df stores the final output generated by the model
merged_df = pd.DataFrame(columns=['Store', 'Weekly_Sales', 'Date', 'Predictions'])

In [None]:
# Fitting model for each store separately and generating predictions for SARIMAX [order=(2,0,2), seasonal_order(1,0,0,52)]

for dataframe in dict_of_df.keys():
    store_num = dataframe.split(sep='_')
    store_num = store_num[-1]
    store_num = int(store_num)
    dict_of_df[dataframe].drop(columns='Store', inplace=True)
    
    test_preds = generate_predictions_full(dict_of_df[dataframe], "SARIMA", order=(2,0,2), seasonal_order=(1,0,0,52))
    
    predictions = pd.DataFrame(test_preds)
    predictions.rename(columns={'predicted_mean':'Predictions'}, inplace=True)
    predictions.reset_index(names='Date', inplace=True)
    
    predicted_store_df = pd.merge(df_preds.loc[df_preds['Store']==store_num], predictions, on=['Date'])
    predicted_store_df.rename(columns={"Predictions_y":"Predictions"}, inplace=True)
    predicted_store_df.drop(columns="Predictions_x", inplace=True)
    merged_df = pd.merge(merged_df, predicted_store_df, how='outer')

In [None]:
sarima_total_eval = evaluation_metrics(merged_df['Weekly_Sales'], merged_df['Predictions'])
sarima_total_eval

In [None]:
# Storing final predictions to csv

# merged_df.to_csv("Walmart_Predictions.csv", encoding='utf-8')

#### ETS [trend='add', seasonal='mul', seasonal_periods=52]

In [None]:
# For storing final_predictions

merged_df = pd.DataFrame(columns=['Store', 'Weekly_Sales', 'Date', 'Predictions'])

In [None]:
# Fitting model for each store separately and generating predictions for ETS [trend='add', seasonal='mul', seasonal_periods=52]
# default smoothing_level, smoothing_trend, smoothing_seasonal

for dataframe in dict_of_df.keys():
    store_num = dataframe.split(sep='_')
    store_num = store_num[-1]
    store_num = int(store_num)
    # dict_of_df[dataframe].drop(columns='Store', inplace=True)
    
    test_preds = generate_predictions_full(dict_of_df[dataframe], "ets",
                                           trend='add', seasonal='mul', seasonal_periods=52)
    
    predictions = pd.DataFrame(test_preds)
    predictions.rename(columns={'predicted_mean':'Predictions'}, inplace=True)
    predictions.reset_index(names='Date', inplace=True)
    
    predicted_store_df = pd.merge(df_preds.loc[df_preds['Store']==store_num], predictions, on=['Date'])
    predicted_store_df.rename(columns={"Predictions_y":"Predictions"}, inplace=True)
    # predicted_store_df.drop(columns="Predictions_x", inplace=True)
    print(predicted_store_df)
    merged_df = pd.merge(merged_df, predicted_store_df, how='outer')

In [None]:
merged_df.drop(columns='Predictions', inplace=True)
merged_df.rename(columns={0:"Predictions"}, inplace=True)

In [None]:
# Storing predictions to csv

# merged_df.to_csv("Walmart_Predictions_ETS.csv", encoding='utf-8')

In [None]:
merged_df.dropna(inplace=True)

In [None]:
ets_total_eval = evaluation_metrics(merged_df['Weekly_Sales'], merged_df['Predictions'])
ets_total_eval

#### ETS with grid search [trend=['add','mul'], seasonal=['add', 'mul'], seasonal_periods=[52], smoothing_values= [0.0, 0.1, 0.2]]

In [None]:
val_merged_df = pd.DataFrame(columns=['Store', 'Weekly_Sales', 'Date', 'Predictions'])
test_merged_df = pd.DataFrame(columns=['Store', 'Weekly_Sales', 'Date', 'Predictions'])

In [None]:
# Fitting model for each store separately and generating predictions for ETS using grid search
# trend=['add','mul'], seasonal=['add','mul'], seasonal_periods=[52], smoothing_values=np.arange(0.0,0.3,0.1)

# for dataframe in dict_of_df.keys():
for dataframe in dict_of_df.keys():
    store_num = dataframe.split(sep='_')
    store_num = store_num[-1]
    store_num = int(store_num)
    # dict_of_df[dataframe].drop(columns='Store', inplace=True)
    
    val_preds, test_preds = generate_predictions_full(dict_of_df[dataframe], "ets", perform_grid_search=True, splits=(0.75, 0.125),
                                                      trend=['add','mul'], seasonal=['add','mul'], seasonal_periods=[52],
                                                      smoothing_values=np.arange(0.0,0.3,0.1))
    
    val_predictions = pd.DataFrame(val_preds)
    val_predictions.rename(columns={'predicted_mean':'Predictions'}, inplace=True)
    val_predictions.reset_index(names='Date', inplace=True)
    val_predicted_store_df = pd.merge(df_preds.loc[df_preds['Store']==store_num], val_predictions, on=['Date'])
    val_merged_df = pd.merge(val_merged_df, val_predicted_store_df, how='outer')
    
    test_predictions = pd.DataFrame(test_preds)
    test_predictions.rename(columns={'predicted_mean':'Predictions'}, inplace=True)
    test_predictions.reset_index(names='Date', inplace=True)
    test_predicted_store_df = pd.merge(df_preds.loc[df_preds['Store']==store_num], test_predictions, on=['Date'])
    test_merged_df = pd.merge(test_merged_df, test_predicted_store_df, how='outer')

In [None]:
val_merged_df.drop(columns="Predictions", inplace=True)
test_merged_df.drop(columns="Predictions", inplace=True)

val_merged_df.rename(columns={0:"Predictions"}, inplace=True)
test_merged_df.rename(columns={0:"Predictions"}, inplace=True)

In [None]:
# # Storing predictions to csv

# val_merged_df.to_csv("Val_Grid_ETS.csv", encoding='utf-8')
# test_merged_df.to_csv("Test_Grid_ETS.csv", encoding='utf-8')

In [None]:
ets_grid_total_eval = evaluation_metrics(val_merged_df['Weekly_Sales'], val_merged_df['Predictions'])
ets_grid_total_eval

#### SARIMA with grid search [p_range=range(0, 2), d_range=range(0, 1), q_range=range(0, 2), P_range=range(0, 2), D_range=range(0, 1), Q_range=range(0, 2), m=[52]]

In [None]:
val_merged_df = pd.DataFrame(columns=['Store', 'Weekly_Sales', 'Date', 'Predictions'])
test_merged_df = pd.DataFrame(columns=['Store', 'Weekly_Sales', 'Date', 'Predictions'])

In [None]:
p_range=range(0, 2)
d_range=range(0, 1)
q_range=range(0, 2)
P_range=range(0, 2)
D_range=range(0, 1)
Q_range=range(0, 2)
m=[52]

In [None]:
# # Fitting model for each store separately and generating predictions for SARIMA using grid search
# # p_range=range(0, 2), d_range=range(0, 1), q_range=range(0, 2), 
# # P_range=range(0, 2), D_range=range(0, 1), Q_range=range(0, 2), m=[52]

for dataframe in dict_of_df.keys():
    store_num = dataframe.split(sep='_')
    store_num = store_num[-1]
    store_num = int(store_num)
    # dict_of_df[dataframe].drop(columns='Store', inplace=True)
    
    val_preds, test_preds = generate_predictions_full(dict_of_df[dataframe], model_name="sarima", splits=(0.75,0.125), perform_grid_search=True, 
                                                      p_range=p_range, d_range=d_range, q_range=q_range,
                                                      P_range=P_range, D_range=D_range, Q_range=Q_range, m=m)
    
    val_predictions = pd.DataFrame(val_preds)
    val_predictions.rename(columns={'predicted_mean':'Predictions'}, inplace=True)
    val_predictions.reset_index(names='Date', inplace=True)
    val_predicted_store_df = pd.merge(df_preds.loc[df_preds['Store']==store_num], val_predictions, on=['Date'])
    val_merged_df = pd.merge(val_merged_df, val_predicted_store_df, how='outer')
    
    test_predictions = pd.DataFrame(test_preds)
    test_predictions.rename(columns={'predicted_mean':'Predictions'}, inplace=True)
    test_predictions.reset_index(names='Date', inplace=True)
    test_predicted_store_df = pd.merge(df_preds.loc[df_preds['Store']==store_num], test_predictions, on=['Date'])
    test_merged_df = pd.merge(test_merged_df, test_predicted_store_df, how='outer')

In [None]:
val_merged_df.drop(columns=['Predictions_x', 'Predictions'], inplace=True)
test_merged_df.drop(columns=['Predictions_x', 'Predictions'], inplace=True)

val_merged_df.rename(columns={"Predictions_y": "Predictions"}, inplace=True)
test_merged_df.rename(columns={"Predictions_y": "Predictions"}, inplace=True)

In [None]:
# # Storing predictions to csv

val_merged_df.to_csv("Val_Grid_SARIMA.csv", encoding='utf-8')
test_merged_df.to_csv("Test_Grid_SARIMA.csv", encoding='utf-8')

In [None]:
sarima_grid_total_eval = evaluation_metrics(val_merged_df['Weekly_Sales'], val_merged_df['Predictions'])
sarima_grid_total_eval

#### SARIMA Prep

In [None]:
df.set_index('Date', inplace=True)

In [None]:
df_agg = df.resample('W').mean()

In [None]:
train = df_agg[:int(0.8*(len(df_agg)))]
test = df_agg[int(0.8*(len(df_agg))):]

In [None]:
train.drop(columns='Store', inplace=True)
test.drop(columns='Store', inplace=True)

In [None]:
train_y = train['Weekly_Sales']
train_x = train.drop(columns='Weekly_Sales')

test_y = test['Weekly_Sales']
test_x = test.drop(columns='Weekly_Sales')

In [None]:
train_preds, test_preds = sarimax_model(train_x, test_x, train_y, test_y, order=(2,0,0), seasonal_order=(1,0,0,52))

In [None]:
prediction_plot("SARIMAX", train_y, test_y, test_preds)

In [None]:
dict_of_df2 = {f'df_store_{i}' : df.loc[df.Store==i] for i in df_copy['Store'].unique()}

#### SARIMAX (order=(2,0,2), seasonal_order=(1,0,0,52))

In [None]:
merged_df = pd.DataFrame(columns=['Store', 'Weekly_Sales', 'Date', 'Predictions'])

In [None]:
for dataframe in dict_of_df2.keys():
    store_num = dataframe.split(sep='_')
    store_num = store_num[-1]
    store_num = int(store_num)
    dict_of_df2[dataframe].drop(columns='Store', inplace=True)
    test_preds = sarimax_full(dict_of_df2[dataframe], order=(2,0,2), seasonal_order=(1,0,0,52))
    predictions = pd.DataFrame(test_preds)
    predictions.rename(columns={'predicted_mean':'Predictions'}, inplace=True)
    predictions.reset_index(names='Date', inplace=True)
    predicted_store_df = pd.merge(df_preds.loc[df_preds['Store']==store_num], predictions, on=['Date'], how='outer')
    predicted_store_df.rename(columns={"Predictions_y":"Predictions"}, inplace=True)
    predicted_store_df.drop(columns="Predictions_x", inplace=True)
    merged_df = pd.merge(merged_df, predicted_store_df, how='outer')

In [None]:
# merged_df.to_csv("SARIMAX.csv", encoding='utf-8')

In [None]:
merged_df.dropna(inplace=True)

In [None]:
sarimax_total_eval = evaluation_metrics(merged_df['Weekly_Sales'], merged_df['Predictions'])
sarimax_total_eval

#### SARIMAX with grid search [p_range = range(0,2), d_range = range(0,1), q_range = range(0,2), P_range = range(0,2), D_range = range(0,1), Q_range = range(0,2), m=[52]]

In [None]:
p_range = range(0,2)
d_range = range(0,1)
q_range = range(0,2)
P_range = range(0,2)
D_range = range(0,1)
Q_range = range(0,2)
m=[52]

In [None]:
val_merged_df = pd.DataFrame(columns=['Store', 'Weekly_Sales', 'Date', 'Predictions'])
test_merged_df = pd.DataFrame(columns=['Store', 'Weekly_Sales', 'Date', 'Predictions'])

In [None]:
# fitting model for each store separately and generating predictions for SARIMA using grid search
# p_range=range(0, 2), d_range=range(0, 1), q_range=range(0, 2), 
# P_range=range(0, 2), D_range=range(0, 1), Q_range=range(0, 2), m=[52]

for dataframe in dict_of_df2.keys():
    store_num = dataframe.split(sep='_')
    store_num = store_num[-1]
    store_num = int(store_num)
    # dict_of_df2[dataframe].drop(columns='Store', inplace=True)
    
    val_preds, test_preds = sarimax_full(dict_of_df2[dataframe], splits=(0.75,0.125), perform_grid_search=True, 
                                         p_range=p_range, d_range=d_range, q_range=q_range,
                                         P_range=P_range, D_range=D_range, Q_range=Q_range, m=m)
    
    val_predictions = pd.DataFrame(val_preds)
    val_predictions.rename(columns={'predicted_mean':'Predictions'}, inplace=True)
    val_predictions.reset_index(names='Date', inplace=True)
    val_predicted_store_df = pd.merge(df_preds.loc[df_preds['Store']==store_num], val_predictions, on=['Date'])
    val_merged_df = pd.merge(val_merged_df, val_predicted_store_df, how='outer')
    
    test_predictions = pd.DataFrame(test_preds)
    test_predictions.rename(columns={'predicted_mean':'Predictions'}, inplace=True)
    test_predictions.reset_index(names='Date', inplace=True)
    test_predicted_store_df = pd.merge(df_preds.loc[df_preds['Store']==store_num], test_predictions, on=['Date'])
    test_merged_df = pd.merge(test_merged_df, test_predicted_store_df, how='outer')

In [None]:
val_merged_df.drop(columns=['Predictions', 'Predictions_x'], inplace=True)
test_merged_df.drop(columns=['Predictions', 'Predictions_x'], inplace=True)

val_merged_df.rename(columns={'Predictions_y':'Predictions'}, inplace=True)
test_merged_df.rename(columns={'Predictions_y':'Predictions'}, inplace=True)

In [None]:
sarimax_grid_total_eval = evaluation_metrics(val_merged_df['Weekly_Sales'], val_merged_df['Predictions'])
sarimax_grid_total_eval

In [None]:
# val_merged_df.to_csv("Val_Grid_SARIMAX.csv", encoding='utf-8')
# test_merged_df.to_csv("Test_Grid_SARIMAX.csv", encoding='utf-8')

In [None]:
print(sarima_total_eval)
print(ets_total_eval)
print(ets_grid_total_eval)
print(sarima_grid_total_eval)
print(sarimax_total_eval)
print(sarimax_grid_total_eval)

#### Grid Search sklearn GridSearchCV

In [None]:
sarima_params_grid = {'order':[(p,d,q) for p in range(0,2) for d in range(0,1) for q in range(0,2)],
                      'seasonal_order':[(P,D,Q,m) for P in range(0,2) for D in range(0,1) for Q in range(0,2) for m in [12]]}

In [None]:
class SARIMAWrapper(BaseEstimator):
    def __init__(self, order, seasonal_order):
        self.order = order
        self.seasonal_order = seasonal_order
    
    def fit(self, X, y):
        self.model = SARIMAX(y, order=self.order, seasonal_order=self.seasonal_order)
        self.model_fit = self.model.fit()
        return self
    
    def predict(self, X):
        return self.model_fit.forecast(steps=len(X))

In [None]:
sarima_wrapper = SARIMAWrapper(order=(1,0,0), seasonal_order=(1,0,0,12))

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer
from sklearn.model_selection import TimeSeriesSplit

sarima_grid_search = GridSearchCV(sarima_wrapper, sarimax_params_grid, scoring=make_scorer(evaluation_metrics), 
                                  cv=TimeSeriesSplit(n_splits=5), error_score='raise', return_train_score=True)