In [1]:
import pandas as pd
from pmdarima import auto_arima
from tqdm import tqdm

In [None]:
def get_future_preds(all_data_df, X_train, X_test, model, justlags=False, daily=False, n_days=None, refit=True):
    '''
    Function to forecast future predictions.
    
    Parameters
    ----------
    
    all_data_df : pandas.DataFrame 
        Data frame containing all the information (including external data) without lags and timestamp as index. The 
        target feature column name has to be 'TotalEntries'.
    
    X_train : pandas.DataFrame
        Data used to fit the model, with timestamp as index.
    
    X_test : pandas.DataFrame
        Data used to test the model, with timestamp as index; has to be continuous with X_train (otherwise the results 
        will be inaccurate).
    
    model : predictive model
        Already fit predictive model from which to extract the first prediction. The model has to be fit in a sensible 
        way, accordingly to the hypothesis to be tested (using daily data or not, all columns or just lags).
    
    justlags : bool
        Parameter to determine if the parameters to be fit to the modell will be just lags or all other features included 
        in the all_data_df (default is False).
    
    daily : bool
        Parameter to indicate if the data is grouped by day or not (default is False) 
    
    n_days : int
        Number of days into the future from which to make predictions. If no number is provided, the function will 
        calculate the number of predictions equivalent to the number of rows of X_test. 
    
    refit : bool
        Wether to refit the model with each new prediction or not (default is True)
    
    Returns
    -------
    
    pandas.DataFrame containing the predictions for the specified interval.
    
    
    '''
    all_data_df.sort_index(inplace=True)
    X_train.sort_index(inplace=True)
    X_test.sort_index(inplace=True)
    
    # Initial parameters
    X_grow = pd.DataFrame(all_data_df['TotalEntries'][:X_train[-1:].index.astype(str)[0]].astype('float64').sort_index())
    y_grow_hat = pd.DataFrame
    y_hat_init = pd.DataFrame(model.predict(X_test[0:1]), index=X_test[0:1].index, columns=['TotalEntries'])
    y_hat = 0
    pred_df = pd.DataFrame
    new_cols = {}
    nobs = 48
    if daily == True:
        nobs = 1
    n_it = X_test.shape[0]
    if n_days != None:
        if daily == False:
            n_it = (48*n_days)
        else:
            n_it = n_days
    
    # Get new predictions
    #for i in range(n_it):
    for i in tqdm(range(n_it)):
        if y_grow_hat.empty:
            y_hat = y_hat_init
            
        else:
            pred_df = X_data.drop(['t-' + str(nobs*7)], axis = 1)
            pred_df.rename(columns=new_cols, inplace=True)
            pred_df.rename(columns={'TotalEntries':'t-1'}, inplace=True)
            
            y_hat = model.predict(pred_df[-1:])
            y_hat = pd.DataFrame(y_hat, index=X_test[i:(i+1)].index, columns=['TotalEntries'])
            
            
               
        if y_grow_hat.empty:
            y_grow_hat = y_hat
            
        else:
            y_grow_hat = pd.concat([y_grow_hat, y_hat])
        
        # Concatenate predictions to known data
        X_grow = pd.concat([X_grow, y_hat])
        X_grow.sort_index(inplace=True)
        
        # Create lags
        X_data = X_grow.copy()
        for l in range(1,nobs*7+1):
            new_cols['t-' + str(l)] = 't-' + str(l+1)
            X_data['t-' + str(l)] = X_data['TotalEntries'].shift(l)
        
        X_data = X_data.iloc[(nobs*7):]
        
        
        # Append the rest of the columns
        if justlags == False:
        
            X_data = X_data.merge(all_data_df.drop(['TotalEntries'], axis = 1)[X_train[0:1].index[0]:y_hat.index[0]], 
                              how='left', on='Timestamp')
        
        
        if refit == True:
        # Fit the model with all the data including the last prediction
            model.fit(X_data.drop(['TotalEntries'], axis=1), X_data['TotalEntries'])
    
    
    y_grow_hat.sort_index(inplace=True)
    return y_grow_hat

In [None]:
def ARIMA_future_preds(all_data_df, X_train, X_test, model, exogenous=False, justlags=False, daily=False, n_days=None):
    '''
    Function to forecast future predictions using an ARIMA model.
    
    Parameters
    ----------
    
    all_data_df : pandas.DataFrame 
        Data frame containing all the information (including external data) without lags and timestamp as index. The target feature column name has to be 'TotalEntries'.
    
    X_train : pandas.DataFrame
        Data used to fit the model, with timestamp as index.
    
    X_test : pandas.DataFrame
        Data used to test the model, with timestamp as index; has to be continuous with X_train (otherwise the results will be inaccurate).
    
    model : predictive model
        Already fit predictive model from which to extract predictions. The model has to be fit in a sensible way, accordingly to the hypothesis to be tested (using daily data or not, all columns or just lags).
    
    exogenous : bool
        Parameter to indicate wether the model has to be fit using exogenous data or not.
    
    justlags : bool
        Parameter to determine if the parameters to be fit to the modell will be just lags or all other features included in the all_data_df (default is False).
    
    daily : bool
        Parameter to indicate if the data is grouped by day or not (default is False) 
    
    n_days : int
        Number of days into the future from which to make predictions. If no number is provided, the function will calculate the number of predictions equivalent to the number of rows of X_test. 
    
    
    Returns
    -------
    
    pandas.DataFrame containing the predictions for the specified interval and another pandas.DataFrame containing the confidence interval for each prediction.
    
    '''
    all_data_df.sort_index(inplace=True)
    X_train.sort_index(inplace=True)
    X_test.sort_index(inplace=True)
    
    # Initial parameters
    X_init = pd.DataFrame(all_data_df['TotalEntries'][:X_train[-1:].index.astype(str)[0]].astype('float64').sort_index())
    y_grow_hat = pd.DataFrame
    y_grow_confint = pd.DataFrame
    y_hat_init, y_confint_init = model.predict(n_periods=1, exogenous=X_test[0:1], return_conf_int=True)
    y_hat_init = pd.DataFrame(y_hat_init, index=X_test[0:1].index, columns=['TotalEntries'])
    y_confint_init = pd.DataFrame(y_confint_init, index=X_test[0:1].index, columns=['PUF_min','PUF_max'])

        
        
        
        
    y_hat = 0
    y_confint = 0
    pred_df = pd.DataFrame
    new_cols = {}
    nobs = 48
    if daily == True:
        nobs = 1
    n_it = X_test.shape[0]
    if n_days != None:
        if daily == False:
            n_it = (48*n_days)
        else:
            n_it = n_days
    
    # Get new predictions
    
    if exogenous == False:
        
        y_grow_hat, y_grow_confint = model.predict(n_periods=n_it, return_conf_int=True)             
        y_grow_hat = pd.DataFrame(y_grow_hat, index=X_test.index, columns=['TotalEntries'])
        y_grow_confint = pd.DataFrame(y_grow_confint, index=X_test.index, columns=['PUF_min','PUF_max'])

        
        
    
    
    else:
        
    #for i in range(n_it):
        for i in tqdm(range(n_it)):
            if y_grow_hat.empty:
                y_hat = y_hat_init
                y_confint = y_confint_init
            
            else:
                pred_df = X_data.drop(['t-' + str(nobs*7)], axis = 1)
                pred_df.rename(columns=new_cols, inplace=True)
                pred_df.rename(columns={'TotalEntries':'t-1'}, inplace=True)
                pred_df.sort_index(inplace=True)
                
                y_hat, y_confint = model.predict(n_periods=i+1, exogenous=pred_df[-(i+1):], return_conf_int=True)
                y_hat = pd.DataFrame(y_hat, index=X_test[:(i+1)].index, columns=['TotalEntries'])
                y_confint = pd.DataFrame(y_confint, index=X_test[:(i+1)].index, columns=['PUF_min','PUF_max'])
            
            
               
            if y_grow_hat.empty:
                y_grow_hat = y_hat
                y_grow_confint = y_confint
            
            else:
                y_grow_hat = pd.concat([y_grow_hat, y_hat[-1:]])
                y_grow_confint = pd.concat([y_grow_confint, y_confint[-1:]])
        
        # Concatenate predictions to known data
            X_grow = pd.concat([X_init, y_hat])
            X_grow.sort_index(inplace=True)
        
        # Create lags
            X_data = X_grow.copy()
            for l in range(1,nobs*7+1):
                new_cols['t-' + str(l)] = 't-' + str(l+1)
                X_data['t-' + str(l)] = X_data['TotalEntries'].shift(l)
        
            X_data = X_data.iloc[(nobs*7):]
        
        
        # Append the rest of the columns
            if justlags == False:
        
                X_data = X_data.merge(all_data_df.drop(['TotalEntries'], axis = 1)[X_train[0:1].index[0]:y_hat[-1:].index[0]], 
                              how='left', on='Timestamp')
        
            
        # Update the model with the last prediction
        #model.add_new_observations(X_data['TotalEntries'][-1:], exogenous=X_data.drop(['TotalEntries'], axis=1)[-1:])
    

    
    y_grow_hat.sort_index(inplace=True)
    y_grow_confint.sort_index(inplace=True)
    return y_grow_hat, y_grow_confint