## Prophet model training and predicting

In [None]:
from sklearn.model_selection import train_test_split

from sklearn.model_selection import cross_val_score
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn import tree
from sklearn.svm import SVR
from sklearn.metrics import r2_score
from sklearn.metrics import f1_score
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from tqdm import tqdm

from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import TimeSeriesSplit

In [None]:
import math, datetime, time, random
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re
import time
import itertools
import tqdm
from prophet import Prophet


def load_data(data_path):
  data = pd.read_csv(data_path)  
  return data

inbound = load_data("inbound_loads.csv")
outbound = load_data("outbound_laods.csv")
weather = load_data("weather.csv")
#For loop to ensure that all pallet data is in the same dataframe
pallet = load_data("Pallet_history_Gold_Spike[0].csv")
for x in range(1, 10):
    pallet = pd.concat([pallet, load_data(f"Pallet_history_Gold_Spike[{x}].csv")])
trainentest = load_data("demand_kWtrain_val.csv")
train = trainentest.iloc[:273988,:]
topredict = trainentest.iloc[273988:, :]

pallet = pallet.drop(['lot_code', 
                      'tran_type', 
                      'final_pallet_code', 
                      'warehouse_facility_id',
                      'source_system_id'], axis=1)

### For alldates_df, concat the features

In [None]:
alldates_df = load_data('alldates_df.csv')
alldates_df = alldates_df.rename(columns = {'Unnamed: 0':'datetime_local'})
alldates_df['datetime_local'] = pd.to_datetime(alldates_df['datetime_local'])
alldates_df = alldates_df.set_index('datetime_local')

### 1. Create test and train dataframe

In [None]:
def normalize_column(df, columnname):
    """Function which returns a Dataframe where the given column is normalized through min-max normalization."""
    df[f'{columnname}_normalized'] = (df[columnname] - df[columnname].min()) / (df[columnname].max() - df[columnname].min())
    return df.drop([columnname], axis=1)

def add_one_hot_encoder(df, colname):
    """
    Function which returns a DataFrame where the given column has been removed and replaced by
    one-hot-encoding columns for each value in the original column.
    """
    onehot = pd.get_dummies(df[colname], prefix=colname)
    return df.drop(colname, axis=1).join(onehot)

def interpolate_column(df, colname):
    df[f'{colname}_interpolated'] = df[colname].interpolate(method='linear')
    return df.drop([colname], axis=1)

In [None]:
#Load in the data
all_data = load_data("train_test_features_df.csv")
print(all_data.shape)
all_data['datetime_local'] = pd.to_datetime(all_data['datetime_local'])
interpolate = ['Temperature', 'Relative Humidity']
to_normalize = ['Relative Humidity_interpolated', 'Temperature_interpolated', 'weight_1h',  
                'weight_5h', 'weight_10h', 'weight_23h', 'pallet_movement_5min', 'doors_open']
add_one_hot_encoding = ['weekday', 'year', 'month']

for x in interpolate:
    all_data = interpolate_column(all_data, x)
for x in to_normalize:
    all_data = normalize_column(all_data, x)
for x in add_one_hot_encoding:
    all_data = add_one_hot_encoder(all_data, x)
    
all_data_train = all_data[all_data['datetime_local'] < '2021-10-11 06:08']
all_data_predict = all_data[all_data['datetime_local'] >= '2021-10-11 06:08']
predictions_dates = all_data_predict['datetime_local']
all_data_predict = all_data_predict.drop('datetime_local', axis=1)
all_data_train = interpolate_column(all_data_train, 'demand_kW')
all_data_train = all_data_train.rename(columns={'demand_kW_interpolated': 'demand_kW'})
all_data_predict = all_data_predict.drop('demand_kW', axis=1)

In [None]:
from prophet.plot import plot_plotly, plot_components_plotly

"""
Function to test prophet with
all_df: all the data you have
dummy_freq: frequency of datapoints you want the model to be fitted on test{day,hour,15min, 1 min}
regressors: set of columns in new_df you want to give to the model
cross_folds: amount of cross validation slices you want to use
changepoint_prior_scales: flexibility of the model to fit
"""

def prophet_model_prediction(all_df, dummy_freq, regressors, n_splits = 5, changepoint_prior_scales = [0.05], add_month = False, add_hour = False):
    
    best_month_rmse = 100000000
    ##make dic of all combinations of the regressors(this doesnt work fuckme)
    #regressor_combinations = itertools.product(regressors)
    regressor_combinations = regressors
   
    results = {}
    all_df_forplotting = all_df.reset_index()
    all_df_forplotting.rename(columns = {'index': 'ds'}, inplace = True)
    all_df_formonthcheck = pd.DataFrame({'ds' : all_df_forplotting['ds'], 'demand_kW' : all_df_forplotting['demand_kW']})
    
    #for each frequency in dummy_freq, resample the new_df to contain datapoints with given frequency
    for freq in dummy_freq:        
        resampled_dummy_df = all_df.resample(freq).last()  
        
        
        #for each regressor combination do the following
        for regs in regressor_combinations:  
            print(regs)
            start = time.time()
            prophet_dummy_df = pd.DataFrame({'ds': resampled_dummy_df.index, 'y' : resampled_dummy_df['demand_kW']})
            ##create the prophet_dummy_df that contains the regressors, column ds, y and reset the index
            for reg in regs: 
                
                prophet_dummy_df[str(reg)] = resampled_dummy_df[reg].copy()
                
            prophet_dummy_df = prophet_dummy_df.reset_index()
            prophet_dummy_df.drop(['index'], axis = 1, inplace = True)
            
            
            ##do cross validation with n_splits
            #ts = TimeSeriesSplit(n_splits=n_splits)
            count = 1
            train_rmses = []
            test_rmses = []
            for train, test in ts.split(prophet_dummy_df):              
                print(f'Starting fold {count}')
                
                ##initiate a prophet object m and add each regressor to it
                m = Prophet()
                if add_month:
                    m.add_seasonality(name='monthly', period=30.5, fourier_order=5)
                if add_hour:
                    m.add_seasonality(name='hourly', period=0.042, fourier_order=5)
                for reg in regs:
                    m.add_regressor(reg)
                    
                #create test and trainset                
                cv_train, cv_test = prophet_dummy_df.iloc[train], prophet_dummy_df.iloc[test]                
                future_test = cv_test.drop(['y'], axis = 1)
                future_train = cv_train.drop(['y'], axis = 1)               

                cv_train
                #fit model
                train_start = time.time()
                m.fit(cv_train)
                    
                                          
                train_stop = time.time()
                
                #to predict future, create df for the coming future month
                predict_start = time.time()
                month_future_minutes = m.make_future_dataframe(periods = 43000, freq = 'min', include_history = False)
                for reg in regs:               
                    all_df_formonthcheck[str(reg)] = all_df_forplotting[reg].copy()

                month_future_minutes = month_future_minutes.merge(all_df_formonthcheck, on = 'ds', how = 'left') 

                y_pred_test = m.predict(future_test) 
                y_pred_train = m.predict(future_train)
                y_pred_month = m.predict(month_future_minutes)
                predict_stop = time.time()                
                
                #Calc rmse
                rmse_start = time.time()
                train_rmse = mean_squared_error(cv_train['y'], y_pred_train['yhat'], squared=False)
                test_rmse = mean_squared_error(cv_test['y'], y_pred_test['yhat'], squared=False)                
                month_rmse = mean_squared_error(month_future_minutes['demand_kW'], y_pred_month['yhat'], squared=False)
                rmse_stop = time.time()        
                train_rmses.append(train_rmse)
                test_rmses.append(test_rmse)
                
                #print findings
                timings = [(train_stop - train_start), (predict_stop - predict_start), (rmse_stop - rmse_start), (rmse_stop - train_start)]
                print("these are the taken regulators:",   regs)
                print(f'Fold {count} train error: {train_rmse}. Test error: {test_rmse}. month error: {month_rmse}.Time taken: {timings[3]} s.')                    
                count += 1
                if month_rmse < best_month_rmse:
                    best_month_rmse = month_rmse
                    best_regs = regs
                    best_train_rmse = train_rmse 
                    print("FUCKYES", freq)
                
            all_rmses = [train_rmses, test_rmses]
            m.plot(y_pred_month)
        ##for each split, do the following
            
    return [m, test_rmse, all_rmses, timings]
            
        #print(f'Mean Absolute Error = {mae}')
        #modelname = str(regressors) + 'with frequency' + str(freq)

# Final Prophet model 
Createtrainset

In [None]:
forresample = all_data_train.set_index('datetime_local')
all_data_15min = forresample.copy().resample('15min').last()
prophet_dummy_df = pd.DataFrame({'ds': all_data_15min.index, 'y' : all_data_15min['demand_kW']})
prophet_dummy_df.reset_index(inplace= True)
prophet_dummy_df.drop('datetime_local', axis = 1, inplace = True)
prophet_dummy_df

Create prediction set

In [None]:
prediction_df = pd.DataFrame({'ds': all_data_predict['datetime_local'], 'datetime_local': all_data_predict['datetime_local']})
prediction_df = prediction_df.set_index('datetime_local')
prediction_df

Fit the model with the trainset

In [None]:
m = Prophet()
m.fit(prophet_dummy_df)

Make predictions with the prediction set and plot the prediction

In [None]:
prophet_predictions = m.predict(prediction_df)
m.plot(prophet_predictions)

In [None]:
results_prophet['yhat'].to_csv('results_prophet.csv')

Demand values we have: 12/31/2018 21:15:00 upandincluding 10/11/2021/ 6:07:00
Predicting the demand from 10/11/2021/ 6:08 upandincluding 12/13/2021 17:59:00

#### Test basic testing with 5 folds
best next months prediction is 345

In [None]:
prophet_model_prediction(new_df, ['15min'], [[]], n_splits = 5)

In [None]:
prophet_model_prediction(new_df, ['h'], [[]], n_splits = 5)

In [None]:
prophet_model_prediction(new_df, ['h'], [[]], n_splits = 2)

#### Test different regressors all together
leads to overfitting

In [None]:
prophet_model_prediction(new_df, ['h'], [['weight_1h_normalized','Relative Humidity_interpolated_normalized',
       'Temperature_interpolated_normalized', 'weight_1h_normalized',
       'weight_5h_normalized', 'weight_10h_normalized',
       'weight_23h_normalized', 'pallet_movement_5min_normalized',
       'doors_open_normalized']], changepoint_prior_scales = [0.5])

#prophet_model_prediction(new_df, '[h, 15min, d]',['Temperature_interpolated_normalized'] )
#prophet_model_prediction(new_df,['Temperature_interpolated_normalized'] )
#prophet_model_prediction(new_df,['Temperature_interpolated_normalized'] )
"""
['weight_1h_normalized','Relative Humidity_interpolated_normalized',
       'Temperature_interpolated_normalized', 'weight_1h_normalized',
       'weight_5h_normalized', 'weight_10h_normalized',
       'weight_23h_normalized', 'pallet_movement_5min_normalized',
       'doors_open_normalized']
"""


#### Test all regressors individually
findings summarized: each regressor adds very very little and creates a lower train error and test erorr but also more overfitting

the train error is around 190-200 and the testerror is around 500 and the 2 months expectation error is around 237

In [None]:

prophet_model_prediction(new_df, ['h'], [['weight_1h_normalized','Relative Humidity_interpolated_normalized',
       'Temperature_interpolated_normalized', 'weight_1h_normalized',
       'weight_5h_normalized', 'weight_10h_normalized',
       'weight_23h_normalized', 'pallet_movement_5min_normalized',
       'doors_open_normalized']], changepoint_prior_scales = [0.5])


#### Test for y2021 data
findings, it is worse than using all years for both hour and 15 min frequency datapoints. 
for hour datafrequency and no regressors
fold 2:
Fold 2 train error: 303.91675967775853. Test error: 520.8706442095779. month error: 438.8608590535741

for 15 min data and no regressors
Fold 2 train error: 286.14426214818167. Test error: 512.0793509099431. month error: 435.20339042603246.

In [None]:
prophet_model_prediction(y2021_df, ['h'], [[]])

In [None]:
prophet_model_prediction(y2021_df, ['15min'], [[]])

#### Test for all years data
findings: overfitting is happening
for hour frequency
fold 2: train error 203, test error 418, month error 354

for 15 minutes frequency
fold 2: train error 200, test error 1107, month error 432 

In [None]:
prophet_model_prediction(new_df, ['h'], [[]])

In [None]:
prophet_model_prediction(new_df, ['15min'], [[]])

#### Test to include month cycle/hour cycle
when using allyears, hourly, add month = true,
train error 202, test error 502, month error 365

In [None]:
prophet_model_prediction(new_df, ['h'], [[]], add_month = True)

When using allyears, hourly, hour = true
train error 202, test error 532, month error 363

In [None]:
prophet_model_prediction(new_df, ['h'], [[]], add_hour = True)

#### Test different changepoint_prior_scales for Temp. 
Findings: there is no difference.

In [None]:
prophet_model_prediction(new_df, ['h'], [['Temperature_interpolated_normalized']], changepoint_prior_scales = [0.5, 0.1, 0.05])