In [47]:
import numpy as np
import pandas as pd
import datetime
import time
from tqdm import tqdm

%run ./../custom_functions/train_dev_test_split.ipynb
%run ./../custom_functions/get_future_predictions.ipynb

from sklearn.linear_model import LinearRegression
from pmdarima import auto_arima

In [5]:
ravaln_df = pd.read_csv('./../../../Databases/clean_data/ravaln_lags_data.csv')
ravaln_df.set_index('Timestamp', inplace=True)
ravaln_df.sort_index(inplace=True)

ravaln_daily = pd.read_csv('./../../../Databases/clean_data/ravaln_dailylags_data.csv')
ravaln_daily.set_index('Timestamp', inplace=True)
ravaln_daily.sort_index(inplace=True)

In [7]:
# Let's import the marag data with no lags for future comparative purposes
no_lags_ravn = pd.read_csv('./../../../Databases/clean_data/ravaln_data.csv')
no_lags_ravn.set_index('Timestamp', inplace=True)
no_lags_ravn.sort_index(inplace=True)

daily_nolags_ravn = pd.read_csv('./../../../Databases/clean_data/ravaln_daily_data.csv')
daily_nolags_ravn.set_index('Timestamp', inplace=True)
daily_nolags_ravn.sort_index(inplace=True)

In [24]:
ravn_results = pd.DataFrame
ravn_runtime = pd.DataFrame

### Linear regression: high frequency data, all columns

In [9]:
X_train, X_dev, X_test, y_train, y_dev, y_test = train_dev_test_split(ravaln_df.drop(['TotalEntries'], axis=1), 
                                                                      ravaln_df['TotalEntries'])

lr_X_train = pd.concat([X_train, X_dev])
lr_X_train.sort_index(inplace=True)
lr_y_train = pd.concat([y_train, y_dev])
lr_y_train.sort_index(inplace=True)

In [10]:
lr_model = LinearRegression().fit(lr_X_train, lr_y_train)

In [12]:
start_PUF_allcols = time.time()

lr_all_future_df = get_future_preds(no_lags_ravn, lr_X_train, X_test, lr_model)

end_PUF_allcols = time.time()
lr_runtime_PUF_allcols = end_PUF_allcols - start_PUF_allcols
lr_runtime_PUF_allcols = pd.DataFrame(pd.Series(lr_runtime_PUF_allcols, name = 'PUF_allcols_lr_runtime'))
lr_all_future_df.columns = ['TotEntr_PUF_allcols']

100%|██████████| 797/797 [06:27<00:00,  1.95it/s]


In [21]:
lr_all_future_df.head()

Unnamed: 0_level_0,TotEntr_PUF_allcols
Timestamp,Unnamed: 1_level_1
2019-02-08 09:30:00,126.068275
2019-02-08 10:00:00,162.71997
2019-02-08 10:30:00,214.956078
2019-02-08 11:00:00,308.750372
2019-02-08 11:30:00,290.580447


In [44]:
ravn_results = pd.DataFrame(pd.Series(ravaln_daily['TotalEntries'][-(lr_preds.shape[0]):], name='obs_values'))
lr_all_future_df.index = pd.to_datetime(lr_all_future_df.index)
lr_preds = lr_all_future_df.groupby(pd.Grouper(freq='D')).sum()
lr_preds.columns = ['LinRegr']
ravn_results = pd.concat([ravn_results, lr_preds], axis=1)

In [45]:
ravn_results

Unnamed: 0_level_0,obs_values,LinRegr
Timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1
2019-02-08,24997,21984.887256
2019-02-09,18222,18049.197069
2019-02-10,7258,10736.641499
2019-02-11,28144,20376.449161
2019-02-12,27086,24769.473837
2019-02-13,25454,25100.944879
2019-02-14,26758,25314.641734
2019-02-15,24904,24521.579505
2019-02-16,16227,17803.784057
2019-02-17,6753,12991.384473


### ARIMA: low frequency data, just lags

In [53]:
lags_dPUF_df = ravaln_daily.drop(['T', 'PPT', 'Weekday/Weekend'], axis = 1)

X_train, X_dev, ARIMA_dlags_X_test, y_train, y_dev, ARIMA_dlags_y_test = train_dev_test_split(lags_dPUF_df.drop(['TotalEntries'], axis=1), 
                                                                      lags_dPUF_df['TotalEntries'])

ARIMA_dlags_X_train = pd.concat([X_train, X_dev])
ARIMA_dlags_X_train.sort_index(inplace=True)
ARIMA_dlags_y_train = pd.concat([y_train, y_dev])
ARIMA_dlags_y_train.sort_index(inplace=True)

In [54]:
# Model tuning with exogenous data (allcols)

start_dARIMA_tune = time.time()
daily_ARIMA_model = auto_arima(ARIMA_dlags_y_train, exogenous=ARIMA_dlags_X_train, start_p=1, start_q=1,
                           start_P=0, seasonal=True, suppress_warnings=True,
                           error_action='ignore', trace=True, stepwise=True)

end_dARIMA_tune = time.time()
dARIMA_tune_runtime = end_dARIMA_tune - start_dARIMA_tune

Fit ARIMA: order=(1, 0, 1) seasonal_order=(0, 0, 0, 1); AIC=1324.035, BIC=1348.122, Fit time=0.155 seconds
Fit ARIMA: order=(0, 0, 0) seasonal_order=(0, 0, 0, 1); AIC=1334.037, BIC=1353.743, Fit time=0.032 seconds
Fit ARIMA: order=(1, 0, 0) seasonal_order=(0, 0, 0, 1); AIC=1327.245, BIC=1349.142, Fit time=0.095 seconds
Fit ARIMA: order=(0, 0, 1) seasonal_order=(0, 0, 0, 1); AIC=1322.058, BIC=1343.954, Fit time=0.112 seconds
Fit ARIMA: order=(0, 0, 2) seasonal_order=(0, 0, 0, 1); AIC=1324.311, BIC=1348.398, Fit time=0.156 seconds
Fit ARIMA: order=(1, 0, 2) seasonal_order=(0, 0, 0, 1); AIC=1324.263, BIC=1350.538, Fit time=0.406 seconds
Total fit time: 0.960 seconds


In [55]:
daily_ARIMA_model.summary()

0,1,2,3
Dep. Variable:,y,No. Observations:,66.0
Model:,"SARIMAX(0, 0, 1)",Log Likelihood,-651.029
Date:,"Wed, 22 May 2019",AIC,1322.058
Time:,20:05:53,BIC,1343.954
Sample:,0,HQIC,1330.71
,- 66,,
Covariance Type:,opg,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
intercept,132.5462,7198.934,0.018,0.985,-1.4e+04,1.42e+04
x1,-0.0164,0.143,-0.115,0.909,-0.297,0.264
x2,0.0833,0.154,0.542,0.588,-0.218,0.385
x3,0.0325,0.123,0.263,0.792,-0.209,0.274
x4,0.0589,0.125,0.472,0.637,-0.186,0.304
x5,0.1289,0.152,0.850,0.395,-0.168,0.426
x6,0.0232,0.115,0.201,0.841,-0.203,0.249
x7,0.6742,0.156,4.322,0.000,0.368,0.980
ma.L1,0.7763,0.141,5.514,0.000,0.500,1.052

0,1,2,3
Ljung-Box (Q):,21.05,Jarque-Bera (JB):,8.98
Prob(Q):,0.99,Prob(JB):,0.01
Heteroskedasticity (H):,0.19,Skew:,-0.36
Prob(H) (two-sided):,0.0,Kurtosis:,4.65


In [56]:
dARIMA_model = daily_ARIMA_model.fit(ARIMA_dlags_y_train, exogenous=ARIMA_dlags_X_train)

In [57]:
start_ARIMA_dPUF_allcols = time.time()
dARIMA_df, dARIMA_confint = ARIMA_future_preds(daily_nolags_ravn, ARIMA_dlags_X_train, ARIMA_dlags_X_test, dARIMA_model, exogenous=True, daily=True)

end_ARIMA_dPUF_allcols = time.time()
dARIMA_runtime_PUF_allcols = end_ARIMA_dPUF_allcols - start_ARIMA_dPUF_allcols


  0%|          | 0/17 [00:00<?, ?it/s]


ValueError: Provided exogenous values are not of the appropriate shape. Required (2, 7), got (2, 10).