In [1]:
import numpy as np
import pandas as pd
import datetime
import time
from tqdm import tqdm
from ipynb.fs.full.train_dev_test_split import train_dev_test_split
from ipynb.fs.full.get_future_predictions import get_future_preds
from sklearn.linear_model import LinearRegression

In [2]:
marag_df = pd.read_csv('./../../../Databases/marag_lags_data.csv')
marag_df.set_index('Timestamp', inplace=True)
marag_df.sort_index(inplace=True)

marag_daily = pd.read_csv('./../../../Databases/marag_dailylags_data.csv')
marag_daily.set_index('Timestamp', inplace=True)
marag_daily.sort_index(inplace=True)

# For the time being we will not use the outliers column, so we will drop it
marag_df.drop(['outliers'], axis=1, inplace=True)

In [3]:
# Let's import the marag data with no lags for future comparative purposes
no_lags_marag = pd.read_csv('./../../../Databases/marag_data.csv')
no_lags_marag.set_index('Timestamp', inplace=True)
no_lags_marag.sort_index(inplace=True)
no_lags_marag.drop(['outliers'], axis=1, inplace=True)

daily_nolags_marag = pd.read_csv('./../../../Databases/marag_daily_data.csv')
daily_nolags_marag.set_index('Timestamp', inplace=True)
daily_nolags_marag.sort_index(inplace=True)

In [4]:
# Let's create a dataframe in which we'll append all the estimated predictions, and another
# containing the running times of each scenario.

lr_results_df = pd.DataFrame
lr_runtime_df = pd.DataFrame

lr_dailyresults_df = pd.DataFrame
lr_runtime_dailydf = pd.DataFrame

## Original data (48 obsv. / day)

### All columns, known future

In this phase of the linear regression model analysis, we will estimate the predicted values using the already known lags (so future information will be used for the predictions).

In [5]:
X_train, X_dev, X_test, y_train, y_dev, y_test = train_dev_test_split(marag_df.drop(['TotalEntries'], axis=1), 
                                                                      marag_df['TotalEntries'])

lr_X_train = pd.concat([X_train, X_dev])
lr_X_train.sort_index(inplace=True)
lr_y_train = pd.concat([y_train, y_dev])
lr_y_train.sort_index(inplace=True)

In [24]:
start_PKF_allcols = time.time()
lr_model = LinearRegression().fit(lr_X_train, lr_y_train)
lr_y_hat = lr_model.predict(X_test)
lr_y_hat = pd.Series(lr_y_hat, index=y_test.index, name = 'TotEntr_PKF_allcols')

end_PKF_allcols = time.time()
lr_runtime_PKF_allcols = end_PKF_allcols - start_PKF_allcols
lr_runtime_PKF_allcols = pd.DataFrame(pd.Series(lr_runtime_PKF_allcols, name = 'PKF_allcols_lr_runtime'))

In [25]:
lr_observed_values = pd.DataFrame(y_test)
lr_observed_values.columns = ['Observed_values']
lr_results_df = lr_observed_values.merge(pd.DataFrame(lr_y_hat), how='left', on='Timestamp')
lr_results_df.head()

Unnamed: 0_level_0,Observed_values,TotEntr_PKF_allcols
Timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1
2018-09-24 00:00:00,20,5.473787
2018-09-24 00:30:00,0,-3.694076
2018-09-24 01:00:00,0,-12.101636
2018-09-24 01:30:00,0,-14.948868
2018-09-24 02:00:00,0,5.511497


In [35]:
lr_runtime_df = lr_runtime_PKF_allcols
lr_runtime_df.head()

Unnamed: 0,PKF_allcols_lr_runtime
0,0.490008


### Just lags, known future

We will use only the lags extracted from future observations, but external data won't be included.

In [17]:
lags_PKF_df = marag_df.drop(['Temperature', 'Precipitation', 'Open/Closed', 'Weekday/Weekend'], axis = 1)
lags_PKF_df.columns

Index(['TotalEntries', 't-1', 't-2', 't-3', 't-4', 't-5', 't-6', 't-7', 't-8',
       't-9',
       ...
       't-327', 't-328', 't-329', 't-330', 't-331', 't-332', 't-333', 't-334',
       't-335', 't-336'],
      dtype='object', length=337)

In [18]:
X_train, X_dev, X_test, y_train, y_dev, y_test = train_dev_test_split(lags_PKF_df.drop(['TotalEntries'], axis=1), 
                                                                      lags_PKF_df['TotalEntries'])

lr_lags_X_train = pd.concat([X_train, X_dev])
lr_lags_X_train.sort_index(inplace=True)
lr_lags_y_train = pd.concat([y_train, y_dev])
lr_lags_y_train.sort_index(inplace=True)

In [19]:
start_PKF_lags = time.time()
lr_lags_model = LinearRegression().fit(lr_lags_X_train, lr_lags_y_train)
lr_lags_y_hat = lr_lags_model.predict(X_test)
lr_lags_y_hat = pd.Series(lr_lags_y_hat, index=y_test.index, name = 'TotEntr_PKF_lags')

end_PKF_lags = time.time()
lr_runtime_PKF_lags = end_PKF_lags - start_PKF_lags
lr_runtime_PKF_lags = pd.DataFrame(pd.Series(lr_runtime_PKF_lags, name = 'PKF_lags_lr_runtime'))

In [50]:
lr_results_df = lr_results_df.merge(lr_lags_y_hat, on='Timestamp')
lr_results_df.head()

Unnamed: 0_level_0,TotEntr_PKF_allcols,TotEntr_PKF_lags
Timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1
2018-09-24 00:00:00,5.473787,-3.599823
2018-09-24 00:30:00,-3.694076,-11.082617
2018-09-24 01:00:00,-12.101636,-15.233982
2018-09-24 01:30:00,-14.948868,-19.498661
2018-09-24 02:00:00,5.511497,2.829622


In [36]:
lr_runtime_df = pd.concat([lr_runtime_df, lr_runtime_PKF_lags], axis = 1)
lr_runtime_df.head()

Unnamed: 0,PKF_allcols_lr_runtime,PKF_lags_lr_runtime
0,0.490008,0.502996


### All columns, unknown future

In [None]:
X_train, X_dev, X_test, y_train, y_dev, y_test = train_dev_test_split(marag_df.drop(['TotalEntries'], axis=1), 
                                                                      marag_df['TotalEntries'])

lr_X_train = pd.concat([X_train, X_dev])
lr_X_train.sort_index(inplace=True)
lr_y_train = pd.concat([y_train, y_dev])
lr_y_train.sort_index(inplace=True)

In [None]:
lr_model = LinearRegression().fit(lr_X_train, lr_y_train)

In [6]:
from ipynb.fs.full.get_future_predictions import get_future_preds

start_PUF_allcols = time.time()

lr_all_future_df = get_future_preds(no_lags_marag, lr_X_train, X_test, lr_model)

end_PUF_allcols = time.time()
lr_runtime_PUF_allcols = end_PUF_allcols - start_PUF_allcols
lr_runtime_PUF_allcols = pd.DataFrame(pd.Series(lr_runtime_PUF_allcols, name = 'PUF_allcols_lr_runtime'))
lr_all_future_df.columns = ['TotEntr_PUF_allcols']

100%|██████████| 7392/7392 [2:07:16<00:00,  1.16s/it]  

7636.272913217545





In [51]:
lr_results_df = lr_results_df.merge(lr_all_future_df, on='Timestamp')
lr_results_df.head()

Unnamed: 0_level_0,TotEntr_PKF_allcols,TotEntr_PKF_lags,TotEntr_PUF_allcols
Timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2018-09-24 00:00:00,5.473787,-3.599823,5.473787
2018-09-24 00:30:00,-3.694076,-11.082617,0.307627
2018-09-24 01:00:00,-12.101636,-15.233982,-4.623486
2018-09-24 01:30:00,-14.948868,-19.498661,-8.131185
2018-09-24 02:00:00,5.511497,2.829622,-8.020188


In [39]:
lr_runtime_df = pd.concat([lr_runtime_df, lr_runtime_PUF_allcols], axis = 1)
lr_runtime_df.head()

Unnamed: 0,PKF_allcols_lr_runtime,PKF_lags_lr_runtime,PUF_allcols_lr_runtime
0,0.490008,0.502996,7636.272913


### Just lags, unknown future

In [56]:
lags_PUF_df = marag_df.drop(['Temperature', 'Precipitation', 'Open/Closed', 'Weekday/Weekend'], axis = 1)
lags_PUF_df.columns

Index(['TotalEntries', 't-1', 't-2', 't-3', 't-4', 't-5', 't-6', 't-7', 't-8',
       't-9',
       ...
       't-327', 't-328', 't-329', 't-330', 't-331', 't-332', 't-333', 't-334',
       't-335', 't-336'],
      dtype='object', length=337)

In [57]:
X_train, X_dev, X_test, y_train, y_dev, y_test = train_dev_test_split(lags_PUF_df.drop(['TotalEntries'], axis=1), 
                                                                      lags_PUF_df['TotalEntries'])

lr_Ulags_X_train = pd.concat([X_train, X_dev])
lr_Ulags_X_train.sort_index(inplace=True)
lr_Ulags_y_train = pd.concat([y_train, y_dev])
lr_Ulags_y_train.sort_index(inplace=True)

In [58]:
lr_lags_model = LinearRegression().fit(lr_Ulags_X_train, lr_Ulags_y_train)

In [59]:
start_PUF_lags = time.time()

lr_all_future_lags_df = get_future_preds(no_lags_marag, lr_Ulags_X_train, X_test, lr_lags_model, justlags=True)

end_PUF_lags = time.time()
lr_runtime_PUF_lags = end_PUF_lags - start_PUF_lags
lr_runtime_PUF_lags = pd.DataFrame(pd.Series(lr_runtime_PUF_lags, name = 'PUF_lags_lr_runtime'))
lr_all_future_lags_df.columns = ['TotEntr_PUF_lags']

100%|██████████| 7392/7392 [2:14:39<00:00,  1.18s/it]  


In [64]:
#lr_results_df = lr_results_df.merge(lr_all_future_lags_df, on='Timestamp')
lr_results_df.head()

Unnamed: 0_level_0,TotEntr_PKF_allcols,TotEntr_PKF_lags,TotEntr_PUF_allcols,TotEntr_PUF_lags
Timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2018-09-24 00:00:00,5.473787,-3.599823,5.473787,-3.599823
2018-09-24 00:30:00,-3.694076,-11.082617,0.307627,-27.350093
2018-09-24 01:00:00,-12.101636,-15.233982,-4.623486,-38.901444
2018-09-24 01:30:00,-14.948868,-19.498661,-8.131185,-48.46683
2018-09-24 02:00:00,5.511497,2.829622,-8.020188,-33.71257


In [75]:
#lr_runtime_df = pd.concat([lr_runtime_df, lr_runtime_PUF_lags], axis = 1)
lr_runtime_df.head()

Unnamed: 0,PKF_allcols_lr_runtime,PKF_lags_lr_runtime,PUF_allcols_lr_runtime,PUF_lags_lr_runtime
0,0.490008,0.502996,7636.272913,8079.679445


In [97]:
#lr_results_df.to_csv('./../../../Databases/lr_results.csv', index=True)
#lr_runtime_df.to_csv('./../../../Databases/lr_runtime_scenarios.csv')

## Daily data (1 obs. / day)

### All columns, known future

In [7]:
X_train, X_dev, X_test, y_train, y_dev, y_test = train_dev_test_split(marag_daily.drop(['TotalEntries'], axis=1), 
                                                                      marag_daily['TotalEntries'])

lrd_X_train = pd.concat([X_train, X_dev])
lrd_X_train.sort_index(inplace=True)
lrd_y_train = pd.concat([y_train, y_dev])
lrd_y_train.sort_index(inplace=True)

In [44]:
start_dPKF_allcols = time.time()
lr_dmodel = LinearRegression().fit(lrd_X_train, lrd_y_train)
lrd_y_hat = lr_dmodel.predict(X_test)
lrd_y_hat = pd.Series(lrd_y_hat, index=y_test.index, name = 'DayEntr_PKF_allcols')

end_dPKF_allcols = time.time()
lrd_runtime_PKF_allcols = end_dPKF_allcols - start_dPKF_allcols
lrd_runtime_PKF_allcols = pd.DataFrame(pd.Series(lrd_runtime_PKF_allcols, name = 'day_PKF_allcols_lr_runtime'))

In [45]:
lrd_observed_values = pd.DataFrame(y_test)
lrd_observed_values.columns = ['daily_observed_values']
lr_dailyresults_df = lrd_observed_values.merge(pd.DataFrame(lrd_y_hat), how='left', on='Timestamp')
lr_dailyresults_df.head()

Unnamed: 0_level_0,daily_observed_values,DayEntr_PKF_allcols
Timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1
2018-09-24,5719,9222.246578
2018-09-25,9998,8512.299774
2018-09-26,9816,9365.59659
2018-09-27,10087,9516.592876
2018-09-28,10742,10151.044256


In [46]:
lr_runtime_dailydf = lrd_runtime_PKF_allcols
lr_runtime_dailydf.head()

Unnamed: 0,day_PKF_allcols_lr_runtime
0,0.005399


### Just lags, known future

In [10]:
lags_dPKF_df = marag_daily.drop(['T', 'PPT', 'Weekday/Weekend'], axis = 1)
lags_dPKF_df.columns

Index(['TotalEntries', 't-1', 't-2', 't-3', 't-4', 't-5', 't-6', 't-7'], dtype='object')

In [11]:
X_train, X_dev, X_test, y_train, y_dev, y_test = train_dev_test_split(lags_dPKF_df.drop(['TotalEntries'], axis=1), 
                                                                      lags_dPKF_df['TotalEntries'])

lr_dlags_X_train = pd.concat([X_train, X_dev])
lr_dlags_X_train.sort_index(inplace=True)
lr_dlags_y_train = pd.concat([y_train, y_dev])
lr_dlags_y_train.sort_index(inplace=True)

In [49]:
start_PKF_dlags = time.time()
lr_dlags_model = LinearRegression().fit(lr_dlags_X_train, lr_dlags_y_train)
lr_dlags_y_hat = lr_dlags_model.predict(X_test)
lr_dlags_y_hat = pd.Series(lr_dlags_y_hat, index=y_test.index, name = 'DayEntr_PKF_lags')

end_PKF_dlags = time.time()
lr_runtime_dPKF_lags = end_PKF_dlags - start_PKF_dlags
lr_runtime_dPKF_lags = pd.DataFrame(pd.Series(lr_runtime_dPKF_lags, name = 'day_PKF_lags_lr_runtime'))

In [50]:
lr_dailyresults_df = lr_dailyresults_df.merge(lr_dlags_y_hat, on='Timestamp')
lr_dailyresults_df.head()

Unnamed: 0_level_0,daily_observed_values,DayEntr_PKF_allcols,DayEntr_PKF_lags
Timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2018-09-24,5719,9222.246578,8760.960808
2018-09-25,9998,8512.299774,8406.381977
2018-09-26,9816,9365.59659,9865.235621
2018-09-27,10087,9516.592876,9957.557774
2018-09-28,10742,10151.044256,9956.419883


In [51]:
lr_runtime_dailydf = pd.concat([lr_runtime_dailydf, lr_runtime_dPKF_lags], axis = 1)
lr_runtime_dailydf.head()

Unnamed: 0,day_PKF_allcols_lr_runtime,day_PKF_lags_lr_runtime
0,0.005399,0.004501


### All columns, unknown future

In [52]:
X_train, X_dev, X_test, y_train, y_dev, y_test = train_dev_test_split(marag_daily.drop(['TotalEntries'], axis=1), 
                                                                      marag_daily['TotalEntries'])

lrd_X_train = pd.concat([X_train, X_dev])
lrd_X_train.sort_index(inplace=True)
lrd_y_train = pd.concat([y_train, y_dev])
lrd_y_train.sort_index(inplace=True)

In [53]:
lr_dmodel = LinearRegression().fit(lrd_X_train, lrd_y_train)

In [54]:
start_dPUF_allcols = time.time()

lr_dall_future_df = get_future_preds(daily_nolags_marag, lrd_X_train, X_test, lr_dmodel, daily=True)

end_dPUF_allcols = time.time()
lr_runtime_dPUF_allcols = end_dPUF_allcols - start_dPUF_allcols
lr_runtime_dPUF_allcols = pd.DataFrame(pd.Series(lr_runtime_dPUF_allcols, name = 'day_PUF_allcols_lr_runtime'))
lr_dall_future_df.columns = ['DayEntr_PUF_allcols']

100%|██████████| 154/154 [00:02<00:00, 59.42it/s]


In [55]:
lr_dailyresults_df = lr_dailyresults_df.merge(lr_dall_future_df, on='Timestamp')
lr_dailyresults_df.head()

Unnamed: 0_level_0,daily_observed_values,DayEntr_PKF_allcols,DayEntr_PKF_lags,DayEntr_PUF_allcols
Timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2018-09-24,5719,9222.246578,8760.960808,9222.246578
2018-09-25,9998,8512.299774,8406.381977,9594.328462
2018-09-26,9816,9365.59659,9865.235621,9627.414999
2018-09-27,10087,9516.592876,9957.557774,9882.440764
2018-09-28,10742,10151.044256,9956.419883,10252.806498


In [56]:
lr_runtime_dailydf = pd.concat([lr_runtime_dailydf, lr_runtime_dPUF_allcols], axis = 1)
lr_runtime_dailydf.head()

Unnamed: 0,day_PKF_allcols_lr_runtime,day_PKF_lags_lr_runtime,day_PUF_allcols_lr_runtime
0,0.005399,0.004501,2.544438


### Just lags, unknown future

In [57]:
lags_dPUF_df = marag_daily.drop(['T', 'PPT', 'Weekday/Weekend'], axis = 1)
lags_dPUF_df.columns

Index(['TotalEntries', 't-1', 't-2', 't-3', 't-4', 't-5', 't-6', 't-7'], dtype='object')

In [58]:
X_train, X_dev, X_test, y_train, y_dev, y_test = train_dev_test_split(lags_dPUF_df.drop(['TotalEntries'], axis=1), 
                                                                      lags_dPUF_df['TotalEntries'])

lr_Udlags_X_train = pd.concat([X_train, X_dev])
lr_Udlags_X_train.sort_index(inplace=True)
lr_Udlags_y_train = pd.concat([y_train, y_dev])
lr_Udlags_y_train.sort_index(inplace=True)

In [59]:
lr_Udlags_model = LinearRegression().fit(lr_Udlags_X_train, lr_Udlags_y_train)

In [60]:
start_dPUF_lags = time.time()

lr_dlags_future_df = get_future_preds(daily_nolags_marag, lr_Udlags_X_train, X_test, lr_Udlags_model, justlags=True, daily=True)

end_dPUF_lags = time.time()
lr_runtime_dPUF_lags = end_dPUF_lags - start_dPUF_lags
lr_runtime_dPUF_lags = pd.DataFrame(pd.Series(lr_runtime_dPUF_lags, name = 'day_PUF_lags_lr_runtime'))
lr_dlags_future_df.columns = ['DayEntr_PUF_lags']

100%|██████████| 154/154 [00:01<00:00, 86.70it/s]


In [61]:
lr_dailyresults_df = lr_dailyresults_df.merge(lr_dlags_future_df, on='Timestamp')
lr_dailyresults_df.head()

Unnamed: 0_level_0,daily_observed_values,DayEntr_PKF_allcols,DayEntr_PKF_lags,DayEntr_PUF_allcols,DayEntr_PUF_lags
Timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2018-09-24,5719,9222.246578,8760.960808,9222.246578,8760.960808
2018-09-25,9998,8512.299774,8406.381977,9594.328462,9210.085039
2018-09-26,9816,9365.59659,9865.235621,9627.414999,9319.118087
2018-09-27,10087,9516.592876,9957.557774,9882.440764,10085.018821
2018-09-28,10742,10151.044256,9956.419883,10252.806498,9952.815953


In [62]:
lr_runtime_dailydf = pd.concat([lr_runtime_dailydf, lr_runtime_dPUF_lags], axis = 1)
lr_runtime_dailydf.head()

Unnamed: 0,day_PKF_allcols_lr_runtime,day_PKF_lags_lr_runtime,day_PUF_allcols_lr_runtime,day_PUF_lags_lr_runtime
0,0.005399,0.004501,2.544438,1.782149


In [63]:
lr_dailyresults_df.to_csv('./../../../Databases/lr_dailyresults.csv', index=True)
lr_runtime_dailydf.to_csv('./../../../Databases/lr_runtime_dailyscenarios.csv')