In [1]:
import numpy as np
import pandas as pd
import datetime
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from math import sqrt
import matplotlib.pyplot as plt
plt.rcParams['figure.figsize'] = [20, 10]
%matplotlib inline

In [2]:
original_df = pd.read_excel('/home/quark/Documents/projects/TFM/Databases/data.xlsx')
original_df.sort_values(['Timestamp'], ascending=True, inplace=True)
original_df['Timestamp'] = pd.to_datetime(original_df['Timestamp'], unit='s')

marag_df = original_df[original_df['SiteId'] == 2][['Timestamp', 'TotalEntries']]
marag_df.set_index('Timestamp', inplace=True)
marag_df.sort_index(inplace=True)
marag_df = marag_df['2017-01-09 00:00:00':]
#Since from this date the time series is constinuous, we slice our dataframe consequently

##### Walk-forward validation

In [40]:
#First we need to split the raw data

#def X_data_walk_forward(X_data):
#    cut1 = int(0.6*len(X_data))
#    X_train_raw = X_data[:(cut1)]
#    return X_train_raw
#X_train_raw = X_data_walk_forward(marag_df)

In [41]:
#Transform

#Data transformation
#define method to transform data, for example, create lags:
def create_lags(dataframe, N):
    temp_df = dataframe.copy()
    for i in range(1,N+1):
        temp_df['t-' + str(i)] = temp_df['TotalEntries'].shift(i)
    
    return temp_df.iloc[N:]
df_with_lags = create_lags(marag_df, 48)

In [42]:
df_with_lags.head()

Unnamed: 0_level_0,TotalEntries,t-1,t-2,t-3,t-4,t-5,t-6,t-7,t-8,t-9,...,t-39,t-40,t-41,t-42,t-43,t-44,t-45,t-46,t-47,t-48
Timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2017-01-10 00:00:00,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2017-01-10 00:30:00,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2017-01-10 01:00:00,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2017-01-10 01:30:00,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2017-01-10 02:00:00,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [43]:
###Note: it's recommended to use TimSeries Split, but for the time being we'll use train_test_split

#Initial data preparation:
def train_dev_test_split(X_data, y_data):
    
##CODE IMPROVEMENT: raise error if X and y of different size!    
    cut1 = int(0.6*len(X_data))
    cut2 = ((len(X_data)-cut1)//2) + cut1
    X_train = X_data[:cut1]
    X_dev = X_data[cut1:cut2]
    X_test = X_data[cut2:]
    y_train = y_data[:cut1]
    y_dev = y_data[cut1:cut2]
    y_test = y_data[cut2:]
    
    return X_train, X_dev, X_test, y_train, y_dev, y_test
    
#X_train, X_temp, y_train, y_temp = train_test_split(men_df.drop('TotalEntriesMen', axis = 1), men_df['TotalEntriesMen'], test_size = 0.4, random_state = 42, shuffle=False)
X_train, X_dev, X_test, y_train, y_dev, y_test = train_dev_test_split(df_with_lags.drop(['TotalEntries'], axis=1), df_with_lags['TotalEntries'])


In [7]:
X_train.head()

Unnamed: 0_level_0,t-1,t-2,t-3,t-4,t-5,t-6,t-7,t-8,t-9,t-10,...,t-39,t-40,t-41,t-42,t-43,t-44,t-45,t-46,t-47,t-48
Timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2017-01-10 00:00:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2017-01-10 00:30:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2017-01-10 01:00:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2017-01-10 01:30:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2017-01-10 02:00:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [44]:
X_train_raw.tail()

Unnamed: 0_level_0,TotalEntries
Timestamp,Unnamed: 1_level_1
2018-04-20 02:00:00,0
2018-04-20 02:30:00,0
2018-04-20 03:00:00,0
2018-04-20 03:30:00,0
2018-04-20 04:00:00,0


In [45]:
X_dev.head()

Unnamed: 0_level_0,t-1,t-2,t-3,t-4,t-5,t-6,t-7,t-8,t-9,t-10,...,t-39,t-40,t-41,t-42,t-43,t-44,t-45,t-46,t-47,t-48
Timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2018-04-20 14:00:00,311.0,246.0,203.0,172.0,202.0,147.0,134.0,177.0,160.0,221.0,...,415.0,486.0,356.0,394.0,236.0,285.0,271.0,357.0,317.0,394.0
2018-04-20 14:30:00,443.0,311.0,246.0,203.0,172.0,202.0,147.0,134.0,177.0,160.0,...,417.0,415.0,486.0,356.0,394.0,236.0,285.0,271.0,357.0,317.0
2018-04-20 15:00:00,407.0,443.0,311.0,246.0,203.0,172.0,202.0,147.0,134.0,177.0,...,344.0,417.0,415.0,486.0,356.0,394.0,236.0,285.0,271.0,357.0
2018-04-20 15:30:00,517.0,407.0,443.0,311.0,246.0,203.0,172.0,202.0,147.0,134.0,...,269.0,344.0,417.0,415.0,486.0,356.0,394.0,236.0,285.0,271.0
2018-04-20 16:00:00,420.0,517.0,407.0,443.0,311.0,246.0,203.0,172.0,202.0,147.0,...,246.0,269.0,344.0,417.0,415.0,486.0,356.0,394.0,236.0,285.0


In [8]:
#Fit model

#Define model to be used
model = LinearRegression().fit(X_train, y_train)

NameError: name 'LinearRegression' is not defined

In [83]:
len(X_dev)

7450

In [79]:
X_train_raw = marag_df[:X_train[-1:].index.astype(str)[0]]

In [82]:
#Walk-forward validation

X_train_raw = marag_df[:X_train[-1:].index.astype(str)[0]]

def walk_forward_validation(X_train, y_train, X_test, L, model):

    '''This function integrates a walk-forward validation method for multiple models.
    Parameters: model, #lags, train data, test data, length of test_data'''

    X_grow = X_train_raw.astype('float64').sort_index()
    y_grow_hat = []
    #RMSE_record = []
    for i in range(len(X_test)):
        y_hat = model.predict(X_test[i:i+1])
        #RMSE_record.append(y_hat)
        y_grow_hat.append(y_hat)
        y_hat = pd.DataFrame(y_hat, index=X_test[i:(i+1)].index, columns=['TotalEntries'])
        #y_hat = pd.DataFrame(y_hat, columns=['TotalEntries'])
        #y_hat.set_index(X_dev[i:(i+1)].index, inplace=True)
        X_grow = pd.concat([X_grow, y_hat])
        X_data = create_lags(X_grow, L)
        model.fit(X_data.drop(['TotalEntries'], axis=1), X_data['TotalEntries'])
        #y_hat_trans = create_lags(pd.DataFrame(y_hat), 49)
        #X_grow = pd.concat(X_grow, y_hat_trans)
        #model.fit(X_grow)
    
    
    return y_grow_hat


    
    
    
    
    #def recurrent_prediction(y_train, n_steps, reg):
    #x_dev = list(y_train[-1:-(n_steps +1):-1])
    #y_dev_hat = []
    #for i in range(n_steps):
     #   y_hat = reg.predict([x_dev])[0]
     #   y_dev_hat.append(y_hat)
     #   x_dev.pop(-1)
     #   x_dev = [y_hat] + x_dev
    #return y_dev_hat

#y_test_hat = recurrent_prediction(y_train, 48, lr_model)
#np.sqrt(mean_squared_error(y_test[:48], y_test_hat))

In [23]:
from sklearn.linear_model import LinearRegression
lr_model = LinearRegression().fit(X_train, y_train)
y_hat = lr_model.predict(X_dev[:1])

In [24]:
y_hat

array([365.18346681])

In [25]:
y_hat = pd.DataFrame(y_hat, index=X_dev[0:(0+1)].index, columns=['TotalEntries'])

In [26]:
y_hat

Unnamed: 0_level_0,TotalEntries
Timestamp,Unnamed: 1_level_1
2018-04-20 14:00:00,365.183467


In [11]:
y_hat = pd.DataFrame(y_hat, index=X_dev[:1].index, columns=['TotalEntries'])
y_hat

Unnamed: 0_level_0,TotalEntries
Timestamp,Unnamed: 1_level_1
2018-04-20 14:00:00,365.183467


In [12]:
y_hat = pd.DataFrame(y_hat, columns=['TotalEntries'])
y_hat

Unnamed: 0_level_0,TotalEntries
Timestamp,Unnamed: 1_level_1
2018-04-20 14:00:00,365.183467


In [13]:
y_hat = pd.DataFrame(y_hat, index=X_dev[:1].index)
y_hat

Unnamed: 0_level_0,TotalEntries
Timestamp,Unnamed: 1_level_1
2018-04-20 14:00:00,365.183467


In [14]:
y_hat = pd.DataFrame(y_hat).set_index(X_dev[:1].index, inplace=True)
y_hat

In [15]:
type(y_train)

pandas.core.series.Series

In [28]:
y_train.head()

Timestamp
2017-01-10 00:00:00    0
2017-01-10 00:30:00    0
2017-01-10 01:00:00    0
2017-01-10 01:30:00    0
2017-01-10 02:00:00    0
Name: TotalEntries, dtype: int64

In [16]:
y_train[-10:]

Timestamp
2018-04-20 09:00:00    221
2018-04-20 09:30:00    160
2018-04-20 10:00:00    177
2018-04-20 10:30:00    134
2018-04-20 11:00:00    147
2018-04-20 11:30:00    202
2018-04-20 12:00:00    172
2018-04-20 12:30:00    203
2018-04-20 13:00:00    246
2018-04-20 13:30:00    311
Name: TotalEntries, dtype: int64

In [27]:
pd.concat([y_train, (y_hat)])[-10:]

Unnamed: 0_level_0,0,TotalEntries
Timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1
2018-04-20 09:30:00,160.0,
2018-04-20 10:00:00,177.0,
2018-04-20 10:30:00,134.0,
2018-04-20 11:00:00,147.0,
2018-04-20 11:30:00,202.0,
2018-04-20 12:00:00,172.0,
2018-04-20 12:30:00,203.0,
2018-04-20 13:00:00,246.0,
2018-04-20 13:30:00,311.0,
2018-04-20 14:00:00,,365.183467


In [80]:
X_grow = X_train_raw.astype('float64').sort_index()
X_grow.tail()

Unnamed: 0_level_0,TotalEntries
Timestamp,Unnamed: 1_level_1
2018-04-20 11:30:00,202.0
2018-04-20 12:00:00,172.0
2018-04-20 12:30:00,203.0
2018-04-20 13:00:00,246.0
2018-04-20 13:30:00,311.0


In [81]:
pd.concat([X_grow, y_hat]).tail()

Unnamed: 0_level_0,TotalEntries
Timestamp,Unnamed: 1_level_1
2018-04-20 12:00:00,172.0
2018-04-20 12:30:00,203.0
2018-04-20 13:00:00,246.0
2018-04-20 13:30:00,311.0
2018-04-20 14:00:00,365.183467


In [78]:
marag_df[:X_train[-1:].index.astype(str)[0]].tail()

Unnamed: 0_level_0,TotalEntries
Timestamp,Unnamed: 1_level_1
2018-04-20 11:30:00,202
2018-04-20 12:00:00,172
2018-04-20 12:30:00,203
2018-04-20 13:00:00,246
2018-04-20 13:30:00,311
