## Import Libraries

In [1]:
import numpy as np
import pandas as pd
import copy as cp

## Import Data

In [17]:
data = pd.read_csv('data/sales_train_evaluation.csv')
data.head()

## Arrange Data for prediction

In [48]:
N = 1941
t = np.arange(0, N, 1).reshape(-1,1)

## train X and Y have to be declared here

item_id = 'HOBBIES_1_001'
store_id = 'CA_1'
example = data[(data['item_id'] == item_id) & (data['store_id'] == 'CA_1')].to_numpy()[0][6:]
example.size

previous_days = 4
x1 = np.array([example[i-1] if i - 1 >= 0 else 0 for i in range(1,N)] ).reshape(-1,1)
# np.insert(x1, 1, 4120, axis=0)
x1.size

1940

## Sliding Windows Model

In [2]:
# Random Data

N = 600
t = np.arange(0, N, 1).reshape(-1,1)
t = np.array([t[i] + np.random.rand(1)/4 for i in range(len(t))])
t = np.array([t[i] - np.random.rand(1)/7 for i in range(len(t))])
t = np.array(np.round(t, 2))

x1 = np.round((np.random.random(N) * 5).reshape(-1,1), 2)
x2 = np.round((np.random.random(N) * 5).reshape(-1,1), 2)
x3 = np.round((np.random.random(N) * 5).reshape(-1,1), 2)

n = np.round((np.random.random(N) * 2).reshape(-1,1), 2)

y = np.array([((np.log(np.abs(2 + x1[t])) - x2[t-1]**2) + 0.02*x3[t-3]*np.exp(x1[t-1])) for t in range(len(t))])
y = np.round(y+n, 2)

In [3]:
dataset = pd.DataFrame(np.concatenate((t, x1, x2, x3, x4, y), axis=1), 
                       columns=['t', 'x1', 'x2', 'x3', 'y'])

deltaT = np.array([(dataset.t[i + 1] - dataset.t[i]) for i in range(len(dataset)-1)])
deltaT = np.concatenate((np.array([0]), deltaT))

dataset.insert(1, '∆t', deltaT)
dataset.head(3)

Unnamed: 0,t,∆t,x1,x2,x3,y
0,-0.12,0.0,1.51,3.14,2.1,2.0
1,1.01,1.13,3.7,3.1,0.93,-7.31
2,1.94,0.93,4.92,3.91,0.28,-3.14


In [4]:
dataset = pd.DataFrame(np.concatenate((t, x1, x2, x3, y), axis=1), 
                       columns=['t', 'x1', 'x2', 'x3', 'y'])

deltaT = np.array([(dataset.t[i + 1] - dataset.t[i]) for i in range(len(dataset)-1)])
deltaT = np.concatenate((np.array([0]), deltaT))

dataset.insert(1, '∆t', deltaT)
dataset.head(3)

Unnamed: 0,t,∆t,x1,x2,x3,y
0,-0.12,0.0,1.51,3.14,2.1,2.0
1,1.01,1.13,3.7,3.1,0.93,-7.31
2,1.94,0.93,4.92,3.91,0.28,-3.14


In [5]:
class WindowSlider(object):
    
    def __init__(self, window_size = 5):        
        '''
        Window Slider object
        ====================
        w: window_size - number of time steps to look back
        o: offset between last reading and temperature
        r: response_size - number of time steps to predict
        l: maximum length to slide - (#observation - w)
        p: final predictors - (#predictors * w)
        '''
        self.w = window_size
        self.o = 0
        self.r = 1       
        self.l = 0
        self.p = 0
        self.names = []
        
    def re_init(self, arr):
        '''
        Helper function to initializate to 0 a vector
        '''
        arr = np.cumsum(arr)
        return arr - arr[0]
                

    def collect_windows(self, X, window_size=5, offset=0, previous_y=False):
        '''
        Input: X is the input matrix, each column is a variable
        Returns: diferent mappings window-output
        '''
        cols = len(list(X)) - 1
        N = len(X)
        
        self.o = offset
        self.w = window_size
        self.l = N - (self.w + self.r) + 1
        if not previous_y: self.p = cols * (self.w)
        if previous_y: self.p = (cols + 1) * (self.w)
        
        # Create the names of the variables in the window
        # Check first if we need to create that for the response itself
        if previous_y: x = cp.deepcopy(X)
        if not previous_y: x = X.drop(X.columns[-1], axis=1)  
        
        for j, col in enumerate(list(x)):        
                
            for i in range(self.w):
                
                name = col + ('(%d)' % (i+1))
                self.names.append(name)
        
        # Incorporate the timestamps where we want to predict
        for k in range(self.r):
            
            name = '∆t' + ('(%d)' % (self.w + k + 1))
            self.names.append(name)
            
        self.names.append('Y')
                
        df = pd.DataFrame(np.zeros(shape=(self.l, (self.p + self.r + 1))), 
                          columns=self.names)
        
        # Populate by rows in the new dataframe
        for i in range(self.l):
            
            slices = np.array([])
            
            # Flatten the lags of predictors
            for p in range(x.shape[1]):
            
                line = X.values[i:self.w + i, p]
                # Reinitialization at every window for ∆T
                if p == 0: line = self.re_init(line)
                    
                # Concatenate the lines in one slice    
                slices = np.concatenate((slices, line)) 
 
            # Incorporate the timestamps where we want to predict
            line = np.array([self.re_init(X.values[i:i+self.w+self.r, 0])[-1]])
            y = np.array(X.values[self.w + i + self.r - 1, -1]).reshape(1,)
            slices = np.concatenate((slices, line, y))
            
            # Incorporate the slice to the cake (df)
            df.iloc[i,:] = slices
            
        return df

In [6]:
from sklearn.model_selection import train_test_split


In [7]:
trainset, testset = train_test_split(dataset, test_size = 0.2, shuffle = False)

In [8]:
trainset

Unnamed: 0,t,∆t,x1,x2,x3,y
0,-0.12,0.00,1.51,3.14,2.10,2.00
1,1.01,1.13,3.70,3.10,0.93,-7.31
2,1.94,0.93,4.92,3.91,0.28,-3.14
3,3.05,1.11,0.29,3.26,3.59,-8.18
4,3.97,0.92,3.69,2.35,1.25,-8.25
...,...,...,...,...,...,...
475,474.96,0.91,2.04,1.10,3.15,-0.76
476,475.98,1.02,2.11,1.67,3.08,0.57
477,476.91,0.93,2.44,3.61,4.91,0.29
478,477.97,1.06,1.04,3.23,4.08,-10.65


In [9]:
w = 5
train_constructor = WindowSlider()
train_windows = train_constructor.collect_windows(trainset.iloc[:,1:], previous_y=False)

test_constructor = WindowSlider()
test_windows = test_constructor.collect_windows(testset.iloc[:,1:], previous_y=False)

train_constructor_y_inc = WindowSlider()
train_windows_y_inc = train_constructor_y_inc.collect_windows(trainset.iloc[:,1:], previous_y=True)

test_constructor_y_inc = WindowSlider()
test_windows_y_inc = test_constructor_y_inc.collect_windows(testset.iloc[:,1:], previous_y=True)

train_windows.head(3)

Unnamed: 0,∆t(1),∆t(2),∆t(3),∆t(4),∆t(5),x1(1),x1(2),x1(3),x1(4),x1(5),...,x2(3),x2(4),x2(5),x3(1),x3(2),x3(3),x3(4),x3(5),∆t(6),Y
0,0.0,1.13,2.06,3.17,4.09,1.51,3.7,4.92,0.29,3.69,...,3.91,3.26,2.35,2.1,0.93,0.28,3.59,1.25,5.05,-2.87
1,0.0,0.93,2.04,2.96,3.92,3.7,4.92,0.29,3.69,0.04,...,3.26,2.35,2.48,0.93,0.28,3.59,1.25,3.24,4.9,-3.85
2,0.0,1.11,2.03,2.99,3.97,4.92,0.29,3.69,0.04,0.78,...,2.35,2.48,0.68,0.28,3.59,1.25,3.24,4.15,5.02,3.11


In [10]:
# ________________ Y_pred = current Y ________________ 
bl_trainset = cp.deepcopy(trainset)
bl_testset = cp.deepcopy(testset)

bl_y = pd.DataFrame(bl_testset['y'])
bl_y_pred = bl_y.shift(periods=1)

bl_residuals = bl_y_pred - bl_y
bl_rmse = np.sqrt(np.sum(np.power(bl_residuals,2)) / len(bl_residuals))
print('RMSE = %.2f' % bl_rmse)
print('Time to train = 0 seconds')
## RMSE = 11.28

RMSE = 11.09
Time to train = 0 seconds


In [11]:
# ______________ MULTIPLE LINEAR REGRESSION ______________ #
from sklearn.linear_model import LinearRegression
import time
lr_model = LinearRegression()
lr_model.fit(trainset.iloc[:,:-1], trainset.iloc[:,-1])

t0 = time.time()
lr_y = testset['y'].values
lr_y_fit = lr_model.predict(trainset.iloc[:,:-1])
lr_y_pred = lr_model.predict(testset.iloc[:,:-1])
tF = time.time()

lr_residuals = lr_y_pred - lr_y
lr_rmse = np.sqrt(np.sum(np.power(lr_residuals,2)) / len(lr_residuals))
print('RMSE = %.2f' % lr_rmse)
print('Time to train = %.2f seconds' % (tF - t0))
## RMSE = 8.61 
## Time to train = 0.00 seconds

RMSE = 8.10
Time to train = 0.01 seconds


In [14]:
# ___________ MULTIPLE LINEAR REGRESSION ON WINDOWS ___________ 
from sklearn.linear_model import LinearRegression
lr_model = LinearRegression()
lr_model.fit(train_windows.iloc[:,:-1], train_windows.iloc[:,-1])

t0 = time.time()
lr_y = test_windows['Y'].values
lr_y_fit = lr_model.predict(train_windows.iloc[:,:-1])
lr_y_pred = lr_model.predict(test_windows.iloc[:,:-1])
tF = time.time()

lr_residuals = lr_y_pred - lr_y
lr_rmse = np.sqrt(np.sum(np.power(lr_residuals,2)) / len(lr_residuals))
print('RMSE = %.2f' % lr_rmse)
print('Time to train = %.2f seconds' % (tF - t0))
## RMSE = 3.84
## Time to train = 0.00 seconds

RMSE = 2.28
Time to train = 0.01 seconds


In [13]:
test_windows

Unnamed: 0,∆t(1),∆t(2),∆t(3),∆t(4),∆t(5),x1(1),x1(2),x1(3),x1(4),x1(5),...,x2(3),x2(4),x2(5),x3(1),x3(2),x3(3),x3(4),x3(5),∆t(6),Y
0,0.0,0.99,1.94,2.98,4.07,0.98,0.11,2.00,2.32,0.78,...,3.50,2.81,4.63,2.72,4.41,2.95,4.37,2.61,4.89,-20.08
1,0.0,0.95,1.99,3.08,3.90,0.11,2.00,2.32,0.78,0.34,...,2.81,4.63,4.47,4.41,2.95,4.37,2.61,4.58,4.96,-16.50
2,0.0,1.04,2.13,2.95,4.01,2.00,2.32,0.78,0.34,2.26,...,4.63,4.47,3.17,2.95,4.37,2.61,4.58,3.89,5.00,-7.16
3,0.0,1.09,1.91,2.97,3.96,2.32,0.78,0.34,2.26,2.00,...,4.47,3.17,1.00,4.37,2.61,4.58,3.89,1.47,5.05,0.87
4,0.0,0.82,1.88,2.87,3.96,0.78,0.34,2.26,2.00,0.67,...,3.17,1.00,2.06,2.61,4.58,3.89,1.47,0.60,4.91,-1.75
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
110,0.0,0.99,2.03,3.09,4.09,3.52,2.17,0.18,2.10,2.15,...,5.00,0.81,2.87,0.48,0.14,2.07,1.42,1.32,4.98,-5.87
111,0.0,1.04,2.10,3.10,3.99,2.17,0.18,2.10,2.15,1.61,...,0.81,2.87,3.43,0.14,2.07,1.42,1.32,0.94,5.00,-9.11
112,0.0,1.06,2.06,2.95,3.96,0.18,2.10,2.15,1.61,2.64,...,2.87,3.43,3.65,2.07,1.42,1.32,0.94,1.57,4.98,-10.92
113,0.0,1.00,1.89,2.90,3.92,2.10,2.15,1.61,2.64,4.61,...,3.43,3.65,0.95,1.42,1.32,0.94,1.57,0.56,5.02,3.67
