In [24]:
import numpy as np
import pandas as pd
import copy as cp

In [8]:
N = 600
t = np.arange(0, N, 1).reshape(-1,1)
t = np.array([t[i] + np.random.rand(1)/4 for i in range(len(t))])
t = np.array([t[i] - np.random.rand(1)/7 for i in range(len(t))])
t = np.array(np.round(t, 2))

x1 = np.round((np.random.random(N) * 5).reshape(-1,1), 2)
x2 = np.round((np.random.random(N) * 5).reshape(-1,1), 2)
x3 = np.round((np.random.random(N) * 5).reshape(-1,1), 2)

n = np.round((np.random.random(N) * 2).reshape(-1,1), 2)

y = np.array([((np.log(np.abs(2 + x1[t])) - x2[t-1]**2) + 0.02*x3[t-3]*np.exp(x1[t-1])) for t in range(len(t))])
y = np.round(y+n, 2)

In [9]:
dataset = pd.DataFrame(np.concatenate((t, x1, x2, x3, y), axis=1), 
                       columns=['t', 'x1', 'x2', 'x3', 'y'])

deltaT = np.array([(dataset.t[i + 1] - dataset.t[i]) for i in range(len(dataset)-1)])
deltaT = np.concatenate((np.array([0]), deltaT))

dataset.insert(1, '∆t', deltaT)
dataset.head(3)

Unnamed: 0,t,∆t,x1,x2,x3,y
0,0.08,0.0,0.79,4.43,0.96,-8.97
1,1.12,1.04,4.14,0.35,2.11,-16.07
2,2.01,0.89,0.83,1.76,0.31,4.28


In [12]:
dataset = pd.DataFrame(np.concatenate((t, x1, x2, x3, y), axis=1), 
                       columns=['t', 'x1', 'x2', 'x3', 'y'])

deltaT = np.array([(dataset.t[i + 1] - dataset.t[i]) for i in range(len(dataset)-1)])
deltaT = np.concatenate((np.array([0]), deltaT))

dataset.insert(1, '∆t', deltaT)
dataset.head(3)

Unnamed: 0,t,∆t,x1,x2,x3,y
0,0.08,0.0,0.79,4.43,0.96,-8.97
1,1.12,1.04,4.14,0.35,2.11,-16.07
2,2.01,0.89,0.83,1.76,0.31,4.28


In [25]:
class WindowSlider(object):
    
    def __init__(self, window_size = 5):        
        '''
        Window Slider object
        ====================
        w: window_size - number of time steps to look back
        o: offset between last reading and temperature
        r: response_size - number of time steps to predict
        l: maximum length to slide - (#observation - w)
        p: final predictors - (#predictors * w)
        '''
        self.w = window_size
        self.o = 0
        self.r = 1       
        self.l = 0
        self.p = 0
        self.names = []
        
    def re_init(self, arr):
        '''
        Helper function to initializate to 0 a vector
        '''
        arr = np.cumsum(arr)
        return arr - arr[0]
                

    def collect_windows(self, X, window_size=5, offset=0, previous_y=False):
        '''
        Input: X is the input matrix, each column is a variable
        Returns: diferent mappings window-output
        '''
        cols = len(list(X)) - 1
        N = len(X)
        
        self.o = offset
        self.w = window_size
        self.l = N - (self.w + self.r) + 1
        if not previous_y: self.p = cols * (self.w)
        if previous_y: self.p = (cols + 1) * (self.w)
        
        # Create the names of the variables in the window
        # Check first if we need to create that for the response itself
        if previous_y: x = cp.deepcopy(X)
        if not previous_y: x = X.drop(X.columns[-1], axis=1)  
        
        for j, col in enumerate(list(x)):        
                
            for i in range(self.w):
                
                name = col + ('(%d)' % (i+1))
                self.names.append(name)
        
        # Incorporate the timestamps where we want to predict
        for k in range(self.r):
            
            name = '∆t' + ('(%d)' % (self.w + k + 1))
            self.names.append(name)
            
        self.names.append('Y')
                
        df = pd.DataFrame(np.zeros(shape=(self.l, (self.p + self.r + 1))), 
                          columns=self.names)
        
        # Populate by rows in the new dataframe
        for i in range(self.l):
            
            slices = np.array([])
            
            # Flatten the lags of predictors
            for p in range(x.shape[1]):
            
                line = X.values[i:self.w + i, p]
                # Reinitialization at every window for ∆T
                if p == 0: line = self.re_init(line)
                    
                # Concatenate the lines in one slice    
                slices = np.concatenate((slices, line)) 
 
            # Incorporate the timestamps where we want to predict
            line = np.array([self.re_init(X.values[i:i+self.w+self.r, 0])[-1]])
            y = np.array(X.values[self.w + i + self.r - 1, -1]).reshape(1,)
            slices = np.concatenate((slices, line, y))
            
            # Incorporate the slice to the cake (df)
            df.iloc[i,:] = slices
            
        return df

In [17]:
from sklearn.model_selection import train_test_split


In [21]:
trainset, testset = train_test_split(dataset, test_size = 0.2, shuffle = False)

In [22]:
trainset

Unnamed: 0,t,∆t,x1,x2,x3,y
0,0.08,0.00,0.79,4.43,0.96,-8.97
1,1.12,1.04,4.14,0.35,2.11,-16.07
2,2.01,0.89,0.83,1.76,0.31,4.28
3,3.06,1.05,4.42,4.01,3.82,-0.60
4,4.20,1.14,4.39,1.68,1.80,-9.88
...,...,...,...,...,...,...
475,475.01,0.85,4.24,1.66,1.39,0.99
476,475.97,0.96,2.08,2.24,2.36,5.07
477,477.09,1.12,4.28,4.16,2.89,-1.73
478,477.99,0.90,4.97,3.29,0.60,-12.60


In [26]:
w = 5
train_constructor = WindowSlider()
train_windows = train_constructor.collect_windows(trainset.iloc[:,1:], previous_y=False)

test_constructor = WindowSlider()
test_windows = test_constructor.collect_windows(testset.iloc[:,1:], previous_y=False)

train_constructor_y_inc = WindowSlider()
train_windows_y_inc = train_constructor_y_inc.collect_windows(trainset.iloc[:,1:], previous_y=True)

test_constructor_y_inc = WindowSlider()
test_windows_y_inc = test_constructor_y_inc.collect_windows(testset.iloc[:,1:], previous_y=True)

train_windows.head(3)

Unnamed: 0,∆t(1),∆t(2),∆t(3),∆t(4),∆t(5),x1(1),x1(2),x1(3),x1(4),x1(5),...,x2(3),x2(4),x2(5),x3(1),x3(2),x3(3),x3(4),x3(5),∆t(6),Y
0,0.0,1.04,1.93,2.98,4.12,0.79,4.14,0.83,4.42,4.39,...,1.76,4.01,1.68,0.96,2.11,0.31,3.82,1.8,5.01,0.24
1,0.0,0.89,1.94,3.08,3.97,4.14,0.83,4.42,4.39,3.81,...,4.01,1.68,1.82,2.11,0.31,3.82,1.8,0.77,4.89,2.78
2,0.0,1.05,2.19,3.08,4.0,0.83,4.42,4.39,3.81,2.88,...,1.68,1.82,2.58,0.31,3.82,1.8,0.77,3.07,4.91,-2.99
