In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
# General imports
import numpy as np
import pandas as pd
import os, sys, gc, time, warnings, pickle, psutil, random

import time

warnings.filterwarnings('ignore')

In [3]:
## Simple "Memory profilers" to see memory usage
def get_memory_usage():
    return np.round(psutil.Process(os.getpid()).memory_info()[0]/2.**30, 2) 
        
def sizeof_fmt(num, suffix='B'):
    for unit in ['','Ki','Mi','Gi','Ti','Pi','Ei','Zi']:
        if abs(num) < 1024.0:
            return "%3.1f%s%s" % (num, unit, suffix)
        num /= 1024.0
    return "%.1f%s%s" % (num, 'Yi', suffix)

## Lag Features 

A lag features is a fancy name for a variable which contains data from prior time steps. If we have a time series data, we can convert it into rows. Every row contains data about one observation and includes all previous occurences of that observation.

Let us assume that I have time-series data about a process. I want to predict the next value of the target variable using the last five values of that variable and other features which describe the process.

In [4]:
import numpy as np
import pandas as pd

In [5]:
input_data = pd.DataFrame([
    [1, 5, 12, 4],
    [1, 5, 16, 5],
    [1, 5, 20, 6],
    [1, 5, 8, 3],
    [1, 5, 10, 3.5],
    [1, 5, 22, 6.5],
    [2, 8, 12, 44],
    [2, 8, 10, 33],
    [2, 8, 14, 50],
    [2, 8, 8, 15],
    [2, 8, 0, 0],
    [2, 8, 3, -5]
], columns = ['id', 'constant_feature', 'time_dependent_feature', 'target_variable'])

We want to get something like
[
    #  ['target_lag_1', 'target_lag_2', 'target_lag_3', 'target_lag_4',
    #   'target_lag_5', 'time_dependent_feature_lag_0',
    #   'time_dependent_feature_lag_1', 'time_dependent_feature_lag_2',
    #   'time_dependent_feature_lag_3', 'time_dependent_feature_lag_4',
    #   'time_dependent_feature_lag_5', 'constant_feature']
    [ 3.5,  3. ,  6. ,  5. ,  4. , 22. , 10. ,  8. , 20. , 16. , 12. , 5. ]),
    [ 0., 15., 50., 33., 44.,  3.,  0.,  8., 14., 10., 12.,  8.])
]

First we have to index the data frame using the observation identifiers. It helps to retrieve time-series data about that one observation

In [7]:
input_data = input_data.set_index('id')

In [8]:
input_data

Unnamed: 0_level_0,constant_feature,time_dependent_feature,target_variable
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,5,12,4.0
1,5,16,5.0
1,5,20,6.0
1,5,8,3.0
1,5,10,3.5
1,5,22,6.5
2,8,12,44.0
2,8,10,33.0
2,8,14,50.0
2,8,8,15.0


Now let us know how many unique observations we have

In [9]:
unique_ids = input_data.index.unique()

In [10]:
unique_ids

Int64Index([1, 2], dtype='int64', name='id')

We are going to iterate over those values and follow these steps:

1. Select only the time-series data realted to that one observation
2. Extract all values of the time-series variables (time_dependent_feature and target_variables).
3. Shift the target variable five times to get five lag features and the new dependent feature (the most recent observation).
4. Shift the other time-series variable six times to get all lag values of that independent feature,
5. Copy the non-time-series variables
6. Split the data frame into the independent features and the dependent features
7. Store them in arrays that will be used later for feature scaling, splitting into training/validation/test sets, and finally for the training of model.

In [12]:
X = []
Y = []

for identifier in unique_ids:
    single_process_data = input_data.loc[identifier] #1
    
    data = pd.DataFrame(single_process_data[['target_variable','time_dependent_feature']].copy()) #2
    data.columns=['y', 'time_dependent_feature'] #2
    
    #Last 5 values of the target variable as "lag" variables (the most recent one is the dependent feature (y))
    for i in range(1,6): #3
        data['target_lag_{}'.format(i)] = data.y.shift(i)
    
    # Last 6 values of the target variable as "time_dependent_feature" variables
    for  i in range(0,6): #4
        data['time_dependent_feature_lag_{}'.format(i)] = data.time_dependent_feature.shift(i)
    
    #rewrite constants
    data['constant_feature'] = single_process_data['constant_feature'] #5
    
    # the shift operations in the loops create many partial results. They are useless, and we do not want them
    data = data.dropna()
    y = data.y #6
    x = data.drop(['y', 'time_dependent_feature'], axis=1) #6
    
    X.append(np.array(x).flatten()) #7
    Y.append(y) #7

In [13]:
X

[array([ 3.5,  3. ,  6. ,  5. ,  4. , 22. , 10. ,  8. , 20. , 16. , 12. ,
         5. ]),
 array([ 0., 15., 50., 33., 44.,  3.,  0.,  8., 14., 10., 12.,  8.])]

In [14]:
Y

[id
 1    6.5
 Name: y, dtype: float64,
 id
 2   -5.0
 Name: y, dtype: float64]

Now the X variable contains all the independent features, and the Y variable contains the dependent feature.

https://www.mikulskibartosz.name/forecasting-time-series-using-lag-features/#:~:text=A%20lag%20features%20is%20a,previous%20occurrences%20of%20that%20observation.