This notebook tries with n_lag = 4

In [1]:
# Input: 
path_data = '../../01 data/train_pollution.csv'
target_var = 'target_carbon_monoxide'
split_ratio_train = 0.65
split_ratio_val = 0.15
split_ratio_test = 0.2

In [2]:
import numpy as np
import pandas as pd
from datetime import datetime
import matplotlib.pyplot as plt
import seaborn as sns
import pickle

from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from sklearn.preprocessing import StandardScaler

In [3]:
# Convert to datetime
def date_parser(x):
    return datetime.strptime(x, '%Y-%m-%d %H:%M:%S')

# Load data
data = pd.read_csv(path_data)
data['date_time'] = data['date_time'].apply(date_parser)
print(data.shape)
data.head(3)

(7111, 12)


Unnamed: 0,date_time,deg_C,relative_humidity,absolute_humidity,sensor_1,sensor_2,sensor_3,sensor_4,sensor_5,target_carbon_monoxide,target_benzene,target_nitrogen_oxides
0,2010-03-10 18:00:00,13.1,46.0,0.7578,1387.2,1087.8,1056.0,1742.8,1293.4,2.5,12.0,167.7
1,2010-03-10 19:00:00,13.2,45.3,0.7255,1279.1,888.2,1197.5,1449.9,1010.9,2.1,9.9,98.9
2,2010-03-10 20:00:00,12.6,56.2,0.7502,1331.9,929.6,1060.2,1586.1,1117.0,2.2,9.2,127.1


In [4]:
# Select features
# If univariate, select the only target var.
# If multivariate, select the target var and features.
data = data[
    ['target_carbon_monoxide', 'sensor_1', 'sensor_2', 'sensor_5', 'target_benzene', 'target_nitrogen_oxides']
]

In [5]:
# Feature engineering with lag time
def feature_shift_time(data: pd.DataFrame, lag=1, n_out=1, dropnan=True):
    var_names = list(data.columns)
    n_vars = 1 if type(data) is list else data.shape[1]
    df = pd.DataFrame(data)
    column, feature = list(), list()
    
    # input sequence (t-n, ... t-1)
    for i in range(lag, 0, -1):
        column.append(df.shift(i))
        feature += [f'{v}(t-{i})' for v in var_names]
    
    # forecast sequence (t, t+1, ... t+n)
    for i in range(0, n_out):
        column.append(df.shift(-i))
        if i == 0:
            feature += [f'{v}(t)' for v in var_names]
        else:
            feature += [f'{v}(t+{i})' for v in var_names]
    # put it all together
    agg = pd.concat(column, axis=1)
    agg.columns = feature
    # drop rows with NaN values
    if dropnan:
        agg.dropna(inplace=True)
    return agg

In [6]:
# Train, val, test split
end_train = int(round(split_ratio_train * len(data), 0))
end_val = int(round((split_ratio_val + split_ratio_train) * len(data), 0))

In [7]:
# Standard scaler
# scaler = StandardScaler()
# scaler.fit(data.iloc[:end_train, :])
# Save the scaler
# pickle.dump(scaler, open('../../03 outputs/multivariate/01/time_scaler.pickle', 'wb'), protocol=pickle.HIGHEST_PROTOCOL)

# Load standard scaler
scaler = pickle.load(open('../../03 outputs/multivariate/01/time_scaler.pickle', 'rb'))

In [8]:
### Set the n_lag based on the PACF
n_lag = 4

# Scale
data_prep = data.copy()
data_prep = pd.DataFrame(scaler.transform(data), columns=data_prep.columns)

# Feature engineering with lag time
data_prep = feature_shift_time(data_prep, lag=n_lag, n_out=1)

# Remove features at time t
drop_col = [c for c in data_prep.columns if '(t)' in c and c != target_var]
data_prep = data_prep.drop(columns=[])

print(data_prep.shape)
data_prep.head(3)

(7107, 30)


Unnamed: 0,target_carbon_monoxide(t-4),sensor_1(t-4),sensor_2(t-4),sensor_5(t-4),target_benzene(t-4),target_nitrogen_oxides(t-4),target_carbon_monoxide(t-3),sensor_1(t-3),sensor_2(t-3),sensor_5(t-3),...,sensor_2(t-1),sensor_5(t-1),target_benzene(t-1),target_nitrogen_oxides(t-1),target_carbon_monoxide(t),sensor_1(t),sensor_2(t),sensor_5(t),target_benzene(t),target_nitrogen_oxides(t)
4,0.548406,1.579574,0.661965,1.054539,0.371265,0.470854,0.212758,1.035971,-0.113891,0.223051,...,0.044701,0.965651,0.027344,0.576968,-0.290715,1.000267,-0.251881,0.580076,-0.466107,-0.041843
5,0.212758,1.035971,-0.113891,0.223051,0.05725,-0.297634,0.29667,1.301487,0.047033,0.535337,...,-0.251881,0.580076,-0.466107,-0.041843,-0.542452,0.743301,-0.855152,0.040859,-0.765169,-0.418268
6,0.29667,1.301487,0.047033,0.535337,-0.047421,0.017357,0.29667,1.246674,0.044701,0.965651,...,-0.855152,0.040859,-0.765169,-0.418268,-0.542452,0.86047,-0.964767,-0.487173,-0.86984,-0.737727


In [26]:
# Create X and y
X_col = [c for c in data_prep if '(t)' not in c]
y_col = [f'{target_var}(t)']

X_train = data_prep.loc[:end_train, X_col]
X_val = data_prep.loc[end_train+1:end_val, X_col]
X_test = data_prep.loc[end_val+1:, X_col]

y_train = data_prep.loc[:end_train, y_col]
y_val = data_prep.loc[end_train+1:end_val, y_col]
y_test = data_prep.loc[end_val+1:, y_col]

In [27]:
# Reshape input to be 3D [n of samples, n of lag timesteps, n of features]
n_features = len(data.columns)
X_train = X_train.values.reshape((X_train.shape[0], n_lag, n_features))
X_val = X_val.values.reshape((X_val.shape[0], n_lag, n_features))
X_test = X_test.values.reshape((X_test.shape[0], n_lag, n_features))
print(X_train.shape, y_train.shape, X_val.shape, y_val.shape, X_test.shape, y_test.shape)

(4619, 4, 6) (4619, 1) (1067, 4, 6) (1067, 1) (1421, 4, 6) (1421, 1)


In [29]:
# Save
np.save('../../03 outputs/multivariate/01/X_train_lag4.npy', X_train)
np.save('../../03 outputs/multivariate/01/y_train_lag4.npy', y_train)
np.save('../../03 outputs/multivariate/01/X_val_lag4.npy', X_val)
np.save('../../03 outputs/multivariate/01/y_val_lag4.npy', y_val)
np.save('../../03 outputs/multivariate/01/X_test_lag4.npy', X_test)
np.save('../../03 outputs/multivariate/01/y_test_lag4.npy', y_test)