# Feature Engineering

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os 

current_dir = os.getcwd()
parent_dir = os.path.dirname(current_dir)

In [2]:
df_train = pd.read_pickle(parent_dir + r"\data\processed\cleaned_train_dataset.pkl")

display(df_train.head(3))
print("Shape: ", df_train.shape)

Unnamed: 0,date_time,day_of_week,COGT,PT08.S1CO,C6H6GT,NOxGT,PT08.S3NOx,NO2GT,PT08.S4NO2,PT08.S5O3,T,RH,AH
0,2004-03-11 00:00:00,Thursday,1.2,1185.0,3.6,62.0,1462.0,77.0,1333.0,733.0,11.3,56.8,0.7603
1,2004-03-11 01:00:00,Thursday,1.0,1136.0,3.3,62.0,1453.0,76.0,1333.0,730.0,10.7,60.0,0.7702
2,2004-03-11 02:00:00,Thursday,0.9,1094.0,2.3,45.0,1579.0,60.0,1276.0,620.0,10.7,59.7,0.7648


Shape:  (5640, 13)


In [3]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5640 entries, 0 to 5639
Data columns (total 13 columns):
 #   Column       Non-Null Count  Dtype         
---  ------       --------------  -----         
 0   date_time    5640 non-null   datetime64[ns]
 1   day_of_week  5640 non-null   category      
 2   COGT         5640 non-null   float64       
 3   PT08.S1CO    5640 non-null   float64       
 4   C6H6GT       5640 non-null   float64       
 5   NOxGT        5640 non-null   float64       
 6   PT08.S3NOx   5640 non-null   float64       
 7   NO2GT        5640 non-null   float64       
 8   PT08.S4NO2   5640 non-null   float64       
 9   PT08.S5O3    5640 non-null   float64       
 10  T            5640 non-null   float64       
 11  RH           5640 non-null   float64       
 12  AH           5640 non-null   float64       
dtypes: category(1), datetime64[ns](1), float64(11)
memory usage: 534.4 KB


In [4]:
df_valid = pd.read_pickle(parent_dir + r"\data\processed\cleaned_validation_dataset.pkl")

display(df_valid.head(3))
print("Shape: ", df_valid.shape)

Unnamed: 0,date_time,day_of_week,COGT,PT08.S1CO,C6H6GT,NOxGT,PT08.S3NOx,NO2GT,PT08.S4NO2,PT08.S5O3,T,RH,AH
0,2004-11-01 00:00:00,Monday,3.2,1353.0,15.9,318.0,584.0,69.0,1723.0,2150.0,20.1,71.3,1.6564
1,2004-11-01 01:00:00,Monday,3.7,1407.0,17.8,338.0,548.0,77.0,1780.0,2519.0,20.1,71.1,1.6498
2,2004-11-01 02:00:00,Monday,3.5,1333.0,16.7,353.0,552.0,68.0,1767.0,1925.0,19.6,73.2,1.6543


Shape:  (1464, 13)


In [5]:
df_valid.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1464 entries, 0 to 1463
Data columns (total 13 columns):
 #   Column       Non-Null Count  Dtype         
---  ------       --------------  -----         
 0   date_time    1464 non-null   datetime64[ns]
 1   day_of_week  1464 non-null   category      
 2   COGT         1464 non-null   float64       
 3   PT08.S1CO    1464 non-null   float64       
 4   C6H6GT       1464 non-null   float64       
 5   NOxGT        1464 non-null   float64       
 6   PT08.S3NOx   1464 non-null   float64       
 7   NO2GT        1464 non-null   float64       
 8   PT08.S4NO2   1464 non-null   float64       
 9   PT08.S5O3    1464 non-null   float64       
 10  T            1464 non-null   float64       
 11  RH           1464 non-null   float64       
 12  AH           1464 non-null   float64       
dtypes: category(1), datetime64[ns](1), float64(11)
memory usage: 138.9 KB


In [6]:
list_target_vals = ['COGT','C6H6GT','NOxGT','NO2GT']

## Train Dataset

In [7]:
rolling_features = {}
list_select_period = [3,6,12,24]
# Rolling 3 hours - Capture immediate volatility
# Rolling 6 hours - Capture 1/4 of days volatility
# Rolling 12 hours - Capture half of days volatility
# Rolling 24 hours - Capture daily volatility
for a_col in df_train.columns:
    if a_col not in ['date_time', 'day_of_week']:
        for a_lag in list_select_period:
            roll_mean = df_train[a_col].rolling(window=a_lag).mean().shift(1)
            roll_std = df_train[a_col].rolling(window=a_lag).std().shift(1)
            roll_max = df_train[a_col].rolling(window=a_lag).max().shift(1)
            roll_min = df_train[a_col].rolling(window=a_lag).min().shift(1)

            mean_col_name = f'rolling_{a_col}_mean_{a_lag}'
            std_col_name = f'rolling_{a_col}_std_{a_lag}'
            max_col_name = f'rolling_{a_col}_max_{a_lag}'
            min_col_name = f'rolling_{a_col}_min_{a_lag}'

            rolling_features[mean_col_name] = roll_mean
            rolling_features[std_col_name] = roll_std
            rolling_features[max_col_name] = roll_max
            rolling_features[min_col_name] = roll_min

df_rolling = pd.DataFrame(rolling_features)
df_train = pd.concat([df_train, df_rolling], axis=1)

display(df_train.head(3))
print("Shape: ", df_train.shape)

Unnamed: 0,date_time,day_of_week,COGT,PT08.S1CO,C6H6GT,NOxGT,PT08.S3NOx,NO2GT,PT08.S4NO2,PT08.S5O3,...,rolling_AH_max_6,rolling_AH_min_6,rolling_AH_mean_12,rolling_AH_std_12,rolling_AH_max_12,rolling_AH_min_12,rolling_AH_mean_24,rolling_AH_std_24,rolling_AH_max_24,rolling_AH_min_24
0,2004-03-11 00:00:00,Thursday,1.2,1185.0,3.6,62.0,1462.0,77.0,1333.0,733.0,...,,,,,,,,,,
1,2004-03-11 01:00:00,Thursday,1.0,1136.0,3.3,62.0,1453.0,76.0,1333.0,730.0,...,,,,,,,,,,
2,2004-03-11 02:00:00,Thursday,0.9,1094.0,2.3,45.0,1579.0,60.0,1276.0,620.0,...,,,,,,,,,,


Shape:  (5640, 189)


In [8]:
lagged_features = []
list_select_period = [1,2,3,6,9,12,24]
for a_target in list_target_vals:
    for a_lag in list_select_period:
        col_name1 = f"{a_target}_lag_{a_lag}"
        lagged = df_train[a_target].shift(a_lag).rename(col_name1)
        lagged_features.append(lagged)

# Combine all lagged columns
df_lags = pd.concat(lagged_features, axis=1)

# Append them to the original DataFrame
df_train = pd.concat([df_train, df_lags], axis=1)

display(df_train.head(3))
print("Shape: ", df_train.shape)

Unnamed: 0,date_time,day_of_week,COGT,PT08.S1CO,C6H6GT,NOxGT,PT08.S3NOx,NO2GT,PT08.S4NO2,PT08.S5O3,...,NOxGT_lag_9,NOxGT_lag_12,NOxGT_lag_24,NO2GT_lag_1,NO2GT_lag_2,NO2GT_lag_3,NO2GT_lag_6,NO2GT_lag_9,NO2GT_lag_12,NO2GT_lag_24
0,2004-03-11 00:00:00,Thursday,1.2,1185.0,3.6,62.0,1462.0,77.0,1333.0,733.0,...,,,,,,,,,,
1,2004-03-11 01:00:00,Thursday,1.0,1136.0,3.3,62.0,1453.0,76.0,1333.0,730.0,...,,,,77.0,,,,,,
2,2004-03-11 02:00:00,Thursday,0.9,1094.0,2.3,45.0,1579.0,60.0,1276.0,620.0,...,,,,76.0,77.0,,,,,


Shape:  (5640, 217)


In [9]:
df_train = df_train.dropna().reset_index(drop = True)

display(df_train.head(3))
print("Shape: ", df_train.shape)

Unnamed: 0,date_time,day_of_week,COGT,PT08.S1CO,C6H6GT,NOxGT,PT08.S3NOx,NO2GT,PT08.S4NO2,PT08.S5O3,...,NOxGT_lag_9,NOxGT_lag_12,NOxGT_lag_24,NO2GT_lag_1,NO2GT_lag_2,NO2GT_lag_3,NO2GT_lag_6,NO2GT_lag_9,NO2GT_lag_12,NO2GT_lag_24
0,2004-03-12 00:00:00,Friday,1.7,1080.0,5.9,122.0,1254.0,97.0,1375.0,816.0,...,184.0,104.0,62.0,53.0,85.0,136.0,151.0,126.0,95.0,77.0
1,2004-03-12 01:00:00,Friday,1.9,1044.0,6.4,133.0,1247.0,110.0,1378.0,832.0,...,193.0,146.0,62.0,97.0,53.0,85.0,172.0,131.0,112.0,76.0
2,2004-03-12 02:00:00,Friday,1.4,988.0,4.1,82.0,1396.0,91.0,1304.0,692.0,...,243.0,207.0,45.0,110.0,97.0,53.0,165.0,135.0,128.0,60.0


Shape:  (5616, 217)


In [10]:
df_train['hour'] = df_train['date_time'].dt.hour
df_train = pd.get_dummies(df_train, columns=['hour','day_of_week'])

display(df_train.head(3))
print("Shape: ", df_train.shape)

Unnamed: 0,date_time,COGT,PT08.S1CO,C6H6GT,NOxGT,PT08.S3NOx,NO2GT,PT08.S4NO2,PT08.S5O3,T,...,hour_21,hour_22,hour_23,day_of_week_Monday,day_of_week_Tuesday,day_of_week_Wednesday,day_of_week_Thursday,day_of_week_Friday,day_of_week_Saturday,day_of_week_Sunday
0,2004-03-12 00:00:00,1.7,1080.0,5.9,122.0,1254.0,97.0,1375.0,816.0,8.3,...,False,False,False,False,False,False,False,True,False,False
1,2004-03-12 01:00:00,1.9,1044.0,6.4,133.0,1247.0,110.0,1378.0,832.0,7.7,...,False,False,False,False,False,False,False,True,False,False
2,2004-03-12 02:00:00,1.4,988.0,4.1,82.0,1396.0,91.0,1304.0,692.0,7.1,...,False,False,False,False,False,False,False,True,False,False


Shape:  (5616, 247)


In [11]:
# Based of the group
# Hour: 00:00:00
# Day of week: day_of_week_Sunday
df_train = df_train.drop(columns = {"hour_0", 'day_of_week_Sunday'})

display(df_train.head(3))
print("Shape: ", df_train.shape)

Unnamed: 0,date_time,COGT,PT08.S1CO,C6H6GT,NOxGT,PT08.S3NOx,NO2GT,PT08.S4NO2,PT08.S5O3,T,...,hour_20,hour_21,hour_22,hour_23,day_of_week_Monday,day_of_week_Tuesday,day_of_week_Wednesday,day_of_week_Thursday,day_of_week_Friday,day_of_week_Saturday
0,2004-03-12 00:00:00,1.7,1080.0,5.9,122.0,1254.0,97.0,1375.0,816.0,8.3,...,False,False,False,False,False,False,False,False,True,False
1,2004-03-12 01:00:00,1.9,1044.0,6.4,133.0,1247.0,110.0,1378.0,832.0,7.7,...,False,False,False,False,False,False,False,False,True,False
2,2004-03-12 02:00:00,1.4,988.0,4.1,82.0,1396.0,91.0,1304.0,692.0,7.1,...,False,False,False,False,False,False,False,False,True,False


Shape:  (5616, 245)


In [12]:
df_train = df_train.astype({col: 'int' for col in df_train.columns if df_train[col].dtype == 'bool'})

display(df_train.head(3))
print("Shape: ", df_train.shape)

Unnamed: 0,date_time,COGT,PT08.S1CO,C6H6GT,NOxGT,PT08.S3NOx,NO2GT,PT08.S4NO2,PT08.S5O3,T,...,hour_20,hour_21,hour_22,hour_23,day_of_week_Monday,day_of_week_Tuesday,day_of_week_Wednesday,day_of_week_Thursday,day_of_week_Friday,day_of_week_Saturday
0,2004-03-12 00:00:00,1.7,1080.0,5.9,122.0,1254.0,97.0,1375.0,816.0,8.3,...,0,0,0,0,0,0,0,0,1,0
1,2004-03-12 01:00:00,1.9,1044.0,6.4,133.0,1247.0,110.0,1378.0,832.0,7.7,...,0,0,0,0,0,0,0,0,1,0
2,2004-03-12 02:00:00,1.4,988.0,4.1,82.0,1396.0,91.0,1304.0,692.0,7.1,...,0,0,0,0,0,0,0,0,1,0


Shape:  (5616, 245)


## Validation Dataset

In [13]:
rolling_features = {}
list_select_period = [3,6,12,24]
# Rolling 3 hours - Capture immediate volatility
# Rolling 6 hours - Capture 1/4 of days volatility
# Rolling 12 hours - Capture half of days volatility
# Rolling 24 hours - Capture daily volatility
for a_col in df_valid.columns:
    if a_col not in ['date_time', 'day_of_week']:
        for a_lag in list_select_period:
            roll_mean = df_valid[a_col].rolling(window=a_lag).mean().shift(1)
            roll_std = df_valid[a_col].rolling(window=a_lag).std().shift(1)
            roll_max = df_valid[a_col].rolling(window=a_lag).max().shift(1)
            roll_min = df_valid[a_col].rolling(window=a_lag).min().shift(1)

            mean_col_name = f'rolling_{a_col}_mean_{a_lag}'
            std_col_name = f'rolling_{a_col}_std_{a_lag}'
            max_col_name = f'rolling_{a_col}_max_{a_lag}'
            min_col_name = f'rolling_{a_col}_min_{a_lag}'

            rolling_features[mean_col_name] = roll_mean
            rolling_features[std_col_name] = roll_std
            rolling_features[max_col_name] = roll_max
            rolling_features[min_col_name] = roll_min

df_rolling = pd.DataFrame(rolling_features)
df_valid = pd.concat([df_valid, df_rolling], axis=1)

display(df_valid.head(3))
print("Shape: ", df_valid.shape)

Unnamed: 0,date_time,day_of_week,COGT,PT08.S1CO,C6H6GT,NOxGT,PT08.S3NOx,NO2GT,PT08.S4NO2,PT08.S5O3,...,rolling_AH_max_6,rolling_AH_min_6,rolling_AH_mean_12,rolling_AH_std_12,rolling_AH_max_12,rolling_AH_min_12,rolling_AH_mean_24,rolling_AH_std_24,rolling_AH_max_24,rolling_AH_min_24
0,2004-11-01 00:00:00,Monday,3.2,1353.0,15.9,318.0,584.0,69.0,1723.0,2150.0,...,,,,,,,,,,
1,2004-11-01 01:00:00,Monday,3.7,1407.0,17.8,338.0,548.0,77.0,1780.0,2519.0,...,,,,,,,,,,
2,2004-11-01 02:00:00,Monday,3.5,1333.0,16.7,353.0,552.0,68.0,1767.0,1925.0,...,,,,,,,,,,


Shape:  (1464, 189)


In [14]:
lagged_features = []
list_select_period = [1,2,3,6,9,12,24]
for a_target in list_target_vals:
    for a_lag in list_select_period:
        col_name1 = f"{a_target}_lag_{a_lag}"
        lagged = df_valid[a_target].shift(a_lag).rename(col_name1)
        lagged_features.append(lagged)

# Combine all lagged columns
df_lags = pd.concat(lagged_features, axis=1)

# Append them to the original DataFrame
df_valid = pd.concat([df_valid, df_lags], axis=1)

display(df_valid.head(3))
print("Shape: ", df_valid.shape)

Unnamed: 0,date_time,day_of_week,COGT,PT08.S1CO,C6H6GT,NOxGT,PT08.S3NOx,NO2GT,PT08.S4NO2,PT08.S5O3,...,NOxGT_lag_9,NOxGT_lag_12,NOxGT_lag_24,NO2GT_lag_1,NO2GT_lag_2,NO2GT_lag_3,NO2GT_lag_6,NO2GT_lag_9,NO2GT_lag_12,NO2GT_lag_24
0,2004-11-01 00:00:00,Monday,3.2,1353.0,15.9,318.0,584.0,69.0,1723.0,2150.0,...,,,,,,,,,,
1,2004-11-01 01:00:00,Monday,3.7,1407.0,17.8,338.0,548.0,77.0,1780.0,2519.0,...,,,,69.0,,,,,,
2,2004-11-01 02:00:00,Monday,3.5,1333.0,16.7,353.0,552.0,68.0,1767.0,1925.0,...,,,,77.0,69.0,,,,,


Shape:  (1464, 217)


In [15]:
df_valid = df_valid.dropna().reset_index(drop = True)

display(df_valid.head(3))
print("Shape: ", df_valid.shape)

Unnamed: 0,date_time,day_of_week,COGT,PT08.S1CO,C6H6GT,NOxGT,PT08.S3NOx,NO2GT,PT08.S4NO2,PT08.S5O3,...,NOxGT_lag_9,NOxGT_lag_12,NOxGT_lag_24,NO2GT_lag_1,NO2GT_lag_2,NO2GT_lag_3,NO2GT_lag_6,NO2GT_lag_9,NO2GT_lag_12,NO2GT_lag_24
0,2004-11-02 00:00:00,Tuesday,2.3,1122.0,10.5,259.0,660.0,62.0,1586.0,1048.0,...,216.0,319.0,318.0,71.0,79.0,86.0,95.0,72.0,90.0,69.0
1,2004-11-02 01:00:00,Tuesday,1.6,983.0,7.6,227.0,752.0,56.0,1472.0,928.0,...,376.0,320.0,338.0,62.0,71.0,79.0,91.0,88.0,91.0,77.0
2,2004-11-02 02:00:00,Tuesday,1.1,885.0,4.8,147.0,878.0,49.0,1374.0,813.0,...,496.0,174.0,353.0,56.0,62.0,71.0,94.0,110.0,64.0,68.0


Shape:  (1440, 217)


In [16]:
df_valid['hour'] = df_valid['date_time'].dt.hour
# df_train['month'] = df_train['date_time'].dt.month

In [17]:
df_valid = pd.get_dummies(df_valid, columns=['hour','day_of_week'])

display(df_valid.head(3))
print("Shape: ", df_valid.shape)

Unnamed: 0,date_time,COGT,PT08.S1CO,C6H6GT,NOxGT,PT08.S3NOx,NO2GT,PT08.S4NO2,PT08.S5O3,T,...,hour_21,hour_22,hour_23,day_of_week_Monday,day_of_week_Tuesday,day_of_week_Wednesday,day_of_week_Thursday,day_of_week_Friday,day_of_week_Saturday,day_of_week_Sunday
0,2004-11-02 00:00:00,2.3,1122.0,10.5,259.0,660.0,62.0,1586.0,1048.0,18.4,...,False,False,False,False,True,False,False,False,False,False
1,2004-11-02 01:00:00,1.6,983.0,7.6,227.0,752.0,56.0,1472.0,928.0,18.2,...,False,False,False,False,True,False,False,False,False,False
2,2004-11-02 02:00:00,1.1,885.0,4.8,147.0,878.0,49.0,1374.0,813.0,17.9,...,False,False,False,False,True,False,False,False,False,False


Shape:  (1440, 247)


In [18]:
# Based of the group
# Hour: 00:00:00
# Day of week: day_of_week_Sunday
df_valid = df_valid.drop(columns = {"hour_0", 'day_of_week_Sunday'})

display(df_valid.head(3))
print("Shape: ", df_valid.shape)

Unnamed: 0,date_time,COGT,PT08.S1CO,C6H6GT,NOxGT,PT08.S3NOx,NO2GT,PT08.S4NO2,PT08.S5O3,T,...,hour_20,hour_21,hour_22,hour_23,day_of_week_Monday,day_of_week_Tuesday,day_of_week_Wednesday,day_of_week_Thursday,day_of_week_Friday,day_of_week_Saturday
0,2004-11-02 00:00:00,2.3,1122.0,10.5,259.0,660.0,62.0,1586.0,1048.0,18.4,...,False,False,False,False,False,True,False,False,False,False
1,2004-11-02 01:00:00,1.6,983.0,7.6,227.0,752.0,56.0,1472.0,928.0,18.2,...,False,False,False,False,False,True,False,False,False,False
2,2004-11-02 02:00:00,1.1,885.0,4.8,147.0,878.0,49.0,1374.0,813.0,17.9,...,False,False,False,False,False,True,False,False,False,False


Shape:  (1440, 245)


In [19]:
df_valid = df_valid.astype({col: 'int' for col in df_valid.columns if df_valid[col].dtype == 'bool'})

display(df_valid.head(3))
print("Shape: ", df_valid.shape)

Unnamed: 0,date_time,COGT,PT08.S1CO,C6H6GT,NOxGT,PT08.S3NOx,NO2GT,PT08.S4NO2,PT08.S5O3,T,...,hour_20,hour_21,hour_22,hour_23,day_of_week_Monday,day_of_week_Tuesday,day_of_week_Wednesday,day_of_week_Thursday,day_of_week_Friday,day_of_week_Saturday
0,2004-11-02 00:00:00,2.3,1122.0,10.5,259.0,660.0,62.0,1586.0,1048.0,18.4,...,0,0,0,0,0,1,0,0,0,0
1,2004-11-02 01:00:00,1.6,983.0,7.6,227.0,752.0,56.0,1472.0,928.0,18.2,...,0,0,0,0,0,1,0,0,0,0
2,2004-11-02 02:00:00,1.1,885.0,4.8,147.0,878.0,49.0,1374.0,813.0,17.9,...,0,0,0,0,0,1,0,0,0,0


Shape:  (1440, 245)


## Export Data

In [20]:
df_train.to_pickle(parent_dir + r"\data\processed\feature_engineering_train_dataset.pkl")
df_valid.to_pickle(parent_dir + r"\data\processed\feature_engineering_validation_dataset.pkl")