# Feature Engineering

Load data

In [7]:
import pandas as pd
import gc
train = pd.read_parquet('../data/test_series.parquet')
train["timestamp"] = pd.to_datetime(train["timestamp"],utc=True)
train_reset = train.copy()

### Timestamp derivatives
- Hour
- Weekday
- Weekend
- Month
- Day

In [8]:
# Add hour column from timestamp to train data
train['hour'] = train['timestamp'].dt.hour
# Add weekday column from timestamp to train data
train['weekday'] = train['timestamp'].dt.weekday
# Add weekend column from timestamp to train data
train['weekend'] = train['weekday'].apply(lambda x: 1 if x >= 5 else 0)
# Add month column from timestamp to train data
train['month'] = train['timestamp'].dt.month
# Add day column from timestamp to train data
train['day'] = train['timestamp'].dt.day

print(train.head())
train = train_reset.copy()



      series_id  step                 timestamp  anglez    enmo  hour  \
0  038441c925bb     0 2018-08-14 19:30:00+00:00  2.6367  0.0217    19   
1  038441c925bb     1 2018-08-14 19:30:05+00:00  2.6368  0.0215    19   
2  038441c925bb     2 2018-08-14 19:30:10+00:00  2.6370  0.0216    19   
3  038441c925bb     3 2018-08-14 19:30:15+00:00  2.6368  0.0213    19   
4  038441c925bb     4 2018-08-14 19:30:20+00:00  2.6368  0.0215    19   

   weekday  weekend  month  day  
0        1        0      8   14  
1        1        0      8   14  
2        1        0      8   14  
3        1        0      8   14  
4        1        0      8   14  


### Feature cross
- Multiplying anglez & enmo

In [9]:
# Add column multiplying anglez and enmo
train['anglez_enmo'] = train['anglez']*train['enmo']
print(train.head())
train = train_reset.copy()

      series_id  step                 timestamp  anglez    enmo  anglez_enmo
0  038441c925bb     0 2018-08-14 19:30:00+00:00  2.6367  0.0217     0.057216
1  038441c925bb     1 2018-08-14 19:30:05+00:00  2.6368  0.0215     0.056691
2  038441c925bb     2 2018-08-14 19:30:10+00:00  2.6370  0.0216     0.056959
3  038441c925bb     3 2018-08-14 19:30:15+00:00  2.6368  0.0213     0.056164
4  038441c925bb     4 2018-08-14 19:30:20+00:00  2.6368  0.0215     0.056691


### Rolling statistics
- For 'enmo and 'anglez' columns, calculates various rolling statistics like mean, median, max, min skewness and kurtosis

In [12]:
train_gp_id = train.groupby('series_id')
    
for col in ['enmo', 'anglez']:
    for n in [5, 30, 60, 120]:
            
        train[f'{col}_diff_{n}'] = train_gp_id[col].diff(periods=n)
        train[f'{col}_mean_{n}'] = train[col].rolling(n).mean()
        train[f'{col}_median_{n}'] = train[col].rolling(n).median()
        train[f'{col}_max_{n}'] = train_gp_id[col].rolling(n).max().reset_index(drop=True)
        train[f'{col}_min_{n}'] = train_gp_id[col].rolling(n).min().reset_index(drop=True)
        train[f'{col}_skew_{n}'] = train_gp_id[col].rolling(n).skew().reset_index(drop=True)
        train[f'{col}_kurt_{n}'] = train_gp_id[col].rolling(n).kurt().reset_index(drop=True)
        gc.collect()
            
    print(f'Created diff and smoothed derivates from {col}')

print(train.head())
train = train_reset.copy()

Created diff and smoothed derivates from enmo
Created diff and smoothed derivates from anglez
      series_id  step                 timestamp  anglez    enmo  enmo_diff_5  \
0  038441c925bb     0 2018-08-14 19:30:00+00:00  2.6367  0.0217          NaN   
1  038441c925bb     1 2018-08-14 19:30:05+00:00  2.6368  0.0215          NaN   
2  038441c925bb     2 2018-08-14 19:30:10+00:00  2.6370  0.0216          NaN   
3  038441c925bb     3 2018-08-14 19:30:15+00:00  2.6368  0.0213          NaN   
4  038441c925bb     4 2018-08-14 19:30:20+00:00  2.6368  0.0215          NaN   

   enmo_mean_5  enmo_median_5  enmo_max_5  enmo_min_5  ...  anglez_min_60  \
0          NaN            NaN         NaN         NaN  ...            NaN   
1          NaN            NaN         NaN         NaN  ...            NaN   
2          NaN            NaN         NaN         NaN  ...            NaN   
3          NaN            NaN         NaN         NaN  ...            NaN   
4      0.02152         0.0215      0.021

### Time lag and lead
- Creation of lag and lead features for 'enmo' and 'anglez' allowing model to consider past and future values
- Values can be anything dependent on what is important, if its a day, lag values will be 17280, an hour, 720

In [15]:
for col in ['enmo', 'anglez']:
        
        for n in range(720, 17280, 720):
            train[f'{col}_lag_{n}'] = train_gp_id[col].shift(n)
        gc.collect()
        print(f'Created lag derivates from {col}')
        
        for n in range(1, 10):
            train[f'{col}_lead_{n}'] = train_gp_id[col].shift(-n)
        gc.collect() 
        print(f'Created lead derivates from {col}')

print(train.head())
train = train_reset.copy()

Created lag derivates from enmo
Created lead derivates from enmo
Created lag derivates from anglez
Created lead derivates from anglez
      series_id  step                 timestamp  anglez    enmo  enmo_lag_720  \
0  038441c925bb     0 2018-08-14 19:30:00+00:00  2.6367  0.0217           NaN   
1  038441c925bb     1 2018-08-14 19:30:05+00:00  2.6368  0.0215           NaN   
2  038441c925bb     2 2018-08-14 19:30:10+00:00  2.6370  0.0216           NaN   
3  038441c925bb     3 2018-08-14 19:30:15+00:00  2.6368  0.0213           NaN   
4  038441c925bb     4 2018-08-14 19:30:20+00:00  2.6368  0.0215           NaN   

   enmo_lag_1440  enmo_lag_2160  enmo_lag_2880  enmo_lag_3600  ...  \
0            NaN            NaN            NaN            NaN  ...   
1            NaN            NaN            NaN            NaN  ...   
2            NaN            NaN            NaN            NaN  ...   
3            NaN            NaN            NaN            NaN  ...   
4            NaN            N