# Feature Engineering

In [1]:
import pandas as pd
import numpy as np

from tqdm import tqdm

import time
import gc

## Daten laden

In [2]:
train_series = pd.read_parquet('../../data/processed/train.parquet', columns=['series_id'])

## Neue Features erstellen

In [3]:
# Hyperparams
NUM_STEPS = 20

In [4]:
def make_features_chunck(series_id, periods=NUM_STEPS):
    df = pd.read_parquet('../../data/processed/train.parquet', filters=[('series_id','=',series_id)])
    
    df["hour"] = df['timestamp'].str[11:13]
    
    df["anglez_abs"] = abs(df["anglez"])
    df["anglez_diff"] = df.groupby('series_id')['anglez'].diff(periods=periods).bfill().astype('float32')
    df["enmo_diff"] = df.groupby('series_id')['enmo'].diff(periods=periods).bfill().astype('float32')
    df['anglez_x_enmo'] = df['anglez'] * df['enmo']
    
    df["anglez_rolling_mean"] = df["anglez"].rolling(periods,center=True).mean().bfill().ffill().astype('float32')
    df["enmo_rolling_mean"] = df["enmo"].rolling(periods,center=True).mean().bfill().ffill().astype('float32')
    df["anglez_rolling_max"] = df["anglez"].rolling(periods,center=True).max().bfill().ffill().astype('float32')
    df["enmo_rolling_max"] = df["enmo"].rolling(periods,center=True).max().bfill().ffill().astype('float32')
    df["anglez_rolling_min"] = df["anglez"].rolling(periods,center=True).min().bfill().ffill().astype('float32')
    df["enmo_rolling_min"] = df["enmo"].rolling(periods,center=True).min().bfill().ffill().astype('float32')
    df["anglez_rolling_std"] = df["anglez"].rolling(periods,center=True).std().bfill().ffill().astype('float32')
    df["enmo_rolling_std"] = df["enmo"].rolling(periods,center=True).std().bfill().ffill().astype('float32')    
    
    return df

In [5]:
def make_features(train):
    train_data = []

    for series_id in tqdm(train.series_id.unique()):
        chunck = make_features_chunck(series_id)
        train_data.append(chunck)
        
        del chunck
        gc.collect()

    return pd.concat(train_data).reset_index(drop=True)

In [6]:
start_time = time.time()
train_with_features = make_features(train_series)
print(f'Feature Engineering took {time.time() - start_time} seconds')

100%|████████████████████████████████████████████████████████████████████████████████| 269/269 [21:32<00:00,  4.80s/it]


Feature Engineering took 1748.421234846115 seconds


In [7]:
train_with_features.head()

Unnamed: 0,series_id,step,timestamp,anglez,enmo,awake,unknown,hour,anglez_abs,anglez_diff,enmo_diff,anglez_x_enmo,anglez_rolling_mean,enmo_rolling_mean,anglez_rolling_max,enmo_rolling_max,anglez_rolling_min,enmo_rolling_min,anglez_rolling_std,enmo_rolling_std
0,038441c925bb,0,2018-08-14T15:30:00-0400,2.6367,0.0217,1,0,15,2.6367,41.462601,-0.0047,0.057216,7.573975,0.02233,54.8498,0.0395,2.4129,0.0166,15.339381,0.004213
1,038441c925bb,1,2018-08-14T15:30:05-0400,2.6368,0.0215,1,0,15,2.6368,41.462601,-0.0047,0.056691,7.573975,0.02233,54.8498,0.0395,2.4129,0.0166,15.339381,0.004213
2,038441c925bb,2,2018-08-14T15:30:10-0400,2.637,0.0216,1,0,15,2.637,41.462601,-0.0047,0.056959,7.573975,0.02233,54.8498,0.0395,2.4129,0.0166,15.339381,0.004213
3,038441c925bb,3,2018-08-14T15:30:15-0400,2.6368,0.0213,1,0,15,2.6368,41.462601,-0.0047,0.056164,7.573975,0.02233,54.8498,0.0395,2.4129,0.0166,15.339381,0.004213
4,038441c925bb,4,2018-08-14T15:30:20-0400,2.6368,0.0215,1,0,15,2.6368,41.462601,-0.0047,0.056691,7.573975,0.02233,54.8498,0.0395,2.4129,0.0166,15.339381,0.004213


## Daten speichern

In [8]:
train_with_features.to_parquet('../../data/processed/train_with_features.parquet')