# Feature Engineering

In [1]:
import pandas as pd
import numpy as np
import pyarrow as pa
import pyarrow.parquet as pq

from tqdm import tqdm

import time
import torch
import gc

pd.set_option('display.max_columns', None)

## Daten laden

In [2]:
IS_TRAINING = False
if IS_TRAINING:
    FILE = '../../data/processed/train_series_split_normalized.parquet'
else:
    FILE = '../../data/processed/validation_series_split_normalized.parquet'
series = pd.read_parquet(FILE, columns=['num_series_id'])

## Features

In [3]:
LAGS_FUTURE = [f"t_lag_{i}" for i in range(-1, -25, -1)]
LAGS_PAST = reversed([f"t_lag_{i}" for i in range(1, 25)])
FEATURES = [*LAGS_PAST, 't_0', *LAGS_FUTURE]

In [4]:
len(FEATURES)

49

## Neue Features erstellen

In [5]:
folder = "train/" if IS_TRAINING else "validation/"
file_path_prefix = "../../data/processed/transformer/" + folder

def save_chunk(num_series_id, chunk):
    series_length, series_columns = chunk[FEATURES].values.shape
    X = torch.from_numpy(np.array(np.ravel(chunk[FEATURES].values).tolist())
                           .reshape(series_length, series_columns, 2)).to(torch.float32)
    torch.save(X, file_path_prefix + str(num_series_id) + ".pt")

In [6]:
def make_features_chunk(num_series_id):
    df = pd.read_parquet(FILE, filters=[('num_series_id','=',num_series_id)])
    df['t_0'] = df[['anglez', 'enmo']].values.tolist()

    columns_past = []
    for i in range(1, 25):
        df[f'anglez_lag_{i}'] = df["anglez"].shift(i).bfill()
        df[f'enmo_lag_{i}'] = df["enmo"].shift(i).bfill()
        df[f't_lag_{i}'] = df[[f'anglez_lag_{i}', f'enmo_lag_{i}']].values.tolist()
        columns_past.extend([f'anglez_lag_{i}', f'enmo_lag_{i}'])
    df = df.drop(columns=columns_past)


    columns_future = []
    for i in range(-1, -25, -1):
        df[f'anglez_lag_{i}'] = df["anglez"].shift(i).ffill()
        df[f'enmo_lag_{i}'] = df["enmo"].shift(i).ffill()
        df[f't_lag_{i}'] = df[[f'anglez_lag_{i}', f'enmo_lag_{i}']].values.tolist()
        columns_future.extend([f'anglez_lag_{i}', f'enmo_lag_{i}'])
    df = df.drop(columns=columns_future)

    
    return df.reset_index(drop=True)

In [7]:
def make_features(series):
    overview_data = []

    for num_series_id in tqdm(series.num_series_id.unique()):
        chunk = make_features_chunk(num_series_id)
        save_chunk(num_series_id, chunk)
        overview_data.append(
            chunk[['num_series_id', 'step', 'awake', 'critical_event_point']].reset_index().rename(columns={'index':'series_index'}).copy()[['num_series_id', 'step', 'awake', 'critical_event_point', 'series_index']]
        )
    
        del chunk
        gc.collect()
   
    return pd.concat(overview_data).reset_index(drop=True)   

In [8]:
start_time = time.time()
overview = make_features(series)
print(f'Feature Engineering took {time.time() - start_time} seconds')

  df[f'anglez_lag_{i}'] = df["anglez"].shift(i).ffill()
  df[f'enmo_lag_{i}'] = df["enmo"].shift(i).ffill()
  df[f't_lag_{i}'] = df[[f'anglez_lag_{i}', f'enmo_lag_{i}']].values.tolist()
  df[f'anglez_lag_{i}'] = df["anglez"].shift(i).ffill()
  df[f'enmo_lag_{i}'] = df["enmo"].shift(i).ffill()
  df[f't_lag_{i}'] = df[[f'anglez_lag_{i}', f'enmo_lag_{i}']].values.tolist()
  df[f'anglez_lag_{i}'] = df["anglez"].shift(i).ffill()
  df[f'enmo_lag_{i}'] = df["enmo"].shift(i).ffill()
  df[f't_lag_{i}'] = df[[f'anglez_lag_{i}', f'enmo_lag_{i}']].values.tolist()
  df[f'anglez_lag_{i}'] = df["anglez"].shift(i).ffill()
  df[f'enmo_lag_{i}'] = df["enmo"].shift(i).ffill()
  df[f't_lag_{i}'] = df[[f'anglez_lag_{i}', f'enmo_lag_{i}']].values.tolist()
  df[f'anglez_lag_{i}'] = df["anglez"].shift(i).ffill()
  df[f'enmo_lag_{i}'] = df["enmo"].shift(i).ffill()
  df[f't_lag_{i}'] = df[[f'anglez_lag_{i}', f'enmo_lag_{i}']].values.tolist()
  df[f'anglez_lag_{i}'] = df["anglez"].shift(i).ffill()
  df[f'enmo_la

  df[f'anglez_lag_{i}'] = df["anglez"].shift(i).ffill()
  df[f'enmo_lag_{i}'] = df["enmo"].shift(i).ffill()
  df[f't_lag_{i}'] = df[[f'anglez_lag_{i}', f'enmo_lag_{i}']].values.tolist()
  df[f'anglez_lag_{i}'] = df["anglez"].shift(i).ffill()
  df[f'enmo_lag_{i}'] = df["enmo"].shift(i).ffill()
  df[f't_lag_{i}'] = df[[f'anglez_lag_{i}', f'enmo_lag_{i}']].values.tolist()
  df[f'anglez_lag_{i}'] = df["anglez"].shift(i).ffill()
  df[f'enmo_lag_{i}'] = df["enmo"].shift(i).ffill()
  df[f't_lag_{i}'] = df[[f'anglez_lag_{i}', f'enmo_lag_{i}']].values.tolist()
  df[f'anglez_lag_{i}'] = df["anglez"].shift(i).ffill()
  df[f'enmo_lag_{i}'] = df["enmo"].shift(i).ffill()
  df[f't_lag_{i}'] = df[[f'anglez_lag_{i}', f'enmo_lag_{i}']].values.tolist()
  df[f'anglez_lag_{i}'] = df["anglez"].shift(i).ffill()
  df[f'enmo_lag_{i}'] = df["enmo"].shift(i).ffill()
  df[f't_lag_{i}'] = df[[f'anglez_lag_{i}', f'enmo_lag_{i}']].values.tolist()
  df[f'anglez_lag_{i}'] = df["anglez"].shift(i).ffill()
  df[f'enmo_la

  df[f'anglez_lag_{i}'] = df["anglez"].shift(i).ffill()
  df[f'enmo_lag_{i}'] = df["enmo"].shift(i).ffill()
  df[f't_lag_{i}'] = df[[f'anglez_lag_{i}', f'enmo_lag_{i}']].values.tolist()
  df[f'anglez_lag_{i}'] = df["anglez"].shift(i).ffill()
  df[f'enmo_lag_{i}'] = df["enmo"].shift(i).ffill()
  df[f't_lag_{i}'] = df[[f'anglez_lag_{i}', f'enmo_lag_{i}']].values.tolist()
  df[f'anglez_lag_{i}'] = df["anglez"].shift(i).ffill()
  df[f'enmo_lag_{i}'] = df["enmo"].shift(i).ffill()
  df[f't_lag_{i}'] = df[[f'anglez_lag_{i}', f'enmo_lag_{i}']].values.tolist()
  df[f'anglez_lag_{i}'] = df["anglez"].shift(i).ffill()
  df[f'enmo_lag_{i}'] = df["enmo"].shift(i).ffill()
  df[f't_lag_{i}'] = df[[f'anglez_lag_{i}', f'enmo_lag_{i}']].values.tolist()
  df[f'anglez_lag_{i}'] = df["anglez"].shift(i).ffill()
  df[f'enmo_lag_{i}'] = df["enmo"].shift(i).ffill()
  df[f't_lag_{i}'] = df[[f'anglez_lag_{i}', f'enmo_lag_{i}']].values.tolist()
  df[f'anglez_lag_{i}'] = df["anglez"].shift(i).ffill()
  df[f'enmo_la

  df[f'anglez_lag_{i}'] = df["anglez"].shift(i).ffill()
  df[f'enmo_lag_{i}'] = df["enmo"].shift(i).ffill()
  df[f't_lag_{i}'] = df[[f'anglez_lag_{i}', f'enmo_lag_{i}']].values.tolist()
  df[f'anglez_lag_{i}'] = df["anglez"].shift(i).ffill()
  df[f'enmo_lag_{i}'] = df["enmo"].shift(i).ffill()
  df[f't_lag_{i}'] = df[[f'anglez_lag_{i}', f'enmo_lag_{i}']].values.tolist()
  df[f'anglez_lag_{i}'] = df["anglez"].shift(i).ffill()
  df[f'enmo_lag_{i}'] = df["enmo"].shift(i).ffill()
  df[f't_lag_{i}'] = df[[f'anglez_lag_{i}', f'enmo_lag_{i}']].values.tolist()
  df[f'anglez_lag_{i}'] = df["anglez"].shift(i).ffill()
  df[f'enmo_lag_{i}'] = df["enmo"].shift(i).ffill()
  df[f't_lag_{i}'] = df[[f'anglez_lag_{i}', f'enmo_lag_{i}']].values.tolist()
  df[f'anglez_lag_{i}'] = df["anglez"].shift(i).ffill()
  df[f'enmo_lag_{i}'] = df["enmo"].shift(i).ffill()
  df[f't_lag_{i}'] = df[[f'anglez_lag_{i}', f'enmo_lag_{i}']].values.tolist()
  df[f'anglez_lag_{i}'] = df["anglez"].shift(i).ffill()
  df[f'enmo_la

  df[f'anglez_lag_{i}'] = df["anglez"].shift(i).ffill()
  df[f'enmo_lag_{i}'] = df["enmo"].shift(i).ffill()
  df[f't_lag_{i}'] = df[[f'anglez_lag_{i}', f'enmo_lag_{i}']].values.tolist()
  df[f'anglez_lag_{i}'] = df["anglez"].shift(i).ffill()
  df[f'enmo_lag_{i}'] = df["enmo"].shift(i).ffill()
  df[f't_lag_{i}'] = df[[f'anglez_lag_{i}', f'enmo_lag_{i}']].values.tolist()
  df[f'anglez_lag_{i}'] = df["anglez"].shift(i).ffill()
  df[f'enmo_lag_{i}'] = df["enmo"].shift(i).ffill()
  df[f't_lag_{i}'] = df[[f'anglez_lag_{i}', f'enmo_lag_{i}']].values.tolist()
  df[f'anglez_lag_{i}'] = df["anglez"].shift(i).ffill()
  df[f'enmo_lag_{i}'] = df["enmo"].shift(i).ffill()
  df[f't_lag_{i}'] = df[[f'anglez_lag_{i}', f'enmo_lag_{i}']].values.tolist()
  df[f'anglez_lag_{i}'] = df["anglez"].shift(i).ffill()
  df[f'enmo_lag_{i}'] = df["enmo"].shift(i).ffill()
  df[f't_lag_{i}'] = df[[f'anglez_lag_{i}', f'enmo_lag_{i}']].values.tolist()
  df[f'anglez_lag_{i}'] = df["anglez"].shift(i).ffill()
  df[f'enmo_la

  df[f'anglez_lag_{i}'] = df["anglez"].shift(i).ffill()
  df[f'enmo_lag_{i}'] = df["enmo"].shift(i).ffill()
  df[f't_lag_{i}'] = df[[f'anglez_lag_{i}', f'enmo_lag_{i}']].values.tolist()
  df[f'anglez_lag_{i}'] = df["anglez"].shift(i).ffill()
  df[f'enmo_lag_{i}'] = df["enmo"].shift(i).ffill()
  df[f't_lag_{i}'] = df[[f'anglez_lag_{i}', f'enmo_lag_{i}']].values.tolist()
  df[f'anglez_lag_{i}'] = df["anglez"].shift(i).ffill()
  df[f'enmo_lag_{i}'] = df["enmo"].shift(i).ffill()
  df[f't_lag_{i}'] = df[[f'anglez_lag_{i}', f'enmo_lag_{i}']].values.tolist()
  df[f'anglez_lag_{i}'] = df["anglez"].shift(i).ffill()
  df[f'enmo_lag_{i}'] = df["enmo"].shift(i).ffill()
  df[f't_lag_{i}'] = df[[f'anglez_lag_{i}', f'enmo_lag_{i}']].values.tolist()
  df[f'anglez_lag_{i}'] = df["anglez"].shift(i).ffill()
  df[f'enmo_lag_{i}'] = df["enmo"].shift(i).ffill()
  df[f't_lag_{i}'] = df[[f'anglez_lag_{i}', f'enmo_lag_{i}']].values.tolist()
  df[f'anglez_lag_{i}'] = df["anglez"].shift(i).ffill()
  df[f'enmo_la

  df[f'anglez_lag_{i}'] = df["anglez"].shift(i).ffill()
  df[f'enmo_lag_{i}'] = df["enmo"].shift(i).ffill()
  df[f't_lag_{i}'] = df[[f'anglez_lag_{i}', f'enmo_lag_{i}']].values.tolist()
  df[f'anglez_lag_{i}'] = df["anglez"].shift(i).ffill()
  df[f'enmo_lag_{i}'] = df["enmo"].shift(i).ffill()
  df[f't_lag_{i}'] = df[[f'anglez_lag_{i}', f'enmo_lag_{i}']].values.tolist()
  df[f'anglez_lag_{i}'] = df["anglez"].shift(i).ffill()
  df[f'enmo_lag_{i}'] = df["enmo"].shift(i).ffill()
  df[f't_lag_{i}'] = df[[f'anglez_lag_{i}', f'enmo_lag_{i}']].values.tolist()
  df[f'anglez_lag_{i}'] = df["anglez"].shift(i).ffill()
  df[f'enmo_lag_{i}'] = df["enmo"].shift(i).ffill()
  df[f't_lag_{i}'] = df[[f'anglez_lag_{i}', f'enmo_lag_{i}']].values.tolist()
  df[f'anglez_lag_{i}'] = df["anglez"].shift(i).ffill()
  df[f'enmo_lag_{i}'] = df["enmo"].shift(i).ffill()
  df[f't_lag_{i}'] = df[[f'anglez_lag_{i}', f'enmo_lag_{i}']].values.tolist()
  df[f'anglez_lag_{i}'] = df["anglez"].shift(i).ffill()
  df[f'enmo_la

  df[f'anglez_lag_{i}'] = df["anglez"].shift(i).ffill()
  df[f'enmo_lag_{i}'] = df["enmo"].shift(i).ffill()
  df[f't_lag_{i}'] = df[[f'anglez_lag_{i}', f'enmo_lag_{i}']].values.tolist()
  df[f'anglez_lag_{i}'] = df["anglez"].shift(i).ffill()
  df[f'enmo_lag_{i}'] = df["enmo"].shift(i).ffill()
  df[f't_lag_{i}'] = df[[f'anglez_lag_{i}', f'enmo_lag_{i}']].values.tolist()
  df[f'anglez_lag_{i}'] = df["anglez"].shift(i).ffill()
  df[f'enmo_lag_{i}'] = df["enmo"].shift(i).ffill()
  df[f't_lag_{i}'] = df[[f'anglez_lag_{i}', f'enmo_lag_{i}']].values.tolist()
  df[f'anglez_lag_{i}'] = df["anglez"].shift(i).ffill()
  df[f'enmo_lag_{i}'] = df["enmo"].shift(i).ffill()
  df[f't_lag_{i}'] = df[[f'anglez_lag_{i}', f'enmo_lag_{i}']].values.tolist()
  df[f'anglez_lag_{i}'] = df["anglez"].shift(i).ffill()
  df[f'enmo_lag_{i}'] = df["enmo"].shift(i).ffill()
  df[f't_lag_{i}'] = df[[f'anglez_lag_{i}', f'enmo_lag_{i}']].values.tolist()
100%|█████████████████████████████████████████████████████████████████

Feature Engineering took 714.6610052585602 seconds


## Overview speichern

In [9]:
overview

Unnamed: 0,num_series_id,step,awake,critical_event_point,series_index
0,7,0,1,0.0,0
1,7,1,1,0.0,1
2,7,2,1,0.0,2
3,7,3,1,0.0,3
4,7,4,1,0.0,4
...,...,...,...,...,...
16484066,276,620635,1,0.0,603354
16484067,276,620636,1,0.0,603355
16484068,276,620637,1,0.0,603356
16484069,276,620638,1,0.0,603357


In [10]:
overview.to_parquet(file_path_prefix + "overview.parquet")