In [1]:
import pandas as pd
import numpy as np

from tqdm import tqdm
import time

import gc

pd.options.mode.chained_assignment = None  # default='warn'

## Hyperparams

In [2]:
NUM_STEPS = 20

## Load Data

In [3]:
train_events = pd.read_csv('../data/raw/train_events.csv')

## Add Label to Series data

In [6]:
def get_merged_series(series):
    train_series = pd.read_parquet('../data/raw/train_series.parquet', filters=[('series_id','=',series)])
    train_events_filtered = train_events.query('series_id == @series')
    
    train_events_filtered = train_events_filtered.dropna()
    train_events_filtered['step'] = train_events_filtered['step'].astype('int')
    train_events_filtered['awake'] = train_events_filtered['event'].replace({'onset': 1, 'wakeup': 0})

    # Merge Series with events and fill empty values with last observed event (awake until onset event...)
    merged = pd.merge(train_series, train_events_filtered[['step', 'awake']], on='step', how='left')
    merged['awake'] = merged['awake'].bfill(axis='rows')
    merged['awake'] = merged['awake'].fillna(1) # awake
    merged["awake"] = merged["awake"].astype("int")
    return merged

In [7]:
train_data = []
lightweight_series_ids = []

total_len = train_events.series_id.nunique()

for i, series_id in enumerate(train_events.series_id.unique()):
    print(f'Step {i} of {total_len}')
    train = get_merged_series(series_id)
    if len(lightweight_series_ids) < 10:
        lightweight_series_ids.append(series_id)
    train_data.append(train)
    del train
    gc.collect()

zzzz_train = pd.concat(train_data).reset_index(drop=True)

Step 0 of 277
Step 1 of 277
Step 2 of 277
Step 3 of 277
Step 4 of 277
Step 5 of 277
Step 6 of 277
Step 7 of 277
Step 8 of 277
Step 9 of 277
Step 10 of 277
Step 11 of 277
Step 12 of 277
Step 13 of 277
Step 14 of 277
Step 15 of 277
Step 16 of 277
Step 17 of 277
Step 18 of 277
Step 19 of 277
Step 20 of 277
Step 21 of 277
Step 22 of 277
Step 23 of 277
Step 24 of 277
Step 25 of 277
Step 26 of 277
Step 27 of 277
Step 28 of 277
Step 29 of 277
Step 30 of 277
Step 31 of 277
Step 32 of 277
Step 33 of 277
Step 34 of 277
Step 35 of 277
Step 36 of 277
Step 37 of 277
Step 38 of 277
Step 39 of 277
Step 40 of 277
Step 41 of 277
Step 42 of 277
Step 43 of 277
Step 44 of 277
Step 45 of 277
Step 46 of 277
Step 47 of 277
Step 48 of 277
Step 49 of 277
Step 50 of 277
Step 51 of 277
Step 52 of 277
Step 53 of 277
Step 54 of 277
Step 55 of 277
Step 56 of 277
Step 57 of 277
Step 58 of 277
Step 59 of 277
Step 60 of 277
Step 61 of 277
Step 62 of 277
Step 63 of 277
Step 64 of 277
Step 65 of 277
Step 66 of 277
Step 

In [8]:
zzzz_train.head()

Unnamed: 0,series_id,step,timestamp,anglez,enmo,awake
0,038441c925bb,0,2018-08-14T15:30:00-0400,2.6367,0.0217,1
1,038441c925bb,1,2018-08-14T15:30:05-0400,2.6368,0.0215,1
2,038441c925bb,2,2018-08-14T15:30:10-0400,2.637,0.0216,1
3,038441c925bb,3,2018-08-14T15:30:15-0400,2.6368,0.0213,1
4,038441c925bb,4,2018-08-14T15:30:20-0400,2.6368,0.0215,1


In [9]:
zzzz_train.to_parquet('../data/processed/train_series.parquet')

## Feature Engineering

In [11]:
def feature_eng(df):
    start_time = time.time()
    for col in ['anglez', 'enmo']:
        for agg in ['median', 'mean', 'min', 'max']:
            print(f'Generating {agg} for Column {col}')
            df[f'{col}_{agg}'] = df.groupby('series_id')[col].rolling(NUM_STEPS, center=True).agg(agg).fillna(method="bfill").fillna(method="ffill").astype(np.float32).values

        print(f'Generating diff for Column {col}')    
        df[f'{col}_diff'] = df.groupby('series_id')[col].diff(periods=NUM_STEPS).fillna(method="bfill").fillna(method="ffill").astype(np.float32)

    # Feature combination
    print(f'Generating anglez x enmo combination')
    df['anglezxenmo'] = df['anglez'] * df['enmo']
    #df = reduce_memory(df)
    print(f'Feature Engineering took {time.time() - start_time} seconds')

    return df

In [12]:
zzzz_train = feature_eng(zzzz_train)

Generating median for Column anglez
Generating mean for Column anglez
Generating min for Column anglez
Generating max for Column anglez
Generating diff for Column anglez
Generating median for Column enmo
Generating mean for Column enmo
Generating min for Column enmo
Generating max for Column enmo
Generating diff for Column enmo
Generating anglez x enmo combination
Feature Engineering took 780.657142162323 seconds


In [14]:
zzzz_train.head(20)

Unnamed: 0,series_id,step,timestamp,anglez,enmo,awake,anglez_median,anglez_mean,anglez_min,anglez_max,anglez_diff,enmo_median,enmo_mean,enmo_min,enmo_max,enmo_diff,anglezxenmo
0,038441c925bb,0,2018-08-14T15:30:00-0400,2.6367,0.0217,1,2.6367,7.573975,2.4129,54.8498,41.462601,0.02165,0.02233,0.0166,0.0395,-0.0047,0.057216
1,038441c925bb,1,2018-08-14T15:30:05-0400,2.6368,0.0215,1,2.6367,7.573975,2.4129,54.8498,41.462601,0.02165,0.02233,0.0166,0.0395,-0.0047,0.056691
2,038441c925bb,2,2018-08-14T15:30:10-0400,2.637,0.0216,1,2.6367,7.573975,2.4129,54.8498,41.462601,0.02165,0.02233,0.0166,0.0395,-0.0047,0.056959
3,038441c925bb,3,2018-08-14T15:30:15-0400,2.6368,0.0213,1,2.6367,7.573975,2.4129,54.8498,41.462601,0.02165,0.02233,0.0166,0.0395,-0.0047,0.056164
4,038441c925bb,4,2018-08-14T15:30:20-0400,2.6368,0.0215,1,2.6367,7.573975,2.4129,54.8498,41.462601,0.02165,0.02233,0.0166,0.0395,-0.0047,0.056691
5,038441c925bb,5,2018-08-14T15:30:25-0400,2.6367,0.0217,1,2.6367,7.573975,2.4129,54.8498,41.462601,0.02165,0.02233,0.0166,0.0395,-0.0047,0.057216
6,038441c925bb,6,2018-08-14T15:30:30-0400,2.6367,0.0217,1,2.6367,7.573975,2.4129,54.8498,41.462601,0.02165,0.02233,0.0166,0.0395,-0.0047,0.057216
7,038441c925bb,7,2018-08-14T15:30:35-0400,2.6367,0.0218,1,2.6367,7.573975,2.4129,54.8498,41.462601,0.02165,0.02233,0.0166,0.0395,-0.0047,0.05748
8,038441c925bb,8,2018-08-14T15:30:40-0400,2.798,0.0223,1,2.6367,7.573975,2.4129,54.8498,41.462601,0.02165,0.02233,0.0166,0.0395,-0.0047,0.062395
9,038441c925bb,9,2018-08-14T15:30:45-0400,3.0847,0.0217,1,2.6367,7.573975,2.4129,54.8498,41.462601,0.02165,0.02233,0.0166,0.0395,-0.0047,0.066938


In [15]:
zzzz_train.to_parquet('../data/processed/train_series.parquet')

In [16]:
zzzz_train_lightweight = zzzz_train[zzzz_train.series_id.isin(lightweight_series_ids)]

In [17]:
zzzz_train_lightweight.to_parquet('../data/processed/train_series_lightweight.parquet')