In [7]:
import pandas as pd
import numpy as np

from tqdm import tqdm

## Hyperparams

In [5]:
NUM_STEPS = 20

## Load Data

In [26]:
train_events = pd.read_csv('../data/raw/train_events.csv')
train_series = pd.read_parquet('../data/raw/train_series.parquet')

In [31]:
train_series.head()

Unnamed: 0,series_id,step,timestamp,anglez,enmo
0,038441c925bb,0,2018-08-14T15:30:00-0400,2.6367,0.0217
1,038441c925bb,1,2018-08-14T15:30:05-0400,2.6368,0.0215
2,038441c925bb,2,2018-08-14T15:30:10-0400,2.637,0.0216
3,038441c925bb,3,2018-08-14T15:30:15-0400,2.6368,0.0213
4,038441c925bb,4,2018-08-14T15:30:20-0400,2.6368,0.0215


In [44]:
sample_series = train_series[train_series.series_id == '038441c925bb']

In [45]:
sample_series

Unnamed: 0,series_id,step,timestamp,anglez,enmo
0,038441c925bb,0,2018-08-14T15:30:00-0400,2.636700,0.0217
1,038441c925bb,1,2018-08-14T15:30:05-0400,2.636800,0.0215
2,038441c925bb,2,2018-08-14T15:30:10-0400,2.637000,0.0216
3,038441c925bb,3,2018-08-14T15:30:15-0400,2.636800,0.0213
4,038441c925bb,4,2018-08-14T15:30:20-0400,2.636800,0.0215
...,...,...,...,...,...
389875,038441c925bb,389875,2018-09-06T04:59:35-0400,-27.373899,0.0110
389876,038441c925bb,389876,2018-09-06T04:59:40-0400,-27.493799,0.0110
389877,038441c925bb,389877,2018-09-06T04:59:45-0400,-27.533701,0.0111
389878,038441c925bb,389878,2018-09-06T04:59:50-0400,-28.003599,0.0111


## Reduce Memory Usage

For each value, we check if it can be stored in a smaller datatype to reduce the used storage.  

In [28]:
def reduce_memory(df):
    start_memory = df.memory_usage().sum() / 1024**2
    print(f'Memory usage before cleanup is {start_memory:.2f} MB')
    
    for col in tqdm(df.columns):
        column_type = df[col].dtype
        if column_type != object:
            max_value = df[col].max()
            min_value = df[col].min()
            if str(column_type) == 'uint32':
                if min_value > np.iinfo(np.uint8).min and max_value < np.iinfo(np.uint8).max:
                    df[col] = df[col].astype(np.uint8)
                elif min_value > np.iinfo(np.uint16).min and max_value < np.iinfo(np.uint16).max:
                    df[col] = df[col].astype(np.uint16)
            elif str(column_type) == 'float32':
                if min_value > np.finfo(np.float16).min and max_value < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
        df['series_id'] = df['series_id'].astype('category')

    end_memory = df.memory_usage().sum() / 1024**2
    print(f'Memory usage after cleanup is {end_memory:.2f} MB')
    
    improvement = (start_memory - end_memory) / start_memory * 100
    print(f'Memory usage improved by {improvement:.2f}%')
    
    return df

## Feature Engineering

In [50]:
def feature_eng(df):
    for col in tqdm(['anglez', 'enmo']):
        for agg in tqdm(['median', 'mean', 'min', 'max']):
            df[f'{col}_{agg}'] = df.groupby('series_id')[col].rolling(NUM_STEPS, center=True).agg(agg).astype(np.float32).values
        df[f'{col}_diff'] = df.groupby('series_id')[col].diff(periods=NUM_STEPS).astype(np.float32)
    
    df = reduce_memory(df)

    return df

In [52]:
sample_series = feature_eng(sample_series)

  0%|          | 0/2 [00:00<?, ?it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[f'{col}_{agg}'] = df.groupby('series_id')[col].rolling(NUM_STEPS, center=True).agg(agg).astype(np.float32).values

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[f'{col}_{agg}'] = df.groupby('series_id')[col].rolling(NUM_STEPS, center=True).agg(agg).astype(np.float32).values

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-d

Memory usage before cleanup is 28.26 MB


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['series_id'] = df['series_id'].astype('category')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['series_id'] = df['series_id'].astype('category')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col] = df[col].astype(np.float16)
100%|██████████| 15/15 [00:02<00:00,  5.22it/s]

Memory usage after cleanup is 16.73 MB
Memory usage improved by 40.79%





In [53]:
sample_series

Unnamed: 0,series_id,step,timestamp,anglez,enmo,anglez_median,anglez_mean,anglez_min,anglez_max,anglez_diff,enmo_median,enmo_mean,enmo_min,enmo_max,enmo_diff
0,038441c925bb,0,2018-08-14T15:30:00-0400,2.636719,0.021698,,,,,,,,,,
1,038441c925bb,1,2018-08-14T15:30:05-0400,2.636719,0.021500,,,,,,,,,,
2,038441c925bb,2,2018-08-14T15:30:10-0400,2.636719,0.021606,,,,,,,,,,
3,038441c925bb,3,2018-08-14T15:30:15-0400,2.636719,0.021301,,,,,,,,,,
4,038441c925bb,4,2018-08-14T15:30:20-0400,2.636719,0.021500,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
389875,038441c925bb,389875,2018-09-06T04:59:35-0400,-27.375000,0.011002,,,,,-0.731445,,,,,-0.0013
389876,038441c925bb,389876,2018-09-06T04:59:40-0400,-27.500000,0.011002,,,,,-0.751953,,,,,-0.0014
389877,038441c925bb,389877,2018-09-06T04:59:45-0400,-27.531250,0.011101,,,,,-0.909668,,,,,-0.0009
389878,038441c925bb,389878,2018-09-06T04:59:50-0400,-28.000000,0.011101,,,,,-1.461914,,,,,-0.0009
