In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

train_events_path = "child-mind-institute-detect-sleep-states/train_events.csv"
train_series_path = "child-mind-institute-detect-sleep-states/train_series.parquet"
test_series_path = "child-mind-institute-detect-sleep-states/test_series.parquet"

#train_events_path = "/kaggle/input/child-mind-institute-detect-sleep-states/train_events.csv"
#train_series_path = "/kaggle/input/child-mind-institute-detect-sleep-states/train_series.parquet"
#test_series_path = "kaggle/input/child-mind-institute-detect-sleep-states/test_series.parquet"


In [2]:
train_events = pd.read_csv("child-mind-institute-detect-sleep-states/train_events.csv")
train_series = pd.read_parquet("child-mind-institute-detect-sleep-states/train_series.parquet")
test_series = pd.read_parquet("child-mind-institute-detect-sleep-states/test_series.parquet")
train_events.head()

Unnamed: 0,series_id,night,event,step,timestamp
0,038441c925bb,1,onset,4992.0,2018-08-14T22:26:00-0400
1,038441c925bb,1,wakeup,10932.0,2018-08-15T06:41:00-0400
2,038441c925bb,2,onset,20244.0,2018-08-15T19:37:00-0400
3,038441c925bb,2,wakeup,27492.0,2018-08-16T05:41:00-0400
4,038441c925bb,3,onset,39996.0,2018-08-16T23:03:00-0400


---------------

# In this first step, we should try to combine train_events with train_series to create a binary target feature "awake", where 0 represents the asleep state and 1 represents the awake state. This will make it easier to do EDA and predcition using machine learning

In [3]:
test_series

Unnamed: 0,series_id,step,timestamp,anglez,enmo
0,038441c925bb,0,2018-08-14T15:30:00-0400,2.636700,0.0217
1,038441c925bb,1,2018-08-14T15:30:05-0400,2.636800,0.0215
2,038441c925bb,2,2018-08-14T15:30:10-0400,2.637000,0.0216
3,038441c925bb,3,2018-08-14T15:30:15-0400,2.636800,0.0213
4,038441c925bb,4,2018-08-14T15:30:20-0400,2.636800,0.0215
...,...,...,...,...,...
445,0402a003dae9,145,2018-12-18T12:57:05-0500,-59.696899,0.0601
446,0402a003dae9,146,2018-12-18T12:57:10-0500,-35.656601,0.0427
447,0402a003dae9,147,2018-12-18T12:57:15-0500,-21.582399,0.0309
448,0402a003dae9,148,2018-12-18T12:57:20-0500,-42.616001,0.0328


## Check that all of the data start with onset

In [4]:
train_events.groupby('series_id').head(1)["event"].unique()

array(['onset'], dtype=object)

## Check that series data ends with awake 

In [5]:
train_events.groupby('series_id').tail(1)["event"].unique()

array(['wakeup'], dtype=object)

## Check if any series has NaN

In [6]:
no_nan_series = train_events.groupby('series_id')['step'].apply(lambda x: x.isnull().any())
no_nan_series.value_counts()

step
True     240
False     37
Name: count, dtype: int64

In [7]:
no_nan_series = no_nan_series.drop('31011ade7c0a') # incomplete events data
no_nan_series = no_nan_series.drop('a596ad0b82aa') # incomplete events data
no_nan_series

series_id
038441c925bb    True
03d92c9f6f8a    True
0402a003dae9    True
04f547b8017d    True
05e1944c3818    True
                ... 
fa149c3c4bde    True
fb223ed2278c    True
fbf33b1a2c10    True
fcca183903b7    True
fe90110788d2    True
Name: step, Length: 275, dtype: bool

## In this function, we are combining train_series with train_events to create a binary dataset, for a given series ID. Time points where the individual are asleep are labeled 1 and awake, 0.

In [8]:
def get_train_series(series): # takes in a series ID and returns
    train_series = pd.read_parquet(train_series_path, filters=[('series_id','=',series)])
    train_events = pd.read_csv(train_events_path).query('series_id == @series')
    
    train_events = train_events.dropna()
    train_events["step"]  = train_events["step"].astype("int")
    train_events["awake"] = train_events["event"].replace({"onset":1,"wakeup":0})

    train = pd.merge(train_series, train_events[['step','awake']], on='step', how='left')
    train["awake"] = train["awake"].bfill(axis ='rows')

    train['awake'] = train['awake'].fillna(1) # awake
    train["awake"] = train["awake"].astype("int")
    return(train)

In [9]:
train = get_train_series(no_nan_series[0])
train

ArrowNotImplementedError: Function 'equal' has no kernel matching input types (large_string, bool)

In [None]:
pd.__version__

'2.0.3'

In [None]:
train_events

Unnamed: 0,series_id,night,event,step,timestamp
0,038441c925bb,1,onset,4992.0,2018-08-14T22:26:00-0400
1,038441c925bb,1,wakeup,10932.0,2018-08-15T06:41:00-0400
2,038441c925bb,2,onset,20244.0,2018-08-15T19:37:00-0400
3,038441c925bb,2,wakeup,27492.0,2018-08-16T05:41:00-0400
4,038441c925bb,3,onset,39996.0,2018-08-16T23:03:00-0400
...,...,...,...,...,...
14505,fe90110788d2,33,wakeup,560604.0,2017-09-06T04:07:00-0400
14506,fe90110788d2,34,onset,574620.0,2017-09-06T23:35:00-0400
14507,fe90110788d2,34,wakeup,581604.0,2017-09-07T09:17:00-0400
14508,fe90110788d2,35,onset,,


In [None]:
train_series

Unnamed: 0,series_id,step,timestamp,anglez,enmo
0,038441c925bb,0,2018-08-14T15:30:00-0400,2.636700,0.0217
1,038441c925bb,1,2018-08-14T15:30:05-0400,2.636800,0.0215
2,038441c925bb,2,2018-08-14T15:30:10-0400,2.637000,0.0216
3,038441c925bb,3,2018-08-14T15:30:15-0400,2.636800,0.0213
4,038441c925bb,4,2018-08-14T15:30:20-0400,2.636800,0.0215
...,...,...,...,...,...
127946335,fe90110788d2,592375,2017-09-08T00:14:35-0400,-27.277500,0.0204
127946336,fe90110788d2,592376,2017-09-08T00:14:40-0400,-27.032499,0.0233
127946337,fe90110788d2,592377,2017-09-08T00:14:45-0400,-26.841200,0.0202
127946338,fe90110788d2,592378,2017-09-08T00:14:50-0400,-26.723900,0.0199


### From the Kaggle description, serires_id is the 