# INFO-371 Final Project

This is our final project which attempts to predict which events in a participant's accelerometer data signals
the beginning and end of sleep. Over the course of many nights, accelerometer data is collected from a worn
device on the participant's wrist at every moment of the day. Sleep scientists then annotate this data with
events labeled "onset", signaling the participant has begun sleeping, or "wakeup", signaling the participant
has just awoken.

By analysing this time series data, in this notebook we create a prediction model using data mining and machine
learning.

In [1]:
#Imports
import os
import pandas as pd
import pathlib
import pyarrow as pa

In [2]:
# Load the data from our local directory
input_data_dir = os.path.join(os.path.realpath(pathlib.Path().cwd()), 'input_data')

In [3]:
# Transform the train_events dataset types
train_events = pd.read_csv(
    os.path.join(input_data_dir, 'train_events.csv'),
    usecols=['series_id', 'event', 'step', 'night']
)
train_events.dropna(inplace=True)
train_events['step'] = train_events['step'].astype(int)
train_events['event'] = train_events['event'].astype('category')
train_events.sort_values(by=['series_id', 'night'])

Unnamed: 0,series_id,night,event,step
0,038441c925bb,1,onset,4992
1,038441c925bb,1,wakeup,10932
2,038441c925bb,2,onset,20244
3,038441c925bb,2,wakeup,27492
4,038441c925bb,3,onset,39996
...,...,...,...,...
14501,fe90110788d2,32,wakeup,547152
14502,fe90110788d2,33,onset,556560
14503,fe90110788d2,33,wakeup,560604
14504,fe90110788d2,34,onset,574620


In [4]:
# Load the series data
train_series = pd.read_parquet(
    os.path.join(input_data_dir, 'train_series.parquet'),
    columns=["series_id", "step", "anglez", "enmo"]
)

In [5]:
train_series.head(5)

Unnamed: 0,series_id,step,anglez,enmo
0,038441c925bb,0,2.6367,0.0217
1,038441c925bb,1,2.6368,0.0215
2,038441c925bb,2,2.637,0.0216
3,038441c925bb,3,2.6368,0.0213
4,038441c925bb,4,2.6368,0.0215


In [6]:
# Merge the two data frames
merged_data = pd.merge_ordered(train_series, train_events, on=['series_id', 'step'])
merged_data['event'] = merged_data['event'].astype('category')
merged_data['series_id'] = merged_data['series_id'].astype('category')

In [7]:
# uncomment to test selecting data from the merge table for a specific ID and range of steps
# 12 steps per minute * 60 min/hr * 3hr
three_hours = 12 * 180
start = train_events.at[0, 'step']
mod_start = start - three_hours - 6
stop = train_events.at[1, 'step']
mod_stop = stop + three_hours
single_day = merged_data[
    (merged_data.series_id == '038441c925bb') &
    (merged_data.step >= mod_start) &
    (merged_data.step <= mod_stop)
]

def fill_event_per_row(row):
    if start < row.step and row.step < stop:
        row.event = "sleep"
    if start > row.step or stop < row.step:
        row.event = "awake"
    return row

# uncomment to test replacing empty event data in the merge table
single_day = single_day.apply(fill_event_per_row, axis='columns', result_type='broadcast')
single_day.tail(int(three_hours + 25))

Unnamed: 0,series_id,step,anglez,enmo,night,event
10908,038441c925bb,10908,-63.0564,0.0206,,sleep
10909,038441c925bb,10909,-63.177898,0.0203,,sleep
10910,038441c925bb,10910,-63.108101,0.0206,,sleep
10911,038441c925bb,10911,-63.192501,0.0201,,sleep
10912,038441c925bb,10912,-63.139,0.0206,,sleep
...,...,...,...,...,...,...
13088,038441c925bb,13088,3.0795,0.2188,,awake
13089,038441c925bb,13089,-23.1852,0.2995,,awake
13090,038441c925bb,13090,-21.6586,0.2878,,awake
13091,038441c925bb,13091,-22.790899,0.4395,,awake


In [8]:
single_day.head(int(three_hours + 25))

Unnamed: 0,series_id,step,anglez,enmo,night,event
2826,038441c925bb,2826,-1.2445,0.0465,,awake
2827,038441c925bb,2827,-38.2528,0.0831,,awake
2828,038441c925bb,2828,-7.1711,0.0373,,awake
2829,038441c925bb,2829,4.8734,0.0188,,awake
2830,038441c925bb,2830,-15.3135,0.073,,awake
...,...,...,...,...,...,...
5006,038441c925bb,5006,-62.7841,0.0065,,sleep
5007,038441c925bb,5007,-63.420502,0.0061,,sleep
5008,038441c925bb,5008,-64.812202,0.0058,,sleep
5009,038441c925bb,5009,-64.928902,0.0056,,sleep


In [41]:
# 12 steps per minute * 60 min/hr * 3hr
three_hours = 12 * 180

def data_by_series_id_start_stop(series_id, start, stop):
    data = merged_data[
        (merged_data.series_id == series_id) &
        (merged_data.step >= mod_start) &
        (merged_data.step <= mod_stop)
    ]
    def fill_event_per_row(row):
        if start < row.step and row.step < stop:
            row.event = "sleep"
        if start > row.step or stop < row.step:
            row.event = "awake"
        return row
    return data.apply(fill_event_per_row, axis='columns', result_type='broadcast')


step_range_lists = {}
events = train_events.itertuples()
for onset_event in events:
    series_id = onset_event.series_id
    onset_night = onset_event.night
    wakeup_event = next(events) # calling next() prevents `for` from looping over 'wakeup' events
    wakeup_id = wakeup_event.series_id
    wakeup_night = wakeup_event.night
    start = onset_event.step
    stop = wakeup_event.step

    if not step_range_lists.get(series_id):
        step_range_lists[series_id] = []

    if onset_night == wakeup_night and series_id == wakeup_id:
        step_range_lists[series_id].append((start, stop))

print(step_range_lists['038441c925bb'])

StopIteration: 

In [72]:
merged_data.dtypes

series_id     object
step          uint32
anglez       float32
enmo         float32
event         object
dtype: object