# INFO-371 Final Project

This is our final project which attempts to predict which events in a participant's accelerometer data signals
the beginning and end of sleep. Over the course of many nights, accelerometer data is collected from a worn
device on the participant's wrist at every moment of the day. Sleep scientists then annotate this data with
events labeled "onset", signaling the participant has begun sleeping, or "wakeup", signaling the participant
has just awoken.

By analysing this time series data, in this notebook we create a prediction model using data mining and machine
learning.

In [50]:
#Imports
import os
import pandas as pd
import pathlib
import pyarrow as pa

In [51]:
# Load the data from our local directory
input_data_dir = os.path.join(os.path.realpath(pathlib.Path().cwd()), 'input_data')

In [86]:
# Transform the train_events dataset types
train_events = pd.read_csv(
    os.path.join(input_data_dir, 'train_events.csv'),
    usecols=['series_id', 'event', 'step', 'night']
)
train_events.dropna(inplace=True)
train_events.reset_index(inplace = True, drop=True)
train_events['step'] = train_events['step'].astype(int)
# train_events['event'] = train_events['event'].astype('category')

rows_to_delete = []

for index, row in train_events.iterrows():
    delete = True
    if index > 0:
        previous_row = train_events.iloc[index - 1]
        if previous_row['night'] == row['night'] and previous_row['event'] != row['event']:
            delete = False

    if index < len(train_events) - 1:
        next_row = train_events.iloc[index + 1]
        if next_row['night'] == row['night'] and next_row['event'] != row['event']:
            delete = False
    if delete:
        rows_to_delete.append(index)

# Drop rows by their indices
train_events.drop(rows_to_delete, inplace=True)
train_events.sort_values(by=['series_id', 'night'])

Unnamed: 0,series_id,night,event,step
0,038441c925bb,1,onset,4992
1,038441c925bb,1,wakeup,10932
2,038441c925bb,2,onset,20244
3,038441c925bb,2,wakeup,27492
4,038441c925bb,3,onset,39996
...,...,...,...,...
9580,fe90110788d2,32,wakeup,547152
9581,fe90110788d2,33,onset,556560
9582,fe90110788d2,33,wakeup,560604
9583,fe90110788d2,34,onset,574620


In [64]:
print(rows_to_delete)

[310, 957, 2668, 9099, 9352]


In [87]:
# Load the series data
train_series = pd.read_parquet(
    os.path.join(input_data_dir, 'train_series.parquet'),
    columns=["series_id", "step", "anglez", "enmo"]
)

In [88]:
# Merge the two data frames
merged_data = pd.merge_ordered(train_series, train_events, on=['series_id', 'step'])
# merged_data['event'] = merged_data['event'].astype('category')
merged_data['series_id'] = merged_data['series_id'].astype('category')

In [77]:
merged_data.at[390000, 'series_id']

'03d92c9f6f8a'

In [113]:
merged_data[['d1_enmo', 'd1_anglez']] = merged_data[['enmo', 'anglez']] - merged_data[['enmo', 'anglez']].shift(1)
merged_data[['d6_enmo', 'd6_anglez']] = merged_data[['enmo', 'anglez']] - merged_data[['enmo', 'anglez']].shift(6)
merged_data.head(25)

Unnamed: 0,series_id,step,anglez,enmo,night,event,d1_enmo,d1_anglez,d6_enmo,d6_anglez
0,038441c925bb,0,2.6367,0.0217,,,,,,
1,038441c925bb,1,2.6368,0.0215,,,-0.0002,0.0001,,
2,038441c925bb,2,2.637,0.0216,,,0.0001,0.0002,,
3,038441c925bb,3,2.6368,0.0213,,,-0.0003,-0.0002,,
4,038441c925bb,4,2.6368,0.0215,,,0.0002,0.0,,
5,038441c925bb,5,2.6367,0.0217,,,0.0002,-0.0001,,
6,038441c925bb,6,2.6367,0.0217,,,0.0,0.0,0.0,0.0
7,038441c925bb,7,2.6367,0.0218,,,0.0001,0.0,0.0003,-0.0001
8,038441c925bb,8,2.798,0.0223,,,0.0005,0.1613,0.0007,0.161
9,038441c925bb,9,3.0847,0.0217,,,-0.0006,0.2867,0.0004,0.4479


In [115]:
merged_data.tail(25)
merged_data.dropna(subset='d6_enmo', inplace=True)
merged_data.reset_index(inplace=True, drop=True)
merged_data.head(25)

Unnamed: 0,series_id,step,anglez,enmo,night,event,d1_enmo,d1_anglez,d6_enmo,d6_anglez
6,038441c925bb,6,2.6367,0.0217,,,0.0,0.0,0.0,0.0
7,038441c925bb,7,2.6367,0.0218,,,0.0001,0.0,0.0003,-0.0001
8,038441c925bb,8,2.798,0.0223,,,0.0005,0.1613,0.0007,0.161
9,038441c925bb,9,3.0847,0.0217,,,-0.0006,0.2867,0.0004,0.4479
10,038441c925bb,10,2.7802,0.0229,,,0.0012,-0.3045,0.0014,0.1434
11,038441c925bb,11,2.5205,0.0216,,,-0.0013,-0.2597,-0.0001,-0.1162
12,038441c925bb,12,2.4129,0.0218,,,0.0002,-0.1076,0.0001,-0.2238
13,038441c925bb,13,2.4129,0.0214,,,-0.0004,0.0,-0.0004,-0.2238
14,038441c925bb,14,2.4129,0.0214,,,0.0,0.0,-0.0009,-0.3851
15,038441c925bb,15,2.4129,0.0214,,,0.0,0.0,-0.0003,-0.6718


In [118]:
# finding the rows that need to be deleted
last_id = merged_data.iloc[0]['series_id']
last_night = 0
last_wakeup_index = 0
rows_to_delete = []
for index, row in merged_data.head(200000).iterrows():

    if row['event'] == 'onset':
        if row['night'] - last_night != 1 or row['series_id'] != last_id:
            rows_to_delete.append((last_wakeup_index, index))
            if row['series_id'] != last_id:
                last_id = row['series_id']
                last_night = row['night']
        else:
            last_night = row['night']
    if row['event'] == 'wakeup':
        last_wakeup_index = index


In [119]:
print(rows_to_delete)

[(62856, 91296), (97860, 109500), (118524, 127296), (133332, 159972), (167400, 177036), (180804, 194220)]


In [None]:
# Delete the 'night' row
merged_data.drop(columns='night', inplace=True)

In [None]:
# Deleting the rows
# 3 hour buffer
# buffer = 12*180
# for tuple in
# merged_data.loc[start_index + buffer:end_index - buffer , 'column_name'] = new_value

In [100]:
for wakeup, onset in rows_to_delete:
    merged_data.at[wakeup+1, 'event'] = 'woke'
    merged_data.at[onset+1, 'event'] = 'sleep'
print(merged_data.at[rows_to_delete[0][1] + 1, 'event'])

sleep
