# INFO-371 Final Project

This is our final project which attempts to predict which events in a participant's accelerometer data signals
the beginning and end of sleep. Over the course of many nights, accelerometer data is collected from a worn
device on the participant's wrist at every moment of the day. Sleep scientists then annotate this data with
events labeled "onset", signaling the participant has begun sleeping, or "wakeup", signaling the participant
has just awoken.

By analysing this time series data, in this notebook we create a prediction model using data mining and machine
learning.

In [1]:
# Imports
import os
import pathlib

import numpy as np
import pandas as pd
import pyarrow as pa

# Set up data directories
input_data_dir = os.path.join(os.path.realpath(pathlib.Path().cwd()), 'input_data')
output_data_dir = os.path.join(os.path.realpath(pathlib.Path().cwd()), 'output_data')

# Set up input files
train_events_file = os.path.join(input_data_dir, 'train_events.csv')
train_series_file = os.path.join(input_data_dir, 'train_series.parquet')
# Set up output files
rows_delete_file = os.path.join(output_data_dir, "rows_to_delete.csv")
sleep_delta_file = os.path.join(output_data_dir, 'sleep_delta_data.parquet')

In [2]:
# Transform the train_events dataset types
train_events = pd.read_csv(train_events_file, usecols=['series_id', 'event', 'step', 'night'])
train_events.dropna(inplace=True)
train_events.reset_index(inplace=True, drop=True)
train_events['step'] = train_events['step'].astype(int)

rows_to_delete = []

train_events_n = len(train_events)

# Locate pairs of onset and wakeup events, between which is relevant series data
for index, row in train_events.iterrows():
    delete = True
    if index > 0:
        previous_row = train_events.iloc[index - 1]
        if previous_row['night'] == row['night'] and previous_row['event'] != row['event']:
            delete = False

    if index < train_events_n - 1:
        next_row = train_events.iloc[index + 1]
        if next_row['night'] == row['night'] and next_row['event'] != row['event']:
            delete = False
    if delete:
        rows_to_delete.append(index)

# Drop rows by their indices
train_events.drop(rows_to_delete, inplace=True)
train_events['series_id'] = train_events['series_id'].astype('category')

In [3]:
# Load the series data
train_series = pd.read_parquet(
    train_series_file,
    columns=["series_id", "step", "anglez", "enmo"],
)
train_series['series_id'] = train_series['series_id'].astype('category')

# Merge the two data frames
merged_data = pd.merge_ordered(train_series, train_events, on=['series_id', 'step'])
merged_data['series_id'] = merged_data['series_id'].astype('category')

In [4]:
# Generate enmo + anglez deltas 1 row behind (5s ago) and 6 rows behind (30s ago)
merged_data[['d1_enmo', 'd1_anglez']] = merged_data[['enmo', 'anglez']] - merged_data[['enmo', 'anglez']].shift(1)
merged_data[['d6_enmo', 'd6_anglez']] = merged_data[['enmo', 'anglez']] - merged_data[['enmo', 'anglez']].shift(6)

In [5]:
EVENT_WOKE = 'woke'
EVENT_SLEEP = 'sleep'

# Finding the rows that need to be deleted
last_id = merged_data.iloc[0]['series_id']
last_night = last_wakeup_index = 0
rows_to_delete = []
for index, row in merged_data.iterrows():
    if row['event'] == 'onset':
        merged_data.at[index+1, 'event'] = EVENT_SLEEP
        if row['night'] - last_night != 1 or row['series_id'] != last_id:
            rows_to_delete.append((last_wakeup_index, index))
            if row['series_id'] != last_id:
                last_id = row['series_id']
                last_night = row['night']
        else:
            last_night = row['night']
    if row['event'] == 'wakeup':
        merged_data.at[index+1, 'event'] = EVENT_WOKE
        last_wakeup_index = index

# Write row deletions to csv outfile (recovery checkpoint)
pd.DataFrame(data=rows_to_delete, columns=['wakeup', 'onset']).to_csv(rows_delete_file, index=False)

In [6]:
# Uncomment to load the deletion rows from csv (recovery checkpoint)
# rows_to_delete = pd.read_csv(rows_delete_file).to_numpy()

In [7]:
# Deleting the rows
# 3 hour buffer
buffer = 12 * 180

# First six rows can be ignored because they will have NaN d6_ column data
all_row_deletions = np.array(range(0, 6), dtype=np.uint32)
for wakeup, onset in rows_to_delete:
    all_row_deletions = np.hstack(
        (all_row_deletions, np.arange(wakeup + buffer, onset - buffer - 1, dtype=np.uint32)),
        dtype=np.uint32
    )

merged_data.drop(all_row_deletions, inplace=True)

# Delete the 'night' col, we're finished with it
merged_data.drop(columns='night', inplace=True)
# Delete the 'step' and 'night' cols, we're finished with them
# merged_data.drop(columns=['step', 'night'], inplace=True)
merged_data.reset_index(inplace=True, drop=True)

In [10]:
# # Forward fill the events col
merged_data.at[0, 'event'] = EVENT_WOKE
merged_data.ffill(inplace=True)
merged_data['event'] = merged_data['event'].astype('category')

In [11]:
# Drop the step column, we're finished with it
merged_data.drop(columns='step', inplace=True)
merged_data.reset_index(inplace=True, drop=True)

In [12]:
# Write transformed data to outfile
merged_data.to_parquet(sleep_delta_file, index=False)

In [13]:
# Uncomment to load transformed data from outfile (recovery checkpoint)
# sleep_delta_data = pd.read_parquet(sleep_delta_file)
# sleep_delta_data.dtypes