# INFO-371 Final Project

This is our final project which attempts to predict which events in a participant's accelerometer data signals
the beginning and end of sleep. Over the course of many nights, accelerometer data is collected from a worn
device on the participant's wrist at every moment of the day. Sleep scientists then annotate this data with
events labeled "onset", signaling the participant has begun sleeping, or "wakeup", signaling the participant
has just awoken.

By analysing this time series data, in this notebook we create a prediction model using data mining and machine
learning.

In [2]:
# Imports
import os
import pathlib
import arff
import math

import numpy as np
import pandas as pd
import pyarrow as pa

# Set up data directories
input_data_dir = os.path.join(os.path.realpath(pathlib.Path().cwd()), 'input_data')
output_data_dir = os.path.join(os.path.realpath(pathlib.Path().cwd()), 'output_data')

# Set up input files
train_events_file = os.path.join(input_data_dir, 'train_events.csv')
train_series_file = os.path.join(input_data_dir, 'train_series.parquet')
# Set up output files
rows_delete_file = os.path.join(output_data_dir, "rows_to_delete.csv")
sleep_delta_file = os.path.join(output_data_dir, 'sleep_delta_data.parquet')
sleep_delta_file_arff = os.path.join(output_data_dir, 'sleep_delta_data.arff')
condensed_sdf_arff = os.path.join(output_data_dir, 'sleep_delta_data_condensed.arff')

In [2]:
# Transform the train_events dataset types
train_events = pd.read_csv(train_events_file, usecols=['series_id', 'event', 'step', 'night'])
train_events.dropna(inplace=True)
train_events.reset_index(inplace=True, drop=True)
train_events['step'] = train_events['step'].astype(int)

rows_to_delete = []

train_events_n = len(train_events)

# Locate pairs of onset and wakeup events, between which is relevant series data
for index, row in train_events.iterrows():
    delete = True
    if index > 0:
        previous_row = train_events.iloc[index - 1]
        if previous_row['night'] == row['night'] and previous_row['event'] != row['event']:
            delete = False

    if index < train_events_n - 1:
        next_row = train_events.iloc[index + 1]
        if next_row['night'] == row['night'] and next_row['event'] != row['event']:
            delete = False
    if delete:
        rows_to_delete.append(index)

# Drop rows by their indices
train_events.drop(rows_to_delete, inplace=True)
train_events['series_id'] = train_events['series_id'].astype('category')

In [17]:
# Load the series data
train_series = pd.read_parquet(
    train_series_file,
    columns=["series_id", "step", "anglez", "enmo"],
)
train_series['series_id'] = train_series['series_id'].astype('category')
print(train_series)

              series_id    step     anglez    enmo
0          038441c925bb       0   2.636700  0.0217
1          038441c925bb       1   2.636800  0.0215
2          038441c925bb       2   2.637000  0.0216
3          038441c925bb       3   2.636800  0.0213
4          038441c925bb       4   2.636800  0.0215
...                 ...     ...        ...     ...
127946335  fe90110788d2  592375 -27.277500  0.0204
127946336  fe90110788d2  592376 -27.032499  0.0233
127946337  fe90110788d2  592377 -26.841200  0.0202
127946338  fe90110788d2  592378 -26.723900  0.0199
127946339  fe90110788d2  592379 -31.521601  0.0205

[127946340 rows x 4 columns]


In [None]:
# Merge the two data frames
merged_data = pd.merge_ordered(train_series, train_events, on=['series_id', 'step'])
merged_data['series_id'] = merged_data['series_id'].astype('category')

In [4]:
# Generate enmo + anglez deltas 1 row behind (5s ago) and 6 rows behind (30s ago)
merged_data[['d1_enmo', 'd1_anglez']] = merged_data[['enmo', 'anglez']] - merged_data[['enmo', 'anglez']].shift(1)
merged_data[['d6_enmo', 'd6_anglez']] = merged_data[['enmo', 'anglez']] - merged_data[['enmo', 'anglez']].shift(6)

In [5]:
EVENT_WOKE = 'woke'
EVENT_SLEEP = 'sleep'

# Finding the rows that need to be deleted
last_id = merged_data.iloc[0]['series_id']
last_night = last_wakeup_index = 0
rows_to_delete = []
for index, row in merged_data.iterrows():
    if row['event'] == 'onset':
        merged_data.at[index + 1, 'event'] = EVENT_SLEEP
        if row['night'] - last_night != 1 or row['series_id'] != last_id:
            rows_to_delete.append((last_wakeup_index, index))
            if row['series_id'] != last_id:
                last_id = row['series_id']
                last_night = row['night']
        else:
            last_night = row['night']
    if row['event'] == 'wakeup':
        merged_data.at[index + 1, 'event'] = EVENT_WOKE
        last_wakeup_index = index

# Write row deletions to csv outfile (recovery checkpoint)
pd.DataFrame(data=rows_to_delete, columns=['wakeup', 'onset']).to_csv(rows_delete_file, index=False)

In [6]:
# Uncomment to load the deletion rows from csv (recovery checkpoint)
# rows_to_delete = pd.read_csv(rows_delete_file).to_numpy()

In [7]:
# Deleting the rows
# 3 hour buffer
buffer = 12 * 180

# First six rows can be ignored because they will have NaN d6_ column data
all_row_deletions = np.array(range(0, 6), dtype=np.uint32)
for wakeup, onset in rows_to_delete:
    all_row_deletions = np.hstack(
        (all_row_deletions, np.arange(wakeup + buffer, onset - buffer - 1, dtype=np.uint32)),
        dtype=np.uint32
    )

merged_data.drop(all_row_deletions, inplace=True)

# Delete the 'night' col, we're finished with it
merged_data.drop(columns='night', inplace=True)
# Delete the 'step' and 'night' cols, we're finished with them
# merged_data.drop(columns=['step', 'night'], inplace=True)
merged_data.reset_index(inplace=True, drop=True)

In [10]:
# # Forward fill the events col
merged_data.at[0, 'event'] = EVENT_WOKE
merged_data.ffill(inplace=True)
merged_data['event'] = merged_data['event'].astype('category')

In [11]:
# Drop the step column, we're finished with it
merged_data.drop(columns='step', inplace=True)
merged_data.reset_index(inplace=True, drop=True)

In [12]:
# Write transformed data to outfile
merged_data.to_parquet(sleep_delta_file, index=False)

In [3]:
# Uncomment to load transformed data from outfile (recovery checkpoint)
sleep_delta_data = pd.read_parquet(sleep_delta_file)
sleep_delta_data.dtypes

             series_id     anglez    enmo event  d1_enmo  d1_anglez  d6_enmo  \
0         038441c925bb   2.636700  0.0217  woke   0.0000   0.000000   0.0000   
1         038441c925bb   2.636700  0.0218  woke   0.0001   0.000000   0.0003   
2         038441c925bb   2.798000  0.0223  woke   0.0005   0.161300   0.0007   
3         038441c925bb   3.084700  0.0217  woke  -0.0006   0.286700   0.0004   
4         038441c925bb   2.780200  0.0229  woke   0.0012  -0.304500   0.0014   
...                ...        ...     ...   ...      ...        ...      ...   
67027587  fe90110788d2 -27.277500  0.0204  woke   0.0001   0.172501   0.0007   
67027588  fe90110788d2 -27.032499  0.0233  woke   0.0029   0.245001   0.0028   
67027589  fe90110788d2 -26.841200  0.0202  woke  -0.0031   0.191299  -0.0002   
67027590  fe90110788d2 -26.723900  0.0199  woke  -0.0003   0.117300  -0.0008   
67027591  fe90110788d2 -31.521601  0.0205  woke   0.0006  -4.797701   0.0003   

          d6_anglez  
0          0.0000

In [6]:
# Write arff file
chunk_size = 100000
total_rows = len(sleep_delta_data)
num_chunks = math.ceil(total_rows / chunk_size)

arff.dump(sleep_delta_file_arff
      , sleep_delta_data.values
      , relation='Movement_and_Sleep'
      , names=sleep_delta_data.columns)

In [20]:
# Remove wakeup and onset events
sleep_delta_data = sleep_delta_data[(sleep_delta_data['event'] != 'onset') & (sleep_delta_data['event'] != 'wakeup')]

# Group the entries
factor = 6
sleep_delta_data['group'] = sleep_delta_data.index // factor

             anglez      enmo   d1_enmo  d1_anglez   d6_enmo  d6_anglez  \
index                                                                     
0          2.742800  0.022000 -0.000017  -0.019367  0.000450   0.106000   
1          2.412950  0.021533  0.000017  -0.017883 -0.000467  -0.329850   
2         53.731201  0.019017 -0.002450   5.884783 -0.002517  51.318253   
3        -13.351600  0.021250  0.001100 -19.618366  0.002233 -67.082802   
4        -79.979088  0.013517  0.000117  -0.000417 -0.007733 -66.627487   
...             ...       ...       ...        ...       ...        ...   
11171261 -31.670416  0.012417  0.003217   0.494233  0.012417   1.904184   
11171262 -30.088882  0.027950  0.002767  -0.234050  0.015533   1.581533   
11171263 -28.166201  0.023433 -0.002583   0.681800 -0.004517   1.922683   
11171264 -27.316267  0.020850 -0.000033   0.180250 -0.002583   0.849934   
11171265 -29.122749  0.020200  0.000150  -2.340200 -0.000250  -1.474550   

             series_id e

In [None]:
# Aggregate the entries into groups, calculating the mean for numeric values and the mode for categorical ones
aggregation_methods = {
    'anglez': 'mean',
    'enmo': 'mean',
    'd1_enmo': 'mean',
    'd1_anglez': 'mean',
    'd6_enmo': 'mean',
    'd6_anglez': 'mean',
    'series_id': lambda x: x.mode()[0] if not x.mode().empty else None,
    'event': lambda x: x.mode()[0] if not x.mode().empty else None,
}
condensed_df = sleep_delta_data.groupby('group').agg(aggregation_methods)


In [30]:
# Write arff file
factor = 10
chunk_size = 100000
total_rows = len(sleep_delta_data) // factor
# num_chunks = math.ceil(total_rows / chunk_size)

arff.dump(condensed_sdf_arff
      , sleep_delta_data.head(total_rows).values
      , relation='Movement_and_Sleep'
      , names=sleep_delta_data.iloc[:total_rows].columns)

In [1]:
# Load file, split it and save a part of it
condensed_df.drop(['group', 'series_id'])

NameError: name 'condensed_df' is not defined