Incorporate activity dataset and the features created. The objective is to
test another model against the most recent dataset and generate insights about further developments.

In [1]:
import os
import requests
import json
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from zipfile import ZipFile


In [2]:
current_path = os.getcwd()
day_to_load = '10102019' # enter the day to load data files from in format "ddmmyyyy"
with ZipFile(current_path + '/data_ingestion/' + day_to_load + '/dataset.zip', mode = 'r') as zip_file:
    users = pd.read_csv(zip_file.open('users_data.csv'))
    sleep = pd.read_csv(zip_file.open('sleep_data.csv'))
    activity = pd.read_csv(zip_file.open('activity_data.csv'))
    readiness = pd.read_csv(zip_file.open('readiness_data.csv'))

## Users data handling:

In [8]:
# users.drop(labels='email', inplace=True)
num_columns = ['age', 'height', 'weight']

users[num_columns] = users[num_columns].apply(pd.to_numeric)
users['gender'] = np.where(users['gender'] == 'male', True, False)
users.rename(columns={'gender':'is_male'}, inplace=True)

  result = method(y)


## Sleep data handling:

In [None]:
num_types = ['awake', 'bedtime_end_delta','bedtime_start_delta','breath_average','breath_average',
             'deep', 'duration', 'efficiency', 'hr_average', 'hr_lowest','light', 
             'midpoint_at_delta', 'midpoint_time', 'onset_latency', 'period_id', 'rem', 'restless',
             'rmssd', 'score', 'score_alignment', 'score_deep', 'score_disturbances', 'score_efficiency',
             'score_latency', 'score_rem', 'score_total', 'temperature_delta', 'temperature_deviation',
             'temperature_trend_deviation', 'timezone', 'total']

date_types = ['summary_date']

sleep[num_types] = users[num_types].apply(pd.to_numeric)
sleep[date_types] = sleep[date_types].apply(pd.to_numeric)

# build dummy to day of the week.
sleep['weekday'] = sleep['summary_date'].dt.weekday

# build is_workday
sleep['is_workday'] = np.where(sleep['weekday'] < 5, True, False)

# one-hot encode weekdays
sleep = pd.get_dummies(sleep, columns=['weekday'])

sleep.rename(columns = {'weekday_0':'weekday_mon]', 
                          'weekday_1':'weekday_tue]', 
                          'weekday_2': 'weekday_wed]',
                          'weekday_3': 'weekday_thu]',
                          'weekday_4': 'weekday_fri]',
                          'weekday_5': 'weekday_sat]',
                          'weekday_6': 'weekday_sun]'}, 
                          inplace = True)

# adding the columns that will be created for the full dataframe:
df = pd.DataFrame(columns=sleep.columns.append(pd.Index(['score_D-1', 'score_D-2', 'rol_score_7d', 'rol_score_14d',
                                                         'rol_score_21d', 'is_traveling', 
                                                         'bedtime_start_dev-3', 'bedtime_start_dev-2',
                                                         'bedtime_start_dev-1', 'bedtime_start_dev+1', 
                                                         'bedtime_start_dev+2', 'bedtime_start_dev+3',
                                                         'bedtime_end_dev-3', 'bedtime_end_dev-2',
                                                         'bedtime_end_dev-1', 'bedtime_end_dev+1',
                                                         'bedtime_end_dev+2', 'bedtime_end_dev+3',
                                                         'duration_dev-3', 'duration_dev-2', 'duration_dev-1',
                                                         'duration_dev+1', 'duration_dev+2', 'duration_dev+3'])))

for user in df['user_id[str]'].unique():
    # setting summary date as index to make rolling averages easier.
    single_user_df = df[df['user_id[str]'] == user]
    single_user_df.set_index('summary_date[t]', inplace = True)
    
    # build D - 1 and D - 2 scores:
    single_user_df['score_D-1[%]'] = single_user_df['score[%]'].shift()[single_user_df.index.shift(1,freq='1D')]
    single_user_df['score_D-2[%]'] = single_user_df['score[%]'].shift()[single_user_df.index.shift(1,freq='2D')]
    
    # build 7, 14, and 21 roling averega scores:
    single_user_df['rol_score_7d[%]'] = pd.rolling_mean(single_user_df['score[%]'].shift(1, freq='1D'), window=7, min_periods=3)
    single_user_df['rol_score_14d[%]'] = pd.rolling_mean(single_user_df['score[%]'].shift(1, freq='1D'), window=14, min_periods=10)
    single_user_df['rol_score_21d[%]'] = pd.rolling_mean(single_user_df['score[%]'].shift(1, freq='1D'), window=21, min_periods=17)
    
    # build is_traveling
    single_user_df['is_traveling[bool]'] = np.where(single_user_df['timezone[s]'] == 
                                                    single_user_df['timezone[s]'].value_counts().idxmax(), False, True)
    
    # build rol_bedtime_start_21d[s]
    single_user_df['rol_bedtime_start_21d[s]'] = pd.rolling_mean(single_user_df['bedtime_start_delta[s]'].shift(1, freq='1D'),
                                                                 window=21, min_periods=10)
    
    # build avg_bedtime_start_delta, create dummy variables for deviation (-3, -2, -1, 1, 2, 3)
    single_user_df['rol_bedtime_start_std_21d[s]'] = pd.rolling_std(single_user_df['bedtime_start_delta[s]'].shift(1, freq='1D'),
                                                                 window=21, min_periods=10)
    
    single_user_df['bedtime_start_dev[std]'] = (single_user_df['bedtime_start_delta[s]'] - single_user_df['rol_bedtime_start_21d[s]'])/single_user_df['rol_bedtime_start_std_21d[s]']
    
    single_user_df['bedtime_start_dev[std]'] = np.where(single_user_df['bedtime_start_dev[std]'] >= 0, 
                                                        np.ceil(single_user_df['bedtime_start_dev[std]']),
                                                        np.floor(single_user_df['bedtime_start_dev[std]']))
    # cap standard deviations to -3 or +3:
    single_user_df['bedtime_start_dev[std]'] = np.where(single_user_df['bedtime_start_dev[std]'] <= -3,-3,
                                                        single_user_df['bedtime_start_dev[std]'])
    
    single_user_df['bedtime_start_dev[std]'] = np.where(single_user_df['bedtime_start_dev[std]'] >= 3,3,
                                                        single_user_df['bedtime_start_dev[std]'])
    
    single_user_df = pd.get_dummies(single_user_df, columns=['bedtime_start_dev[std]'])
    
    single_user_df.rename(columns = { 'bedtime_start_dev[std]_-3.0':'bedtime_start_dev-3[bool]', 
                                      'bedtime_start_dev[std]_-2.0':'bedtime_start_dev-2[bool]', 
                                      'bedtime_start_dev[std]_-1.0': 'bedtime_start_dev-1[bool]',
                                      'bedtime_start_dev[std]_1.0': 'bedtime_start_dev+1[bool]',
                                      'bedtime_start_dev[std]_2.0': 'bedtime_start_dev+2[bool]',
                                      'bedtime_start_dev[std]_3.0': 'bedtime_start_dev+3[bool]'},
                                      inplace = True)
    
    single_user_df.drop(labels = ['rol_bedtime_start_21d[s]','rol_bedtime_start_std_21d[s]'], axis = 1, inplace = True)
    
    # build avg_bedtime_end_delta, create dummy variables for deviation. (-3, -2, -1, 1, 2, 3)
    single_user_df['rol_bedtime_end_21d[s]'] = pd.rolling_mean(single_user_df['bedtime_end_delta[s]'].shift(1, freq='1D'),
                                                                 window=21, min_periods=10)

    single_user_df['rol_bedtime_end_std_21d[s]'] = pd.rolling_std(single_user_df['bedtime_end_delta[s]'].shift(1, freq='1D'),
                                                                 window=21, min_periods=10)

    single_user_df['bedtime_end_dev[std]'] = (single_user_df['bedtime_end_delta[s]'] - single_user_df['rol_bedtime_end_21d[s]'])/single_user_df['rol_bedtime_end_std_21d[s]']

    single_user_df['bedtime_end_dev[std]'] = np.where(single_user_df['bedtime_end_dev[std]'] >= 0, 
                                                        np.ceil(single_user_df['bedtime_end_dev[std]']),
                                                        np.floor(single_user_df['bedtime_end_dev[std]']))
    # cap standard deviations to -3 or +3:
    single_user_df['bedtime_end_dev[std]'] = np.where(single_user_df['bedtime_end_dev[std]'] <= -3,-3,
                                                        single_user_df['bedtime_end_dev[std]'])

    single_user_df['bedtime_end_dev[std]'] = np.where(single_user_df['bedtime_end_dev[std]'] >= 3,3,
                                                        single_user_df['bedtime_end_dev[std]'])

    single_user_df = pd.get_dummies(single_user_df, columns=['bedtime_end_dev[std]'])

    single_user_df.rename(columns = {'bedtime_end_dev[std]_-3.0':'bedtime_end_dev-3[bool]', 
                              'bedtime_end_dev[std]_-2.0':'bedtime_end_dev-2[bool]', 
                              'bedtime_end_dev[std]_-1.0': 'bedtime_end_dev-1[bool]',
                              'bedtime_end_dev[std]_1.0': 'bedtime_end_dev+1[bool]',
                              'bedtime_end_dev[std]_2.0': 'bedtime_end_dev+2[bool]',
                              'bedtime_end_dev[std]_3.0': 'bedtime_end_dev+3[bool]'},
                              inplace = True)

    single_user_df.drop(labels = ['rol_bedtime_end_21d[s]','rol_bedtime_end_std_21d[s]'], axis = 1, inplace = True)
    
    # build avg_duration, create dummy variables for deviation (-3, -2, -1, 1, 2, 3)
    single_user_df['rol_duration_21d[s]'] = pd.rolling_mean(single_user_df['duration[s]'].shift(1, freq='1D'),
                                                                 window=21, min_periods=10)

    single_user_df['rol_duration_std_21d[s]'] = pd.rolling_std(single_user_df['duration[s]'].shift(1, freq='1D'),
                                                                 window=21, min_periods=10)

    single_user_df['duration_dev[std]'] = (single_user_df['duration[s]'] - single_user_df['rol_duration_21d[s]'])/single_user_df['rol_duration_std_21d[s]']

    single_user_df['duration_dev[std]'] = np.where(single_user_df['duration_dev[std]'] >= 0, 
                                                        np.ceil(single_user_df['duration_dev[std]']),
                                                        np.floor(single_user_df['duration_dev[std]']))
    # cap standard deviations to -3 or +3:
    single_user_df['duration_dev[std]'] = np.where(single_user_df['duration_dev[std]'] <= -3,-3,
                                                        single_user_df['duration_dev[std]'])

    single_user_df['duration_dev[std]'] = np.where(single_user_df['duration_dev[std]'] >= 3,3,
                                                        single_user_df['duration_dev[std]'])

    single_user_df = pd.get_dummies(single_user_df, columns=['duration_dev[std]'])

    single_user_df.rename(columns = {'duration_dev[std]_-3.0':'duration_dev-3[bool]', 
                              'duration_dev[std]_-2.0':'duration_dev-2[bool]', 
                              'duration_dev[std]_-1.0':'duration_dev-1[bool]',
                              'duration_dev[std]_1.0': 'duration_dev+1[bool]',
                              'duration_dev[std]_2.0': 'duration_dev+2[bool]',
                              'duration_dev[std]_3.0': 'duration_dev+3[bool]'},
                              inplace = True)

    single_user_df.drop(labels = ['rol_duration_21d[s]','rol_duration_std_21d[s]'], axis = 1, inplace = True)
    
    # reset index to be user_date[str]
    single_user_df.set_index('user_date[str]', inplace = True)
    
    # create a new dataset with the new variables
    nf_df = nf_df.append(single_user_df)

