# Data ingestion and transformation notebook

This notebook uses the data ingestion standard pipeline implementation.

In [3]:
import json
import numpy as np
import pandas as pd
import os
import zipfile
from zipfile import ZipFile
from datetime import date
import time
import requests

In [4]:
%reload_ext autoreload
%autoreload 2

In [47]:
# Helper functions for feature engineering:

def decode_stacked_json(stacked_json_string, pos=0, decoder=json.JSONDecoder()):
    """Yield multiple JSON objects and restart parsing from the previous position
    Input must be of the form: {'key1': value1, 'key2': value2}{'key3': value3, 'key4': value4"""
    while True:
        try:
            json_object, pos = decoder.raw_decode(stacked_json_string, pos)
        except json.JSONDecodeError:
            break
        yield json_object
       
        
def after_wake_exercise(class_5min,bedtime_end):
    """returns the number of minutes with medium to high MET scores 
    within the first 3 hours of wake-time as a proxy for whether or not exercise 
    occurs after waking up"""
    if isinstance(class_5min,float):
        return np.nan
    elif isinstance(bedtime_end,float):
        return np.nan
    else:
        #convert the str integer into a list of integers
        class_5min_list = list(map(int, class_5min))
        #--take the timestamp from the datetime string, extracts hh:mm data, and converts to a number--#
        wake_hr_min = int(''.join(bedtime_end.split('T')[1][0:5].split(':')))
        #---calculate minutes lapsed since 4am and wake up time---#
        #---rescale minutes into 5 minute intervals to find the number of elements at which to offset class_5min--#
        offset = int(((wake_hr_min - 400)/100)*(60/5))

        #subset observations between wake up and 3 hrs post wake up (24*5=120 min)
        morning_obs = class_5min_list[offset:offset+36]

        #tally total minutes spent in medium to high intensity exercise (3-4)
        total_min = sum([1 for obs in morning_obs if obs >=3])*5
        return total_min


def before_sleep_exercise(class_5min,bedtime_start):
    """returns the number of minutes with medium to high MET scores 
    within the last 3 hours of wake-time as a proxy for whether or not exercise 
    occurs in the evening close to bedtime"""
    if isinstance(class_5min,float):
        return np.nan
    elif isinstance(bedtime_start,float):
        return np.nan
    else:
        #convert the str integer into a list of integers
        class_5min_list = list(map(int, class_5min))
        #--take the timestamp from the datetime string, extracts hh:mm data, and converts to a number--#
        sleep_hr_min = int(''.join(bedtime_start.split('T')[1][0:5].split(':')))
        #---calculate minutes lapsed since 4am and wake up time---#
        #---rescale minutes into 5 minute intervals to find the number of elements at which to offset class_5min--#
        offset = int(((sleep_hr_min - 400)/100)*(60/5))

        #subset observations 3 hours before sleep time
        evening_obs = class_5min_list[offset-36:offset]

        #tally total minutes spent in medium to high intensity exercise (3-4)
        total_min = sum([1 for obs in evening_obs if obs >=3])*5
        return total_min


def noon_exercise(class_5min):
    """returns the number of minutes with medium to high MET scores 
    between noon and two local time"""
    if isinstance(class_5min,float):
        return np.nan
    else:
        #convert the str integer into a list of integers
        class_5min_list = list(map(int, class_5min))

        #---calculate minutes lapsed since 4am and 12 (8*12 (5min intervasl in an hr)---#

        offset = 96

        #subset observations between noon and 2 pm
        noon_obs = class_5min_list[offset:offset+24]

        #tally total minutes spent in medium to high intensity exercise (3-4)
        total_min = sum([1 for obs in noon_obs if obs >=3])*5
        return total_min
 
    
def evening_exercise(class_5min):
    """returns the number of minutes with medium to high MET scores 
    between noon and two local time"""
    if isinstance(class_5min,float):
        return np.nan
    else:
        #convert the str integer into a list of integers
        class_5min_list = list(map(int, class_5min))

        #---calculate minutes lapsed since 4am and 5pm (13*12) --there are 12 5min intervasl in an hr---#

        offset = 156

        #subset observations between 5pm and 7 pm post wake up
        eve_obs = class_5min_list[offset:offset+24]

        #tally total minutes spent in medium to high intensity exercise (3-4)
        total_min = sum([1 for obs in eve_obs if obs >=3])*5
        return total_min


def after_midnight(timestamp_str):
    """indicator for whether or not bedtime started after midnight (in early am hours)"""
    day, hour = timestamp_str.split('T')
    hr = int(hour[0:2]) #extract hr from timestamp
    if 0 <= hr < 6:
        return 1
    else:
        return 0


def age_bin(e):
    """Bins: 20s, 30s, 40s, and 50s +"""
    if  20 <= e < 30:
        return "20s"
    elif 30 <= e < 40:
        return "30s"
    elif 40 <= e < 50:
        return "40s"
    elif e > 50:
        return "50s plus"
    else:
        return np.nan
   
    
def height_bin(e):
    """Bins: 0, 150, 160, 170, 180, 190. In centimeters """
    if  0 < e < 150:
        return "less than 150 cm"
    elif 150 <= e <= 160:
        return "150s"
    elif 160 <= e <= 170:
        return "160s"
    elif 170 <= e <= 180:
        return "170s"
    elif 180 <= e <= 190:
        return "180s"
    elif e > 190:
        return "greater than 190 cm"
    else:
        return np.nan
    
    
def weight_bin(e):
    """Bins: 0, 65, 80, 95. In kilograms"""
    if  0 < e <=65:
        return "less than 65 kg"
    elif 65 <= e <= 80:
        return "65 to 80 kg"
    elif 80 <= e <= 95:
        return "80 to 95 kg"
    elif e > 95:
        return "more than 95 kg"
    else:
        return np.nan


In [48]:
full_path = 'data_ingestion/{}'.format(date.today().strftime('%d%m%Y'))
os.makedirs(full_path, exist_ok=True)

# create a "log file" on the log ingestion directory. Can be helpful to debug and later on for feature monitoring.
log_file = open(full_path + '/log.txt', 'a+')
log_file.write(time.asctime() + ' - Starting data ingestion process\n')

# if a data file exists, load it, otherwise, create the directory, download a new one and load the DFs:
if not os.path.isfile(full_path + '/raw_dataset.zip'):    
    start = time.time() # time the data ingestion from Memento
    log_file.write(time.asctime() + ' - Day file not found. Starting data download\n')
    
    # try: 
    ACCESS_TOKEN = ##ENVIRONMENT VARIABLE HERE##
    headers = {'Authorization': 'Bearer ' + ACCESS_TOKEN }
    URL = "https://api.mementolabs.io/data/dataset"
    # changed this code to deal with streaming json from 10/20/2019 on. Must double check if this will be the final 
    # format
    data = requests.get(URL, headers=headers).content.decode(encoding='utf-8')

    # save the raw data to disk as zip it to allow re-processing without new download
    zip_file = full_path + '/raw_dataset.zip'
    zipObj = ZipFile(zip_file, 'w', compression=zipfile.ZIP_DEFLATED)
    zipObj.writestr('raw_dataset.txt', data)
    zipObj.close()

    print('total run time:',time.time() - start)
    log_file.write(time.asctime() + ' - JSON File downloaded and saved. Time: ' + str(time.time() - start) + 'secs.\n')
    

total run time: 35.71577286720276


In [49]:
# define intermediate data frames:
users = pd.DataFrame()
sleep = pd.DataFrame()
readiness = pd.DataFrame()
activity = pd.DataFrame()
experiments = pd.DataFrame()

raw_data = str(zipfile.ZipFile(full_path + '/raw_dataset.zip').read('raw_dataset.txt'), encoding='utf-8')
log_file.write(time.asctime() + ' - raw data file loaded\n')

start = time.time()
log_file.write(time.asctime() + ' - creating user-dependent features\n')

for user_json in decode_stacked_json(raw_data):
    # in the stacked json file, each json object corresponds to all data of one user. So here I will perform all 
    # feature transformation that are user-dependent before I append the whole user data to a final data frame.
    user_user = pd.DataFrame.from_records([user_json['userInfo']['userInfo']])
    
    try: # user signup date may be useful to define if a user model is relevant or not:
        user_user['signupDate'] = pd.to_datetime(user_json['signupDate'])                  
    except KeyError:
        user_user['signupDate'] = np.nan
        
    try:
        # one hot encode gender data:
        user_user['gender'] = np.where(user_user['gender'] == 'male', True, False)
    except KeyError:
        user_user['gender'] = np.nan
        
    try:
        user_user['age'] = user_user['age']
    except KeyError:
        user_user['age'] = np.nan
        
    try:
        user_user['height'] = user_user['height']
    except KeyError:
        user_user['height'] = np.nan
    
    try:
        user_user['weight'] = user_user['weight']
    except KeyError:
        user_user['weight'] = np.nan
        
    user_user.rename(columns={'gender':'is_male'}, inplace=True)
    
    # send to a final users data frame:
    users = users.append(user_user)
    
    
    # skip users without sleep records (strictly experiment users, or users of other hardware)
    if len(user_json['sleep']) == 0: continue
    
    # unpack data for each user in a specific data frame
    user_sleep = pd.DataFrame.from_records([i for i in user_json['sleep']])
    user_readiness = pd.DataFrame.from_records([i for i in user_json['readiness']])
    user_activity = pd.DataFrame.from_records([i for i in user_json['activity']])
    
    # Creating the new activity related features. Because several activity features depends on user info and/or 
    # sleep features, I will add the features needed here:
    user_activity['user_date'] = user_user['user_id'][0] + '|' + user_activity['summary_date']
    
    user_activity['age'] = user_user['age'][0]
    user_activity['height'] = user_user['height'][0]
    user_activity['weight'] = user_user['weight'][0]
    user_activity['is_male'] = user_user['is_male'][0]
    user_activity['user_id'] = user_user['user_id'][0]
    
    user_activity = pd.merge(user_activity, user_sleep[['bedtime_start', 'bedtime_end','summary_date']], 
                            on='summary_date', how='left')
    
    user_activity['met_min_medium_plus'] = user_activity['met_min_medium'] + \
                                                    user_activity['met_min_high']
    
    user_activity['age_bin'] = user_activity['age'].apply(age_bin)
    user_activity['height_bin'] = user_activity['height'].apply(height_bin)
    user_activity['weight_bin'] = user_activity['weight'].apply(weight_bin)
    user_activity['afterwake_exercise_min'] = [after_wake_exercise(c,b) for c,b in \
                                               zip(user_activity['class_5min'],
                                                   user_activity['bedtime_end'])]
    user_activity['beforesleep_exercise_min'] = [before_sleep_exercise(c,b) for c,b in \
                                                 zip(user_activity['class_5min'],
                                                     user_activity['bedtime_start'])]
    user_activity['noon_exercise_min'] = user_activity['class_5min'].apply(noon_exercise)
    user_activity['eve_exercise_min'] = user_activity['class_5min'].apply(evening_exercise)
    
    user_activity.set_index('user_date', inplace = True)
    
    activity = activity.append(user_activity)   
    
    #----------------sleep features---------#
    
    # sleep features normalizing and tranformations: (QQ why do this--maybe rename with _norm suffix??)
    user_sleep['awake_norm'] = user_sleep['awake']/user_sleep['duration']
    user_sleep['deep_norm'] = user_sleep['deep']/user_sleep['duration']
    user_sleep['light_norm'] = user_sleep['light']/user_sleep['duration']
    user_sleep['onset_latency_norm'] = user_sleep['onset_latency']/user_sleep['duration']
    user_sleep['rem_norm'] = user_sleep['rem']/user_sleep['duration']
    user_sleep['restless_norm'] = user_sleep['light']/100
    
    #applying after midnight flag
    user_sleep['afterMidnight'] = user_sleep['bedtime_start'].apply(after_midnight)
    
    # bins for sleep score to Shiraz's models:
    bins = [0, 75, 85, 100]
    names = ['fair', 'good', 'greate']
    user_sleep['good_sleep'] = pd.cut(user_sleep['score'], bins=bins, labels=names)
    
    user_sleep['user_date'] = user_user['user_id'][0] + '|' + user_sleep['summary_date']
    
    #converting to datetime for datetime manipulation
    user_sleep['summary_date'] = pd.to_datetime(user_sleep['summary_date'])
    
    # build dummy to day of the week.
    user_sleep['weekday'] = user_sleep['summary_date'].dt.weekday
    
    # build is_workday
    user_sleep['is_workday'] = np.where(user_sleep['weekday'] < 5, True, False)
    
    # one-hot encode weekdays
    user_sleep = pd.get_dummies(user_sleep, columns=['weekday'])
    
    user_sleep.rename(columns = {'weekday_0':'weekday_mon', 
                              'weekday_1':'weekday_tue', 
                              'weekday_2': 'weekday_wed',
                              'weekday_3': 'weekday_thu',
                              'weekday_4': 'weekday_fri',
                              'weekday_5': 'weekday_sat',
                              'weekday_6': 'weekday_sun'}, 
                              inplace = True)

    user_sleep.set_index('summary_date', inplace = True)

    # build D - 1 and D - 2 scores:
    user_sleep['score_D-1'] = user_sleep['score'].shift()[user_sleep.index.shift(1,freq='1D')]
    user_sleep['score_D-2'] = user_sleep['score'].shift()[user_sleep.index.shift(1,freq='2D')]
    user_sleep['deep_D-1'] = user_sleep['deep'].shift()[user_sleep.index.shift(1,freq='1D')]
    user_sleep['deep_D-2'] = user_sleep['deep'].shift()[user_sleep.index.shift(1,freq='2D')]
    user_sleep['rem_D-1'] = user_sleep['rem'].shift()[user_sleep.index.shift(1,freq='1D')]
    user_sleep['rem_D-2'] = user_sleep['rem'].shift()[user_sleep.index.shift(1,freq='2D')]

    # build 7, 14, and 21 rolling average scores: (QQ: why shift a day here with min periods fewer than window??)
    user_sleep['rol_score_7d'] = pd.DataFrame.rolling(user_sleep['score'].shift(1, freq='1D'),
                                                      window=7, min_periods=3).mean()
    user_sleep['rol_score_14d'] = pd.DataFrame.rolling(user_sleep['score'].shift(1, freq='1D'),
                                                      window=14, min_periods=10).mean()
    user_sleep['rol_score_21d'] = pd.DataFrame.rolling(user_sleep['score'].shift(1, freq='1D'),
                                                      window=21, min_periods=17).mean()

    # build is_traveling
    user_sleep['is_traveling'] = np.where(user_sleep['timezone'] == 
                                                    user_sleep['timezone'].value_counts().idxmax(), False, True)

    # build rol_bedtime_start_21d
    user_sleep['rol_bedtime_start_21d'] = pd.DataFrame.rolling(user_sleep['bedtime_start_delta'].shift(1, freq='1D'),
                                                                 window=21, min_periods=10).mean()

    # build avg_bedtime_start_delta, create dummy variables for deviation (-3, -2, -1, 1, 2, 3)
    user_sleep['rol_bedtime_start_std_21d'] = pd.DataFrame.rolling(user_sleep['bedtime_start_delta'].shift(1, freq='1D'),
                                                                 window=21, min_periods=10).std()

    user_sleep['bedtime_start_dev'] = (user_sleep['bedtime_start_delta'] - user_sleep['rol_bedtime_start_21d'])/\
                                      user_sleep['rol_bedtime_start_std_21d']

    user_sleep['bedtime_start_dev'] = np.where(user_sleep['bedtime_start_dev'] >= 0, 
                                                        np.ceil(user_sleep['bedtime_start_dev']),
                                                        np.floor(user_sleep['bedtime_start_dev']))
    # cap standard deviations to -3 or +3:
    user_sleep['bedtime_start_dev'] = np.where(user_sleep['bedtime_start_dev'] <= -3,-3, user_sleep['bedtime_start_dev'])

    user_sleep['bedtime_start_dev'] = np.where(user_sleep['bedtime_start_dev'] >= 3,3,
                                                        user_sleep['bedtime_start_dev'])

    user_sleep = pd.get_dummies(user_sleep, columns=['bedtime_start_dev'])

    user_sleep.rename(columns = { 'bedtime_start_dev_-3.0':'bedtime_start_dev-3', 
                                  'bedtime_start_dev_-2.0':'bedtime_start_dev-2', 
                                  'bedtime_start_dev_-1.0': 'bedtime_start_dev-1',
                                  'bedtime_start_dev_1.0': 'bedtime_start_dev+1',
                                  'bedtime_start_dev_2.0': 'bedtime_start_dev+2',
                                  'bedtime_start_dev_3.0': 'bedtime_start_dev+3'},
                                  inplace = True)

    user_sleep.drop(labels = ['rol_bedtime_start_21d','rol_bedtime_start_std_21d'], axis = 1, inplace = True)

    # build avg_bedtime_end_delta, create dummy variables for deviation. (-3, -2, -1, 1, 2, 3)
    user_sleep['rol_bedtime_end_21d'] = pd.DataFrame.rolling(user_sleep['bedtime_end_delta'].shift(1, freq='1D'),
                                                                 window=21, min_periods=10).mean()

    user_sleep['rol_bedtime_end_std_21d'] = pd.DataFrame.rolling(user_sleep['bedtime_end_delta'].shift(1, freq='1D'),
                                                                 window=21, min_periods=10).std()

    user_sleep['bedtime_end_dev'] = (user_sleep['bedtime_end_delta'] - user_sleep['rol_bedtime_end_21d'])/\
                                    user_sleep['rol_bedtime_end_std_21d']

    user_sleep['bedtime_end_dev'] = np.where(user_sleep['bedtime_end_dev'] >= 0, 
                                             np.ceil(user_sleep['bedtime_end_dev']),
                                             np.floor(user_sleep['bedtime_end_dev']))
    # cap standard deviations to -3 or +3:
    user_sleep['bedtime_end_dev'] = np.where(user_sleep['bedtime_end_dev'] <= -3,-3,
                                             user_sleep['bedtime_end_dev'])

    user_sleep['bedtime_end_dev'] = np.where(user_sleep['bedtime_end_dev'] >= 3,3,
                                             user_sleep['bedtime_end_dev'])

    user_sleep = pd.get_dummies(user_sleep, columns=['bedtime_end_dev'])

    user_sleep.rename(columns = {'bedtime_end_dev_-3.0':'bedtime_end_dev-3', 
                                 'bedtime_end_dev_-2.0':'bedtime_end_dev-2', 
                                 'bedtime_end_dev_-1.0': 'bedtime_end_dev-1',
                                 'bedtime_end_dev_1.0': 'bedtime_end_dev+1',
                                 'bedtime_end_dev_2.0': 'bedtime_end_dev+2',
                                 'bedtime_end_dev_3.0': 'bedtime_end_dev+3'},
                                 inplace = True)

    user_sleep.drop(labels = ['rol_bedtime_end_21d','rol_bedtime_end_std_21d'], axis = 1, inplace = True)

    # build avg_duration, create dummy variables for deviation (-3, -2, -1, 1, 2, 3)
    user_sleep['rol_duration_21d'] = pd.DataFrame.rolling(user_sleep['duration'].shift(1, freq='1D'),
                                                                 window=21, min_periods=10).mean()

    user_sleep['rol_duration_std_21d'] = pd.DataFrame.rolling(user_sleep['duration'].shift(1, freq='1D'),
                                                                 window=21, min_periods=10).mean()

    user_sleep['duration_dev'] = (user_sleep['duration'] - user_sleep['rol_duration_21d'])/\
                                 user_sleep['rol_duration_std_21d']

    user_sleep['duration_dev'] = np.where(user_sleep['duration_dev'] >= 0, 
                                          np.ceil(user_sleep['duration_dev']),
                                          np.floor(user_sleep['duration_dev']))

    # cap standard deviations to -3 or +3:
    user_sleep['duration_dev'] = np.where(user_sleep['duration_dev'] <= -3,-3, user_sleep['duration_dev'])

    user_sleep['duration_dev'] = np.where(user_sleep['duration_dev'] >= 3,3, user_sleep['duration_dev'])

    user_sleep = pd.get_dummies(user_sleep, columns=['duration_dev'])

    user_sleep.rename(columns = {'duration_dev_-3.0':'duration_dev-3', 
                              'duration_dev_-2.0':'duration_dev-2', 
                              'duration_dev_-1.0':'duration_dev-1',
                              'duration_dev_1.0': 'duration_dev+1',
                              'duration_dev_2.0': 'duration_dev+2',
                              'duration_dev_3.0': 'duration_dev+3'},
                              inplace = True)

    user_sleep.drop(labels = ['rol_duration_21d','rol_duration_std_21d'], axis = 1, inplace = True)
        
    user_sleep.set_index('user_date', inplace = True)

    sleep = sleep.append(user_sleep)
    # As we are not doing any transformation in the readiness data frame, append directly to the final one:
    user_readiness['user_date'] = user_user['user_id'][0] + '|' + user_readiness['summary_date']
    user_readiness.set_index('user_date', inplace=True)
    readiness = readiness.append(user_readiness)    

In [50]:
# Save each DF as a csv file (case needed to work on a single file in teh future
users_file_path = full_path + '/users_data.csv'
sleep_file_path = full_path + '/sleep_data.csv'
readiness_file_path = full_path + '/readiness_data.csv'
activity_file_path = full_path + '/activity_data.csv'
df_gold_file_path = full_path + '/gold.csv'
experiments_file_path = full_path + '/experiments.csv'

# Note that you can't drop duplicates when  rows are lists since lists are unhashable
activity.drop(['bedtime_start', 'bedtime_end'], axis=1, inplace=True)
readiness.drop('summary_date', axis=1, inplace=True)

# adding column name prefixes since the sleep, activity, and readiness data sets share common names
sleep = sleep.add_prefix('sleep_')
activity = activity.add_prefix('activity_')
readiness = readiness.add_prefix('readiness_')
users = users.add_prefix('user_')


# merge all dfs in a big "gold" df; 
users.to_csv(users_file_path, index=None, encoding='utf-8')
sleep.to_csv(sleep_file_path, index=None, encoding='utf-8')
readiness.to_csv(readiness_file_path, index=None, encoding='utf-8')
activity.to_csv(activity_file_path, index=None, encoding='utf-8')
df_gold = sleep.merge(activity, left_index=True, right_index=True)
df_gold = df_gold.merge(readiness, left_index=True, right_index=True)
#--keeping copy of sleep userid for testing reasons
df_gold['user_id'] = list(pd.Series(df_gold.index).apply(lambda x: x.split('|')[0]))
#writing out consolidated dataset to file as gold dataset
df_gold.to_csv(df_gold_file_path, index=None, encoding='utf-8')

In [111]:
#read in gold set
df_gold_file_path = 'data_ingestion/27102019/gold.csv'
df_gold = pd.read_csv(df_gold_file_path)

In [112]:
df_gold.head()

Unnamed: 0,sleep_afterMidnight,sleep_awake,sleep_awake_norm,sleep_bedtime_end,sleep_bedtime_end_delta,sleep_bedtime_end_dev+1,sleep_bedtime_end_dev+2,sleep_bedtime_end_dev+3,sleep_bedtime_end_dev-1,sleep_bedtime_end_dev-2,...,readiness_period_id,readiness_score,readiness_score_activity_balance,readiness_score_previous_day,readiness_score_previous_night,readiness_score_recovery_index,readiness_score_resting_hr,readiness_score_sleep_balance,readiness_score_temperature,user_id
0,1,4380,0.136961,2018-10-12T12:28:03+01:00,44883,0.0,0.0,0.0,0.0,0.0,...,2,83,100,47,65,66,96,96,84,3GFNJNPROVUFMPSYCY4I2ZHED5LF4IP4
1,1,6810,0.206364,2018-10-13T12:20:18+02:00,44418,0.0,0.0,0.0,0.0,0.0,...,0,75,69,88,53,30,97,87,91,3GFNJNPROVUFMPSYCY4I2ZHED5LF4IP4
2,1,5250,0.155694,2018-10-14T12:40:03+02:00,45603,0.0,0.0,0.0,0.0,0.0,...,0,80,55,85,71,70,89,92,97,3GFNJNPROVUFMPSYCY4I2ZHED5LF4IP4
3,1,2610,0.088057,2018-10-15T11:22:02+02:00,40922,0.0,0.0,0.0,0.0,0.0,...,1,81,78,90,68,68,89,87,99,3GFNJNPROVUFMPSYCY4I2ZHED5LF4IP4
4,1,7350,0.247475,2018-10-16T13:55:53+01:00,50153,0.0,0.0,0.0,0.0,0.0,...,0,73,74,86,54,80,90,64,90,3GFNJNPROVUFMPSYCY4I2ZHED5LF4IP4


### PT CURRENT MODELING + ROUTINE FEATURES and WEEKLY AVG FEATURES

In [114]:
#modeling helper functions

#----function to compute some statistic on the dataset based on an n day window-----#
def mean_roll (dataframe,window):
    '''compute the statistic on an n day window and return the 
    resulting dataframe'''
    df_collect = []
    
    for usr in set(dataframe['user_id']):
        usr_df = dataframe[dataframe['user_id']==usr]\
        .rolling(on ='activity_summary_date',window=window)\
        .mean()
        
        df_collect.append(usr_df)
    
    return pd.concat(df_collect)

#----function to compute period-over-period to gauge consistency (routine) from week-to week---#
def PoP(dataframe,usrid,date_col, metric_cols,days_offset):
    '''for each datetime in the dataframe's date column, computes the metric column's
    difference from the period days prior:
    returns WoW differences with the corresponding userid and datetime as a tuple'''
    #--first recorded observation date + days_offset
    init_date = min(dataframe[date_col]) + pd.DateOffset(days_offset)
    pop_list = []
    for dt in dataframe[date_col]:
        if dt < init_date: # won't have 7 days prior for the first 7 observations so flag as np.nan
            pop_list.append([usrid,dt]+[np.nan for i in np.arange(len(metric_cols))])
        else:
            try: #---need to account for missing dates
                pop_diff_list = []
                for metric in metric_cols:
                    #grab the value from the prior 7(or n) days -- need to index on 0 since values returns array of array
                    prd_prior = dataframe[dataframe[date_col] == dt + pd.DateOffset(-days_offset)][metric].values[0]
                    #grab the current value
                    prd_current = dataframe[dataframe[date_col] == dt][metric].values[0]
                    #take the diff - list of differences
                    try:
                        pop_diff = (prd_current/prd_prior)-1
                    except ZeroDivisionError: #1 if prior period was 0 (basically capping at 100% increase)
                        pop_diff = 1
                   
                    pop_diff_list.append(pop_diff)
                    
                pop_list.append([usrid,dt]+[i for i in pop_diff_list])
            except:
                #---if missing dates then use nan values
                pop_list.append([usrid,dt]+[np.nan for i in np.arange(len(metric_cols))])

    return pop_list


def process_data_frame(mod_df,target_df,base_n_days,next_n_days,features):

    '''builds a dataframe of feature values and target values for each person-day in the dataset
    based on the features passed and the dataframes used as the model feature and target values'''
    feature_space = []
    targets = []

    cap_date = max(mod_df['activity_summary_date']) - pd.DateOffset(base_n_days)
    # need at least n days of data to compute target value
    target_period_start = min(mod_df['activity_summary_date']) + pd.DateOffset(base_n_days) #7


    #iteratively build feature to target observations for training/test data
    while target_period_start <= cap_date:

        baseline_period = target_period_start - pd.DateOffset(base_n_days) #n day prior to target sleep score
        baseline_data = mod_df[mod_df['activity_summary_date'] == baseline_period]
        #corresponds to sleep score n days ahead -- target data as avg score 3 days from now = target_period_start - pd.DateOffset(4)
        target_data = target_df[target_df['activity_summary_date'] == (target_period_start - pd.DateOffset(base_n_days-next_n_days))]\
        .filter(['user_id','sleep_score'])\
        .rename(columns={'sleep_score': 'target_score'})

        temp_df = pd.merge(baseline_data,target_data, on = 'user_id',how ='left')
        #binarizing outcome - if score increases or remains the same then 1, else 0
        temp_df['target_binary'] = (temp_df['target_score'] >= temp_df['sleep_score']).astype(int)


        feature_vals = temp_df[temp_df['activity_summary_date'] == baseline_period]\
        .filter(features)\
        .values

        target_vals = temp_df[temp_df['activity_summary_date'] == baseline_period]\
        .filter(['target_binary','target_score'])\
        .values

        feature_space.append(feature_vals)
        targets.append(target_vals)
        # move forward 1 day
        target_period_start = target_period_start + pd.DateOffset(+1)


    features_block = []
    target_block = []

    for f in feature_space:
        for array in f:
            features_block.append(array)

    for t in targets:
        for array in t:
            target_block.append(array)

    data_set = pd.concat([pd.DataFrame(features_block,columns = features),
           pd.DataFrame(target_block,columns = ['target_binary','target_score'])],axis = 1)
    
    return data_set


In [117]:
deviation_columns = list(df_gold.columns[df_gold.columns.str.contains('_deviation')].values)

# note i include user_id and summary date to map back to 7 day rolled dataframe
seven_day_vars = ['user_id','activity_summary_date','activity_score_meet_daily_targets',
                  'activity_score_training_frequency','activity_score_training_volume',
                  'activity_score_recovery_time'] + deviation_columns

In [118]:
#variables of interest to roll on an n day window

'''the below are the variables i rolled (you can include more if need be)'''

activity_vars_to_roll = ['activity_summary_date',
    'activity_cal_total','activity_high','activity_medium','activity_steps','activity_inactive',
    'activity_non_wear','activity_score','activity_met_min_medium', 'activity_met_min_high',
    'activity_met_min_medium_plus','activity_score_move_every_hour','activity_score_stay_active',
    'activity_beforesleep_exercise_min','activity_afterwake_exercise_min','activity_noon_exercise_min', 
    'activity_eve_exercise_min','activity_age_bin','activity_is_male'
]

sleep_vars_to_roll = [
    'sleep_user_id','sleep_bedtime_start', 'sleep_bedtime_end','sleep_score',
    'sleep_bedtime_start_delta','sleep_isWeekend','sleep_afterMidnight','sleep_onset_latency','sleep_duration',
    'sleep_breath_average','sleep_is_traveling','sleep_score_disturbances']

#-----collecting all variables to roll on an n day window------#
vars_to_roll = activity_vars_to_roll + sleep_vars_to_roll + ['user_id']

In [119]:
#splitting the pre-computed 7 day average variables into their own dataframe 
df_to_roll = df_gold.filter(vars_to_roll)
df_7Day_vars = df_gold.filter(seven_day_vars)
df_7Day_vars['activity_summary_date'] = pd.to_datetime(df_7Day_vars['activity_summary_date'])
df_to_roll['activity_summary_date'] = pd.to_datetime(df_to_roll['activity_summary_date'])

In [120]:
#building a 7 day rolling average dataframe
df_7_mean  = mean_roll(df_to_roll,7)

In [121]:
#variables for computing % change WoW to capture consistency/routine in sleep'''
activity_metric_cols = ['activity_score_training_frequency','activity_score_training_volume','activity_score_recovery_time',
               'activity_score_meet_daily_targets']

sleep_metric_cols = ['sleep_onset_latency','sleep_duration','sleep_bedtime_start_delta','sleep_score_disturbances']

In [122]:
#---will take a few minutes to run since it's calling the PoP function for each user-----#
'''calculating WoW % changes for the weekly target variables to attempt to capture consistency/routine'''

activity_wow_list = []
for usr in set(df_7Day_vars['user_id']):
    temp_df = df_7Day_vars[df_7Day_vars['user_id']==usr]
    activity_wow_list.append(PoP(temp_df,usr,'activity_summary_date',metric_cols = activity_metric_cols,days_offset = 7))
    
sleep_wow_list = []
for usr in set(df_7_mean['user_id']):
    temp_df = df_7_mean[df_7_mean['user_id']==usr]
    sleep_wow_list.append(PoP(temp_df,usr,'activity_summary_date',metric_cols = sleep_metric_cols,days_offset = 7))
    
    

#need to figure out a better way to do this, but this basically makes a dataframe of the wow metrics
activity_d = []
for usr in activity_wow_list:
    for i in usr:
        activity_d.append(
            {'user_id': i[0], 'activity_summary_date': i[1], 'activity_score_training_frequency_wow':i[2],
                  'activity_score_training_volume_wow':i[3],'activity_score_recovery_time_wow':i[4],
                 'activity_score_meet_daily_targets_wow':i[5]}
        )

        
sleep_d = []
for usr in sleep_wow_list:
    for i in usr:
        sleep_d.append(
            {'user_id': i[0], 'activity_summary_date': i[1], 'sleep_onset_latency_wow':i[2],
                  'sleep_duration_wow':i[3],'sleep_bedtime_start_delta_wow':i[4],'sleep_score_disturbances_wow':i[5]
            }
        )
    

#dataframe with WoW %change for each day  
activity_wow_df = pd.DataFrame(activity_d).replace(np.inf, 1)
sleep_wow_df = pd.DataFrame(sleep_d).replace(np.inf, 1)        

wow_df = pd.merge(activity_wow_df,sleep_wow_df, on = ['user_id','activity_summary_date'],how ='left')

In [123]:
#---combining the PoP metrics (week-over-week) with the pre-computed weekly metrics
df_7Day_WoW = pd.merge(wow_df,df_7Day_vars, on = ['user_id','activity_summary_date'],how ='left')

In [124]:
#modeling 7 day averages and combining the 7-day based metrics + wow data
#---this is the consolidated dataframe that i'll use for further modeling---#
mod_df = pd.merge(df_7_mean,df_7Day_WoW, on = ['user_id','activity_summary_date'],how ='left')

In [125]:
#setting routine variable based on how often and how much exercise changes week-to-week
mod_df['activity_routine_score'] = (
    np.absolute(mod_df['activity_score_training_volume_wow']) + 
    np.absolute(mod_df['activity_score_training_frequency_wow'])+
    np.absolute(mod_df['activity_score_meet_daily_targets_wow'])+
    np.absolute(mod_df['activity_score_recovery_time_wow']))/4

#setting sleep routine variable based on how often and how much sleep habits change week-to-week
mod_df['sleep_routine_score'] = (
    np.absolute(mod_df['sleep_onset_latency_wow']) + 
    np.absolute(mod_df['sleep_duration_wow']) +
    np.absolute(mod_df['sleep_bedtime_start_delta_wow'])+
    np.absolute(mod_df['sleep_score_disturbances_wow']))/4

In [126]:
#one hot encoding age bins and weight bins
mod_df = pd.get_dummies(mod_df,columns = ['activity_age_bin'])
mod_df = mod_df.drop_duplicates()

In [127]:
#writing out
mod_df.to_csv('mod_df.csv',index = False)

In [128]:
#reading in
mod_df = pd.read_csv('mod_df.csv',parse_dates = ['activity_summary_date'])

In [129]:
max(pd.to_datetime(mod_df['activity_summary_date']))

Timestamp('2019-10-26 00:00:00')

In [130]:
#these are the variables that i chose for modeling
my_variables = ['activity_summary_date','sleep_score','sleep_afterMidnight','sleep_onset_latency','sleep_duration',
            'sleep_score_disturbances','sleep_temperature_deviation','sleep_bedtime_start_delta','sleep_routine_score',
            'activity_cal_total','activity_steps','activity_score_meet_daily_targets','activity_met_min_medium_plus',
            'activity_score_training_frequency','activity_score_training_volume','activity_routine_score','activity_score_move_every_hour',
            'activity_score_stay_active','activity_afterwake_exercise_min','activity_beforesleep_exercise_min','sleep_is_traveling',
            'activity_is_male','activity_age_bin_20s', 'activity_age_bin_30s','activity_age_bin_40s', 'activity_age_bin_50s plus']



In [131]:
data_set = process_data_frame(mod_df = mod_df,target_df = mod_df,base_n_days = 7,next_n_days = 7,features=my_variables)


In [132]:
#writing the above to file
data_set.to_csv('my_dataset.csv',index = False)

In [133]:
data_set = pd.read_csv('my_dataset.csv',parse_dates = ['activity_summary_date'])

#### Modeling

In [134]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier,GradientBoostingRegressor
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline,make_pipeline
from sklearn.ensemble import RandomForestRegressor,AdaBoostRegressor

In [135]:
# splitting training and test set by recency
train = data_set[data_set['activity_summary_date'] <= pd.Timestamp(2019, 9, 1, 12)]
test = data_set[data_set['activity_summary_date'] > pd.Timestamp(2019, 9, 1, 12)]
#-dropping summary date since its not needed
train = train.drop(['activity_summary_date'],axis=1)
test = test.drop(['activity_summary_date'],axis=1)

In [136]:
#dropping NaNs - still need to resolve how we want to handle NAs
train = train.dropna()
test = test.dropna()

In [137]:
#drop summary date from features
features = [f for f in my_variables if f != 'activity_summary_date']
rf_regr = RandomForestRegressor()
rf_regr.fit(train[features],train['target_score'])
rf_reg_pred = rf_regr.predict(test[features])



In [138]:
#-----RESULTS of Predicting Avg Sleep Score Over Next 7 Days Based on Prior 7 Days--------#
from sklearn.metrics import r2_score, mean_squared_error,mean_absolute_error,explained_variance_score,median_absolute_error
print('R2 of: ',r2_score(test['target_score'], rf_reg_pred))
print('MSE of: ', mean_squared_error(test['target_score'], rf_reg_pred))
print('MAE of: ', mean_absolute_error(test['target_score'], rf_reg_pred))
print('Explained Variance:', explained_variance_score(test['target_score'], rf_reg_pred))
print('Median Absolute Error',median_absolute_error(test['target_score'], rf_reg_pred))

R2 of:  0.7063363955947766
MSE of:  17.436819185289743
MAE of:  3.2244947022802353
Explained Variance: 0.7066639029458599
Median Absolute Error 2.5160714285714363


In [139]:
import eli5
from eli5.sklearn import PermutationImportance
perm = PermutationImportance(rf_regr, random_state=1).fit(test[features], test['target_score'])
eli5.show_weights(perm, feature_names = test[features].columns.tolist())

Weight,Feature
0.8979  ± 0.0355,sleep_score
0.1305  ± 0.0072,sleep_score_disturbances
0.0320  ± 0.0040,sleep_afterMidnight
0.0128  ± 0.0046,activity_steps
0.0120  ± 0.0041,activity_cal_total
0.0097  ± 0.0019,activity_score_move_every_hour
0.0080  ± 0.0023,sleep_bedtime_start_delta
0.0060  ± 0.0014,activity_score_meet_daily_targets
0.0060  ± 0.0026,sleep_duration
0.0057  ± 0.0034,activity_age_bin_50s plus


In [140]:
#recommender based on disturbances
def recommender(model,user_data,actionable_features):
    base_prediction = model.predict(user_data)[0]
    adj_predictions = []
    for a in actionable_features:
        #reset the copy on each iteration
        user_data_copy = user_data.copy()
        if a in {'sleep_bedtime_start_delta','sleep_routine_score','activity_routine_score'}:
            #all features above have an ideal target of 0
            user_data_copy[a] = 0
            adj_prediction = model.predict(user_data_copy)[0]
            if adj_prediction > base_prediction:
                adj_predictions.append([a,adj_prediction])

        elif a in {'sleep_score_disturbances','activity_score_training_frequency','activity_score_training_volume'}:
            #all features above have an ideal target of 95+
            user_data_copy[a] = 95
            adj_prediction = model.predict(user_data_copy)[0]
            if adj_prediction > base_prediction:
                adj_predictions.append([a,adj_prediction])
        elif a in {'activity_steps'}:
            #1 mile is approximately 2000 steps (will increment avg by 2000)
            user_data_copy[a] = user_data_copy[a].values[0] + 2000
            adj_prediction = model.predict(user_data_copy)[0]   
            if adj_prediction > base_prediction:
                adj_predictions.append([a,adj_prediction])

        elif a == 'sleep_afterMidnight':
            if user_data_copy[a].values[0] > 0:
                user_data_copy[a] = 0
                adj_prediction = model.predict(user_data_copy)[0]   
                if adj_prediction > base_prediction:
                    adj_predictions.append([a,adj_prediction])
        else:
            if a == 'sleep_onset_latency':
                user_data_copy[a] = 900    
                adj_prediction = model.predict(user_data_copy)[0]   
                if adj_prediction > base_prediction:
                    adj_predictions.append([a,adj_prediction])
                    
    return adj_predictions

In [141]:
actionable_features = [
 'sleep_afterMidnight',
 'sleep_onset_latency',
 'sleep_score_disturbances',
 'sleep_bedtime_start_delta',
 'sleep_routine_score',
'activity_cal_total',
'activity_routine_score',
'activity_steps',
'activity_score_training_frequency',
'activity_score_training_volume']

In [145]:
#Ed's userID is TE2CPSSWP4QUGFAJQZ5FHITIKPNCCICX
ed_data = mod_df[mod_df['user_id'] == 'TE2CPSSWP4QUGFAJQZ5FHITIKPNCCICX']
ed_test = ed_data[ed_data['activity_summary_date'] == '2019-10-03'].filter(features)
recommender(model = rf_regr,user_data = ed_test,actionable_features = actionable_features)

[['sleep_score_disturbances', 79.57142857142857],
 ['sleep_routine_score', 70.19999999999999],
 ['activity_routine_score', 71.69999999999999],
 ['activity_score_training_volume', 70.44285714285714]]

In [146]:
#actual
ed_data[ed_data['activity_summary_date'] == '2019-10-10'].filter(['sleep_score'])

Unnamed: 0,sleep_score
10037,73.714286


In [147]:
#predicted
rf_regr.predict(ed_test[features])

array([69.84285714])

In [148]:
ed_data[ed_data['activity_summary_date'] == '2019-10-03'].filter(features)

Unnamed: 0,sleep_score,sleep_afterMidnight,sleep_onset_latency,sleep_duration,sleep_score_disturbances,sleep_temperature_deviation,sleep_bedtime_start_delta,sleep_routine_score,activity_cal_total,activity_steps,...,activity_score_move_every_hour,activity_score_stay_active,activity_afterwake_exercise_min,activity_beforesleep_exercise_min,sleep_is_traveling,activity_is_male,activity_age_bin_20s,activity_age_bin_30s,activity_age_bin_40s,activity_age_bin_50s plus
10030,72.0,0.0,767.142857,28302.857143,58.714286,0.01,-4805.571429,0.136449,2362.142857,7230.571429,...,97.857143,69.714286,70.0,52.857143,0.0,1.0,1,0,0,0


In [149]:
edx = df_gold[(df_gold['activity_summary_date'] == '2019-10-10') & (df_gold['user_id'] == 'TE2CPSSWP4QUGFAJQZ5FHITIKPNCCICX')]

In [150]:
edx[edx.columns[edx.columns.str.contains('score')]]

Unnamed: 0,sleep_rol_score_14d,sleep_rol_score_21d,sleep_rol_score_7d,sleep_score,sleep_score_D-1,sleep_score_D-2,sleep_score_alignment,sleep_score_deep,sleep_score_disturbances,sleep_score_efficiency,...,activity_score_training_frequency,activity_score_training_volume,readiness_score,readiness_score_activity_balance,readiness_score_previous_day,readiness_score_previous_night,readiness_score_recovery_index,readiness_score_resting_hr,readiness_score_sleep_balance,readiness_score_temperature
18798,73.928571,73.0,74.714286,64,81.0,81.0,84.0,97.0,57.0,74.0,...,40,49,65,80,79,33,81,66,74,96
