<h2>
    <center>Modeling Some Future Date Sleep Score <br> 
    Based on the Last 7 Days of Data
    </center>
</h2> 


In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import requests
import json
import numpy as np
import pandas as pd
import pprint as pp
import os
from os import mkdir
from os import chdir
from zipfile import ZipFile
from datetime import datetime,date
import time
from pandas.io.json import json_normalize

In [3]:
# We'll also import seaborn, a Python graphing library
import warnings # current version of seaborn generates a bunch of warnings that we'll ignore
warnings.filterwarnings("ignore")
import seaborn as sns
import matplotlib.pyplot as plt
sns.set(style="white", color_codes=True)

In [4]:
# Create a ZipFile Object and load dataset.zip in it
# with ZipFile('dataset_10-14-2019.zip', 'r') as zipObj:
#    # Extract all the contents of zip file in current directory
#    zipObj.extractall()

In [22]:
sleep_file_path = 'sleep_data.csv'
activity_file_path = 'activity_data.csv'

sleep = pd.read_csv(sleep_file_path)
activity = pd.read_csv(activity_file_path)

# drop duplicate values
sleep = sleep.drop_duplicates()
activity = activity.drop_duplicates()


# adding column name prefixes since the sleep and activity data sets share common names
sleep = sleep.add_prefix('sleep_')
activity = activity.add_prefix('activity_')
#set up join key to merge the two dataframes
sleep['user_date_key'] = sleep['sleep_user_id'] + '|' + sleep['sleep_summary_date']
activity['user_date_key'] = activity['activity_user_id'] + '|' + activity['activity_summary_date']
#join activity observations with sleep observations
df = pd.merge(sleep,activity, on = 'user_date_key',how ='left')

In [23]:
df['sleep_summary_date'] = pd.to_datetime(df['sleep_summary_date'])

In [24]:
def isTraveling(df):
    '''flags whether or not the user is in the same timezone as the time zone they are 
    most frequently in'''
    user_istraveling_list = []
    for user in df['sleep_user_id'].unique():
        single_user_df = df[df['sleep_user_id'] == user].filter(['sleep_timezone','user_date_key'])
        single_user_df['is_traveling'] = np.where(single_user_df['sleep_timezone'] == 
                                                    single_user_df['sleep_timezone'].value_counts().idxmax(), False, True)
        usr_datekey = single_user_df.filter(['is_traveling','user_date_key']).values
        
        for i in usr_datekey:
            user_istraveling_list.append(i)
            
    return pd.DataFrame(columns = ['is_traveling','user_date_key'],data = user_istraveling_list)

In [25]:
df = pd.merge(df,isTraveling(df), on = 'user_date_key',how ='left')

In [26]:
def AfterWakeExcercise(class_5min,bedtime_end):
    '''returns the number of minutes with medium to high MET scores 
    within the first 3 hours of waketime as a proxy for whether or not exercise 
    occurs after waking up'''
    if isinstance(class_5min,float):
        return np.nan
    else:
        #convert the str integer into a list of integers
        class_5min_list = list(map(int, class_5min))
        #--take the timestamp from the datetime string, extracts hh:mm data, and converts to a number--#
        wake_hr_min = int(''.join(bedtime_end.split('T')[1][0:5].split(':')))
        #---calculate minutes lapsed since 4am and wake up time---#
        #---rescale minutes into 5 minute intervals to find the number of elements at which to offset class_5min--#
        offset = int(((wake_hr_min - 400)/100)*(60/5))

        #subset observations between wake up and 3 hrs post wake up (24*5=120 min)
        morning_obs = class_5min_list[offset:offset+36]

        #tally total minutes spent in medium to high intensity exercise (3-4)
        total_min = sum([1 for obs in morning_obs if obs >=3])*5
        return total_min


def BeforeSleepExcercise(class_5min,bedtime_start):
    '''returns the number of minutes with medium to high MET scores 
    within the last 3 hours of waketime as a proxy for whether or not exercise 
    occurs in the evening close to bedtime'''
    if isinstance(class_5min,float):
        return np.nan
    else:
        #convert the str integer into a list of integers
        class_5min_list = list(map(int, class_5min))
        #--take the timestamp from the datetime string, extracts hh:mm data, and converts to a number--#
        sleep_hr_min = int(''.join(bedtime_start.split('T')[1][0:5].split(':')))
        #---calculate minutes lapsed since 4am and wake up time---#
        #---rescale minutes into 5 minute intervals to find the number of elements at which to offset class_5min--#
        offset = int(((sleep_hr_min - 400)/100)*(60/5))

        #subset observations 3 hours before sleep time
        evening_obs = class_5min_list[offset-36:offset]

        #tally total minutes spent in medium to high intensity exercise (3-4)
        total_min = sum([1 for obs in evening_obs if obs >=3])*5
        return total_min

def NoonExcercise(class_5min):
    '''returns the number of minutes with medium to high MET scores 
    between noon and two local time'''
    if isinstance(class_5min,float):
        return np.nan
    else:
        #convert the str integer into a list of integers
        class_5min_list = list(map(int, class_5min))

        #---calculate minutes lapsed since 4am and 12 (8*12 (5min intervasl in an hr)---#

        offset = 96

        #subset observations between noon and 2 pm
        noon_obs = class_5min_list[offset:offset+24]

        #tally total minutes spent in medium to high intensity exercise (3-4)
        total_min = sum([1 for obs in noon_obs if obs >=3])*5
        return total_min
    

def EveningExcercise(class_5min):
    '''returns the number of minutes with medium to high MET scores 
    between noon and two local time'''
    if isinstance(class_5min,float):
        return np.nan
    else:
        #convert the str integer into a list of integers
        class_5min_list = list(map(int, class_5min))

        #---calculate minutes lapsed since 4am and 5pm (13*12) --there are 12 5min intervasl in an hr---#

        offset = 156

        #subset observations between 5pm and 7 pm post wake up
        eve_obs = class_5min_list[offset:offset+24]

        #tally total minutes spent in medium to high intensity exercise (3-4)
        total_min = sum([1 for obs in eve_obs if obs >=3])*5
        return total_min

In [27]:
#helper functions to flag weekend and instances where bedtime started at or after midnight
def after_midnight(timestamp_str):
    '''indicator for whether or not bedtime started after midnight (in early am hours)'''
    d,t = timestamp_str.split('T')
    hr = int(t[0:2]) #extract hr from timestamp
    if hr >= 0 and hr < 6:
        return 1
    else:
        return 0

def isWeekend(timestamp_str):
    '''indicator for whether or not local day falls on a friday or saturday
    note: dayofweek attribute returns integers (0-6) with 0 being monday and 
    6 being sunday'''
    day = pd.Timestamp(timestamp_str).dayofweek
    if day in (5,6):
        return 1
    else:
        return 0

In [28]:
#functions for binning demo data

def age_bin(e):
    if  e >= 20 and e < 30:
        return "20s"
    elif e >= 30 and e < 40:
        return "30s"
    elif e >= 40 and e < 50:
        return "40s"
    else:
        return "50s plus"
    
def height_bin(e):
    if  e > 0 and e < 170:
        return "less than 170 cm"
    elif e >= 170 and e <= 180:
        return "170 to 180 cm"
    else:
        return "greater than 180 cm"
    
def weight_bin(e):
    if  e > 0 and e <= 69:
        return "less than 70 kg"
    elif e >= 70 and e <= 80:
        return "70 to 80 kg"
    else:
        return "greater than 80 kg"

Note: the measurments aren't perfectly calibrated. For instance, we can see situations where activity_class_5min shows intervals of high activity, but the activity.high measure shows 0 MET minutes of high activity.
* so we might want to look at both medium and high activity as well as the activity class_5min
* (see example below)

In [29]:
#note: this must be missing from the api calls; it's not showing up in our data
df['activity_met_min_medium_plus'] = df['activity_met_min_medium'] + df['activity_met_min_high']

In [30]:
#----adding additional features----#
# df['age_bin'] = df['age'].apply(age_bin)
# df['height_bin'] = df['height'].apply(height_bin)
# df['weight_bin'] = df['weight'].apply(weight_bin)
df['sleep_afterMidnight'] = df['sleep_bedtime_start'].apply(after_midnight)
df['sleep_isWeekend'] = df['sleep_bedtime_start'].apply(isWeekend)
df['afterwake_exercise_min'] = [AfterWakeExcercise(c,b) for c,b in zip(df['activity_class_5min'],df['sleep_bedtime_end'])]
df['beforesleep_exercise_min'] = [BeforeSleepExcercise(c,b) for c,b in zip(df['activity_class_5min'],df['sleep_bedtime_start'])]
df['noon_exercise_min'] = df['activity_class_5min'].apply(NoonExcercise)
df['eve_exercise_min'] = df['activity_class_5min'].apply(EveningExcercise)

## Start Here Before Rolling Data

the following are measurements that are already on a 7 day window (or 14 day window as with the deviation scores) per the api documentation so I didn't roll them since they're already computed as averages




In [None]:
deviation_columns = list(df.columns[df.columns.str.contains('_deviation')].values)

# note i include user_id and summary date to map back to 7 day rolled dataframe
seven_day_vars = ['sleep_user_id','sleep_summary_date','activity_score_meet_daily_targets',
                  'activity_score_training_frequency','activity_score_training_volume',
                  'activity_score_recovery_time'] + deviation_columns

In [31]:
#variables of interest to roll on an n day window

'''the below are the variables i rolled (you can include more if need be)'''

activity_vars_to_roll = [
    'activity_cal_total','activity_high','activity_medium','activity_steps','activity_inactive',
    'activity_non_wear','activity_score','activity_met_min_medium', 'activity_met_min_high',
    'activity_met_min_medium_plus','activity_score_move_every_hour','activity_score_stay_active',
    'beforesleep_exercise_min','afterwake_exercise_min','noon_exercise_min', 'eve_exercise_min'
]

sleep_vars_to_roll = [
    'sleep_user_id','sleep_bedtime_start', 'sleep_bedtime_end','sleep_summary_date','sleep_score',
    'sleep_bedtime_start_delta','sleep_isWeekend','sleep_afterMidnight','sleep_onset_latency','sleep_duration',
    'sleep_breath_average','is_traveling','sleep_score_disturbances']


In [32]:
#-----collecting all variables to roll on an n day window------#
vars_to_roll = activity_vars_to_roll + sleep_vars_to_roll

In [33]:
#splitting the pre-computed 7 day average variables into their own dataframe 
df_to_roll = df.filter(vars_to_roll)
df_7Day_vars = df.filter(seven_day_vars)
df_7Day_vars['sleep_summary_date'] = pd.to_datetime(df_7Day_vars['sleep_summary_date'])
df_to_roll['sleep_summary_date'] = pd.to_datetime(df_to_roll['sleep_summary_date'])

In [34]:
#----function to compute some statistic on the dataset based on an n day window-----#
def mean_roll (dataframe,window):
    '''compute the statistic on an n day window and return the 
    resulting dataframe'''
    df_collect = []
    
    for usr in set(dataframe['sleep_user_id']):
        usr_df = dataframe[dataframe['sleep_user_id']==usr]\
        .rolling(on ='sleep_summary_date',window=window)\
        .mean()
        
        df_collect.append(usr_df)
    
    return pd.concat(df_collect)

### Now Rolling Dataframe of variables that don't include those 7 day variables

In [35]:
#building a 7 day rolling average dataframe
df_7_mean  = mean_roll(df_to_roll,7)


The below is the period-over-period function i use

In [36]:
def PoP(dataframe,usrid,date_col, metric_cols,days_offset):
    '''for each datetime in the dataframe's date column, computes the metric column's
    difference from the period days prior:
    returns WoW differences with the corresponding userid and datetime as a tuple'''
    #--first recorded observation date + days_offset
    init_date = min(dataframe[date_col]) + pd.DateOffset(days_offset)
    pop_list = []
    for dt in dataframe[date_col]:
        if dt < init_date: # won't have 7 days prior for the first 7 observations so flag as np.nan
            pop_list.append([usrid,dt]+[np.nan for i in np.arange(len(metric_cols))])
        else:
            try: #---need to account for missing dates
                pop_diff_list = []
                for metric in metric_cols:
                    #grab the value from the prior 7(or n) days -- need to index on 0 since values returns array of array
                    prd_prior = dataframe[dataframe[date_col] == dt + pd.DateOffset(-days_offset)][metric].values[0]
                    #grab the current value
                    prd_current = dataframe[dataframe[date_col] == dt][metric].values[0]
                    #take the diff - list of differences
                    try:
                        pop_diff = (prd_current/prd_prior)-1
                    except ZeroDivisionError: #1 if prior period was 0 (basically capping at 100% increase)
                        pop_diff = 1
                   
                    pop_diff_list.append(pop_diff)
                    
                pop_list.append([usrid,dt]+[i for i in pop_diff_list])
            except:
                #---if missing dates then use nan values
                pop_list.append([usrid,dt]+[np.nan for i in np.arange(len(metric_cols))])

    return pop_list

The following metrics incorporate a measure based on the last seven days and, together, make up 75% of the activity score. A WoW delta in this metrics can help us determine if the user demonstrates a consistency from one week to the next whereby the most consistency would be a delta of zero. We could use this as a proxy for routine, meaning that the frequency and amount of exercise doesn't change much from one week to the next and the person's goals and rest doesn't change much from one week to the next:
- activity.score_training_frequency
- activity.score_training_volume
- activity.score_recovery_time
- activity.score_meet_daily_targets

In [37]:
#variables for computing % change WoW to capture consistency/routine in sleep'''
activity_metric_cols = ['activity_score_training_frequency','activity_score_training_volume','activity_score_recovery_time',
               'activity_score_meet_daily_targets']

sleep_metric_cols = ['sleep_onset_latency','sleep_duration','sleep_bedtime_start_delta','sleep_score_disturbances']

In [38]:
#---will take a few minutes to run since it's calling the PoP function for each user-----#
'''calculating WoW % changes for the weekly target variables to attempt to capture consistency/routine'''
activity_wow_list = []
for usr in set(df_7Day_vars['sleep_user_id']):
    temp_df = df_7Day_vars[df_7Day_vars['sleep_user_id']==usr]
    activity_wow_list.append(PoP(temp_df,usr,'sleep_summary_date',metric_cols = activity_metric_cols,days_offset = 7))
    
sleep_wow_list = []
for usr in set(df_7_mean['sleep_user_id']):
    temp_df = df_7_mean[df_7_mean['sleep_user_id']==usr]
    sleep_wow_list.append(PoP(temp_df,usr,'sleep_summary_date',metric_cols = sleep_metric_cols,days_offset = 7))

In [39]:
#need to figure out a better way to do this, but this basically makes a dataframe of the wow metrics
activity_d = []
for usr in activity_wow_list:
    for i in usr:
        activity_d.append(
            {'sleep_user_id': i[0], 'sleep_summary_date': i[1], 'activity_score_training_frequency_wow':i[2],
                  'activity_score_training_volume_wow':i[3],'activity_score_recovery_time_wow':i[4],
                 'activity_score_meet_daily_targets_wow':i[5]}
        )

        
sleep_d = []
for usr in sleep_wow_list:
    for i in usr:
        sleep_d.append(
            {'sleep_user_id': i[0], 'sleep_summary_date': i[1], 'sleep_onset_latency_wow':i[2],
                  'sleep_duration_wow':i[3],'sleep_bedtime_start_delta_wow':i[4],'sleep_score_disturbances_wow':i[5]
            }
        )
    

#dataframe with WoW %change for each day  
activity_wow_df = pd.DataFrame(activity_d).replace(np.inf, 1)
sleep_wow_df = pd.DataFrame(sleep_d).replace(np.inf, 1)        

wow_df = pd.merge(activity_wow_df,sleep_wow_df, on = ['sleep_user_id','sleep_summary_date'],how ='left')

In [41]:
#---combining the PoP metrics (week-over-week) with the pre-computed weekly metrics
df_7Day_WoW = pd.merge(wow_df,df_7Day_vars, on = ['sleep_user_id','sleep_summary_date'],how ='left')

In [45]:
#modeling 7 day averages and combining the 7-day based metrics + wow data
#---this is the consolidated dataframe that i'll use for further modeling---#
mod_df = pd.merge(df_7_mean,df_7Day_WoW, on = ['sleep_user_id','sleep_summary_date'],how ='left')

In [46]:
#setting routine variable based on how often and how much exercise changes week-to-week
mod_df['activity_routine_score'] = (
    np.absolute(mod_df['activity_score_training_volume_wow']) + 
    np.absolute(mod_df['activity_score_training_frequency_wow'])+
    np.absolute(mod_df['activity_score_meet_daily_targets_wow'])+
    np.absolute(mod_df['activity_score_recovery_time_wow']))/4

#setting sleep routine variable based on how often and how much sleep habits change week-to-week
mod_df['sleep_routine_score'] = (
    np.absolute(mod_df['sleep_onset_latency_wow']) + 
    np.absolute(mod_df['sleep_duration_wow']) +
    np.absolute(mod_df['sleep_bedtime_start_delta_wow'])+
    np.absolute(mod_df['sleep_score_disturbances_wow']))/4

In [48]:
mod_df = mod_df.drop_duplicates()

In [49]:
# mod_df.to_csv('mod_df.csv',index = False)

In [53]:
mod_df = pd.read_csv('mod_df.csv',parse_dates = ['sleep_summary_date'])

The function below basically consolidates modeling features and targets into a single dataframe that can be used for training and cross validating models - it doesn't include the userID, but perhaps we can include it

In [31]:
def process_data_frame(mod_df,target_df,base_n_days,next_n_days,features):

    '''builds a dataframe of feature values and target values for each person-day in the dataset
    based on the features passed and the dataframes used as the model feature and target values'''
    feature_space = []
    targets = []

    cap_date = max(mod_df['sleep_summary_date']) - pd.DateOffset(base_n_days)
    # need at least n days of data to compute target value
    target_period_start = min(mod_df['sleep_summary_date']) + pd.DateOffset(base_n_days) #7


    #iteratively build feature to target observations for training/test data
    while target_period_start <= cap_date:

        baseline_period = target_period_start - pd.DateOffset(base_n_days) #n day prior to target sleep score
        baseline_data = mod_df[mod_df['sleep_summary_date'] == baseline_period]
        #corresponds to sleep score n days ahead -- target data as avg score 3 days from now = target_period_start - pd.DateOffset(4)
        target_data = target_df[target_df['sleep_summary_date'] == (target_period_start - pd.DateOffset(base_n_days-next_n_days))]\
        .filter(['sleep_user_id','sleep_score'])\
        .rename(columns={'sleep_score': 'target_score'})

        temp_df = pd.merge(baseline_data,target_data, on = 'sleep_user_id',how ='left')
        #binarizing outcome - if score increases or remains the same then 1, else 0
        temp_df['target_binary'] = (temp_df['target_score'] >= temp_df['sleep_score']).astype(int)


        feature_vals = temp_df[temp_df['sleep_summary_date'] == baseline_period]\
        .filter(features)\
        .values

        target_vals = temp_df[temp_df['sleep_summary_date'] == baseline_period]\
        .filter(['target_binary','target_score'])\
        .values

        feature_space.append(feature_vals)
        targets.append(target_vals)
        # move forward 1 day
        target_period_start = target_period_start + pd.DateOffset(+1)


    features_block = []
    target_block = []

    for f in feature_space:
        for array in f:
            features_block.append(array)

    for t in targets:
        for array in t:
            target_block.append(array)

    data_set = pd.concat([pd.DataFrame(features_block,columns = features),
           pd.DataFrame(target_block,columns = ['target_binary','target_score'])],axis = 1)
    
    return data_set
    
    

In [11]:
#these are the variables that i chose for modeling
my_variables = ['sleep_summary_date','sleep_score','sleep_afterMidnight','sleep_onset_latency','sleep_duration',
            'sleep_score_disturbances','sleep_temperature_deviation','sleep_bedtime_start_delta','sleep_routine_score',
            'activity_cal_total','activity_steps','activity_score_meet_daily_targets','activity_met_min_medium_plus',
            'activity_score_training_frequency','activity_score_training_volume','activity_routine_score','activity_score_move_every_hour',
            'activity_score_stay_active','afterwake_exercise_min','beforesleep_exercise_min','is_traveling']

In [67]:
data_set = process_data_frame(mod_df = mod_df,target_df = mod_df,base_n_days = 7,next_n_days = 7,features=my_variables)

In [68]:
#writing the above to file
data_set.to_csv('my_dataset.csv',index = False)

In [4]:
data_set = pd.read_csv('my_dataset.csv',parse_dates = ['sleep_summary_date'])

## You can end here, the rest is just modeling

### Modeling Future Avg Sleep Scores with Random Forest

In [6]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier,GradientBoostingRegressor
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline,make_pipeline
from sklearn.ensemble import RandomForestRegressor,AdaBoostRegressor

In [7]:
# splitting training and test set by recency
train = data_set[data_set['sleep_summary_date'] <= pd.Timestamp(2019, 9, 1, 12)]
test = data_set[data_set['sleep_summary_date'] > pd.Timestamp(2019, 9, 1, 12)]
#-dropping summary date since its not needed
train = train.drop(['sleep_summary_date'],axis=1)
test = test.drop(['sleep_summary_date'],axis=1)

In [8]:
#dropping NaNs - still need to resolve how we want to handle NAs
train = train.dropna()
test = test.dropna()

<h4>
    <center>Random Forest Results<br> 
    Next Week's Avg Sleep Score Based on the Last 7 Days of Data (with Sleep Score as a Feature)
    </center>
</h4> 

The below is intended to help us predict the last 7 day's avg sleep score 7 days from now

In [15]:
#drop summary date from features
features = [f for f in my_variables if f != 'sleep_summary_date']
rf_regr = RandomForestRegressor()
rf_regr.fit(train[features],train['target_score'])
rf_reg_pred = rf_regr.predict(test[features])

In [16]:
#-----RESULTS of Predicting Avg Sleep Score Over Next 7 Days Based on Prior 7 Days--------#
from sklearn.metrics import r2_score, mean_squared_error,mean_absolute_error,explained_variance_score,median_absolute_error
print('R2 of: ',r2_score(test['target_score'], rf_reg_pred))
print('MSE of: ', mean_squared_error(test['target_score'], rf_reg_pred))
print('MAE of: ', mean_absolute_error(test['target_score'], rf_reg_pred))
print('Explained Variance:', explained_variance_score(test['target_score'], rf_reg_pred))
print('Median Absolute Error',median_absolute_error(test['target_score'], rf_reg_pred))

R2 of:  0.7053442164676733
MSE of:  17.19840780099022
MAE of:  3.1907100591715984
Explained Variance: 0.7062093142937167
Median Absolute Error 2.6000000000000014


In [17]:
#----feature importance----#
for f,i in zip(features,rf_regr.feature_importances_):
    print(f,round(i,2))

sleep_score 0.72
sleep_afterMidnight 0.01
sleep_onset_latency 0.02
sleep_duration 0.02
sleep_score_disturbances 0.03
sleep_temperature_deviation 0.01
sleep_bedtime_start_delta 0.03
sleep_routine_score 0.01
activity_cal_total 0.02
activity_steps 0.02
activity_score_meet_daily_targets 0.01
activity_met_min_medium_plus 0.01
activity_score_training_frequency 0.0
activity_score_training_volume 0.01
activity_routine_score 0.01
activity_score_move_every_hour 0.01
activity_score_stay_active 0.02
afterwake_exercise_min 0.02
beforesleep_exercise_min 0.02
is_traveling 0.01


In [18]:
perm = PermutationImportance(rf_regr, random_state=1).fit(test[features], test['target_score'])
eli5.show_weights(perm, feature_names = test[features].columns.tolist())

Weight,Feature
1.0937  ± 0.0906,sleep_score
0.0708  ± 0.0096,sleep_score_disturbances
0.0300  ± 0.0054,sleep_bedtime_start_delta
0.0152  ± 0.0054,activity_cal_total
0.0142  ± 0.0033,sleep_duration
0.0142  ± 0.0012,sleep_afterMidnight
0.0124  ± 0.0055,activity_score_meet_daily_targets
0.0104  ± 0.0056,activity_steps
0.0078  ± 0.0018,sleep_onset_latency
0.0034  ± 0.0024,activity_score_training_volume


In [90]:
ab_reg = AdaBoostRegressor()
ab_reg.fit(train[features],train['target_score'])
ab_reg_pred = ab_reg.predict(test[features])
#-----RESULTS of Predicting Avg Sleep Score Over Next 7 Days Based on Prior 7 Days with AdaBoost--------#
print('R2 of: ',r2_score(test['target_score'], ab_reg_pred))
print('MSE of: ', mean_squared_error(test['target_score'], ab_reg_pred))
print('MAE of: ', mean_absolute_error(test['target_score'], ab_reg_pred))
print('Explained Variance:', explained_variance_score(test['target_score'], ab_reg_pred))
print('Median Absolute Error',median_absolute_error(test['target_score'], ab_reg_pred))

R2 of:  0.6590830785058923
MSE of:  19.898568328866954
MAE of:  3.5269448968944963
Explained Variance: 0.6657254453864163
Median Absolute Error 2.954186592900541


In [88]:
benchmark_feature = ['sleep_summary_date','sleep_score']

benchmark_data_set = process_data_frame(mod_df = mod_df,target_df = mod_df,base_n_days = 7,next_n_days = 7,features=benchmark_feature)
# splitting training and test set by recency
benchmark_train = benchmark_data_set[data_set['sleep_summary_date'] <= pd.Timestamp(2019, 9, 1, 12)]
benchmark_test = benchmark_data_set[data_set['sleep_summary_date'] > pd.Timestamp(2019, 9, 1, 12)]
#-dropping summary date since its not needed
benchmark_train = benchmark_train.drop(['sleep_summary_date'],axis=1)
benchmark_test = benchmark_test.drop(['sleep_summary_date'],axis=1)
#dropping NaNs
benchmark_train = benchmark_train.dropna()
benchmark_test = benchmark_test.dropna()

#drop summary date from features
features = [f for f in benchmark_feature if f != 'sleep_summary_date']
bmrf_reg = RandomForestRegressor()
bmrf_reg.fit(train[features],train['target_score'])
bmrf_reg_pred = bmrf_reg.predict(test[features])

#-----RESULTS of Predicting Avg Sleep Score Over Next 7 Days Based on Prior 7 Days Avg Sleep Score--------#
from sklearn.metrics import r2_score, mean_squared_error,mean_absolute_error,explained_variance_score,median_absolute_error
print('R2 of: ',r2_score(test['target_score'], bmrf_reg_pred))
print('MSE of: ', mean_squared_error(test['target_score'], bmrf_reg_pred))
print('MAE of: ', mean_absolute_error(test['target_score'], bmrf_reg_pred))
print('Explained Variance:', explained_variance_score(test['target_score'], bmrf_reg_pred))
print('Median Absolute Error',median_absolute_error(test['target_score'], bmrf_reg_pred))

R2 of:  0.684785500120024
MSE of:  18.3983747026172
MAE of:  3.2738942466911705
Explained Variance: 0.6854244872933202
Median Absolute Error 2.5480272199110985


### Modeling Future Avg Sleep Score Direction with Logistic Regression

<h4>
    <center>Logistic Regression Results<br> 
    Accuracy of Predicting the direction of next week's avg sleep score based on the last 7 days
    </center>
</h4> 

In [43]:
# Fit to data and predict using pipeline with standard scaling.
features = [f for f in my_variables if f != 'sleep_summary_date']

std_lr_clf = make_pipeline(StandardScaler(),
                        LogisticRegression(solver='lbfgs', multi_class='ovr'))

std_lr_clf.fit(train[features],train['target_binary'])
pred_test_lr_std = std_lr_clf.predict(test[features])

from sklearn.metrics import accuracy_score
print('logistic regression accuracy', str(accuracy_score(test['target_binary'], pred_test_lr_std)))

logistic regression accuracy 0.6248520710059171


In [44]:
#based on prior sleep score only

features = ['sleep_score']

std_lr_clf = make_pipeline(StandardScaler(),
                        LogisticRegression(solver='lbfgs', multi_class='ovr'))

std_lr_clf.fit(train[features],train['target_binary'])
pred_test_lr_std = std_lr_clf.predict(test[features])

from sklearn.metrics import accuracy_score
print('logistic regression accuracy', str(accuracy_score(test['target_binary'], pred_test_lr_std)))

logistic regression accuracy 0.5816568047337278


### Gradient Boosting

In [72]:
features = [f for f in my_variables if f not in ('sleep_summary_date')]
gb_regr = GradientBoostingRegressor()
gb_regr.fit(train[features],train['target_score'])
gb_reg_pred = gb_regr.predict(test[features])

#-----RESULTS of Predicting Avg Sleep Score Over Next 7 Days Based on Prior 7 Days--------#
from sklearn.metrics import r2_score, mean_squared_error,mean_absolute_error,explained_variance_score,median_absolute_error
print('R2 of: ',r2_score(test['target_score'], gb_reg_pred))
print('MSE of: ', mean_squared_error(test['target_score'], gb_reg_pred))
print('MAE of: ', mean_absolute_error(test['target_score'], gb_reg_pred))
print('Explained Variance:', explained_variance_score(test['target_score'], gb_reg_pred))
print('Median Absolute Error',median_absolute_error(test['target_score'], gb_reg_pred))

R2 of:  0.7242085478013509
MSE of:  16.09733841324481
MAE of:  3.0702702549892513
Explained Variance: 0.724270974657712
Median Absolute Error 2.4635413900253624


In [73]:
for f,i in zip(features,gb_regr.feature_importances_):
    print(f,round(i,2))

sleep_score 0.92
sleep_afterMidnight 0.01
sleep_onset_latency 0.0
sleep_duration 0.01
sleep_score_disturbances 0.02
sleep_temperature_deviation 0.0
sleep_bedtime_start_delta 0.01
sleep_routine_score 0.0
activity_cal_total 0.0
activity_steps 0.01
activity_score_meet_daily_targets 0.01
activity_met_min_medium_plus 0.0
activity_score_training_frequency 0.0
activity_score_training_volume 0.0
activity_routine_score 0.0
activity_score_move_every_hour 0.0
activity_score_stay_active 0.0
afterwake_exercise_min 0.0
beforesleep_exercise_min 0.0
is_traveling 0.0


### Messing around with Coefficient Adjustments 

to make this work, you're gonna need to iterate through each feature and make an adjustment and then test it against the original prediction to see if it actually increases from the original prediction. you can then select all adjustments that result in a predicted increase
 - things to keep in mind is that this is all relative to the user. for instance, Ed had a week where he fell asleep before midnight each night so we wouldn't adjust sleep_isMidnight. same with isTraveling. and sleep_bedtime_start_delta. his numbers may be low relative to the larger population so we'll need a way to check against this to decide if we should adjust those variables (or we just iterate and adjust through all)

In [55]:
#Ed's userID is TE2CPSSWP4QUGFAJQZ5FHITIKPNCCICX
ed_data = mod_df[mod_df['sleep_user_id'] == 'TE2CPSSWP4QUGFAJQZ5FHITIKPNCCICX']
#ed_test = ed_data[ed_data['sleep_summary_date'] == max(ed_data['sleep_summary_date'])].filter(my_features)

In [56]:
features = [f for f in my_variables if f != 'sleep_summary_date']
ed_test = ed_data[ed_data['sleep_summary_date'] == '2019-10-03'].filter(features)

In [57]:
#--predicting Ed's score
features = [f for f in my_variables if f != 'sleep_summary_date']
rf_regr = RandomForestRegressor()
rf_regr.fit(train[features],train['target_score'])
rf_reg_pred = rf_regr.predict(test[features])
rf_regr.predict(ed_test)

array([72.1])

In [58]:
ed_data[ed_data['sleep_summary_date'] == '2019-10-10'].filter(['sleep_score'])

Unnamed: 0,sleep_score
15667,73.714286


In [107]:
#----testing variations

features = [f for f in my_variables if f != 'sleep_summary_date']
ed_test = ed_data[ed_data['sleep_summary_date'] == '2019-10-03'].filter(features)
ed_test

Unnamed: 0,sleep_score,sleep_afterMidnight,sleep_onset_latency,sleep_duration,sleep_score_disturbances,sleep_temperature_deviation,sleep_bedtime_start_delta,sleep_routine_score,activity_cal_total,activity_steps,activity_score_meet_daily_targets,activity_met_min_medium_plus,activity_score_training_frequency,activity_score_training_volume,activity_routine_score,activity_score_move_every_hour,activity_score_stay_active,afterwake_exercise_min,beforesleep_exercise_min,is_traveling
4256,72.0,0.0,767.142857,28302.857143,58.714286,0.01,-4805.571429,0.136449,2362.142857,7230.571429,1.0,36.285714,1.0,33.0,0.251539,97.857143,69.714286,70.0,52.857143,0.0


In [108]:
#prediction without feature variation
print(rf_regr.predict(ed_test.filter(features)),
    ed_data[ed_data['sleep_summary_date'] == '2019-10-10'].filter(['sleep_score']))

[69.97142857]       sleep_score
4263    73.714286


In [96]:
#what happens when routine is improved (to 0)
ed_test['activity_routine_score'] = 0
ed_test['sleep_routine_score']  = 0
rf_regr.predict(ed_test.filter(features))

array([71.18571429])

<h3>Iterative, Contextual Approach to Yielding Recommendations that Increase Avg Weekly Predicted Sleep Score</h3>

- generate base prediction based on actual feature values and model
- iterate through each 'actionable' feature checking against feature context and adjust feature in favorable direction, holding all else constant
- rerun model's prediction and store resulting score,feature, and adjustment
- choose top 3 features that yield prediction score higher than base prediction

#### Context
- sleep onset_latency - a latency of about 15 minutes gives best score (900 seconds)
- sleep score_disturbances - 90 + considered good
- sleep before midnight = 0
- sleep and activity routine of zero would be considered good (conditioned on certain activity and sleep behaviors)
- 'activity_cal_total',
- 'activity_steps',
- activity.score_meet_daily_targets - 95 is good
- training frequency and volume - 95 is good

In [91]:
def recommender(model,user_data,actionable_features):
    base_prediction = model.predict(user_data)[0]
    adj_predictions = []
    for a in actionable_features:
        #reset the copy on each iteration
        user_data_copy = user_data.copy()
        if a in {'sleep_bedtime_start_delta','sleep_routine_score','activity_routine_score'}:
            #all features above have an ideal target of 0
            user_data_copy[a] = 0
            adj_prediction = model.predict(user_data_copy)[0]
            if adj_prediction > base_prediction:
                adj_predictions.append([a,adj_prediction])

        elif a in {'sleep_score_disturbances','activity_score_training_frequency','activity_score_training_volume'}:
            #all features above have an ideal target of 95+
            user_data_copy[a] = 95
            adj_prediction = model.predict(user_data_copy)[0]
            if adj_prediction > base_prediction:
                adj_predictions.append([a,adj_prediction])
        elif a in {'activity_steps'}:
            #1 mile is approximately 2000 steps (will increment avg by 2000)
            user_data_copy[a] = user_data_copy[a].values[0] + 2000
            adj_prediction = model.predict(user_data_copy)[0]   
            if adj_prediction > base_prediction:
                adj_predictions.append([a,adj_prediction])

        elif a == 'sleep_afterMidnight':
            if user_data_copy[a].values[0] > 0:
                user_data_copy[a] = 0
                adj_prediction = model.predict(user_data_copy)[0]   
                if adj_prediction > base_prediction:
                    adj_predictions.append([a,adj_prediction])
        else:
            if a == 'sleep_onset_latency':
                user_data_copy[a] = 900    
                adj_prediction = model.predict(user_data_copy)[0]   
                if adj_prediction > base_prediction:
                    adj_predictions.append([a,adj_prediction])
                    
    return adj_predictions


In [59]:
base_prediction = rf_regr.predict(ed_test)[0]

In [69]:
actionable_features = [
 'sleep_afterMidnight',
 'sleep_onset_latency',
 'sleep_score_disturbances',
 'sleep_bedtime_start_delta',
 'sleep_routine_score',
'activity_cal_total',
'activity_routine_score',
'activity_steps',
'activity_score_training_frequency',
'activity_score_training_volume'   
]

In [94]:
recommender(model = rf_regr,user_data = ed_test,actionable_features = actionable_features)

[['sleep_score_disturbances', 75.77142857142857],
 ['activity_routine_score', 72.68571428571428],
 ['activity_score_training_frequency', 72.2]]

#### Additional Notes to Ignore
does it make sense to include meet_daily_targets in the consistency score and/or weight it equally among the others?
if a user has virtually no difference in frequency, volumne, and recovery in a given week but has a large delta in meeting target, then that might imply an aggressive change in goals for the week. If they don't hit their "goal" but get the same frequency/volume, then how is sleep quality impacte if at all? 

Maybe its easier to define routine with respect to exercise consistency instead of goal setting consistency...they're kinda separate things. What I'm interested in is if exercise consistency can be a signal for sleep quality.

We can also define consistency in dummy variable terms. if frequency is 95 or greater than we know the user got 100 minutes of medium or high intensity activity on at least three days during past seven days. we have a known behavior that we can be prescriptive against. Same goes for volume and recovery. 



i feel like it would because consistent goals are part of routine;there's routine and progress. however, if someone has a large difference in goals wow and doesn't adjust training volume/frequency it may not necessarily be a bad thing

In [None]:
# 10 predictions for a given user vs actuals using random forest
last_predictable_entry = max(ed_data['sleep_summary_date']) - pd.DateOffset(7)
for i in np.arange(10):
    base_date = last_predictable_entry - pd.DateOffset(i)
    test_date = ed_data[ed_data['sleep_summary_date'] == base_date].filter(features)
    predicted = rf_regr.predict(test_date)
    actual = ed_data[ed_data['sleep_summary_date'] == base_date + pd.DateOffset(7)].filter(['sleep_score']).values[0]
    print(predicted)
    print(actual)
    print("---")
    