In [3]:
import pickle
import numpy as np
import pandas as pd
from sklearn import preprocessing
from sklearn.feature_selection import RFECV
from sklearn.svm import SVR
from sklearn.feature_selection import SelectPercentile
from sklearn.feature_selection import f_regression
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.ensemble import ExtraTreesRegressor
import operator

In [4]:
def adjust_to_thirty_minute(steps):
    return steps.fillna(0).tz_localize(None).resample('30T').sum().fillna(0)

In [5]:
def get_processed(file_name):
    
    root = '../../../../Volumes/dav/HeartSteps/pooling_rl_shared_data/processed/'
    with open('{}{}.pkl'.format(root,file_name),'rb') as f:
        return pickle.load(f)

In [8]:
merged = get_processed('merged_est')

In [68]:
merged[1].columns

Index(['avail', 'decision.utime', 'jbsteps10', 'location.category',
       'location.category.response', 'sedentary.width', 'send', 'send.active',
       'send.sedentary', 'steps', 'study.day', 'user', 'weather.condition',
       'weather.condition.response', 'window.utime'],
      dtype='object')

In [69]:
def get_target_indices_sent(df):
    return df.loc[(df['send.active'] == True)&(df['avail'] == True  )].index

In [70]:
def get_target_indices_notsent(df):
    return df.loc[(df['send.active'] == False)&(df['avail'] == True  )].index

In [11]:
def get_near_targets(targets):
    return targets.round('30T').tz_localize(None)

In [16]:
targets = get_target_indices_sent(merged[1])

In [88]:
merged[1].avail

2015-07-22 09:00:00-05:00      NaN
2015-07-22 09:05:00-05:00      NaN
2015-07-22 09:10:00-05:00      NaN
2015-07-22 09:15:00-05:00      NaN
2015-07-22 09:20:00-05:00      NaN
2015-07-22 09:25:00-05:00      NaN
2015-07-22 09:30:00-05:00      NaN
2015-07-22 09:35:00-05:00      NaN
2015-07-22 09:40:00-05:00      NaN
2015-07-22 09:45:00-05:00      NaN
2015-07-22 09:50:00-05:00      NaN
2015-07-22 09:55:00-05:00      NaN
2015-07-22 10:00:00-05:00      NaN
2015-07-22 10:05:00-05:00      NaN
2015-07-22 10:10:00-05:00      NaN
2015-07-22 10:15:00-05:00      NaN
2015-07-22 10:20:00-05:00      NaN
2015-07-22 10:25:00-05:00      NaN
2015-07-22 10:30:00-05:00      NaN
2015-07-22 10:35:00-05:00      NaN
2015-07-22 10:40:00-05:00      NaN
2015-07-22 10:45:00-05:00      NaN
2015-07-22 10:50:00-05:00      NaN
2015-07-22 10:55:00-05:00      NaN
2015-07-22 11:00:00-05:00      NaN
2015-07-22 11:05:00-05:00      NaN
2015-07-22 11:10:00-05:00      NaN
2015-07-22 11:15:00-05:00      NaN
2015-07-22 11:20:00-

In [55]:
targets = get_near_targets(targets)

In [89]:
def measure_responsivity(df):
    steps = adjust_to_thirty_minute(df['steps'])
    targets_intervention = get_target_indices_sent(df)
    targets_intervention = get_near_targets(targets_intervention)
    
    targets_control = get_target_indices_notsent(df)
    targets_control = get_near_targets(targets_control)
    
    
    to_return = []
    average_steps_control = np.array([steps[t] for t in targets_control]).mean()
    average_steps_intervention = np.array([steps[t] for t in targets_intervention]).mean()

    return average_steps_control-average_steps_intervention

In [50]:
def make_feature_matrix():
    other = get_processed('other_activity_no_missing')
    agg_consc = get_processed('agg_conscientiousness_measures')
    agg_selfeff = get_processed('agg_self_efficacy_measures')
    agg_actchoice = get_processed('agg_activity_choice_measures')
    other_keys = list(other[1].keys())
    #other_keys = ['vigact.days','vigact.hrs','vigact.min','modact.days']
    pids = [p for p in other.keys() if p!=4]
    
    to_return = []
    feature_names = other_keys+['conscientiousness']+['self_efficacy']+['activity_choice']
    
    for p in pids:
        temp = [other[p][o] for o in other_keys]
        temp.append(agg_consc[p])
        temp.append(agg_selfeff[p])
        temp.append(agg_actchoice[p])
        to_return.append(temp)
    return to_return,feature_names,pids

In [71]:
X,fn,pids = make_feature_matrix()

In [72]:
def fix(x):
    to_return = []
    for i in x:
        if type(i)==str:
            if i=='no':
                to_return.append(0)
            elif i=='yes':
                to_return.append(1)
            elif i=='X':
                to_return.append(0)
            else:
                #print(i)
                if pd.isnull(float(i)):
                    print(i)
                to_return.append(float(i))
        else:
            if pd.isnull(float(i)):
                    i = 0
            to_return.append(i)
    return to_return

In [73]:
X = [fix(x) for x in X]

In [74]:
X_scaled = preprocessing.scale(X)

In [75]:
def get_training_data_responsivity(pids):
    merged = get_processed('merged_est')
    to_return = {}
    for p in pids:
        if p in merged:
            df  = merged[p]
            #steps = adjust_to_thirty_minute(df['steps'])
            y = measure_responsivity(df)
            to_return[p]=y
    return to_return

In [90]:
steps = get_training_data_responsivity(pids)

In [91]:
steps

{1: -96.267646211466456,
 2: 20.297450980392142,
 3: -61.0575022461815,
 5: 42.964102564102561,
 6: 88.638012780869929,
 7: -168.61578947368423,
 8: 26.35720720720721,
 9: 58.320740740740732,
 10: 67.403999999999996,
 11: -63.317460317460331,
 13: 56.156060606060606,
 14: -38.36750543647095,
 15: 15.817910447761193,
 17: -12.796590909090895,
 18: -139.64550687559958,
 19: 34.794871794871796,
 21: 26.531747333880219,
 22: 70.631442241968557,
 23: -14.502094679514045,
 25: 10.741269841269855,
 27: 13.967090707964601,
 28: -52.969924812030058,
 30: -72.249404761904771,
 31: -114.36675824175822,
 32: 33.692339544513459,
 33: -36.143735460285825,
 34: 94.821724137931042,
 35: -43.407258064516128,
 37: 200.44698469846981,
 39: 4.0193464052287595,
 40: 7.5163934426229417,
 41: -39.583877551020407,
 42: 48.174685620557682,
 44: 40.020671834625318,
 46: -41.576876617773962,
 48: -3.4239033693579159}

In [92]:
pid_lookup = {pids[i]:i for i in range(len(pids))}

In [93]:
def make_matrix_for_first_analysis(steps,pid_lookup,X):
    
    big_matrix_X = []
    big_matrix_y = []
    for pid,stepcount in steps.items():
        #for s in stepcount:
        big_matrix_X.append(X[pid_lookup[pid]])
        big_matrix_y.append(stepcount)
    return big_matrix_X,big_matrix_y
        

In [94]:
bx,by = make_matrix_for_first_analysis(steps,pid_lookup,X_scaled)

In [95]:
by_scaled = preprocessing.scale(by)

In [98]:
et = ExtraTreesRegressor(n_estimators=10, max_features=9,
                                       random_state=0)

In [99]:
et.fit(bx, by)

ExtraTreesRegressor(bootstrap=False, criterion='mse', max_depth=None,
          max_features=9, max_leaf_nodes=None, min_impurity_decrease=0.0,
          min_impurity_split=None, min_samples_leaf=1, min_samples_split=2,
          min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
          oob_score=False, random_state=0, verbose=0, warm_start=False)

In [100]:
et.feature_importances_

array([ 0.06065006,  0.24453863,  0.05362615,  0.10904874,  0.20753303,
        0.04530203,  0.11239996,  0.08887171,  0.07802969])

In [101]:
def nice_dict(feats,fns):
    
    to_return = {fns[i]:feats[i] for i in range(len(feats))}
    return sorted(to_return.items(), key=operator.itemgetter(1),reverse=True)

In [102]:
nice_dict(et.feature_importances_,fn)

[('vigact.min', 0.24453863473151088),
 ('modact.days', 0.2075330274268051),
 ('conscientiousness', 0.11239996378748159),
 ('vigact.days', 0.10904873946666131),
 ('self_efficacy', 0.088871705516369817),
 ('activity_choice', 0.078029685916945882),
 ('fittracker', 0.060650057909543496),
 ('fitapp', 0.053626154288689151),
 ('vigact.hrs', 0.045302030955992792)]

In [49]:
#all features
[('vigact.days', 0.19843104958349361),
 ('walk10.days', 0.11353117626506644),
 ('walk.hrs', 0.094853610237206579),
 ('self_efficacy', 0.085161291471744477),
 ('fitapp', 0.063369097644263489),
 ('activity_choice', 0.061838498369279862),
 ('vigact.min', 0.059730112353548262),
 ('walk.min', 0.057417412352888511),
 ('sit.hrs', 0.057075377938251112),
 ('conscientiousness', 0.045379791484796816),
 ('modact.hrs', 0.04413522387124863),
 ('modact.days', 0.043017162012806712),
 ('modact.min', 0.036358893126501507),
 ('fittracker', 0.027887931739907938),
 ('vigact.hrs', 0.011813371548996105),
 ('sit.min', 0.0)]

[('vigact.days', 0.1984310495834936),
 ('walk10.days', 0.11353117626506644),
 ('walk.hrs', 0.09485361023720658),
 ('self_efficacy', 0.08516129147174448),
 ('fitapp', 0.06336909764426349),
 ('activity_choice', 0.06183849836927986),
 ('vigact.min', 0.05973011235354826),
 ('walk.min', 0.05741741235288851),
 ('sit.hrs', 0.05707537793825111),
 ('conscientiousness', 0.045379791484796816),
 ('modact.hrs', 0.04413522387124863),
 ('modact.days', 0.04301716201280671),
 ('modact.min', 0.03635889312650151),
 ('fittracker', 0.027887931739907938),
 ('vigact.hrs', 0.011813371548996105),
 ('sit.min', 0.0)]