In [1]:
import pickle
import numpy as np
import pandas as pd
from sklearn import preprocessing
from sklearn.feature_selection import RFECV
from sklearn.svm import SVR
from sklearn.feature_selection import SelectPercentile
from sklearn.feature_selection import f_regression
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.ensemble import ExtraTreesRegressor
import operator

In [2]:
def adjust_to_thirty_minute(steps):
    return steps.fillna(0).tz_localize(None).resample('30T').sum().fillna(0)

In [3]:
def get_processed(file_name):
    
    root = '../../../../Volumes/dav/HeartSteps/pooling_rl_shared_data/processed/'
    with open('{}{}.pkl'.format(root,file_name),'rb') as f:
        return pickle.load(f)

In [4]:
merged = get_processed('merged_est')

In [24]:
 adjust_to_thirty_minute(merged[1]['steps'])

2015-07-22 09:00:00       0.0
2015-07-22 09:30:00      19.0
2015-07-22 10:00:00     453.0
2015-07-22 10:30:00     874.0
2015-07-22 11:00:00       0.0
2015-07-22 11:30:00     968.0
2015-07-22 12:00:00     445.0
2015-07-22 12:30:00      57.0
2015-07-22 13:00:00     565.0
2015-07-22 13:30:00     414.0
2015-07-22 14:00:00       0.0
2015-07-22 14:30:00     292.0
2015-07-22 15:00:00     379.0
2015-07-22 15:30:00     170.0
2015-07-22 16:00:00     235.0
2015-07-22 16:30:00     392.0
2015-07-22 17:00:00     132.0
2015-07-22 17:30:00     550.0
2015-07-22 18:00:00    3494.0
2015-07-22 18:30:00     360.0
2015-07-22 19:00:00     259.0
2015-07-22 19:30:00     505.0
2015-07-22 20:00:00     487.0
2015-07-22 20:30:00       0.0
2015-07-22 21:00:00       0.0
2015-07-22 21:30:00       0.0
2015-07-22 22:00:00       0.0
2015-07-22 22:30:00       0.0
2015-07-22 23:00:00       0.0
2015-07-22 23:30:00       0.0
                        ...  
2015-09-15 02:00:00       0.0
2015-09-15 02:30:00       0.0
2015-09-15

In [25]:
merged[1].columns

Index(['avail', 'decision.utime', 'jbsteps10', 'location.category',
       'location.category.response', 'sedentary.width', 'send', 'send.active',
       'send.sedentary', 'steps', 'study.day', 'user', 'weather.condition',
       'weather.condition.response', 'window.utime'],
      dtype='object')

In [52]:
def get_target_indices(df):
    return df.loc[(df['send'] == True)&(df['avail'] == True  )].index

In [53]:
def get_near_targets(targets):
    return targets.round('30T').tz_localize(None)

In [54]:
targets = get_target_indices(merged[1])

In [55]:
targets = get_near_targets(targets)

In [56]:
def measure_responsivity(df):
    steps = adjust_to_thirty_minute(df['steps'])
    targets = get_target_indices(df)
    targets = get_near_targets(targets)
    to_return = []
    for t in targets:
        before = t- pd.DateOffset(minutes=30)
        after = t
        sb = steps[before]
        sa = steps[after]
        #/(sb+1
        to_return.append((sa-sb)/(sb+1))
    return to_return

In [57]:
def make_feature_matrix():
    other = get_processed('other_activity')
    agg_consc = get_processed('agg_conscientiousness_measures')
    agg_selfeff = get_processed('agg_self_efficacy_measures')
    agg_actchoice = get_processed('agg_activity_choice_measures')
    other_keys = list(other[1].keys())
    #other_keys = ['vigact.days','vigact.hrs','vigact.min','modact.days']
    pids = [p for p in other.keys() if p!=4]
    
    to_return = []
    feature_names = other_keys+['conscientiousness']+['self_efficacy']+['activity_choice']
    
    for p in pids:
        temp = [other[p][o] for o in other_keys]
        temp.append(agg_consc[p])
        temp.append(agg_selfeff[p])
        temp.append(agg_actchoice[p])
        to_return.append(temp)
    return to_return,feature_names,pids

In [58]:
X,fn,pids = make_feature_matrix()

In [59]:
def fix(x):
    to_return = []
    for i in x:
        if type(i)==str:
            if i=='no':
                to_return.append(0)
            elif i=='yes':
                to_return.append(1)
            elif i=='X':
                to_return.append(0)
            else:
                #print(i)
                if pd.isnull(float(i)):
                    print(i)
                to_return.append(float(i))
        else:
            if pd.isnull(float(i)):
                    i = 0
            to_return.append(i)
    return to_return

In [60]:
X = [fix(x) for x in X]

In [61]:
X_scaled = preprocessing.scale(X)

In [62]:
def get_training_data_responsivity(pids):
    merged = get_processed('merged_est')
    to_return = {}
    for p in pids:
        if p in merged:
            df  = merged[p]
            #steps = adjust_to_thirty_minute(df['steps'])
            y = measure_responsivity(df)
            to_return[p]=y
    return to_return

In [63]:
steps = get_training_data_responsivity(pids)

In [64]:
pid_lookup = {pids[i]:i for i in range(len(pids))}

In [65]:
def make_matrix_for_first_analysis(steps,pid_lookup,X):
    
    big_matrix_X = []
    big_matrix_y = []
    for pid,stepcount in steps.items():
        for s in stepcount:
            big_matrix_X.append(X[pid_lookup[pid]])
            big_matrix_y.append(s)
    return big_matrix_X,big_matrix_y
        

In [66]:
bx,by = make_matrix_for_first_analysis(steps,pid_lookup,X_scaled)

In [67]:
by_scaled = preprocessing.scale(by)

In [68]:
et = ExtraTreesRegressor(n_estimators=10, max_features=4,
                                       random_state=0)

In [69]:
et.fit(bx, by)

ExtraTreesRegressor(bootstrap=False, criterion='mse', max_depth=None,
          max_features=4, max_leaf_nodes=None, min_impurity_decrease=0.0,
          min_impurity_split=None, min_samples_leaf=1, min_samples_split=2,
          min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
          oob_score=False, random_state=0, verbose=0, warm_start=False)

In [70]:
et.feature_importances_

array([ 0.0540918 ,  0.0147709 ,  0.06356462,  0.06453754,  0.        ,
        0.21675013,  0.06666962,  0.22181772,  0.03908551,  0.03956535,
        0.03088215,  0.01191712,  0.04423204,  0.04326198,  0.03354569,
        0.05530785])

In [71]:
def nice_dict(feats,fns):
    
    to_return = {fns[i]:feats[i] for i in range(len(feats))}
    return sorted(to_return.items(), key=operator.itemgetter(1),reverse=True)

In [72]:
nice_dict(et.feature_importances_,fn)

[('modact.days', 0.22181771811080728),
 ('vigact.days', 0.21675012670050489),
 ('modact.min', 0.066669621005768342),
 ('walk.min', 0.064537536784083374),
 ('fittracker', 0.063564617812372978),
 ('activity_choice', 0.05530784657340123),
 ('sit.hrs', 0.054091803132829464),
 ('walk10.days', 0.044232038469906962),
 ('conscientiousness', 0.043261977675293391),
 ('vigact.hrs', 0.039565346463468001),
 ('vigact.min', 0.039085506011996687),
 ('self_efficacy', 0.033545692398085088),
 ('modact.hrs', 0.030882147506498054),
 ('walk.hrs', 0.014770899660899551),
 ('fitapp', 0.011917121694084662),
 ('sit.min', 0.0)]

16