In [299]:
import pickle
import numpy as np
import pandas as pd
from sklearn import preprocessing
from sklearn.feature_selection import RFECV
from sklearn.svm import SVR
from sklearn.feature_selection import SelectPercentile
from sklearn.feature_selection import f_regression
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.ensemble import ExtraTreesRegressor
import operator

In [2]:
def adjust_to_thirty_minute(steps):
    return steps.fillna(0).tz_localize(None).resample('30T').sum().fillna(0)

In [3]:
def get_processed(file_name):
    
    root = '../../../../Volumes/dav/HeartSteps/pooling_rl_shared_data/processed/'
    with open('{}{}.pkl'.format(root,file_name),'rb') as f:
        return pickle.load(f)

In [4]:
merged = get_processed('merged_est')

In [277]:
def get_target_indices(df):
    return df.loc[(df['send.active'] == True)&(df['avail'] == True  )].index

In [80]:
def get_near_targets(targets):
    return targets.round('30T').tz_localize(None)

In [24]:
targets = get_target_indices(merged[1])

In [25]:
targets = get_near_targets(targets)

In [315]:
def measure_responsivity(df):
    steps = adjust_to_thirty_minute(df['steps'])
    targets = get_target_indices(df)
    targets = get_near_targets(targets)
    to_return = []
    for t in targets:
        before = t- pd.DateOffset(minutes=30)
        after = t
        sb = steps[before]
        sa = steps[after]
        #/(sb+1
        to_return.append((sa-sb)/(sb+1))
    return to_return

In [279]:
def make_feature_matrix():
    other = get_processed('other_activity')
    agg_consc = get_processed('agg_conscientiousness_measures')
    agg_selfeff = get_processed('agg_self_efficacy_measures')
    agg_actchoice = get_processed('agg_activity_choice_measures')
    other_keys = list(other[1].keys())
    #other_keys = ['vigact.days','vigact.hrs','vigact.min','modact.days']
    pids = [p for p in other.keys() if p!=4]
    
    to_return = []
    feature_names = other_keys+['conscientiousness']+['self_efficacy']+['activity_choice']
    
    for p in pids:
        temp = [other[p][o] for o in other_keys]
        temp.append(agg_consc[p])
        temp.append(agg_selfeff[p])
        temp.append(agg_actchoice[p])
        to_return.append(temp)
    return to_return,feature_names,pids

In [280]:
X,fn,pids = make_feature_matrix()

In [281]:
def fix(x):
    to_return = []
    for i in x:
        if type(i)==str:
            if i=='no':
                to_return.append(0)
            elif i=='yes':
                to_return.append(1)
            elif i=='X':
                to_return.append(0)
            else:
                #print(i)
                if pd.isnull(float(i)):
                    print(i)
                to_return.append(float(i))
        else:
            if pd.isnull(float(i)):
                    i = 0
            to_return.append(i)
    return to_return

In [282]:
X = [fix(x) for x in X]

In [283]:
X_scaled = preprocessing.scale(X)

In [284]:
def get_training_data_responsivity(pids):
    merged = get_processed('merged_est')
    to_return = {}
    for p in pids:
        if p in merged:
            df  = merged[p]
            #steps = adjust_to_thirty_minute(df['steps'])
            y = measure_responsivity(df)
            to_return[p]=y
    return to_return

In [321]:
steps = get_training_data_responsivity(pids)

In [322]:
pid_lookup = {pids[i]:i for i in range(len(pids))}

In [323]:
def make_matrix_for_first_analysis(steps,pid_lookup,X):
    
    big_matrix_X = []
    big_matrix_y = []
    for pid,stepcount in steps.items():
        for s in stepcount:
            big_matrix_X.append(X[pid_lookup[pid]])
            big_matrix_y.append(s)
    return big_matrix_X,big_matrix_y
        

In [324]:
bx,by = make_matrix_for_first_analysis(steps,pid_lookup,X_scaled)

In [325]:
by_scaled = preprocessing.scale(by)

In [326]:
et = ExtraTreesRegressor(n_estimators=10, max_features=16,
                                       random_state=0)

In [327]:
et.fit(bx, by)

ExtraTreesRegressor(bootstrap=False, criterion='mse', max_depth=None,
          max_features=16, max_leaf_nodes=None, min_impurity_decrease=0.0,
          min_impurity_split=None, min_samples_leaf=1, min_samples_split=2,
          min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
          oob_score=False, random_state=0, verbose=0, warm_start=False)

In [328]:
et.feature_importances_

array([ 0.01873823,  0.03948614,  0.14745533,  0.05488882,  0.06392793,
        0.        ,  0.27674769,  0.03747015,  0.04742855,  0.01946472,
        0.02847807,  0.01255071,  0.04731204,  0.06980254,  0.07513629,
        0.0611128 ])

In [329]:
def nice_dict(feats,fns):
    
    to_return = {fns[i]:feats[i] for i in range(len(feats))}
    return sorted(to_return.items(), key=operator.itemgetter(1),reverse=True)

In [330]:
nice_dict(et.feature_importances_,fn)

[('modact.days', 0.27674768512445341),
 ('modact.min', 0.14745533328974098),
 ('self_efficacy', 0.075136294115016072),
 ('conscientiousness', 0.069802538259117711),
 ('vigact.days', 0.063927925620420725),
 ('activity_choice', 0.061112796416797711),
 ('walk10.days', 0.054888816054769154),
 ('walk.min', 0.047428553292749247),
 ('modact.hrs', 0.047312044351381456),
 ('sit.hrs', 0.039486144703674543),
 ('vigact.min', 0.037470146926060224),
 ('fittracker', 0.028478071720180027),
 ('walk.hrs', 0.01946471689100062),
 ('fitapp', 0.018738226984113766),
 ('vigact.hrs', 0.012550706250524343),
 ('sit.min', 0.0)]

16