In [1]:
import pickle
import numpy as np
import pandas as pd
from sklearn import preprocessing
from sklearn.feature_selection import RFECV
from sklearn.svm import SVR
from sklearn.pipeline import Pipeline
from sklearn.ensemble import ExtraTreesRegressor
import operator

In [2]:
def adjust_to_thirty_minute(steps):
    return steps.fillna(0).tz_localize(None).resample('30T').sum().fillna(0)

In [3]:
def get_processed(file_name):
    
    root = '../../../../Volumes/dav/HeartSteps/pooling_rl_shared_data/processed/'
    with open('{}{}.pkl'.format(root,file_name),'rb') as f:
        return pickle.load(f)

In [4]:
def make_feature_matrix():
    other = get_processed('other_activity')
    agg_consc = get_processed('agg_conscientiousness_measures')
    agg_selfeff = get_processed('agg_self_efficacy_measures')
    agg_actchoice = get_processed('agg_activity_choice_measures')
    other_keys = list(other[1].keys())
    #other_keys = ['vigact.days','vigact.hrs','vigact.min','modact.days']
    pids = [p for p in other.keys() if p!=4]
    
    to_return = []
    feature_names = other_keys+['conscientiousness']+['self_efficacy']+['activity_choice']
    
    for p in pids:
        temp = [other[p][o] for o in other_keys]
        temp.append(agg_consc[p])
        temp.append(agg_selfeff[p])
        temp.append(agg_actchoice[p])
        to_return.append(temp)
    return to_return,feature_names,pids

In [5]:
def fix(x):
    to_return = []
    for i in x:
        if type(i)==str:
            if i=='no':
                to_return.append(0)
            elif i=='yes':
                to_return.append(1)
            elif i=='X':
                to_return.append(0)
            else:
                #print(i)
                if pd.isnull(float(i)):
                    print(i)
                to_return.append(float(i))
        else:
            if pd.isnull(float(i)):
                    i = 0
            to_return.append(i)
    return to_return

In [6]:
X,fn,pids = make_feature_matrix()

In [7]:
X = [fix(x) for x in X]

In [8]:
X_scaled = preprocessing.scale(X)

In [9]:
def get_training_data_step_counts(pids):
    merged = get_processed('merged_est')
    to_return = {}
    for p in pids:
        if p in merged:
            df  = merged[p]
            steps = adjust_to_thirty_minute(df['steps'])
            to_return[p]=steps
    return to_return

In [10]:
steps = get_training_data_step_counts(pids)

In [11]:
pid_lookup = {pids[i]:i for i in range(len(pids))}

In [12]:
def make_matrix_for_first_analysis(steps,pid_lookup,X):
    
    big_matrix_X = []
    big_matrix_y = []
    for pid,stepcount in steps.items():
        #for s in stepcount:
        big_matrix_X.append(X[pid_lookup[pid]])
        big_matrix_y.append(stepcount.mean())
    return big_matrix_X,big_matrix_y
        

In [13]:
bx,by = make_matrix_for_first_analysis(steps,pid_lookup,X_scaled)

In [14]:
by_scaled = preprocessing.scale(by)

In [15]:
selector.support_

NameError: name 'selector' is not defined

In [None]:
et = ExtraTreesRegressor(n_estimators=10, max_features=16,
                                       random_state=0)

In [16]:
fn

['walk.min',
 'vigact.days',
 'modact.min',
 'walk.hrs',
 'fittracker',
 'walk10.days',
 'vigact.min',
 'modact.days',
 'vigact.hrs',
 'sit.hrs',
 'sit.min',
 'fitapp',
 'modact.hrs',
 'conscientiousness',
 'self_efficacy',
 'activity_choice']

In [138]:
et.fit(bx, by)

ExtraTreesRegressor(bootstrap=False, criterion='mse', max_depth=None,
          max_features=16, max_leaf_nodes=None, min_impurity_decrease=0.0,
          min_impurity_split=None, min_samples_leaf=1, min_samples_split=2,
          min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
          oob_score=False, random_state=0, verbose=0, warm_start=False)

In [139]:
def nice_dict(feats,fns):
    
    to_return = {fns[i]:feats[i] for i in range(len(feats))}
    return sorted(to_return.items(), key=operator.itemgetter(1),reverse=True)

In [140]:
nice_dict(et.feature_importances_,fn)

[('modact.days', 0.2306553858481179),
 ('walk.min', 0.16124223848078406),
 ('conscientiousness', 0.1192831377210601),
 ('modact.min', 0.10884339235448426),
 ('vigact.days', 0.09323766639934844),
 ('sit.hrs', 0.074809410299192927),
 ('modact.hrs', 0.047288073726061731),
 ('walk10.days', 0.043348642907086593),
 ('vigact.min', 0.028967733658822086),
 ('fitapp', 0.02594337986688728),
 ('self_efficacy', 0.017895939358585602),
 ('walk.hrs', 0.016548784920776705),
 ('vigact.hrs', 0.015086926300478717),
 ('activity_choice', 0.013704788527918973),
 ('fittracker', 0.0031444996303946066),
 ('sit.min', 0.0)]