# Model Personalization

### H1: The accuracy of a model trained at time, *t*, and predicting on time, *t+i*, will have diminishing predictive accuracy as *i* increases

In [2]:
import pandas as pd
import numpy as np
import time
import importlib.machinery
import sys
sys.path.append('/home/sac086/extrasensory/')
import extrasense as es
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, make_scorer
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import GroupShuffleSplit, GroupKFold, cross_val_score, GridSearchCV
from sklearn.pipeline import Pipeline

In [3]:
features_df = es.get_impersonal_data(leave_users_out=[], data_type="activity", labeled_only=False)

# remove nan rows
no_label_indeces = features_df.label.isnull()
features_df = features_df[~no_label_indeces]

timestamps = features_df.pop('timestamp')
label_source = features_df.pop("label_source")
labels = features_df.pop("label")
user_ids = features_df.pop("user_id")

# Step 1 : Find users who labeled a variety of things early on in their participation

In [3]:
users_df = es.get_impersonal_data(leave_users_out=[], data_type="activity", labeled_only=False)

In [4]:
from collections import Counter

In [5]:
# who contributed the most data?
activities = labels.unique()

rows = []
for user_id in es.user_ids:
    user_df = users_df[users_df['user_id'] == user_id]
    user_counts = Counter(user_df.label)
        
    row = {str(val) : user_counts[val] for val in activities}
    row['user id'] = user_id
    row['total'] = np.sum([val for val in user_counts.values()])
    # find number of days of participation too
    row['days participated'] = (user_df.timestamp.max() - user_df.timestamp.min()) / (3600 * 24)
    rows.append(row)

In [6]:
data_df = pd.DataFrame(rows)

In [7]:
data_df.head()

Unnamed: 0,BICYCLING,FIX_running,FIX_walking,LYING_DOWN,SITTING,STAIRS_-_GOING_DOWN,STAIRS_-_GOING_UP,days participated,total,user id
0,120,33,187,2882,2246,0,0,28.136505,6808,098A72A5-E3E5-4F54-A152-BBDA0DF7B694
1,0,7,190,1415,2253,7,7,2.987384,3960,0A986513-7828-4D53-AA1F-E02D6DF9561B
2,106,35,375,0,1671,0,0,6.98265,3090,0BFC35E2-4817-4865-BFA7-764742302A2D
3,0,0,1532,2855,3088,0,0,6.968553,7513,0E6184E1-90C0-48EE-B25A-F1ECB7B9714E
4,0,1,158,979,1543,0,0,8.769792,2685,1155FF54-63D3-4AB2-9863-8385D0BD0A13


In [8]:
users_to_keep = data_df[data_df['days participated'] > 7]

In [9]:
users_to_keep

Unnamed: 0,BICYCLING,FIX_running,FIX_walking,LYING_DOWN,SITTING,STAIRS_-_GOING_DOWN,STAIRS_-_GOING_UP,days participated,total,user id
0,120,33,187,2882,2246,0,0,28.136505,6808,098A72A5-E3E5-4F54-A152-BBDA0DF7B694
4,0,1,158,979,1543,0,0,8.769792,2685,1155FF54-63D3-4AB2-9863-8385D0BD0A13
5,62,0,164,3583,1694,0,74,7.084676,8845,11B5EC4D-4133-4289-B475-4E737182A406
6,0,0,765,2040,3153,0,29,7.017407,6218,136562B6-95B2-483D-88DC-065F28409FD2
8,0,47,925,2779,3441,0,0,7.703831,7371,1DBB0F6F-1F81-4A50-9DF4-CD62ACFA4842
9,59,0,363,1037,1728,0,43,11.445544,4771,24E40C4C-A349-4F9F-93AB-01D00FB994AF
10,0,0,207,1439,1184,0,55,7.87309,4925,27E04243-B138-4F40-A164-F40B60165CF3
12,150,0,454,2380,1810,0,0,7.098634,6164,33A85C34-CFE4-4732-9E73-0A7AC861B27A
13,0,0,132,1720,1916,0,0,7.813542,5203,3600D531-0C55-44A7-AE95-A7A38519464E
15,0,0,1141,1429,3485,0,5,8.90765,6690,481F4DD2-7689-43B9-A2AA-C8772227162B


### For users that participated for more than one day, train a personal model at t=1 and predict at t=2, t=3, t=4, etc.

#### Use [XGBoost](Tuning XGBoost.ipynb)

In [11]:
norm_timestamps = es.get_normalized_timestamps(users_df)
users_df['norm_timestamps'] = norm_timestamps  

## One Person Test

In [12]:
test_user = "81536B0A-8DBF-4D8A-AC24-9543E2E4C8E0" # this user has a fair variety, but not perfect

In [13]:
impersonal_df = users_df[users_df['user_id'] != test_user]
personal_df = users_df[users_df['user_id'] == test_user]

### Personal Model

In [27]:
day_one_df = personal_df[personal_df['norm_timestamps'] < 1]

In [17]:
from collections import Counter

In [28]:
Counter(day_one_df.label)

Counter({nan: 152,
         'FIX_walking': 25,
         'LYING_DOWN': 450,
         'SITTING': 426,
         'BICYCLING': 83})

In [20]:
import xgboost as xgb

In [21]:
personal_clf = xgb.XGBClassifier()

In [29]:
y_day_one = day_one_df.pop("label")
label_source_day_one = day_one_df.pop("label_source")
timestamps_day_one = day_one_df.pop("timestamp")
norm_timestamps_day_one = day_one_df.pop("norm_timestamps")
__ = day_one_df.pop("user_id")

In [30]:
personal_scaler = StandardScaler()
personal_scaler.fit(day_one_df)

StandardScaler(copy=True, with_mean=True, with_std=True)

In [31]:
X_day_one = personal_scaler.transform(day_one_df)

In [34]:
personal_clf.fit(X_day_one, np.array([str(y) for y in y_day_one]))

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='multi:softprob', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)

In [36]:
day_two_df = personal_df[(personal_df['norm_timestamps'] < 2) &\
                         (personal_df['norm_timestamps'] > 1)]

In [37]:
y_day_two = day_two_df.pop("label")
label_source_day_two = day_two_df.pop("label_source")
timestamps_day_two = day_two_df.pop("timestamp")
norm_timestamps_day_two = day_two_df.pop("norm_timestamps")
__ = day_two_df.pop("user_id")

In [38]:
X_day_two = personal_scaler.transform(day_two_df)

In [39]:
y_day_two = [str(y) for y in y_day_two]

In [40]:
accuracy_score(y_day_two, personal_clf.predict(X_day_two))

0.47030185004868547