In [1]:
import pandas as pd
import numpy as np
import time
import importlib.machinery
es = importlib.machinery.SourceFileLoader('extrasense','/home/sac086/extrasensory/extrasense/extrasense.py').load_module()
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import StratifiedShuffleSplit, ShuffleSplit, KFold, StratifiedKFold
from sklearn.preprocessing import StandardScaler

# Recreating Exp1

**Experimental Conditions**: Impersonal, Personal, Hybrid training data

**Experiment Output**: User ID, Method, Run number, Accuracy

**Sampling Method for personal data**: randomly sample the data, no stratification of sample classes

In [None]:
%px
def run_experiment1(user_id):
    personal_df = es.get_data_from_user_id(user_id, data_type="activity", labeled_only=True)
    timestamps_personal = personal_df.pop('timestamp')
    
    # train impersonal model
    impersonal_df = es.get_impersonal_data(user_id, data_type="activity", labeled_only=True)
    timestamps_impersonal = impersonal_df.pop('timestamp')
    impersonal_train_labels = impersonal_df.pop("label")

     # standard scale training
    impersonal_scaler = StandardScaler().fit(impersonal_df)
    scaled_impersonal_df = impersonal_scaler.transform(impersonal_df)
    
    impersonal_clf = RandomForestClassifier()
    impersonal_clf.fit(scaled_impersonal_df, impersonal_train_labels)
    
    # setup sampler
    rs = ShuffleSplit(n_splits=5, test_size=100)
    
    run_count = 1
    rows = []
    # run sampler    
    for train_ind, test_ind in rs.split(personal_df):
        personal_train_df = personal_df.iloc[train_ind]
        personal_train_labels = personal_train_df.pop("label")
        
        val_df = personal_df.iloc[test_ind]
        val_labels = val_df.pop("label")
        
        #return personal_train_df, personal_train_labels, impersonal_df, impersonal_train_labels
        hybrid_train_df = pd.concat([impersonal_df, personal_train_df])
        hybrid_train_labels = pd.concat([impersonal_train_labels, personal_train_labels])
        
        # scale 
        personal_scaler = StandardScaler().fit(personal_train_df)
        scaled_personal_df = personal_scaler.transform(personal_train_df)
        
        hybrid_scaler = StandardScaler().fit(hybrid_train_df)
        scaled_hybrid_df = hybrid_scaler.transform(hybrid_train_df)
        
        # build and predict personal model
        personal_clf = RandomForestClassifier()
        personal_clf.fit(scaled_personal_df, personal_train_labels)
        personal_scaled_val_df = personal_scaler.transform(val_df)
        personal_predictions = personal_clf.predict(personal_scaled_val_df)
        
        # build and predict hybrid model
        hybrid_clf = RandomForestClassifier()
        hybrid_clf.fit(scaled_hybrid_df, hybrid_train_labels)
        hybrid_scaled_val_df = hybrid_scaler.transform(val_df)
        hybrid_predictions = hybrid_clf.predict(hybrid_scaled_val_df)
        
        # impersonal predictions
        impersonal_scaled_val_df = impersonal_scaler.transform(val_df)
        impersonal_predictions = impersonal_clf.predict(impersonal_scaled_val_df)
        
        # validate models
        personal_score = accuracy_score(val_labels, personal_predictions)
        hybrid_score = accuracy_score(val_labels, hybrid_predictions)
        impersonal_score = accuracy_score(val_labels, impersonal_predictions)
        
        print("\tRun #%s" % run_count)
        print("\tpersonal : %s" % personal_score)
        print("\thybrid : %s" % hybrid_score)
        print("\timpersonal : %s" % impersonal_score)
        print("\n")
        
        personal_row = {"user_id" : user_id, 
                        "method":"personal", 
                        "run_num" : run_count,
                        "accuracy" : personal_score}
        
        hybrid_row = {"user_id" : user_id, 
                        "method":"hybrid", 
                        "run_num" : run_count,
                        "accuracy" : hybrid_score}
        
        impersonal_row = {"user_id" : user_id, 
                        "method":"impersonal", 
                        "run_num" : run_count,
                        "accuracy" : impersonal_score}
        rows.append(personal_row)
        rows.append(hybrid_row)
        rows.append(impersonal_row)
        
        run_count += 1
    return rows

In [None]:
rows = []

for user_id in es.user_ids:
    print("Getting scores for %s" % user_id)
    start = time.time()
    user_rows = run_experiment1(user_id)
    finish = time.time()
    duration_in_minutes = (finish - start) / 60.
    print("\ttook %s minutes" % (duration_in_minutes))
    rows += user_rows

In [None]:
scores_df = pd.DataFrame(rows)

In [None]:
scores_df.to_pickle('./scores_df.pickle')

In [None]:
ls -l

In [None]:
scores_df[scores_df['method'] == 'personal'].mean()

In [None]:
scores_df[scores_df['method'] == 'impersonal'].mean()

In [None]:
len(es.user_ids)

In [None]:
personal_df.head()

In [None]:
impersonal_df.head()

# Trying Stratified Shuffle with parallelization

In [None]:
import ipyparallel as ipp

In [None]:
c = ipp.Client()

In [None]:
dview = c[:]

In [None]:
dview.block=True

In [None]:
dview.scatter('user_ids', es.user_ids)

In [None]:
import_string = '''import pandas as pd
import numpy as np
import time
import importlib.machinery
es = importlib.machinery.SourceFileLoader('extrasense','/home/sac086/extrasensory/extrasense/extrasense.py').load_module()
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import StratifiedShuffleSplit, ShuffleSplit, KFold, StratifiedKFold
from sklearn.preprocessing import StandardScaler'''

In [None]:
asr = dview.execute(import_string)

In [None]:
%%px
def run_experiment1(user_id, stratified=False):
    personal_df = es.get_data_from_user_id(user_id, data_type="activity", labeled_only=True)
    timestamps_personal = personal_df.pop('timestamp')
    personal_labels = personal_df.pop("label")

    # train impersonal model
    impersonal_df = es.get_impersonal_data(user_id, data_type="activity", labeled_only=True)
    timestamps_impersonal = impersonal_df.pop('timestamp')
    impersonal_train_labels = impersonal_df.pop("label")

     # standard scale training
    impersonal_scaler = StandardScaler().fit(impersonal_df)
    scaled_impersonal_df = impersonal_scaler.transform(impersonal_df)
    
    impersonal_clf = RandomForestClassifier()
    impersonal_clf.fit(scaled_impersonal_df, impersonal_train_labels)
    
    # setup sampler
    if stratified:
        rs = StratifiedShuffleSplit(n_splits=5, train_size=10, test_size=100)
        split_iterator = rs.split(personal_df, personal_labels)
    else:
        rs = ShuffleSplit(n_splits=5, train_size=10, test_size=100)
        split_iterator = rs.split(personal_df)
    run_count = 1
    rows = []
    # run sampler    
    for train_ind, test_ind in split_iterator:
        personal_train_df = personal_df.iloc[train_ind]
        personal_train_labels = personal_labels.iloc[train_ind]
        
        val_df = personal_df.iloc[test_ind]
        val_labels = personal_labels.iloc[test_ind]
        
        #return personal_train_df, personal_train_labels, impersonal_df, impersonal_train_labels
        hybrid_train_df = pd.concat([impersonal_df, personal_train_df])
        hybrid_train_labels = pd.concat([impersonal_train_labels, personal_train_labels])
        
        # scale 
        personal_scaler = StandardScaler().fit(personal_train_df)
        scaled_personal_df = personal_scaler.transform(personal_train_df)
        
        hybrid_scaler = StandardScaler().fit(hybrid_train_df)
        scaled_hybrid_df = hybrid_scaler.transform(hybrid_train_df)
        
        # build and predict personal model
        personal_clf = RandomForestClassifier()
        personal_clf.fit(scaled_personal_df, personal_train_labels)
        personal_scaled_val_df = personal_scaler.transform(val_df)
        personal_predictions = personal_clf.predict(personal_scaled_val_df)
        
        # build and predict hybrid model
        hybrid_clf = RandomForestClassifier()
        hybrid_clf.fit(scaled_hybrid_df, hybrid_train_labels)
        hybrid_scaled_val_df = hybrid_scaler.transform(val_df)
        hybrid_predictions = hybrid_clf.predict(hybrid_scaled_val_df)
        
        # impersonal predictions
        impersonal_scaled_val_df = impersonal_scaler.transform(val_df)
        impersonal_predictions = impersonal_clf.predict(impersonal_scaled_val_df)
        
        # validate models
        personal_score = accuracy_score(val_labels, personal_predictions)
        hybrid_score = accuracy_score(val_labels, hybrid_predictions)
        impersonal_score = accuracy_score(val_labels, impersonal_predictions)
        
        print("\tRun #%s" % run_count)
        print("\tpersonal : %s" % personal_score)
        print("\thybrid : %s" % hybrid_score)
        print("\timpersonal : %s" % impersonal_score)
        print("\n")
        
        personal_row = {"user_id" : user_id, 
                        "method":"personal", 
                        "run_num" : run_count,
                        "accuracy" : personal_score}
        
        hybrid_row = {"user_id" : user_id, 
                        "method":"hybrid", 
                        "run_num" : run_count,
                        "accuracy" : hybrid_score}
        
        impersonal_row = {"user_id" : user_id, 
                        "method":"impersonal", 
                        "run_num" : run_count,
                        "accuracy" : impersonal_score}
        rows.append(personal_row)
        rows.append(hybrid_row)
        rows.append(impersonal_row)
        
        run_count += 1
    return rows

In [None]:
# without class stratification

command1 = '''
rows = []
errors = []

for user_id in user_ids:
    print("Getting scores for %s" % user_id)
    start = time.time()
    try:
        user_rows = run_experiment1(user_id)
    except ValueError as ve:
        errors.append(ve)
        continue
    finish = time.time()
    duration_in_minutes = (finish - start) / 60.
    print("\ttook %s minutes" % (duration_in_minutes))
    rows += user_rows
'''
asr = dview.execute(command1)
rows = dview.gather('rows')
scores_df = pd.DataFrame(rows)
scores_df.to_pickle('./scores_df.pickle')

In [None]:
# without class stratification

command2 = '''
rows = []
errors = []

for user_id in user_ids:
    print("Getting scores for %s" % user_id)
    start = time.time()
    try:
        user_rows = run_experiment1(user_id, stratified=True)
    except ValueError as ve:
        errors.append(ve)
        continue
    finish = time.time()
    duration_in_minutes = (finish - start) / 60.
    print("\ttook %s minutes" % (duration_in_minutes))
    rows += user_rows
'''
asr = dview.execute(command2)
rows = dview.gather('rows')
scores_df = pd.DataFrame(rows)
scores_df.to_pickle('./scores_df_stratified.pickle')

# Is the Impersonal Model really doing that good with a model that isn't even tuned?

In [2]:
all_df = es.get_impersonal_data(data_type="activity", labeled_only=True)

In [3]:
all_df.head()

Unnamed: 0,user_id,raw_acc:magnitude_stats:mean,raw_acc:magnitude_stats:std,raw_acc:magnitude_stats:moment3,raw_acc:magnitude_stats:moment4,raw_acc:magnitude_stats:percentile25,raw_acc:magnitude_stats:percentile50,raw_acc:magnitude_stats:percentile75,raw_acc:magnitude_stats:value_entropy,raw_acc:magnitude_stats:time_entropy,...,raw_acc:3d:mean_z,raw_acc:3d:std_x,raw_acc:3d:std_y,raw_acc:3d:std_z,raw_acc:3d:ro_xy,raw_acc:3d:ro_xz,raw_acc:3d:ro_yz,label,label_source,timestamp
0,098A72A5-E3E5-4F54-A152-BBDA0DF7B694,1.019125,0.005488,-0.007333,0.012716,1.015712,1.01873,1.022642,1.534798,6.684597,...,-1.018555,0.003047,0.003574,0.005455,0.231009,0.160268,0.274546,SITTING,2.0,1438708000.0
1,098A72A5-E3E5-4F54-A152-BBDA0DF7B694,1.027085,0.04091,0.065024,0.109432,1.021727,1.024658,1.0276,0.444923,6.683851,...,-1.007858,0.129614,0.102002,0.10549,-0.538931,0.753075,-0.716014,SITTING,2.0,1438709000.0
2,098A72A5-E3E5-4F54-A152-BBDA0DF7B694,1.019498,0.034167,0.045501,0.098916,1.017684,1.019795,1.02274,0.493111,6.684059,...,-1.015779,0.03907,0.074888,0.037067,0.126114,0.093632,0.414424,SITTING,2.0,1438709000.0
3,098A72A5-E3E5-4F54-A152-BBDA0DF7B694,1.014249,0.00456,0.002866,0.010174,1.012491,1.013851,1.01585,1.387466,6.684602,...,-1.013756,0.003356,0.003058,0.004548,0.082769,-0.083428,0.14749,SITTING,2.0,1438709000.0
4,098A72A5-E3E5-4F54-A152-BBDA0DF7B694,1.01596,0.003355,0.003965,0.007191,1.013798,1.015754,1.017733,1.646399,6.684606,...,-1.015449,0.002772,0.002371,0.003354,0.062909,0.083451,-0.049604,SITTING,2.0,1438709000.0


# 1. Get a held out group of users
# 2. increase the number of users in impersonal dataset
# 3. accuracy on held out set should increase with increased users