In [12]:
import pandas as pd
import numpy as np
import time
import importlib.machinery
es = importlib.machinery.SourceFileLoader('extrasense','/home/sac086/extrasensory/extrasense/extrasense.py').load_module()
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import StratifiedShuffleSplit, ShuffleSplit, KFold, StratifiedKFold
from sklearn.preprocessing import StandardScaler

# Recreating Exp1

**Experimental Conditions**: Impersonal, Personal, Hybrid training data

**Experiment Output**: User ID, Method, Run number, Accuracy

**Sampling Method for personal data**: randomly sample the data, no stratification of sample classes

## Testing with and without stratification for one participant

In [13]:
exp = importlib.machinery.SourceFileLoader('extrasense','/home/sac086/extrasensory/processes/experimental_setups.py').load_module()

In [14]:
training_sizes = [5,10,20,30,40,50,60]
user_id = es.user_ids[3]

rows = []
for ts in training_sizes:
    print("Getting scores for %s" % user_id)
    start = time.time()
    try:
        user_rows = exp.run_experiment1(user_id, training_size=ts)
    except ValueError as ve:
        errors.append(ve)
        continue
    finish = time.time()
    duration_in_minutes = (finish - start) / 60.
    print("\ttook %s minutes" % (duration_in_minutes))
    rows += user_rows

Getting scores for 0E6184E1-90C0-48EE-B25A-F1ECB7B9714E
getting impersonal data...
Scaling Impersonal...
Initializing ShuffleSplit()
Starting run #1
	Run #1
	personal : 0.63
	hybrid : 0.7
	impersonal : 0.75


Starting run #2


KeyboardInterrupt: 

In [15]:
training_sizes = [5,10,20,30,40,50,60]
user_id = es.user_ids[3]

rows_stratified = []
for ts in training_sizes:
    print("Getting scores for %s" % user_id)
    start = time.time()
    try:
        user_rows = exp.run_experiment1(user_id, training_size=ts, stratified=True)
    except ValueError as ve:
        errors.append(ve)
        continue
    finish = time.time()
    duration_in_minutes = (finish - start) / 60.
    print("\ttook %s minutes" % (duration_in_minutes))
    rows_stratified += user_rows

Getting scores for 0E6184E1-90C0-48EE-B25A-F1ECB7B9714E
getting impersonal data...
Scaling Impersonal...
Initializing StratifiedShuffleSplit()
Starting run #1
	Run #1
	personal : 0.53
	hybrid : 0.6
	impersonal : 0.6


Starting run #2
	Run #2
	personal : 0.46
	hybrid : 0.59
	impersonal : 0.6


Starting run #3
	Run #3
	personal : 0.22
	hybrid : 0.63
	impersonal : 0.61


Starting run #4
	Run #4
	personal : 0.62
	hybrid : 0.64
	impersonal : 0.68


Starting run #5
	Run #5
	personal : 0.52
	hybrid : 0.65
	impersonal : 0.67


	took 25.76958030462265 minutes
Getting scores for 0E6184E1-90C0-48EE-B25A-F1ECB7B9714E
getting impersonal data...
Scaling Impersonal...
Initializing StratifiedShuffleSplit()
Starting run #1
	Run #1
	personal : 0.71
	hybrid : 0.69
	impersonal : 0.67


Starting run #2


KeyboardInterrupt: 

# testing parallelization

In [None]:
import ipyparallel as ipp

In [None]:
c = ipp.Client()

In [None]:
dview = c[:]

In [None]:
dview.block=True

In [None]:
dview.scatter('user_ids', es.user_ids)

In [None]:
import_string = '''import pandas as pd
import numpy as np
import time
import importlib.machinery
es = importlib.machinery.SourceFileLoader('extrasense','/home/sac086/extrasensory/extrasense/extrasense.py').load_module()
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import StratifiedShuffleSplit, ShuffleSplit, KFold, StratifiedKFold
from sklearn.preprocessing import StandardScaler'''

In [None]:
asr = dview.execute(import_string)

In [None]:
%%px
def run_experiment1(user_id, training_size=100, learning_algo=es.weka_RF, stratified=False):
    #print("Getting personal data...")
    personal_df = es.get_data_from_user_id(user_id, data_type="activity", labeled_only=True)
    timestamps_personal = personal_df.pop('timestamp')
    label_sources_personal = personal_df.pop("label_source")
    personal_labels = personal_df.pop("label")
    
    # train impersonal model
    print("getting impersonal data...")
    impersonal_df = es.get_impersonal_data(leave_users_out=[user_id], data_type="activity", labeled_only=True)
    timestamps_impersonal = impersonal_df.pop('timestamp')
    impersonal_train_labels = impersonal_df.pop("label")
    impersonal_label_sources = impersonal_df.pop('label_source')
    impersonal_ids = impersonal_df.pop("user_id")
    #impersonal_df.dropna()
    
    #impersonal_train_labels = impersonal_train_labels.index[impersonal_df.index]
    
     # standard scale training
    print("Scaling Impersonal...")
    impersonal_scaler = StandardScaler().fit(impersonal_df)
    scaled_impersonal_df = impersonal_scaler.transform(impersonal_df)
    
    impersonal_clf = learning_algo()
    impersonal_clf.fit(scaled_impersonal_df, impersonal_train_labels)
    
    # setup sampler
    if stratified:
        rs = StratifiedShuffleSplit(n_splits=5, train_size=training_size, test_size=100)
        splitter =  rs.split(personal_df, personal_labels)
    else:
        rs = ShuffleSplit(n_splits=5, train_size=training_size, test_size=100)
        splitter = rs.split(personal_df)
    
    run_count = 1
    rows = []
    # run sampler
    for train_ind, test_ind in splitter:
        print("Starting run #%s" % run_count)
        personal_train_df = personal_df.iloc[train_ind]
        personal_train_labels = personal_labels.iloc[train_ind]
        
        val_df = personal_df.iloc[test_ind]
        val_labels = personal_labels.iloc[test_ind]
        
        
        #return personal_train_df, personal_train_labels, impersonal_df, impersonal_train_labels
        hybrid_train_df = pd.concat([impersonal_df, personal_train_df])
        #return impersonal_train_labels, personal_train_labels
        hybrid_train_labels = pd.concat([impersonal_train_labels, personal_train_labels])
        
        # scale 
        personal_scaler = StandardScaler().fit(personal_train_df)
        scaled_personal_df = personal_scaler.transform(personal_train_df)
        
        hybrid_scaler = StandardScaler().fit(hybrid_train_df)
        scaled_hybrid_df = hybrid_scaler.transform(hybrid_train_df)
        
        # build and predict personal model
        personal_clf = learning_algo()
        personal_clf.fit(scaled_personal_df, personal_train_labels)
        personal_scaled_val_df = personal_scaler.transform(val_df)
        personal_predictions = personal_clf.predict(personal_scaled_val_df)
        
        # build and predict hybrid model
        hybrid_clf = learning_algo()
        hybrid_clf.fit(scaled_hybrid_df, hybrid_train_labels)
        hybrid_scaled_val_df = hybrid_scaler.transform(val_df)
        hybrid_predictions = hybrid_clf.predict(hybrid_scaled_val_df)
        
        # impersonal predictions
        impersonal_scaled_val_df = impersonal_scaler.transform(val_df)
        impersonal_predictions = impersonal_clf.predict(impersonal_scaled_val_df)
        
        # validate models
        personal_score = accuracy_score(val_labels, personal_predictions)
        hybrid_score = accuracy_score(val_labels, hybrid_predictions)
        impersonal_score = accuracy_score(val_labels, impersonal_predictions)
        
        print("\tRun #%s" % run_count)
        print("\tpersonal : %s" % personal_score)
        print("\thybrid : %s" % hybrid_score)
        print("\timpersonal : %s" % impersonal_score)
        print("\n")
        
        personal_row = {"user_id" : user_id, 
                        "method":"personal", 
                        "run_num" : run_count,
                        "training_size" : training_size,
                        "accuracy" : personal_score}
        
        hybrid_row = {"user_id" : user_id, 
                        "method":"hybrid", 
                        "run_num" : run_count,
                        "training_size" : training_size,
                        "accuracy" : hybrid_score}
        
        impersonal_row = {"user_id" : user_id, 
                        "method":"impersonal", 
                        "run_num" : run_count,
                        "training_size" : training_size,
                        "accuracy" : impersonal_score}
        rows.append(personal_row)
        rows.append(hybrid_row)
        rows.append(impersonal_row)
        
        run_count += 1
    return rows

In [None]:
# without class stratification

command2 = '''
rows = []
errors = []
training_sizes = [5,10,20,30,40,50,60]
for user_id in user_ids:
    for ts in training_sizes:
        print("Getting scores for %s" % user_id)
        start = time.time()
        try:
            user_rows = run_experiment1(user_id, training_size=ts)
        except ValueError as ve:
            errors.append(ve)
            continue
        finish = time.time()
        duration_in_minutes = (finish - start) / 60.
        print("\ttook %s minutes" % (duration_in_minutes))
        rows += user_rows
'''
start = time.time()
asr = dview.execute(command2)
print("finished running processes")
rows = dview.gather('rows')
finish = time.time()
print("Took %.3f hours") % 
scores_df = pd.DataFrame(rows)
scores_df.to_pickle('./scores_df.pickle')

In [None]:
# with class stratification

command2 = '''
rows = []
errors = []
training_sizes = [5,10,20,30,40,50,60]
for user_id in user_ids:
    for ts in training_sizes:
        print("Getting scores for %s" % user_id)
        start = time.time()
        try:
            user_rows = run_experiment1(user_id, training_size=ts, stratified=True)
        except ValueError as ve:
            errors.append(ve)
            continue
        finish = time.time()
        duration_in_minutes = (finish - start) / 60.
        print("\ttook %s minutes" % (duration_in_minutes))
        rows += user_rows
'''
asr = dview.execute(command2)
rows = dview.gather('rows')
scores_df = pd.DataFrame(rows)
scores_df.to_pickle('./scores_df_stratified.pickle')

In [None]:
scores_df.head()

# Visualizing the results

In [1]:
ls -l ../results

total 336
-rw-rw-r--. 1 sac086 sac086 180186 Dec 13 04:24 2017-12-12_19_exp1_no_stratification.pickle
-rw-rw-r--. 1 sac086 sac086 161482 Dec 13 13:18 2017-12-12_19_exp1_with_stratification.pickle
drwxrwxr-x. 2 sac086 sac086     10 Dec 12 19:00 [0m[01;34mexperiment1[0m/


In [2]:
# Load results from file
not_stratified_scores_df = pd.read_pickle("../results/2017-12-12_19_exp1_no_stratification.pickle")
stratified_scores_df = pd.read_pickle("../results/2017-12-12_19_exp1_with_stratification.pickle")


## Results without Stratification

### impersonal mean scores will only include the validation set from the round where personal training data was 5 in order to have a consistent amount of trials as other methods

In [3]:
impersonal_mean_scores = []

for user_id in es.user_ids:
    user_impersonal_mean = not_stratified_scores_df[(not_stratified_scores_df['user_id'] == user_id) &\
                                     (not_stratified_scores_df['method'] == 'impersonal') &\
                                     (not_stratified_scores_df['training_size'] == 5)]['accuracy'].mean()
    impersonal_mean_scores.append(user_impersonal_mean)

In [4]:
# Results without stratification
print("Impersonal: M=%.3f, SD=%.3f\n" % (np.mean(impersonal_mean_scores), np.std(impersonal_mean_scores)))

user_ids = not_stratified_scores_df['user_id'].unique()
training_sizes = [5,10,20,30,40]

all_personal_scores = []
all_personal_sizes = []

all_hybrid_scores = []
all_hybrid_sizes = []
for ts in training_sizes:
    personal_mean_scores = []
    hybrid_mean_scores = []
    
    for user_id in es.user_ids:
        user_personal_mean = not_stratified_scores_df[(not_stratified_scores_df['user_id'] == user_id) &\
              (not_stratified_scores_df['method'] == 'personal') &\
              (not_stratified_scores_df['training_size'] == ts)]['accuracy'].mean()
        user_hybrid_mean = not_stratified_scores_df[(not_stratified_scores_df['user_id'] == user_id) &\
              (not_stratified_scores_df['method'] == 'hybrid') &\
              (not_stratified_scores_df['training_size'] == ts)]['accuracy'].mean()
        personal_mean_scores.append(user_personal_mean)
        hybrid_mean_scores.append(user_hybrid_mean)
    
    print("Training Size : %s" % ts)
    print("\tPersonal: M=%.3f, SD=%.3f" % (np.mean(personal_mean_scores), np.std(personal_mean_scores)))
    print("\tHybrid: M=%.3f, SD=%.3f" % (np.mean(hybrid_mean_scores), np.std(hybrid_mean_scores)))
    
    all_personal_scores +=  personal_mean_scores
    all_personal_sizes += [ts] * len(personal_mean_scores)
    
    all_hybrid_scores += hybrid_mean_scores
    all_hybrid_sizes += [ts] * len(hybrid_mean_scores)


Impersonal: M=0.600, SD=0.105

Training Size : 5
	Personal: M=0.564, SD=0.099
	Hybrid: M=0.620, SD=0.090
Training Size : 10
	Personal: M=0.618, SD=0.096
	Hybrid: M=0.630, SD=0.088
Training Size : 20
	Personal: M=0.657, SD=0.086
	Hybrid: M=0.647, SD=0.082
Training Size : 30
	Personal: M=0.679, SD=0.089
	Hybrid: M=0.664, SD=0.071
Training Size : 40
	Personal: M=0.695, SD=0.081
	Hybrid: M=0.670, SD=0.074


In [5]:
from plotly.offline import init_notebook_mode, plot, iplot
import plotly.graph_objs as go

In [9]:
impersonal_trace = go.Box(y=impersonal_mean_scores,
                          x=0,
                          boxpoints='all',
                          jitter=0.8,
                          pointpos=-1,
                          name="Impersonal")


personal_trace = go.Box(y=all_personal_scores,
                       x=all_personal_sizes,
                        boxpoints='all',
                          jitter=0.8,
                          pointpos=-1,
                       name="personal")

hybrid_trace = go.Box(y=all_hybrid_scores,
                     x=all_hybrid_sizes,
                      boxpoints='all',
                          jitter=0.8,
                          pointpos=-1,
                     name="hybrid")

data = [impersonal_trace, personal_trace, hybrid_trace]
layout = go.Layout(yaxis=dict(title='Accuracy', range=[0,1]),
                   xaxis=dict(title='Amount of personal training data'),
                   boxmode='group',
                   title="Accuracy Scores By Method Without Class Stratification"
                  )
fig = go.Figure(data=data, layout=layout)
iplot(fig)

In [7]:
iplot(fig)

## Results with Stratification

In [None]:
# Results without stratification
print("Impersonal: M=%.3f, SD=%.3f\n" % (np.mean(impersonal_mean_scores), np.std(impersonal_mean_scores)))

user_ids = stratified_scores_df['user_id'].unique()
training_sizes = [5,10,20,30,40]

all_personal_scores = []
all_personal_sizes = []

all_hybrid_scores = []
all_hybrid_sizes = []
for ts in training_sizes:
    personal_mean_scores = []
    hybrid_mean_scores = []
    
    for user_id in user_ids:
        user_personal_mean = stratified_scores_df[(stratified_scores_df['user_id'] == user_id) &\
              (stratified_scores_df['method'] == 'personal') &\
              (stratified_scores_df['training_size'] == ts)]['accuracy'].mean()
        user_hybrid_mean = stratified_scores_df[(stratified_scores_df['user_id'] == user_id) &\
              (stratified_scores_df['method'] == 'hybrid') &\
              (stratified_scores_df['training_size'] == ts)]['accuracy'].mean()
        personal_mean_scores.append(user_personal_mean)
        hybrid_mean_scores.append(user_hybrid_mean)
    
    print("Training Size : %s" % ts)
    print("\tPersonal: M=%.3f, SD=%.3f" % (np.mean(personal_mean_scores), np.std(personal_mean_scores)))
    print("\tHybrid: M=%.3f, SD=%.3f" % (np.mean(hybrid_mean_scores), np.std(hybrid_mean_scores)))
    
    all_personal_scores +=  personal_mean_scores
    all_personal_sizes += [ts] * len(personal_mean_scores)
    
    all_hybrid_scores += hybrid_mean_scores
    all_hybrid_sizes += [ts] * len(hybrid_mean_scores)
