# Visualize the results of each model in aggregate

In [1]:
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
init_notebook_mode(connected=True)
from plotly.graph_objs import *

In [2]:
import pandas as pd
import numpy as np

In [3]:
from wisdm import wisdm
wisdm.WISDM_DIR = wisdm.wisdm_v2_dataset_path
wisdm.WISDM_TRANSFORMED = wisdm.wisdm_transformed_v2
wisdm.set_data()

In [39]:
results = []
bad_user_ids = []
for user_id in wisdm.user_ids:
    try:
        user_results_df = pd.read_pickle("./results/experiment_08-23_train_v1_test_v2/"+user_id+".pickle")
        results.append(user_results_df)
    except FileNotFoundError as fnfe:
        print("%s not found : user may not have had enough labeled data" % user_id)
        bad_user_ids.append(user_id)
        pass
        
results_df = pd.concat(results).reset_index(drop=True)


1269 not found : user may not have had enough labeled data
1276 not found : user may not have had enough labeled data
1480 not found : user may not have had enough labeled data
1491 not found : user may not have had enough labeled data
1511 not found : user may not have had enough labeled data
1531 not found : user may not have had enough labeled data
1679 not found : user may not have had enough labeled data
1724 not found : user may not have had enough labeled data
1726 not found : user may not have had enough labeled data
1750 not found : user may not have had enough labeled data
1757 not found : user may not have had enough labeled data
1763 not found : user may not have had enough labeled data
1797 not found : user may not have had enough labeled data
1802 not found : user may not have had enough labeled data
1813 not found : user may not have had enough labeled data


In [5]:
results_df.describe()

Unnamed: 0,impersonal score Mean,impersonal score STD,k-run,personal + cluster score Mean,personal + cluster score STD,personal + impersonal score Mean,personal + impersonal score STD,personal score Mean,personal score STD,personal training data
count,2213.0,2213.0,2213.0,2213.0,2213.0,2213.0,2213.0,2213.0,2213.0,2213.0
mean,0.535573,0.0,4.689562,0.962364,0.017383,0.999941,6.7e-05,0.933281,0.023368,48.364211
std,0.345584,0.0,2.802876,0.065569,0.032364,0.000691,0.000769,0.086413,0.035479,28.280103
min,0.0,0.0,0.0,0.322222,0.0,0.987302,0.0,0.288889,0.0,10.0
25%,0.208333,0.0,2.0,0.945455,0.0,1.0,0.0,0.904762,0.0,20.0
50%,0.555556,0.0,5.0,1.0,0.0,1.0,0.0,0.954545,0.0,50.0
75%,0.875,0.0,7.0,1.0,0.028748,1.0,0.0,1.0,0.04,70.0
max,1.0,0.0,9.0,1.0,0.489898,1.0,0.014815,1.0,0.221352,100.0


In [6]:
training_sizes = [10,20,30,40,50,60,70,80,90,100]

personal_means = {}
personal_plus_universal_means = {}
personal_plus_cluster_means = {}
universal_means = results_df['impersonal score Mean']

for ts in training_sizes:
    personal_scores = results_df[results_df['personal training data'] == ts]['personal score Mean']
    personal_means[ts] = personal_scores
    
    personal_plus_universal_scores = results_df[results_df['personal training data'] == ts]['personal + impersonal score Mean']
    personal_plus_universal_means[ts] = personal_plus_universal_scores
    
    personal_plus_cluster_scores = results_df[results_df['personal training data'] == ts]['personal + cluster score Mean']
    personal_plus_cluster_means[ts] = personal_plus_cluster_scores

In [7]:
universal_trace = Box(y=universal_means,
                      x=[0]*len(universal_means),
                      name="universal model",
                      boxpoints='suspectedoutliers',
                      )

data = [universal_trace]

personal_x = []
personal_plus_universal_x = []
personal_plus_cluster_x = []

personal_means_all = []
personal_plus_universal_means_all = []
personal_plus_cluster_means_all = []

for ts in training_sizes:
    personal_means_all += personal_means[ts].tolist()
    personal_x += [ts] * len(personal_means[ts])
    
    personal_plus_universal_means_all += personal_plus_universal_means[ts].tolist()
    personal_plus_universal_x += [ts] *len(personal_plus_universal_means[ts])
    
    personal_plus_cluster_means_all += personal_plus_cluster_means[ts].tolist()
    personal_plus_cluster_x += [ts] * len(personal_plus_cluster_means[ts])

personal_trace = Box(y=personal_means_all,
                     x=personal_x,
                     name="Personal",
                     boxpoints="suspectedoutliers")

data.append(personal_trace)

personal_plus_universal_trace = Box(y=personal_plus_universal_means_all,
                                    x=personal_plus_universal_x,
                                    name="Personal + Universal",
                                    boxpoints="suspectedoutliers")

data.append(personal_plus_universal_trace)

personal_plus_cluster_trace = Box(y=personal_plus_cluster_means_all,
                                    x=personal_plus_cluster_x,
                                    name="Personal + Cluster",
                                    boxpoints="suspectedoutliers")

data.append(personal_plus_cluster_trace)

layout = Layout(showlegend=True, boxmode='group')
fig = Figure(data=data, layout=layout)

iplot(fig, filename="boxplots_of_wisdm_models")

# By User

In [8]:
def plotScoresByUser(user_id):
    personal_means = {}
    personal_plus_universal_means = {}
    personal_plus_cluster_means = {}
    universal_means = results_df['impersonal score Mean']

    for ts in training_sizes:
        personal_scores = results_df[(results_df['personal training data'] == ts) & \
                                     (results_df['test user'] == user_id)]['personal score Mean']
        personal_means[ts] = personal_scores

        personal_plus_universal_scores = results_df[(results_df['personal training data'] == ts) & \
                                     (results_df['test user'] == user_id)]['personal + impersonal score Mean']
        personal_plus_universal_means[ts] = personal_plus_universal_scores

        personal_plus_cluster_scores = results_df[(results_df['personal training data'] == ts) & \
                                     (results_df['test user'] == user_id)]['personal + cluster score Mean']
        personal_plus_cluster_means[ts] = personal_plus_cluster_scores
    
    
    universal_trace = Box(y=universal_means,
                      x=[0]*len(universal_means),
                      name="universal model",
                      boxpoints='suspectedoutliers',
                      )

    data = [universal_trace]

    personal_x = []
    personal_plus_universal_x = []
    personal_plus_cluster_x = []

    personal_means_all = []
    personal_plus_universal_means_all = []
    personal_plus_cluster_means_all = []

    for ts in training_sizes:
        personal_means_all += personal_means[ts].tolist()
        personal_x += [ts] * len(personal_means[ts])

        personal_plus_universal_means_all += personal_plus_universal_means[ts].tolist()
        personal_plus_universal_x += [ts] *len(personal_plus_universal_means[ts])

        personal_plus_cluster_means_all += personal_plus_cluster_means[ts].tolist()
        personal_plus_cluster_x += [ts] * len(personal_plus_cluster_means[ts])

    personal_trace = Box(y=personal_means_all,
                         x=personal_x,
                         name="Personal",
                         boxpoints="suspectedoutliers")

    data.append(personal_trace)

    personal_plus_universal_trace = Box(y=personal_plus_universal_means_all,
                                        x=personal_plus_universal_x,
                                        name="Personal + Universal",
                                        boxpoints="suspectedoutliers")

    data.append(personal_plus_universal_trace)

    personal_plus_cluster_trace = Box(y=personal_plus_cluster_means_all,
                                        x=personal_plus_cluster_x,
                                        name="Personal + Cluster",
                                        boxpoints="suspectedoutliers")

    data.append(personal_plus_cluster_trace)

    layout = Layout(showlegend=True, boxmode='group')
    fig = Figure(data=data, layout=layout)

    iplot(fig, filename="boxplots_of_wisdm_models")

In [9]:
plotScoresByUser(wisdm.user_ids[0])

For this first user we observe the following.
* the personal+cluster approach almost always does better, on average.
* the personal+cluster appraoch never does as poorly as either the universal+cluster, universal+personal
* the personal+cluster appraoch offers GREAT improvements over other approaches around 20-40 samples

In [10]:
plotScoresByUser(wisdm.user_ids[1])

For the second user, we oddly enough see that personal models and the personal+cluster model both outperform the personal+universal model regardless of the amount of personal data and by a great deal

In [11]:
plotScoresByUser(wisdm.user_ids[2])

Here, we see a different story again.  The personal+universal model not only does well on average, it's worst case scenario is occasionally better than those of the personal or personal+cluster models.

# Select users who get the most from each kind of model

In [16]:
def getModelAccuracyMean(user_id, ts):
    # each list element at index, i, represents that model improvement over the best other model with training size[k]
    user_df = results_df[(results_df['test user'] == user_id) & \
                         (results_df['personal training data'] == ts)]
    personal_score_mean = user_df['personal score Mean'].mean()
    impersonal_score_mean = user_df['impersonal score Mean'].mean()
    personal_plus_impersonal_mean = user_df['personal + impersonal score Mean'].mean()
    personal_plus_cluster_mean = user_df['personal + cluster score Mean'].mean()

    #print("personal : %s" % personal_score_mean)
    #print("impersonal : %s" % impersonal_score_mean)
    #print("personal + impersonal : %s" % personal_plus_impersonal_mean)
    #print("personal + cluster : %s" % personal_plus_cluster_mean)
    mean_scores = {"personal" : personal_score_mean,
                   "impersonal" : impersonal_score_mean,
                   "personal + impersonal" : personal_plus_impersonal_mean,
                   "personal + cluster" : personal_plus_cluster_mean}
    return mean_scores

In [40]:
model_means_columns = ['user id', 'personal', 'impersonal', 'personal + impersonal', 'personal + cluster']
model_means = []

for user_id in wisdm.user_ids:
    if user_id not in bad_user_ids:
        mean_scores = getModelAccuracyMean(user_id, 10)
        mean_scores['user id'] = user_id
        model_means.append(mean_scores)
    
scores_df = pd.DataFrame(model_means, columns=model_means_columns)

In [41]:
scores_df.describe()

Unnamed: 0,personal,impersonal,personal + impersonal,personal + cluster
count,35.0,35.0,35.0,35.0
mean,0.892521,0.550193,0.999973,0.938451
std,0.113564,0.333881,0.000161,0.082904
min,0.519071,0.033333,0.999048,0.669349
25%,0.825247,0.250692,1.0,0.908635
50%,0.902373,0.543654,1.0,0.979248
75%,1.0,0.845933,1.0,1.0
max,1.0,1.0,1.0,1.0


In [42]:
users_benefit_from_personal = []
users_benefit_from_impersonal = []
users_benefit_from_personal_plus_impersonal = []
users_benefit_from_personal_plus_cluster = []

for ind, row in scores_df.iterrows():
    scores = [row['personal'], row['impersonal'], row['personal + impersonal'], row['personal + cluster']]
    best_model = np.argmax(scores)
    
    if best_model == 0:
        users_benefit_from_personal.append(row['user id'])
    elif best_model == 1:
        users_benefit_from_impersonal.append(row['user id'])
    elif best_model == 2:
        users_benefit_from_personal_plus_impersonal.append(row['user id'])
    elif best_model == 3:
        users_benefit_from_personal_plus_cluster.append(row['user id'])

In [43]:
print("Personal was best : %s" % len(users_benefit_from_personal))
print("Impersonal was best : %s" % len(users_benefit_from_impersonal))
print("Personal + Impersonal was best : %s" % len(users_benefit_from_personal_plus_impersonal))
print("Personal + Cluster was best : %s" % len(users_benefit_from_personal_plus_cluster))


Personal was best : 18
Impersonal was best : 0
Personal + Impersonal was best : 21
Personal + Cluster was best : 0


# Next : systematically increase the training size to understand how the best model changes as the available training data increases

In [49]:
def getBests(training_size):
    model_means_columns = ['user id', 'personal', 'impersonal', 'personal + impersonal', 'personal + cluster']
    model_means = []

    for user_id in wisdm.user_ids:
        if user_id not in bad_user_ids:
            mean_scores = getModelAccuracyMean(user_id, training_size)
            mean_scores['user id'] = user_id
            model_means.append(mean_scores)

    scores_df = pd.DataFrame(model_means, columns=model_means_columns)
    
    users_benefit_from_personal = []
    users_benefit_from_impersonal = []
    users_benefit_from_personal_plus_impersonal = []
    users_benefit_from_personal_plus_cluster = []

    for ind, row in scores_df.iterrows():
        scores = [row['personal'], row['impersonal'], row['personal + impersonal'], row['personal + cluster']]
        best_model = np.argmax(scores)

        if best_model == 0:
            users_benefit_from_personal.append(row['user id'])
        elif best_model == 1:
            users_benefit_from_impersonal.append(row['user id'])
        elif best_model == 2:
            users_benefit_from_personal_plus_impersonal.append(row['user id'])
        elif best_model == 3:
            users_benefit_from_personal_plus_cluster.append(row['user id'])
    return users_benefit_from_personal, users_benefit_from_impersonal, \
            users_benefit_from_personal_plus_impersonal, users_benefit_from_personal_plus_cluster

In [50]:
training_sizes = [10,20,30,40,50,60,70,80,90,100]

personal_bests = []
impersonal_bests = []
personal_impersonal_bests = []
personal_cluster_bests = []

for ts in training_sizes:
    personal, impersonal, personal_impersonal, personal_cluster = getBests(ts)
    
    personal_bests.append(personal)
    impersonal_bests.append(impersonal)
    personal_impersonal_bests.append(personal_impersonal)
    personal_cluster_bests.append(personal_cluster)
    
    print("Training Size : %s" % ts)
    print("\t personal : %s" % len(personal))
    print("\t impersonal : %s" % len(impersonal))
    print("\t personal + impersonal : %s" % len(personal_impersonal))
    print("\t personal + cluster : %s" % len(personal_cluster))

Training Size : 10
	 personal : 18
	 impersonal : 0
	 personal + impersonal : 21
	 personal + cluster : 0
Training Size : 20
	 personal : 18
	 impersonal : 0
	 personal + impersonal : 21
	 personal + cluster : 0
Training Size : 30
	 personal : 18
	 impersonal : 0
	 personal + impersonal : 21
	 personal + cluster : 0
Training Size : 40
	 personal : 18
	 impersonal : 0
	 personal + impersonal : 21
	 personal + cluster : 0
Training Size : 50
	 personal : 20
	 impersonal : 0
	 personal + impersonal : 19
	 personal + cluster : 0
Training Size : 60
	 personal : 20
	 impersonal : 0
	 personal + impersonal : 19
	 personal + cluster : 0
Training Size : 70
	 personal : 22
	 impersonal : 0
	 personal + impersonal : 17
	 personal + cluster : 0
Training Size : 80
	 personal : 23
	 impersonal : 0
	 personal + impersonal : 16
	 personal + cluster : 0
Training Size : 90
	 personal : 26
	 impersonal : 0
	 personal + impersonal : 13
	 personal + cluster : 0
Training Size : 100
	 personal : 28
	 imperson

In [63]:
personal_trace = Scatter(x=training_sizes,
                     y=[len(x) for x in personal_bests],
                     name="Personal")
impersonal_trace = Scatter(x=training_sizes,
                       y=[len(x) for x in impersonal_bests],
                       name="Impersonal")
personal_impersonal_trace = Scatter(x=training_sizes,
                       y=[len(x) for x in personal_impersonal_bests],
                       name="Personal + Impersonal")
personal_cluster_trace = Scatter(x=training_sizes,
                       y=[len(x) for x in personal_cluster_bests],
                       name="Personal + Cluster")

data = [personal_trace, impersonal_trace, personal_impersonal_trace, personal_cluster_trace]
layout=Layout(yaxis=dict(range=[0,40]))
fig = Figure(data=data, layout=layout)
iplot(fig, filename="user_model_bests")

In [68]:
for c in wisdm.data_df['class'].unique():
    print(c)

b'Standing'
b'Sitting'
b'Stairs'
b'LyingDown'
b'Walking'
b'Jogging'


In [65]:
wisdm.WISDM_DIR

'./datasets/WISDM_v2/'

In [71]:
from collections import Counter

In [72]:
Counter(wisdm.data_df['class'])

Counter({b'Jogging': 130,
         b'LyingDown': 619,
         b'Sitting': 1410,
         b'Stairs': 251,
         b'Standing': 840,
         b'Walking': 2185})

In [78]:
new_data_df = wisdm.data_df[wisdm.data_df['class'] != b'LyingDown']

In [79]:
Counter(new_data_df['class'])

Counter({b'Jogging': 130,
         b'Sitting': 1410,
         b'Stairs': 251,
         b'Standing': 840,
         b'Walking': 2185})

In [81]:
def make_labels_compatible(data_df):
    class_labels = data_df['class'].unique()

    if b'LyingDown' in class_labels:
        # remove rows with "LyingDown" as class
        new_data_df = data_df[data_df['class'] != b"LyingDown"]
        return new_data_df
    elif (b'Upstairs' in class_labels) or (b'Downstairs' in class_labels):
        new_data_df = data_df.copy()
        new_data_df['class'] = data_df['class'].replace(to_replace=[b'Upstairs', b'Downstairs'], value=[b'Stairs', b'Stairs'])
        return new_data_df

In [82]:
new_data_df = make_labels_compatible(wisdm.data_df)

In [83]:
new_data_df['class'].unique()

array([b'Standing', b'Sitting', b'Stairs', b'Walking', b'Jogging'], dtype=object)