# Visualize the results of each model in aggregate

[Design Considerations for the WISDM Smart Phone-based
Sensor Mining Architecture](http://www.cis.fordham.edu/wisdm/includes/files/Lockhart-Design-SensorKDD11.pdf)

[The Impact of Personalization on Smartphone-Based Activity Recognition](http://storm.cis.fordham.edu/~gweiss/papers/aaai12-workshop-personalization.pdf)

In [1]:
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
init_notebook_mode(connected=True)
from plotly.graph_objs import *

In [2]:
import pandas as pd
import numpy as np

In [3]:
from wisdm import wisdm
wisdm.WISDM_DIR = wisdm.wisdm_v2_dataset_path
wisdm.WISDM_TRANSFORMED = wisdm.wisdm_transformed_v2
wisdm.set_data()

In [19]:
results = []
bad_user_ids = []

for user_id in wisdm.user_ids:
    try:
        user_results_df = pd.read_pickle("./results/experiment_08-21_v2_dataset/"+user_id+".pickle")
        results.append(user_results_df)
    except FileNotFoundError as fnfe:
        print("%s not found : user may not have had enough labeled data" % user_id)
        bad_user_ids.append(user_id)
        pass
        
results_df = pd.concat(results).reset_index(drop=True)


1269 not found : user may not have had enough labeled data
1276 not found : user may not have had enough labeled data
1480 not found : user may not have had enough labeled data
1491 not found : user may not have had enough labeled data
1511 not found : user may not have had enough labeled data
1531 not found : user may not have had enough labeled data
1679 not found : user may not have had enough labeled data
1724 not found : user may not have had enough labeled data
1726 not found : user may not have had enough labeled data
1750 not found : user may not have had enough labeled data
1757 not found : user may not have had enough labeled data
1763 not found : user may not have had enough labeled data
1797 not found : user may not have had enough labeled data
1802 not found : user may not have had enough labeled data
1813 not found : user may not have had enough labeled data


In [5]:
results_df.describe()

Unnamed: 0,impersonal score Mean,impersonal score STD,k-run,personal + cluster score Mean,personal + cluster score STD,personal + impersonal score Mean,personal + impersonal score STD,personal score Mean,personal score STD,personal training data
count,2213.0,2213.0,2213.0,2213.0,2213.0,2213.0,2213.0,2213.0,2213.0,2213.0
mean,0.541627,0.0,4.689562,0.912099,0.029117,0.906989,0.030069,0.93242,0.023512,48.364211
std,0.3354,0.0,2.802876,0.122497,0.044476,0.127889,0.043685,0.087157,0.03654,28.280103
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.311111,0.0,10.0
25%,0.25,0.0,2.0,0.88,0.0,0.875,0.0,0.901754,0.0,20.0
50%,0.535714,0.0,5.0,0.948148,0.007018,0.945455,0.008595,0.954545,0.0,50.0
75%,0.846154,0.0,7.0,1.0,0.044444,1.0,0.044536,1.0,0.04,70.0
max,1.0,0.0,9.0,1.0,0.489898,1.0,0.4,1.0,0.279942,100.0


In [6]:
training_sizes = [10,20,30,40,50,60,70,80,90,100]

personal_means = {}
personal_plus_universal_means = {}
personal_plus_cluster_means = {}
universal_means = results_df['impersonal score Mean']

for ts in training_sizes:
    personal_scores = results_df[results_df['personal training data'] == ts]['personal score Mean']
    personal_means[ts] = personal_scores
    
    personal_plus_universal_scores = results_df[results_df['personal training data'] == ts]['personal + impersonal score Mean']
    personal_plus_universal_means[ts] = personal_plus_universal_scores
    
    personal_plus_cluster_scores = results_df[results_df['personal training data'] == ts]['personal + cluster score Mean']
    personal_plus_cluster_means[ts] = personal_plus_cluster_scores

In [7]:
universal_trace = Box(y=universal_means,
                      x=[0]*len(universal_means),
                      name="universal model",
                      boxpoints='suspectedoutliers',
                      )

data = [universal_trace]

personal_x = []
personal_plus_universal_x = []
personal_plus_cluster_x = []

personal_means_all = []
personal_plus_universal_means_all = []
personal_plus_cluster_means_all = []

for ts in training_sizes:
    personal_means_all += personal_means[ts].tolist()
    personal_x += [ts] * len(personal_means[ts])
    
    personal_plus_universal_means_all += personal_plus_universal_means[ts].tolist()
    personal_plus_universal_x += [ts] *len(personal_plus_universal_means[ts])
    
    personal_plus_cluster_means_all += personal_plus_cluster_means[ts].tolist()
    personal_plus_cluster_x += [ts] * len(personal_plus_cluster_means[ts])

personal_trace = Box(y=personal_means_all,
                     x=personal_x,
                     name="Personal",
                     boxpoints="suspectedoutliers")

data.append(personal_trace)

personal_plus_universal_trace = Box(y=personal_plus_universal_means_all,
                                    x=personal_plus_universal_x,
                                    name="Personal + Universal",
                                    boxpoints="suspectedoutliers")

data.append(personal_plus_universal_trace)

personal_plus_cluster_trace = Box(y=personal_plus_cluster_means_all,
                                    x=personal_plus_cluster_x,
                                    name="Personal + Cluster",
                                    boxpoints="suspectedoutliers")

data.append(personal_plus_cluster_trace)

layout = Layout(showlegend=True, boxmode='group')
fig = Figure(data=data, layout=layout)

iplot(fig, filename="boxplots_of_wisdm_models")

# By User

In [8]:
def plotScoresByUser(user_id):
    personal_means = {}
    personal_plus_universal_means = {}
    personal_plus_cluster_means = {}
    universal_means = results_df['impersonal score Mean']

    for ts in training_sizes:
        personal_scores = results_df[(results_df['personal training data'] == ts) & \
                                     (results_df['test user'] == user_id)]['personal score Mean']
        personal_means[ts] = personal_scores

        personal_plus_universal_scores = results_df[(results_df['personal training data'] == ts) & \
                                     (results_df['test user'] == user_id)]['personal + impersonal score Mean']
        personal_plus_universal_means[ts] = personal_plus_universal_scores

        personal_plus_cluster_scores = results_df[(results_df['personal training data'] == ts) & \
                                     (results_df['test user'] == user_id)]['personal + cluster score Mean']
        personal_plus_cluster_means[ts] = personal_plus_cluster_scores
    
    
    universal_trace = Box(y=universal_means,
                      x=[0]*len(universal_means),
                      name="universal model",
                      boxpoints='suspectedoutliers',
                      )

    data = [universal_trace]

    personal_x = []
    personal_plus_universal_x = []
    personal_plus_cluster_x = []

    personal_means_all = []
    personal_plus_universal_means_all = []
    personal_plus_cluster_means_all = []

    for ts in training_sizes:
        personal_means_all += personal_means[ts].tolist()
        personal_x += [ts] * len(personal_means[ts])

        personal_plus_universal_means_all += personal_plus_universal_means[ts].tolist()
        personal_plus_universal_x += [ts] *len(personal_plus_universal_means[ts])

        personal_plus_cluster_means_all += personal_plus_cluster_means[ts].tolist()
        personal_plus_cluster_x += [ts] * len(personal_plus_cluster_means[ts])

    personal_trace = Box(y=personal_means_all,
                         x=personal_x,
                         name="Personal",
                         boxpoints="suspectedoutliers")

    data.append(personal_trace)

    personal_plus_universal_trace = Box(y=personal_plus_universal_means_all,
                                        x=personal_plus_universal_x,
                                        name="Personal + Universal",
                                        boxpoints="suspectedoutliers")

    data.append(personal_plus_universal_trace)

    personal_plus_cluster_trace = Box(y=personal_plus_cluster_means_all,
                                        x=personal_plus_cluster_x,
                                        name="Personal + Cluster",
                                        boxpoints="suspectedoutliers")

    data.append(personal_plus_cluster_trace)

    layout = Layout(showlegend=True, boxmode='group')
    fig = Figure(data=data, layout=layout)

    iplot(fig, filename="boxplots_of_wisdm_models")

In [9]:
plotScoresByUser(wisdm.user_ids[0])

For this first user we observe the following.
* the personal+cluster approach almost always does better, on average.
* the personal+cluster appraoch never does as poorly as either the universal+cluster, universal+personal
* the personal+cluster appraoch offers GREAT improvements over other approaches around 20-40 samples

In [10]:
plotScoresByUser(wisdm.user_ids[1])

For the second user, we oddly enough see that personal models and the personal+cluster model both outperform the personal+universal model regardless of the amount of personal data and by a great deal

In [11]:
plotScoresByUser(wisdm.user_ids[2])

Here, we see a different story again.  The personal+universal model not only does well on average, it's worst case scenario is occasionally better than those of the personal or personal+cluster models.

# Select users who get the most from each kind of model

In [12]:
test_df = results_df[results_df['test user']==wisdm.user_ids[0]]

In [13]:
test_df.tail()

Unnamed: 0,classifier,impersonal score Mean,impersonal score STD,k-run,personal + cluster score Mean,personal + cluster score STD,personal + impersonal score Mean,personal + impersonal score STD,personal score Mean,personal score STD,personal training data,test user
75,RF with Wiki Parameters,0.333333,0.0,9,0.72,0.026667,0.733333,0.0,0.96,0.03266,60,194
76,RF with Wiki Parameters,0.4,0.0,9,0.733333,0.0,0.733333,0.042164,0.973333,0.03266,70,194
77,RF with Wiki Parameters,0.333333,0.0,9,0.746667,0.026667,0.76,0.03266,1.0,0.0,80,194
78,RF with Wiki Parameters,0.266667,0.0,9,0.746667,0.026667,0.76,0.03266,1.0,0.0,90,194
79,RF with Wiki Parameters,0.333333,0.0,9,0.733333,0.0,0.773333,0.053333,1.0,0.0,100,194


In [14]:
def getModelAccuracyMean(user_id, ts):
    # each list element at index, i, represents that model improvement over the best other model with training size[k]
    user_df = results_df[(results_df['test user'] == user_id) & \
                         (results_df['personal training data'] == ts)]
    personal_score_mean = user_df['personal score Mean'].mean()
    impersonal_score_mean = user_df['impersonal score Mean'].mean()
    personal_plus_impersonal_mean = user_df['personal + impersonal score Mean'].mean()
    personal_plus_cluster_mean = user_df['personal + cluster score Mean'].mean()

    #print("personal : %s" % personal_score_mean)
    #print("impersonal : %s" % impersonal_score_mean)
    #print("personal + impersonal : %s" % personal_plus_impersonal_mean)
    #print("personal + cluster : %s" % personal_plus_cluster_mean)
    mean_scores = {"personal" : personal_score_mean,
                   "impersonal" : impersonal_score_mean,
                   "personal + impersonal" : personal_plus_impersonal_mean,
                   "personal + cluster" : personal_plus_cluster_mean}
    return mean_scores

In [24]:
model_means_columns = ['user id', 'personal', 'impersonal', 'personal + impersonal', 'personal + cluster']
model_means = []

for user_id in wisdm.user_ids:
    if user_id not in bad_user_ids:
        mean_scores = getModelAccuracyMean(user_id, 10)
        mean_scores['user id'] = user_id
        model_means.append(mean_scores)
    
scores_df = pd.DataFrame(model_means, columns=model_means_columns)

In [25]:
scores_df.describe()

Unnamed: 0,personal,impersonal,personal + impersonal,personal + cluster
count,35.0,35.0,35.0,35.0
mean,0.893113,0.572091,0.847205,0.849219
std,0.113876,0.316139,0.15455,0.147352
min,0.515945,0.0,0.333123,0.376981
25%,0.813862,0.329472,0.75842,0.781767
50%,0.89996,0.566574,0.88,0.86
75%,1.0,0.822562,1.0,0.985669
max,1.0,1.0,1.0,1.0


In [17]:
users_benefit_from_personal = []
users_benefit_from_impersonal = []
users_benefit_from_personal_plus_impersonal = []
users_benefit_from_personal_plus_cluster = []

for ind, row in scores_df.iterrows():
    scores = [row['personal'], row['impersonal'], row['personal + impersonal'], row['personal + cluster']]
    best_model = np.argmax(scores)
    
    if best_model == 0:
        users_benefit_from_personal.append(row['user_id'])
    elif best_model == 1:
        users_benefit_from_impersonal.append(row['user_id'])
    elif best_model == 2:
        users_benefit_from_personal_plus_impersonal.append(row['user_id'])
    elif best_model == 3:
        users_benefit_from_personal_plus_cluster.append(row['user_id'])

In [18]:
print("Personal was best : %s" % len(users_benefit_from_personal))
print("Impersonal was best : %s" % len(users_benefit_from_impersonal))
print("Personal + Impersonal was best : %s" % len(users_benefit_from_personal_plus_impersonal))
print("Personal + Cluster was best : %s" % len(users_benefit_from_personal_plus_cluster))


Personal was best : 47
Impersonal was best : 0
Personal + Impersonal was best : 6
Personal + Cluster was best : 1


In [22]:
def getBests(training_size):
    model_means_columns = ['user id', 'personal', 'impersonal', 'personal + impersonal', 'personal + cluster']
    model_means = []

    for user_id in wisdm.user_ids:
        if user_id not in bad_user_ids:
            mean_scores = getModelAccuracyMean(user_id, training_size)
            mean_scores['user id'] = user_id
            model_means.append(mean_scores)

    scores_df = pd.DataFrame(model_means, columns=model_means_columns)
    
    users_benefit_from_personal = []
    users_benefit_from_impersonal = []
    users_benefit_from_personal_plus_impersonal = []
    users_benefit_from_personal_plus_cluster = []

    for ind, row in scores_df.iterrows():
        scores = [row['personal'], row['impersonal'], row['personal + impersonal'], row['personal + cluster']]
        best_model = np.argmax(scores)

        if best_model == 0:
            users_benefit_from_personal.append(row['user id'])
        elif best_model == 1:
            users_benefit_from_impersonal.append(row['user id'])
        elif best_model == 2:
            users_benefit_from_personal_plus_impersonal.append(row['user id'])
        elif best_model == 3:
            users_benefit_from_personal_plus_cluster.append(row['user id'])
    return users_benefit_from_personal, users_benefit_from_impersonal, \
            users_benefit_from_personal_plus_impersonal, users_benefit_from_personal_plus_cluster

In [23]:
training_sizes = [10,20,30,40,50,60,70,80,90,100]

personal_bests = []
impersonal_bests = []
personal_impersonal_bests = []
personal_cluster_bests = []

for ts in training_sizes:
    personal, impersonal, personal_impersonal, personal_cluster = getBests(ts)
    
    personal_bests.append(personal)
    impersonal_bests.append(impersonal)
    personal_impersonal_bests.append(personal_impersonal)
    personal_cluster_bests.append(personal_cluster)
    
    print("Training Size : %s" % ts)
    print("\t personal : %s" % len(personal))
    print("\t impersonal : %s" % len(impersonal))
    print("\t personal + impersonal : %s" % len(personal_impersonal))
    print("\t personal + cluster : %s" % len(personal_cluster))

Training Size : 10
	 personal : 32
	 impersonal : 0
	 personal + impersonal : 6
	 personal + cluster : 1
Training Size : 20
	 personal : 31
	 impersonal : 1
	 personal + impersonal : 4
	 personal + cluster : 3
Training Size : 30
	 personal : 32
	 impersonal : 0
	 personal + impersonal : 5
	 personal + cluster : 2
Training Size : 40
	 personal : 31
	 impersonal : 1
	 personal + impersonal : 4
	 personal + cluster : 3
Training Size : 50
	 personal : 33
	 impersonal : 1
	 personal + impersonal : 2
	 personal + cluster : 3
Training Size : 60
	 personal : 32
	 impersonal : 1
	 personal + impersonal : 2
	 personal + cluster : 4
Training Size : 70
	 personal : 32
	 impersonal : 1
	 personal + impersonal : 2
	 personal + cluster : 4
Training Size : 80
	 personal : 33
	 impersonal : 1
	 personal + impersonal : 2
	 personal + cluster : 3
Training Size : 90
	 personal : 33
	 impersonal : 1
	 personal + impersonal : 2
	 personal + cluster : 3
Training Size : 100
	 personal : 33
	 impersonal : 0
	 

In [28]:
personal_trace = Scatter(x=training_sizes,
                     y=[len(x) for x in personal_bests],
                     name="Personal")
impersonal_trace = Scatter(x=training_sizes,
                       y=[len(x) for x in impersonal_bests],
                       name="Impersonal")
personal_impersonal_trace = Scatter(x=training_sizes,
                       y=[len(x) for x in personal_impersonal_bests],
                       name="Personal + Impersonal")
personal_cluster_trace = Scatter(x=training_sizes,
                       y=[len(x) for x in personal_cluster_bests],
                       name="Personal + Cluster")

data = [personal_trace, impersonal_trace, personal_impersonal_trace, personal_cluster_trace]
layout=Layout(yaxis=dict(range=[0,40]))
fig = Figure(data=data, layout=layout)
iplot(fig, filename="user_model_bests")