# Visualize the results of each model in aggregate

In [1]:
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
init_notebook_mode(connected=True)
from plotly.graph_objs import *

In [2]:
import pandas as pd
import numpy as np

In [3]:
from wisdm import wisdm
#wisdm.WISDM_DIR = wisdm.wisdm_v2_dataset_path
#wisdm.WISDM_TRANSFORMED = wisdm.wisdm_transformed_v2
wisdm.set_data()

In [6]:
results = []
for user_id in wisdm.user_ids:
    try:
        user_results_df = pd.read_pickle("./results/experiment_08-17/"+user_id+".pickle")
        results.append(user_results_df)
    except FileNotFoundError as fnfe:
        print("%s not found : user may not have had enough labeled data" % user_id)
        pass
        
results_df = pd.concat(results).reset_index(drop=True)


In [7]:
results_df.describe()

Unnamed: 0,impersonal score Mean,impersonal score STD,k-run,personal + cluster score Mean,personal + cluster score STD,personal + impersonal score Mean,personal + impersonal score STD,personal score Mean,personal score STD,personal training data
count,3203.0,3203.0,3203.0,3203.0,3203.0,3203.0,3203.0,3203.0,3203.0,3203.0
mean,0.759675,0.0,4.542304,0.887214,0.037908,0.895351,0.03536393,0.881412,0.03681312,52.425851
std,0.188025,0.0,2.860652,0.084171,0.030518,0.087472,0.0303017,0.088221,0.03197226,28.067305
min,0.0,0.0,0.0,0.45,0.0,0.288889,0.0,0.4,0.0,10.0
25%,0.727273,0.0,2.0,0.833333,0.021053,0.847059,1.110223e-16,0.833333,1.110223e-16,30.0
50%,0.8,0.0,5.0,0.894118,0.036364,0.909091,0.03333333,0.890909,0.03499271,50.0
75%,0.866667,0.0,7.0,0.953846,0.053452,0.96,0.04988877,0.944444,0.05345225,80.0
max,1.0,0.0,9.0,1.0,0.326599,1.0,0.2981424,1.0,0.2,100.0


In [8]:
training_sizes = [10,20,30,40,50,60,70,80,90,100]

personal_means = {}
personal_plus_universal_means = {}
personal_plus_cluster_means = {}
universal_means = results_df['impersonal score Mean']

for ts in training_sizes:
    personal_scores = results_df[results_df['personal training data'] == ts]['personal score Mean']
    personal_means[ts] = personal_scores
    
    personal_plus_universal_scores = results_df[results_df['personal training data'] == ts]['personal + impersonal score Mean']
    personal_plus_universal_means[ts] = personal_plus_universal_scores
    
    personal_plus_cluster_scores = results_df[results_df['personal training data'] == ts]['personal + cluster score Mean']
    personal_plus_cluster_means[ts] = personal_plus_cluster_scores

In [9]:
universal_trace = Box(y=universal_means,
                      x=[0]*len(universal_means),
                      name="universal model",
                      boxpoints='suspectedoutliers',
                      )

data = [universal_trace]

personal_x = []
personal_plus_universal_x = []
personal_plus_cluster_x = []

personal_means_all = []
personal_plus_universal_means_all = []
personal_plus_cluster_means_all = []

for ts in training_sizes:
    personal_means_all += personal_means[ts].tolist()
    personal_x += [ts] * len(personal_means[ts])
    
    personal_plus_universal_means_all += personal_plus_universal_means[ts].tolist()
    personal_plus_universal_x += [ts] *len(personal_plus_universal_means[ts])
    
    personal_plus_cluster_means_all += personal_plus_cluster_means[ts].tolist()
    personal_plus_cluster_x += [ts] * len(personal_plus_cluster_means[ts])

personal_trace = Box(y=personal_means_all,
                     x=personal_x,
                     name="Personal",
                     boxpoints="suspectedoutliers")

data.append(personal_trace)

personal_plus_universal_trace = Box(y=personal_plus_universal_means_all,
                                    x=personal_plus_universal_x,
                                    name="Personal + Universal",
                                    boxpoints="suspectedoutliers")

data.append(personal_plus_universal_trace)

personal_plus_cluster_trace = Box(y=personal_plus_cluster_means_all,
                                    x=personal_plus_cluster_x,
                                    name="Personal + Cluster",
                                    boxpoints="suspectedoutliers")

data.append(personal_plus_cluster_trace)

layout = Layout(showlegend=True, boxmode='group')
fig = Figure(data=data, layout=layout)

iplot(fig, filename="boxplots_of_wisdm_models")

# By User

In [10]:
def plotScoresByUser(user_id):
    personal_means = {}
    personal_plus_universal_means = {}
    personal_plus_cluster_means = {}
    universal_means = results_df['impersonal score Mean']

    for ts in training_sizes:
        personal_scores = results_df[(results_df['personal training data'] == ts) & \
                                     (results_df['test user'] == user_id)]['personal score Mean']
        personal_means[ts] = personal_scores

        personal_plus_universal_scores = results_df[(results_df['personal training data'] == ts) & \
                                     (results_df['test user'] == user_id)]['personal + impersonal score Mean']
        personal_plus_universal_means[ts] = personal_plus_universal_scores

        personal_plus_cluster_scores = results_df[(results_df['personal training data'] == ts) & \
                                     (results_df['test user'] == user_id)]['personal + cluster score Mean']
        personal_plus_cluster_means[ts] = personal_plus_cluster_scores
    
    
    universal_trace = Box(y=universal_means,
                      x=[0]*len(universal_means),
                      name="universal model",
                      boxpoints='suspectedoutliers',
                      )

    data = [universal_trace]

    personal_x = []
    personal_plus_universal_x = []
    personal_plus_cluster_x = []

    personal_means_all = []
    personal_plus_universal_means_all = []
    personal_plus_cluster_means_all = []

    for ts in training_sizes:
        personal_means_all += personal_means[ts].tolist()
        personal_x += [ts] * len(personal_means[ts])

        personal_plus_universal_means_all += personal_plus_universal_means[ts].tolist()
        personal_plus_universal_x += [ts] *len(personal_plus_universal_means[ts])

        personal_plus_cluster_means_all += personal_plus_cluster_means[ts].tolist()
        personal_plus_cluster_x += [ts] * len(personal_plus_cluster_means[ts])

    personal_trace = Box(y=personal_means_all,
                         x=personal_x,
                         name="Personal",
                         boxpoints="suspectedoutliers")

    data.append(personal_trace)

    personal_plus_universal_trace = Box(y=personal_plus_universal_means_all,
                                        x=personal_plus_universal_x,
                                        name="Personal + Universal",
                                        boxpoints="suspectedoutliers")

    data.append(personal_plus_universal_trace)

    personal_plus_cluster_trace = Box(y=personal_plus_cluster_means_all,
                                        x=personal_plus_cluster_x,
                                        name="Personal + Cluster",
                                        boxpoints="suspectedoutliers")

    data.append(personal_plus_cluster_trace)

    layout = Layout(showlegend=True, boxmode='group')
    fig = Figure(data=data, layout=layout)

    iplot(fig, filename="boxplots_of_wisdm_models")

In [11]:
plotScoresByUser(wisdm.user_ids[0])

For this first user we observe the following.
* the personal+cluster approach almost always does better, on average.
* the personal+cluster appraoch never does as poorly as either the universal+cluster, universal+personal
* the personal+cluster appraoch offers GREAT improvements over other approaches around 20-40 samples

In [12]:
plotScoresByUser(wisdm.user_ids[1])

For the second user, we oddly enough see that personal models and the personal+cluster model both outperform the personal+universal model regardless of the amount of personal data and by a great deal

In [13]:
plotScoresByUser(wisdm.user_ids[2])

Here, we see a different story again.  The personal+universal model not only does well on average, it's worst case scenario is occasionally better than those of the personal or personal+cluster models.

# Select users who get the most from each kind of model

In [14]:
test_df = results_df[results_df['test user']==wisdm.user_ids[0]]

In [15]:
test_df.tail()

Unnamed: 0,classifier,impersonal score Mean,impersonal score STD,k-run,personal + cluster score Mean,personal + cluster score STD,personal + impersonal score Mean,personal + impersonal score STD,personal score Mean,personal score STD,personal training data,test user
95,RF with Wiki Parameters,0.666667,0.0,9,0.916667,0.052705,0.9,0.062361,0.866667,0.040825,60,33
96,RF with Wiki Parameters,0.75,0.0,9,0.916667,0.052705,0.866667,0.084984,0.9,0.062361,70,33
97,RF with Wiki Parameters,0.75,0.0,9,0.95,0.040825,0.883333,0.040825,0.966667,0.040825,80,33
98,RF with Wiki Parameters,0.666667,0.0,9,0.933333,0.062361,0.933333,0.062361,0.933333,0.033333,90,33
99,RF with Wiki Parameters,0.666667,0.0,9,1.0,0.0,0.966667,0.066667,0.95,0.040825,100,33


In [14]:
def getModelComparisons(user_id):
    # each list element at index, i, represents that model improvement over the best other model with training size[k]
    personal_plus_universal_improvements = []
    personal_plus_cluster_improvements = []
    personal_improvements = []

    for ts in training_sizes:
        # get array for scores for a model where each element at index, k, represents the k-th run
        ts_df = 

SyntaxError: invalid syntax (<ipython-input-14-c3eadb5c1f55>, line 9)

# junk

In [15]:
x = (-31/23)

def eff(x):
    return (x**3) - x + 1

def eff_prime(x):
    return 3*(x**2) - 1

x - (eff(x)/eff_prime(x))

-1.325200398950907