# ToDo :

* ~~consolidate code from notebook into WISDM helper methods file~~
* ~~rewrite kfolds process so that we have more precise control over the size of a fold (as oppose to just the number of folds)~~
* ~~run experiment that directly compares model using ALL general data + active data to model using ONLY general data from nearest cluster + active data~~
* analyze/visualize clusters (is there a better algorithm? is there a better k for the k-means?) 
* perhaps compare with using the WORST cluster, or using ONLY the personal data
    * for each size of active data
        * for each algorithm (also ensemble algorithm?)
            * personal only
            * universal only
            * personal + ALL universal
            * personal + best cluster universal
            * personal + worst cluster universal
* Run experiment trained on v1.1 applied to v2.0 data


In [1]:
from wisdm import wisdm
import numpy as np
import pandas as pd

from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold, StratifiedKFold, StratifiedShuffleSplit, ShuffleSplit
from sklearn.cluster import KMeans
from scipy.stats import mode

import warnings
import time

In [2]:
wisdm.set_data(version='2', make_compatible=True)

# Test Universal Impersonal Model Pipeline

In [3]:
start = time.time()

In [None]:
rf_results = []

with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    for user_id in wisdm.user_ids:
        test_set = wisdm.get_user_set(user_id)
        test_set = wisdm.remove_all_nan(test_set)

        test_labels = np.array([t.decode("utf-8") for t in test_set['class'].as_matrix()])
        test_features = test_set.as_matrix(columns=[test_set.columns[1:-1]])

        #print("%s labels, %s features" % (len(test_labels), len(test_features)))
        # training features & labels
        training_set = wisdm.data_df[wisdm.data_df['user'] != user_id]
        training_set = wisdm.remove_all_nan(training_set)
        training_labels = np.array([t.decode("utf-8") for t in training_set['class'].as_matrix()])
        training_features = training_set.as_matrix(columns=[test_set.columns[1:-1]])
        
        try:
            # normalize features
            scaler = StandardScaler().fit(training_features)
            scaled_train_x = scaler.transform(training_features)
            scaled_test_x = scaler.transform(test_features)

            clf = wisdm.weka_RF()
            clf.fit(scaled_train_x, training_labels)
            predictions = clf.predict(scaled_test_x)

            score = accuracy_score(test_labels, predictions)
            print("Score for %s: %.3f" % (user_id, score))
            rf_results.append(score)
        except ValueError as ve:
            if "while a minimum of 1 is required by StandardScaler" in ve.args[0]:
                print("Not enough data for user #%s" % user_id)
                continue
            else:
                raise ve

print("RF results : M=%.5f, SD=%.5f" % (np.mean(rf_results), np.std(rf_results)))

In [None]:
finish = time.time()
print("Took about %s seconds" % (finish - start))

# Setup Parallelization

In [2]:
import ipyparallel as ipp

In [3]:
c = ipp.Client()

In [4]:
dview = c[:]

In [5]:
%%px --local
import sys
sys.path.append("/home/sac086/wisdm_model_personalization/")
import warnings
import os
from wisdm import wisdm
import numpy as np
import pandas as pd

from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold, StratifiedKFold, StratifiedShuffleSplit, ShuffleSplit
from sklearn.cluster import KMeans
from scipy.stats import mode
from collections import Counter
import time

In [6]:
%%px --local

wisdm.set_data(version="2")

# Cross-Validation with exact number of data points

### Experiment Attributes : 
* amount of training set from individual end user
* amount of training set from impersonal data (other users)
    * "ALL" all other data
    * "closest cluster" only data from the closest cluster
    * "furthest cluster" only data from the furthest cluster
    * "All - furthest cluster" all other data EXCEPT data from furthest cluster
* test user id
* algorithm
* algorithm parameters



In [7]:
%%px --local
def personal_model(active_features, active_labels, test_features=None, test_labels=None):
    scaler = StandardScaler().fit(active_features)
    scaled_train_x = scaler.transform(active_features)
    
    rfc_clf = wisdm.weka_RF()
    rfc_clf.fit(scaled_train_x, active_labels)
    
    if test_features is None:
        return rfc_clf
    
    scaled_test_x = scaler.transform(test_features)
    predictions = rfc_clf.predict(scaled_test_x)
    score = accuracy_score(test_labels, predictions)
    return score

In [68]:
%%px --local

# Rewrite these so that if there are no test_features or labels, 
# then return the fit model itself
def impersonal_model(impersonal_features, impersonal_labels, test_features, test_labels):
    scaler = StandardScaler().fit(impersonal_features)
    scaled_train_x = scaler.transform(impersonal_features)
    
    rfc_clf = wisdm.weka_RF()
    rfc_clf.fit(scaled_train_x, impersonal_labels)
    
    if test_features is None:
        return rfc_clf
    scaled_test_x = scaler.transform(test_features)

    
    predictions = rfc_clf.predict(scaled_test_x)
    score = accuracy_score(test_labels, predictions)

    return score

KeyboardInterrupt: 

In [9]:
%%px --local

def universal_plus_personal_model(personal_features, personal_labels,
                                  universal_features, universal_labels,
                                  test_features, test_labels):
    personal_plus_universal_features = np.vstack((personal_features, universal_features))
    personal_plus_universal_labels = np.hstack((personal_labels, universal_labels))

    scaler = StandardScaler().fit(personal_plus_universal_features)
    scaled_train_x = scaler.transform(personal_plus_universal_features)
    scaled_test_x = scaler.transform(test_features)

    rfc_clf = wisdm.weka_RF()

    rfc_clf.fit(scaled_train_x, personal_plus_universal_labels)
    predictions = rfc_clf.predict(scaled_test_x)
    score = accuracy_score(test_labels, predictions)
    return score

In [10]:
%%px --local

def cluster_plus_personal_model(personal_features, personal_labels,
                                  universal_features, universal_labels,
                                  test_features, test_labels, KM, clusters):
    cluster_predictions = KM.predict(personal_features)
    closest_cluster = mode(cluster_predictions).mode[0]

    cluster_data_indeces = [i for i in range(len(clusters)) if clusters[i] == closest_cluster]
    cluster_features = universal_features[cluster_data_indeces]
    cluster_labels = universal_labels[cluster_data_indeces]

    training_features = np.vstack((personal_features, cluster_features))
    training_labels = np.hstack((personal_labels, cluster_labels))

    scaler = StandardScaler().fit(training_features)
    scaled_train_x = scaler.transform(training_features)
    scaled_test_x = scaler.transform(test_features)

    rfc_clf = wisdm.weka_RF()

    rfc_clf.fit(scaled_train_x, training_labels)
    predictions = rfc_clf.predict(scaled_test_x)
    score = accuracy_score(test_labels, predictions)
    return score

In [11]:
%%px --local
experiment_output_path = "/home/sac086/wisdm_model_personalization/results/experiment_08-24_train_v2/"
if not os.path.exists(experiment_output_path):
    os.makedirs(experiment_output_path)

In [18]:
%%px --local
training_sizes = [10,20,30,40,50,60,70,80,90,100]

def all_models():
    scores = []
    err = None
    training_sizes = [10,20,30,40,50,60,70,80,90,100]

    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        for ind, user_id in enumerate(user_ids):
            user_scores_df = []
            print("Running user #%s: %s" % (ind, user_id))
            personal_set = wisdm.get_user_set(user_id)
            personal_set = wisdm.remove_all_nan(personal_set)
            
            if len(personal_set) < 40:
                print("User does not have enough labeled data")
                continue

            personal_labels = np.array([t.decode("utf-8") for t in personal_set['class'].as_matrix()])
            personal_features = personal_set.as_matrix(columns=[personal_set.columns[1:-1]])

            # What is the distribution of labels for this participant?
            personal_labels_distribution = Counter(personal_labels)
            print("\tHas %s labels : " % len(personal_labels))
            for label_key, number in personal_labels_distribution.items():
                print("\t\t%s:%s" % (label_key, number))
            print("\n")
            # training features & labels
            universal_set = wisdm.data_df[wisdm.data_df['user'] != user_id]
            universal_set = wisdm.remove_all_nan(universal_set)
            universal_labels = np.array([t.decode("utf-8") for t in universal_set['class'].as_matrix()])
            universal_features = universal_set.as_matrix(columns=[universal_set.columns[1:-1]])

            # get k-means clusters
            number_of_clusters = 4 # the higher this number is, the smaller we should expect each cluster to be

            KM = KMeans(n_clusters=number_of_clusters)
            clusters = KM.fit_predict(universal_features)
            k = 10

            skf = StratifiedKFold(n_splits=k)

            k_run = 0
            try:
                for active_index, test_index in skf.split(personal_features, personal_labels):
                    print("\tRunning Fold #%s\n" % k_run)
                    # data set available for active labeling from the individual
                    all_active_features = personal_features[active_index]
                    all_active_labels = personal_labels[active_index]


                    # held out test set from individual
                    test_features = personal_features[test_index]
                    test_labels = personal_labels[test_index]

                    # iterate through size of training data
                    for training_size in training_sizes:
                        # initialize score holders
                        personal_model_scores = []
                        universal_model_scores = []
                        personal_plus_all_scores = []
                        personal_plus_cluster_scores = []

                        # run universal model
                        universal_model_score = universal_model(universal_features, universal_labels,
                                                                test_features, test_labels)
                        universal_model_scores.append(universal_model_score)

                        sss = StratifiedShuffleSplit(n_splits=5, train_size=training_size)

                        splits = sss.split(all_active_features, all_active_labels)

                        try:
                            for split_num, split_tup in enumerate(splits):
                                sampled_active_index, __ = split_tup
                                sampled_active_features = personal_features[sampled_active_index]
                                sampled_active_labels = personal_labels[sampled_active_index]

                                # run personal model
                                personal_score = personal_model(sampled_active_features, sampled_active_labels, test_features, test_labels)
                                personal_model_scores.append(personal_score)

                                # run personal + universal
                                personal_plus_all_score = universal_plus_personal_model(sampled_active_features, sampled_active_labels,
                                                                                        universal_features, universal_labels,
                                                                                        test_features, test_labels)
                                personal_plus_all_scores.append(personal_plus_all_score)

                                # run personal + cluster
                                personal_plus_cluster_score = cluster_plus_personal_model(sampled_active_features, sampled_active_labels,
                                                                                        universal_features, universal_labels,
                                                                                        test_features, test_labels, KM, clusters)
                                personal_plus_cluster_scores.append(personal_plus_cluster_score)
                        except ValueError as ve:
                            print("Error with training size while trying to split personal data")
                            print("Message : %s" % ve.args[0])
                            err = ve
                            if "Reduce test_size and/or train_size" in ve.args[0]:
                                print("continuing...")
                                continue
                            elif "should be smaller than the number of samples" in ve.args[0]:
                                print("continuing...")
                                continue
                            elif "The least populated class in y has only 1 member" in ve.args[0]:
                                print("continuing...")
                                continue
                            else:
                                raise(ve)

                        row = {"test user" : user_id,
                               "k-run" : k_run,
                           "classifier" : "RF with Wiki Parameters",
                           "personal training data" : training_size,
                           "personal score Mean" : np.mean(personal_model_scores),
                           "personal score STD" : np.std(personal_model_scores),
                           "impersonal score Mean" : np.mean(universal_model_scores),
                           "impersonal score STD" : np.std(universal_model_scores),
                           "personal + impersonal score Mean" : np.mean(personal_plus_all_scores),
                           "personal + impersonal score STD" : np.std(personal_plus_all_scores),
                           "personal + cluster score Mean" : np.mean(personal_plus_cluster_scores),
                           "personal + cluster score STD" : np.std(personal_plus_cluster_scores)
                           }
                        print("\tamount of personal data : %s row" % training_size)
                        print("\tpersonal model score : M=%.3f, SD=%.3f" % (row["personal score Mean"], row["personal score STD"]))
                        print("\tuniversal model score : M=%.3f, SD=%.3f" % (row["impersonal score Mean"], row["impersonal score STD"]))
                        print("\tpersonal + ALL universal : M=%.3f, SD=%.3f" % (row["personal + impersonal score Mean"], row["personal + impersonal score STD"]))
                        print("\tpersonal + CLUSTER universal : M=%.3f, SD=%.3f" % (row["personal + cluster score Mean"], row["personal + cluster score STD"]))
                        print("\n")
                        scores.append(row)
                        user_scores_df.append(row)
                    k_run += 1
            except ValueError as ve:
                if "Cannot have number of splits n_splits" in ve.args[0]:
                    print("Skipping this k-fold because there is not enough data...")
                    continue
                else:
                    raise ve
            user_scores_df = pd.DataFrame(user_scores_df)
            user_scores_df.to_pickle("/home/sac086/wisdm_model_personalization/results/experiment_08-24_v2_dataset/"+user_id+".pickle")

    scores_df = pd.DataFrame(scores)

In [13]:
scatter_result = dview.scatter("user_ids", wisdm.user_ids)

In [None]:
start = time.time()
dview.block = True
results = dview.execute("all_models()")
finish = time.time()
print("Finished all models in %s seconds" % (finish - start))

In [None]:
results.display_outputs()

# Train on v1.1, run on v2.0

In [14]:
%%px --local
experiment_output_path = "/home/sac086/wisdm_model_personalization/results/experiment_08-24_train_v1_test_v2/"
if not os.path.exists(experiment_output_path):
    os.makedirs(experiment_output_path)

In [51]:
%%px --local
training_sizes = [10,20,30,40,50,60,70,80,90,100]
k = 10

def all_models():
    scores = []
    err = None
    training_sizes = [10,20,30,40,50,60,70,80,90,100]
    
    # Train model with v1.1 data and get clusterings
    wisdm.set_data(version='1', make_compatible=True)
    
    data_df_v1 = wisdm.remove_all_nan(wisdm.data_df)
    user_ids_v1 = wisdm.user_ids
    
    impersonal_labels = np.array([t.decode("utf-8") for t in data_df_v1['class'].as_matrix()])
    impersonal_features = data_df_v1.as_matrix(columns=[data_df_v1.columns[1:-1]])
    
    # train an impersonal model
    impersonal_scaler = StandardScaler().fit(impersonal_features)
    scaled_train_x = impersonal_scaler.transform(impersonal_features)

    rfc_clf = wisdm.weka_RF()
    rfc_clf.fit(scaled_train_x, impersonal_labels)
    
    # create clusters
    number_of_clusters = 4 # the higher this number is, the smaller we should expect each cluster to be

    KM = KMeans(n_clusters=number_of_clusters)
    clusters = KM.fit_predict(scaled_train_x) # NOTE!!! <--- Really not sure if I should scale these before clustering
    
    # reset data back to v2.0
    wisdm.set_data(version="2", make_compatible=True)
    
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        for ind, user_id in enumerate(user_ids):
            user_scores_df = []
            print("Running user #%s: %s" % (ind, user_id))
            personal_set = wisdm.get_user_set(user_id)
            personal_set = wisdm.remove_all_nan(personal_set)
            print("%s personal samples" % len(personal_set))

            personal_labels = np.array([t.decode("utf-8") for t in personal_set['class'].as_matrix()])
            personal_features = personal_set.as_matrix(columns=[personal_set.columns[1:-1]])

            # What is the distribution of labels for this participant?
            #personal_labels_distribution = Counter(personal_labels)
            #print("\tHas %s labels : " % len(personal_labels))
            #for label_key, number in personal_labels_distribution.items():
            #    print("\t\t%s:%s" % (label_key, number))
            #print("\n")
            
            skf = StratifiedKFold(n_splits=k)

            k_run = 0
            try:
                for active_index, test_index in skf.split(personal_features, personal_labels):
                    #print("\tRunning Fold #%s\n" % k_run)
                    # data set available for active labeling from the individual
                    all_active_features = personal_features[active_index]
                    all_active_labels = personal_labels[active_index]

                    # held out test set from individual
                    test_features = personal_features[test_index]
                    test_labels = personal_labels[test_index]

                    # iterate through size of training data
                    for training_size in training_sizes:
                        # initialize score holders
                        personal_model_scores = []
                        universal_model_scores = []
                        personal_plus_all_scores = []
                        personal_plus_cluster_scores = []

                        # run universal model
                        impersonal_scaled_test_x = impersonal_scaler.transform(test_features)
                        #universal_model_score = universal_model(universal_features, universal_labels,
                        #                                        impersonal_scaled_test_x, test_labels)
                        universal_model_score = accuracy_score(test_labels, rfc_clf.predict(impersonal_scaled_test_x))
                        universal_model_scores.append(universal_model_score)

                        sss = StratifiedShuffleSplit(n_splits=5, train_size=training_size)

                        splits = sss.split(all_active_features, all_active_labels)

                        try:
                            for split_num, split_tup in enumerate(splits):
                                sampled_active_index, __ = split_tup
                                sampled_active_features = personal_features[sampled_active_index]
                                sampled_active_labels = personal_labels[sampled_active_index]

                                # run personal model
                                personal_score = personal_model(sampled_active_features, sampled_active_labels, test_features, test_labels)
                                personal_model_scores.append(personal_score)

                                # run personal + universal
                                personal_plus_all_score = universal_plus_personal_model(sampled_active_features, sampled_active_labels,
                                                                                        impersonal_features, impersonal_labels,
                                                                                        test_features, test_labels)
                                personal_plus_all_scores.append(personal_plus_all_score)

                                # run personal + cluster
                                personal_plus_cluster_score = cluster_plus_personal_model(sampled_active_features, sampled_active_labels,
                                                                                        impersonal_features, impersonal_labels,
                                                                                        test_features, test_labels, KM, clusters)
                                personal_plus_cluster_scores.append(personal_plus_cluster_score)
                        except ValueError as ve:
                            print("Error with training size while trying to split personal data")
                            print("Message : %s" % ve.args[0])
                            err = ve
                            if "Reduce test_size and/or train_size" in ve.args[0]:
                                print("continuing...")
                                continue
                            elif "should be smaller than the number of samples" in ve.args[0]:
                                print("continuing...")
                                continue
                            elif "The least populated class in y has only 1 member" in ve.args[0]:
                                print("continuing...")
                                continue
                            else:
                                raise(ve)

                        row = {"test user" : user_id,
                               "k-run" : k_run,
                           "classifier" : "RF with Wiki Parameters",
                           "personal training data" : training_size,
                           "personal score Mean" : np.mean(personal_model_scores),
                           "personal score STD" : np.std(personal_model_scores),
                           "impersonal score Mean" : np.mean(universal_model_scores),
                           "impersonal score STD" : np.std(universal_model_scores),
                           "personal + impersonal score Mean" : np.mean(personal_plus_all_scores),
                           "personal + impersonal score STD" : np.std(personal_plus_all_scores),
                           "personal + cluster score Mean" : np.mean(personal_plus_cluster_scores),
                           "personal + cluster score STD" : np.std(personal_plus_cluster_scores)
                           }
                        print("\tamount of personal data : %s row" % training_size)
                        print("\tpersonal model score : M=%.3f, SD=%.3f" % (row["personal score Mean"], row["personal score STD"]))
                        print("\tuniversal model score : M=%.3f, SD=%.3f" % (row["impersonal score Mean"], row["impersonal score STD"]))
                        print("\tpersonal + ALL universal : M=%.3f, SD=%.3f" % (row["personal + impersonal score Mean"], row["personal + impersonal score STD"]))
                        print("\tpersonal + CLUSTER universal : M=%.3f, SD=%.3f" % (row["personal + cluster score Mean"], row["personal + cluster score STD"]))
                        print("\n")
                        scores.append(row)
                        user_scores_df.append(row)
                    k_run += 1
            except ValueError as ve:
                if "Cannot have number of splits n_splits" in ve.args[0]:
                    print("Skipping this k-fold (%s) because there is not enough data..." % k_run)
                    continue
                elif "while a minimum of 1 is required" in ve.args[0]:
                    print("Skipping this k-fold because there is not enough data...")
                    continue
                else:
                    raise ve
            user_scores_df = pd.DataFrame(user_scores_df)
            user_scores_df.to_pickle(experiment_output_path+user_id+".pickle")

    scores_df = pd.DataFrame(scores)

In [20]:
scatter_result = dview.scatter("user_ids", wisdm.user_ids)

In [21]:
start = time.time()
dview.block = True
results = dview.execute("all_models()")
finish = time.time()
print("Finished all models in %s minutes" % ((finish - start) / 60.)

Finished all models in 1690.7465000152588 seconds


In [22]:
results.display_outputs()

[stdout:0] 
Running user #0: 194
168 personal samples
	amount of personal data : 10 row
	personal model score : M=0.884, SD=0.070
	universal model score : M=0.737, SD=0.000
	personal + ALL universal : M=0.979, SD=0.026
	personal + CLUSTER universal : M=0.968, SD=0.026


	amount of personal data : 20 row
	personal model score : M=0.895, SD=0.033
	universal model score : M=0.737, SD=0.000
	personal + ALL universal : M=0.958, SD=0.021
	personal + CLUSTER universal : M=0.916, SD=0.026


	amount of personal data : 30 row
	personal model score : M=0.905, SD=0.021
	universal model score : M=0.737, SD=0.000
	personal + ALL universal : M=0.958, SD=0.039
	personal + CLUSTER universal : M=0.926, SD=0.042


	amount of personal data : 40 row
	personal model score : M=0.958, SD=0.039
	universal model score : M=0.737, SD=0.000
	personal + ALL universal : M=0.979, SD=0.026
	personal + CLUSTER universal : M=0.947, SD=0.033


	amount of personal data : 50 row
	personal model score : M=0.937, SD=0.039
	u

#  Active Learning Experiments (Uncertainty Sampling with Random Forest)

Ideas for experiments:
1. Simply vary the uncertainty threshold to see how this affects the number of labels elicited and the result accuracies
2. Same as #1 but incorporate samples with label of certainty into training set.
3. Same as #2, but also propogate labels to unlabeled sample set and incorporate into training

### Active Learning Experiment 1

Uses a stream sampling technique in which a threshold for how far the most probable class label can be from a uniform distribution is used to determine model certainty

In [56]:
%%px --local
experiment_name = "experiment_08-29_train_v1_test_v2_active1/"

description = '''
Training and calibrating probability estimations on the WISDM v1.1 dataset. 
Probability estimation was done with stream sampling where those samples whose most probable class label
was did not exceed some threshold (where the threshold is some amount greater than the uniform probability for the class) were
actively labeled.  Testing on WISDM 2.0 with one participant held out'''


experiment_output_path = "/home/sac086/wisdm_model_personalization/results/" + experiment_name
if not os.path.exists(experiment_output_path):
    os.makedirs(experiment_output_path)
with open(experiment_output_path + "README.md", "w") as fOut:
    fOut.write(description)

In [57]:
%%px --local
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import log_loss, brier_score_loss

In [58]:
%%px --local
def uncertainty_stream_sampling(features, model, uncertainty_threshold):
    '''returns two-tuple with predictions as the first element and
       indeces of uncertain predictions as the second'''
    probabilities = model.predict_proba(features)
    uniform_prob = 1 / len(model.classes_)

    uncertain_indeces = []
    predictions = []
    for ind, prob in enumerate(probabilities):
        max_prob = np.max(prob)
        predictions.append(prob.argmax())
        if uncertainty_threshold > (max_prob - uniform_prob): # if the most probable class isn't much more probable than a uniform probability
            uncertain_indeces.append(ind)

    return predictions, uncertain_indeces

In [59]:
%%px --local
#training_sizes = [10,20,30,40,50,60,70,80,90,100]
uncertainty_thresholds = np.arange(0.2, 0.8, 0.05)
k = 10

def active_learning_experiment1():
    scores = []
    err = None
    training_sizes = [10,20,30,40,50,60,70,80,90,100]
    
    # Train model with v1.1 data and get clusterings
    wisdm.set_data(version='1', make_compatible=True)
    
    data_df_v1 = wisdm.remove_all_nan(wisdm.data_df)
    user_ids_v1 = wisdm.user_ids
    
    impersonal_labels = np.array([t.decode("utf-8") for t in data_df_v1['class'].as_matrix()])
    impersonal_features = data_df_v1.as_matrix(columns=[data_df_v1.columns[1:-1]])
    
    # train an impersonal model
    impersonal_scaler = StandardScaler().fit(impersonal_features)
    scaled_train_x = impersonal_scaler.transform(impersonal_features)

    rfc_clf = wisdm.weka_RF()
    sig_clf = CalibratedClassifierCV(rfc_clf, method='sigmoid')
    sig_clf.fit(scaled_train_x, impersonal_labels)
    
    # create clusters
    number_of_clusters = 4 # the higher this number is, the smaller we should expect each cluster to be

    KM = KMeans(n_clusters=number_of_clusters)
    clusters = KM.fit_predict(scaled_train_x) # NOTE!!! <--- Really not sure if I should scale these before clustering
    
    # reset data back to v2.0
    wisdm.set_data(version="2", make_compatible=True)
    
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        for ind, user_id in enumerate(user_ids):
            user_scores_df = []
            print("Running user #%s: %s" % (ind, user_id))
            personal_set = wisdm.get_user_set(user_id)
            personal_set = wisdm.remove_all_nan(personal_set)
            print("%s personal samples" % len(personal_set))
            
            if len(personal_set) < 40:
                print("User does not have enough labeled data")
                continue

            personal_labels = np.array([t.decode("utf-8") for t in personal_set['class'].as_matrix()])
            personal_features = personal_set.as_matrix(columns=[personal_set.columns[1:-1]])

            # What is the distribution of labels for this participant?
            #personal_labels_distribution = Counter(personal_labels)
            #print("\tHas %s labels : " % len(personal_labels))
            #for label_key, number in personal_labels_distribution.items():
            #    print("\t\t%s:%s" % (label_key, number))
            #print("\n")
            
            skf = StratifiedKFold(n_splits=k)

            k_run = 0
            
            for active_index, test_index in skf.split(personal_features, personal_labels):
                #print("\tRunning Fold #%s\n" % k_run)
                # data set available for active labeling from the individual
                all_active_features = personal_features[active_index]
                all_active_labels = personal_labels[active_index]


                # held out test set from individual
                test_features = personal_features[test_index]
                test_labels = personal_labels[test_index]

                personal_model_scores = []
                universal_model_scores = []
                personal_plus_all_scores = []
                personal_plus_cluster_scores = []

                # run universal model
                impersonal_scaled_test_x = impersonal_scaler.transform(test_features)
                universal_model_score = accuracy_score(test_labels, sig_clf.predict(impersonal_scaled_test_x))
                universal_model_scores.append(universal_model_score)

                for thresh in uncertainty_thresholds:
                    # determine active samples from uncertainty sampling
                    predictions, sampled_active_index = uncertainty_stream_sampling(all_active_features, sig_clf, thresh)
                    print("Threshold at %s, %s actively sampled labels" % (thresh, len(sampled_active_index)))
                    sampled_active_features = personal_features[sampled_active_index]
                    sampled_active_labels = personal_labels[sampled_active_index]
                    
                    if len(sampled_active_labels) < 1:
                        row = {"test user" : user_id,
                           "k-run" : k_run,
                       "classifier" : "RF with Wiki Parameters",
                       "personal training data" : len(sampled_active_index),
                       "uncertainty threshold" : thresh,
                       "personal score Mean" : np.nan,
                       "personal score STD" : np.nan,
                       "impersonal score Mean" : universal_model_score,
                       "impersonal score STD" : 0.0,
                       "personal + impersonal score Mean" :np.nan,
                       "personal + impersonal score STD" : np.nan,
                       "personal + cluster score Mean" : np.nan,
                       "personal + cluster score STD" : np.nan
                       }
                        scores.append(row)
                        user_scores_df.append(row)
                        continue

                    # make predictions
                    personal_score = personal_model(sampled_active_features, sampled_active_labels, test_features, test_labels)
                    personal_model_scores.append(personal_score)

                    # run personal + universal
                    personal_plus_all_score = universal_plus_personal_model(sampled_active_features, sampled_active_labels,
                                                                            impersonal_features, impersonal_labels,
                                                                            test_features, test_labels)
                    personal_plus_all_scores.append(personal_plus_all_score)

                    # run personal + cluster
                    personal_plus_cluster_score = cluster_plus_personal_model(sampled_active_features, sampled_active_labels,
                                                                            impersonal_features, impersonal_labels,
                                                                            test_features, test_labels, KM, clusters)
                    personal_plus_cluster_scores.append(personal_plus_cluster_score)
                    row = {"test user" : user_id,
                           "k-run" : k_run,
                       "classifier" : "RF with Wiki Parameters",
                       "personal training data" : len(sampled_active_index),
                       "uncertainty threshold" : thresh,
                       "personal score Mean" : np.mean(personal_model_scores),
                       "personal score STD" : np.std(personal_model_scores),
                       "impersonal score Mean" : np.mean(universal_model_scores),
                       "impersonal score STD" : np.std(universal_model_scores),
                       "personal + impersonal score Mean" : np.mean(personal_plus_all_scores),
                       "personal + impersonal score STD" : np.std(personal_plus_all_scores),
                       "personal + cluster score Mean" : np.mean(personal_plus_cluster_scores),
                       "personal + cluster score STD" : np.std(personal_plus_cluster_scores)
                       }
                    print("\tamount of personal data : %s samples" % len(sampled_active_index))
                    print("\tUncertainty threshold : %s" % thresh)
                    print("\tpersonal model score : M=%.3f, SD=%.3f" % (row["personal score Mean"], row["personal score STD"]))
                    print("\tuniversal model score : M=%.3f, SD=%.3f" % (row["impersonal score Mean"], row["impersonal score STD"]))
                    print("\tpersonal + ALL universal : M=%.3f, SD=%.3f" % (row["personal + impersonal score Mean"], row["personal + impersonal score STD"]))
                    print("\tpersonal + CLUSTER universal : M=%.3f, SD=%.3f" % (row["personal + cluster score Mean"], row["personal + cluster score STD"]))
                    print("\n")
                    scores.append(row)
                    user_scores_df.append(row)
                k_run += 1
            user_scores_df = pd.DataFrame(user_scores_df)
            user_scores_df.to_pickle(experiment_output_path+user_id+".pickle")

In [60]:
scatter_result = dview.scatter("user_ids", wisdm.user_ids)

In [61]:
start = time.time()
dview.block = True
results = dview.execute("active_learning_experiment1()")
finish = time.time()
print("Finished all models in %s minutes" % ((finish - start) / 60.))

Finished all models in 8.557817057768505 minutes


In [62]:
results.display_outputs()

[stdout:0] 
Running user #0: 194
168 personal samples
Threshold at 0.2, 0 actively sampled labels
Threshold at 0.25, 0 actively sampled labels
Threshold at 0.3, 24 actively sampled labels
	amount of personal data : 24 samples
	Uncertainty threshold : 0.3
	personal model score : M=0.368, SD=0.000
	universal model score : M=0.632, SD=0.000
	personal + ALL universal : M=0.684, SD=0.000
	personal + CLUSTER universal : M=0.579, SD=0.000


Threshold at 0.35, 42 actively sampled labels
	amount of personal data : 42 samples
	Uncertainty threshold : 0.35
	personal model score : M=0.474, SD=0.105
	universal model score : M=0.632, SD=0.000
	personal + ALL universal : M=0.737, SD=0.053
	personal + CLUSTER universal : M=0.553, SD=0.026


Threshold at 0.4, 56 actively sampled labels
	amount of personal data : 56 samples
	Uncertainty threshold : 0.4
	personal model score : M=0.509, SD=0.099
	universal model score : M=0.632, SD=0.000
	personal + ALL universal : M=0.789, SD=0.086
	personal + CLUSTER un

### Active Learning 2
Uses an uncertainty sampling technique which looks at a pool of candidates and asks for a label for the *n* samples it is least confident in its prediction for.  While this technique allows for better control over the number of samples we allow it to query for, it also requires that we look at an entire pool of examples first instead of one by one as they are sampled. If the query needs to happen as the sample occurs, such as is likely necessary with activity recognition or most applications of context-aware computing, then this technique will need to be modified. We have a few ideas of how that can be done :

* If there is a way to determine how far back the user can remember their context, the samples from most recent until that point can be considered the pool of potential actively labeled samples.
* in activity recognition, there is a strong relationship between the activity at a point in time and the activity at the next point in time. 

In [63]:
%%px --local
experiment_name = "experiment_08-29_train_v1_test_v2_active2/"

description = '''
Training and calibrating probability estimations on the WISDM v1.1 dataset. 
Probability estimation was done using the n least certain examples for a given set of 
potentially actively labeled samples.  Testing on WISDM 2.0 with one participant held out'''


experiment_output_path = "/home/sac086/wisdm_model_personalization/results/" + experiment_name
if not os.path.exists(experiment_output_path):
    os.makedirs(experiment_output_path)
with open(experiment_output_path + "README.md", "w") as fOut:
    fOut.write(description)

In [64]:
%%px --local
def uncertainty_pool_sampling(features, model, number_of_samples):
    '''returns two-tuple with predictions as the first element and
       indeces of uncertain predictions as the second'''
    probabilities = model.predict_proba(features)
    uniform_prob = 1 / len(model.classes_)

    uniform_diffs = []
    predictions = []
    for ind, prob in enumerate(probabilities):
        max_prob = np.max(prob)
        predictions.append(prob.argmax())
        uniform_diff = max_prob - uniform_prob
        uniform_diffs.append(uniform_diff)
    
    uncertain_indeces = np.argsort(uniform_diffs)[:number_of_samples]

    return predictions, uncertain_indeces

In [65]:
%%px --local
training_sizes = [10,20,30,40,50,60,70,80,90,100]
k = 10

def active_learning_experiment2():
    scores = []
    err = None
    training_sizes = [10,20,30,40,50,60,70,80,90,100]
    
    # Train model with v1.1 data and get clusterings
    wisdm.set_data(version='1', make_compatible=True)
    
    data_df_v1 = wisdm.remove_all_nan(wisdm.data_df)
    user_ids_v1 = wisdm.user_ids
    
    impersonal_labels = np.array([t.decode("utf-8") for t in data_df_v1['class'].as_matrix()])
    impersonal_features = data_df_v1.as_matrix(columns=[data_df_v1.columns[1:-1]])
    
    # train an impersonal model
    impersonal_scaler = StandardScaler().fit(impersonal_features)
    scaled_train_x = impersonal_scaler.transform(impersonal_features)

    rfc_clf = wisdm.weka_RF()
    rfc_clf = wisdm.weka_RF()
    sig_clf = CalibratedClassifierCV(rfc_clf, method='sigmoid')
    sig_clf.fit(scaled_train_x, impersonal_labels)
    rfc_clf.fit(scaled_train_x, impersonal_labels)
    
    # create clusters
    number_of_clusters = 4 # the higher this number is, the smaller we should expect each cluster to be

    KM = KMeans(n_clusters=number_of_clusters)
    clusters = KM.fit_predict(scaled_train_x) # NOTE!!! <--- Really not sure if I should scale these before clustering
    
    # reset data back to v2.0
    wisdm.set_data(version="2", make_compatible=True)
    
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        for ind, user_id in enumerate(user_ids):
            user_scores_df = []
            print("Running user #%s: %s" % (ind, user_id))
            personal_set = wisdm.get_user_set(user_id)
            personal_set = wisdm.remove_all_nan(personal_set)
            print("%s personal samples" % len(personal_set))

            personal_labels = np.array([t.decode("utf-8") for t in personal_set['class'].as_matrix()])
            personal_features = personal_set.as_matrix(columns=[personal_set.columns[1:-1]])

            # What is the distribution of labels for this participant?
            #personal_labels_distribution = Counter(personal_labels)
            #print("\tHas %s labels : " % len(personal_labels))
            #for label_key, number in personal_labels_distribution.items():
            #    print("\t\t%s:%s" % (label_key, number))
            #print("\n")
            
            skf = StratifiedKFold(n_splits=k)

            k_run = 0
            try:
                for active_index, test_index in skf.split(personal_features, personal_labels):
                    #print("\tRunning Fold #%s\n" % k_run)
                    # data set available for active labeling from the individual
                    all_active_features = personal_features[active_index]
                    all_active_labels = personal_labels[active_index]


                    # held out test set from individual
                    test_features = personal_features[test_index]
                    test_labels = personal_labels[test_index]

                    # iterate through size of training data
                    for training_size in training_sizes:
                        # initialize score holders
                        personal_model_scores = []
                        universal_model_scores = []
                        personal_plus_all_scores = []
                        personal_plus_cluster_scores = []

                        # run universal model
                        impersonal_scaled_test_x = impersonal_scaler.transform(test_features)
                        #universal_model_score = universal_model(universal_features, universal_labels,
                        #                                        impersonal_scaled_test_x, test_labels)
                        universal_model_score = accuracy_score(test_labels, sig_clf.predict(impersonal_scaled_test_x))
                        universal_model_scores.append(universal_model_score)

                        try:
                            predictions, sampled_active_index = uncertainty_stream_sampling(all_active_features, sig_clf, training_size)
                            sampled_active_features = personal_features[sampled_active_index]
                            sampled_active_labels = personal_labels[sampled_active_index]

                            # run personal model
                            personal_score = personal_model(sampled_active_features, sampled_active_labels, test_features, test_labels)
                            personal_model_scores.append(personal_score)

                            # run personal + universal
                            personal_plus_all_score = universal_plus_personal_model(sampled_active_features, sampled_active_labels,
                                                                                    impersonal_features, impersonal_labels,
                                                                                    test_features, test_labels)
                            personal_plus_all_scores.append(personal_plus_all_score)

                            # run personal + cluster
                            personal_plus_cluster_score = cluster_plus_personal_model(sampled_active_features, sampled_active_labels,
                                                                                    impersonal_features, impersonal_labels,
                                                                                    test_features, test_labels, KM, clusters)
                            personal_plus_cluster_scores.append(personal_plus_cluster_score)
                        except ValueError as ve:
                            print("Error with training size while trying to split personal data")
                            print("Message : %s" % ve.args[0])
                            err = ve
                            if "Reduce test_size and/or train_size" in ve.args[0]:
                                print("continuing...")
                                continue
                            elif "should be smaller than the number of samples" in ve.args[0]:
                                print("continuing...")
                                continue
                            elif "The least populated class in y has only 1 member" in ve.args[0]:
                                print("continuing...")
                                continue
                            else:
                                raise(ve)

                        row = {"test user" : user_id,
                               "k-run" : k_run,
                           "classifier" : "RF with Wiki Parameters",
                           "personal training data" : training_size,
                           "personal score Mean" : np.mean(personal_model_scores),
                           "personal score STD" : np.std(personal_model_scores),
                           "impersonal score Mean" : np.mean(universal_model_scores),
                           "impersonal score STD" : np.std(universal_model_scores),
                           "personal + impersonal score Mean" : np.mean(personal_plus_all_scores),
                           "personal + impersonal score STD" : np.std(personal_plus_all_scores),
                           "personal + cluster score Mean" : np.mean(personal_plus_cluster_scores),
                           "personal + cluster score STD" : np.std(personal_plus_cluster_scores)
                           }
                        print("\tamount of personal data : %s row" % training_size)
                        print("\tpersonal model score : M=%.3f, SD=%.3f" % (row["personal score Mean"], row["personal score STD"]))
                        print("\tuniversal model score : M=%.3f, SD=%.3f" % (row["impersonal score Mean"], row["impersonal score STD"]))
                        print("\tpersonal + ALL universal : M=%.3f, SD=%.3f" % (row["personal + impersonal score Mean"], row["personal + impersonal score STD"]))
                        print("\tpersonal + CLUSTER universal : M=%.3f, SD=%.3f" % (row["personal + cluster score Mean"], row["personal + cluster score STD"]))
                        print("\n")
                        scores.append(row)
                        user_scores_df.append(row)
                    k_run += 1
            except ValueError as ve:
                if "Cannot have number of splits n_splits" in ve.args[0]:
                    print("Skipping this k-fold (%s) because there is not enough data..." % k_run)
                    continue
                elif "while a minimum of 1 is required" in ve.args[0]:
                    print("Skipping this k-fold because there is not enough data...")
                    continue
                else:
                    raise ve
            user_scores_df = pd.DataFrame(user_scores_df)
            user_scores_df.to_pickle(experiment_output_path+user_id+".pickle")

    scores_df = pd.DataFrame(scores)

In [66]:
start = time.time()
dview.block = True
results = dview.execute("active_learning_experiment2()")
finish = time.time()
print("Finished all models in %s minutes" % ((finish - start) / 60.))

Finished all models in 8.499967368443807 minutes


In [67]:
results.display_outputs()

[stdout:0] 
Running user #0: 194
168 personal samples
	amount of personal data : 10 row
	personal model score : M=1.000, SD=0.000
	universal model score : M=0.632, SD=0.000
	personal + ALL universal : M=1.000, SD=0.000
	personal + CLUSTER universal : M=1.000, SD=0.000


	amount of personal data : 20 row
	personal model score : M=1.000, SD=0.000
	universal model score : M=0.632, SD=0.000
	personal + ALL universal : M=1.000, SD=0.000
	personal + CLUSTER universal : M=1.000, SD=0.000


	amount of personal data : 30 row
	personal model score : M=1.000, SD=0.000
	universal model score : M=0.632, SD=0.000
	personal + ALL universal : M=1.000, SD=0.000
	personal + CLUSTER universal : M=1.000, SD=0.000


	amount of personal data : 40 row
	personal model score : M=1.000, SD=0.000
	universal model score : M=0.632, SD=0.000
	personal + ALL universal : M=1.000, SD=0.000
	personal + CLUSTER universal : M=1.000, SD=0.000


	amount of personal data : 50 row
	personal model score : M=1.000, SD=0.000
	u

# Same experiments without stratified iterations (what happens with class imbalance?)

In [None]:
description = '''
This experiment utilizes the same validation pipeline as experiments 1,2, and 3 but does not stratify the personal data.
This is done to reflect the idea that each individual may in reality have a different distribution of activity classes that
we won't know in advance.
'''

In [112]:
experiment_name = "experiment_08-28_train_v1_test_v2_unstratified/"
experiment_output_path = "/home/sac086/wisdm_model_personalization/results/" + experiment_name
if not os.path.exists(experiment_output_path):
    os.makedirs(experiment_output_path)
with open(experiment_output_path + "README.md", "w") as fOut:
    fOut.write(description)

In [120]:
%%px --local
training_sizes = [10,20,30,40,50,60,70,80,90,100]
k = 10

def all_models_nonstratified():
    scores = []
    err = None
    training_sizes = [10,20,30,40,50,60,70,80,90,100]
    
    # Train model with v1.1 data and get clusterings
    wisdm.set_data(version='1', make_compatible=True)
    
    data_df_v1 = wisdm.remove_all_nan(wisdm.data_df)
    user_ids_v1 = wisdm.user_ids
    
    impersonal_labels = np.array([t.decode("utf-8") for t in data_df_v1['class'].as_matrix()])
    impersonal_features = data_df_v1.as_matrix(columns=[data_df_v1.columns[1:-1]])
    
    # train an impersonal model
    impersonal_scaler = StandardScaler().fit(impersonal_features)
    scaled_train_x = impersonal_scaler.transform(impersonal_features)

    rfc_clf = wisdm.weka_RF()
    rfc_clf.fit(scaled_train_x, impersonal_labels)
    
    # create clusters
    number_of_clusters = 4 # the higher this number is, the smaller we should expect each cluster to be

    KM = KMeans(n_clusters=number_of_clusters)
    clusters = KM.fit_predict(scaled_train_x) # NOTE!!! <--- Really not sure if I should scale these before clustering
    
    # reset data back to v2.0
    wisdm.set_data(version="2", make_compatible=True)
    
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        for ind, user_id in enumerate(user_ids):
            user_scores_df = []
            print("Running user #%s: %s" % (ind, user_id))
            personal_set = wisdm.get_user_set(user_id)
            personal_set = wisdm.remove_all_nan(personal_set)
            print("%s personal samples" % len(personal_set))

            personal_labels = np.array([t.decode("utf-8") for t in personal_set['class'].as_matrix()])
            personal_features = personal_set.as_matrix(columns=[personal_set.columns[1:-1]])

            # What is the distribution of labels for this participant?
            #personal_labels_distribution = Counter(personal_labels)
            #print("\tHas %s labels : " % len(personal_labels))
            #for label_key, number in personal_labels_distribution.items():
            #    print("\t\t%s:%s" % (label_key, number))
            #print("\n")
            
            kf = KFold(n_splits=k, shuffle=True)

            k_run = 0
            try:
                for active_index, test_index in kf.split(personal_features, personal_labels):
                    #print("\tRunning Fold #%s\n" % k_run)
                    # data set available for active labeling from the individual
                    all_active_features = personal_features[active_index]
                    all_active_labels = personal_labels[active_index]


                    # held out test set from individual
                    test_features = personal_features[test_index]
                    test_labels = personal_labels[test_index]

                    # iterate through size of training data
                    for training_size in training_sizes:
                        # initialize score holders
                        personal_model_scores = []
                        universal_model_scores = []
                        personal_plus_all_scores = []
                        personal_plus_cluster_scores = []

                        # run universal model
                        impersonal_scaled_test_x = impersonal_scaler.transform(test_features)
                        #universal_model_score = universal_model(universal_features, universal_labels,
                        #                                        impersonal_scaled_test_x, test_labels)
                        universal_model_score = accuracy_score(test_labels, rfc_clf.predict(impersonal_scaled_test_x))
                        universal_model_scores.append(universal_model_score)

                        ss = ShuffleSplit(n_splits=5, train_size=training_size)

                        splits = ss.split(all_active_features, all_active_labels)

                        try:
                            for split_num, split_tup in enumerate(splits):
                                sampled_active_index, __ = split_tup
                                sampled_active_features = personal_features[sampled_active_index]
                                sampled_active_labels = personal_labels[sampled_active_index]

                                # run personal model
                                personal_score = personal_model(sampled_active_features, sampled_active_labels, test_features, test_labels)
                                personal_model_scores.append(personal_score)

                                # run personal + universal
                                personal_plus_all_score = universal_plus_personal_model(sampled_active_features, sampled_active_labels,
                                                                                        impersonal_features, impersonal_labels,
                                                                                        test_features, test_labels)
                                personal_plus_all_scores.append(personal_plus_all_score)

                                # run personal + cluster
                                personal_plus_cluster_score = cluster_plus_personal_model(sampled_active_features, sampled_active_labels,
                                                                                        impersonal_features, impersonal_labels,
                                                                                        test_features, test_labels, KM, clusters)
                                personal_plus_cluster_scores.append(personal_plus_cluster_score)
                        except ValueError as ve:
                            print("Error with training size while trying to split personal data")
                            print("Message : %s" % ve.args[0])
                            err = ve
                            if "Reduce test_size and/or train_size" in ve.args[0]:
                                print("continuing...")
                                continue
                            elif "should be smaller than the number of samples" in ve.args[0]:
                                print("continuing...")
                                continue
                            elif "The least populated class in y has only 1 member" in ve.args[0]:
                                print("continuing...")
                                continue
                            else:
                                raise(ve)

                        row = {"test user" : user_id,
                               "k-run" : k_run,
                           "classifier" : "RF with Wiki Parameters",
                           "personal training data" : training_size,
                           "personal score Mean" : np.mean(personal_model_scores),
                           "personal score STD" : np.std(personal_model_scores),
                           "impersonal score Mean" : np.mean(universal_model_scores),
                           "impersonal score STD" : np.std(universal_model_scores),
                           "personal + impersonal score Mean" : np.mean(personal_plus_all_scores),
                           "personal + impersonal score STD" : np.std(personal_plus_all_scores),
                           "personal + cluster score Mean" : np.mean(personal_plus_cluster_scores),
                           "personal + cluster score STD" : np.std(personal_plus_cluster_scores)
                           }
                        print("\tamount of personal data : %s row" % training_size)
                        print("\tpersonal model score : M=%.3f, SD=%.3f" % (row["personal score Mean"], row["personal score STD"]))
                        print("\tuniversal model score : M=%.3f, SD=%.3f" % (row["impersonal score Mean"], row["impersonal score STD"]))
                        print("\tpersonal + ALL universal : M=%.3f, SD=%.3f" % (row["personal + impersonal score Mean"], row["personal + impersonal score STD"]))
                        print("\tpersonal + CLUSTER universal : M=%.3f, SD=%.3f" % (row["personal + cluster score Mean"], row["personal + cluster score STD"]))
                        print("\n")
                        scores.append(row)
                        user_scores_df.append(row)
                    k_run += 1
            except ValueError as ve:
                if "Cannot have number of splits n_splits" in ve.args[0]:
                    print("Skipping this k-fold (%s) because there is not enough data..." % k_run)
                    continue
                elif "while a minimum of 1 is required" in ve.args[0]:
                    print("Skipping this k-fold because there is not enough data...")
                    continue
                else:
                    raise ve
            user_scores_df = pd.DataFrame(user_scores_df)
            user_scores_df.to_pickle(experiment_output_path+user_id+".pickle")

    scores_df = pd.DataFrame(scores)

In [121]:
scatter_result = dview.scatter("user_ids", wisdm.user_ids)

In [124]:
start = time.time()
dview.block = True
results = dview.execute("all_models()")
finish = time.time()
print("Finished all models in %s minutes" % ((finish - start) / 60.))

Finished all models in 28.40054277976354 minutes


In [126]:
results.result()

[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ]