# ToDo :

* ~~consolidate code from notebook into WISDM helper methods file~~
* rewrite kfolds process so that we have more precise control over the size of a fold (as oppose to just the number of folds)
* run experiment that directly compares model using ALL general data + active data to model using ONLY general data from nearest cluster + active data
* analyze/visualize clusters (is there a better algorithm? is there a better k for the k-means?) 
* perhaps compare with using the WORST cluster, or using ONLY the personal data
    * for each size of active data
        * for each algorithm (also ensemble algorithm?)
            * personal only
            * universal only
            * personal + ALL universal
            * personal + best cluster universal
            * personal + worst cluster universal


In [1]:
from wisdm import wisdm
import numpy as np
import pandas as pd

from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold, StratifiedShuffleSplit
from sklearn.cluster import KMeans
from scipy.stats import mode

In [2]:
wisdm.WISDM_DIR = wisdm.wisdm_v2_dataset_path
wisdm.WISDM_TRANSFORMED = wisdm.wisdm_transformed_v2

wisdm.set_data()

# Test Universal Impersonal Model Pipeline

In [3]:
import warnings

In [4]:
import time
start = time.time()

In [9]:
rf_results = []

with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    for user_id in wisdm.user_ids:
        test_set = wisdm.get_user_set(user_id)
        test_set = wisdm.remove_all_nan(test_set)

        test_labels = np.array([t.decode("utf-8") for t in test_set['class'].as_matrix()])
        test_features = test_set.as_matrix(columns=[test_set.columns[1:-1]])

        #print("%s labels, %s features" % (len(test_labels), len(test_features)))
        # training features & labels
        training_set = wisdm.data_df[wisdm.data_df['user'] != user_id]
        training_set = wisdm.remove_all_nan(training_set)
        training_labels = np.array([t.decode("utf-8") for t in training_set['class'].as_matrix()])
        training_features = training_set.as_matrix(columns=[test_set.columns[1:-1]])

        # normalize features
        scaler = StandardScaler().fit(training_features)
        scaled_train_x = scaler.transform(training_features)
        scaled_test_x = scaler.transform(test_features)

        clf = wisdm.weka_RF()
        clf.fit(scaled_train_x, training_labels)
        predictions = clf.predict(scaled_test_x)

        score = accuracy_score(test_labels, predictions)
        print("Score for %s: %.3f" % (user_id, score))
        rf_results.append(score)

print("RF results : M=%.5f, SD=%.5f" % (np.mean(rf_results), np.std(rf_results)))

Score for 194: 0.482
Score for 998: 0.637
Score for 1097: 0.306
Score for 1104: 0.619
Score for 1117: 0.269
Score for 1205: 1.000
Score for 1238: 0.670
Score for 1246: 1.000
Score for 1247: 0.000
Score for 1253: 0.833
Score for 1269: 0.000
Score for 1274: 1.000
Score for 1276: 0.000
Score for 1277: 1.000
Score for 1280: 0.000
Score for 1319: 0.692
Score for 1320: 0.352
Score for 1477: 0.086
Score for 1480: 0.000
Score for 1491: 0.000
Score for 1511: 1.000
Score for 1512: 1.000
Score for 1518: 1.000
Score for 1531: 1.000
Score for 1554: 0.917
Score for 1559: 0.830
Score for 1603: 0.480
Score for 1676: 0.291
Score for 1679: 0.000
Score for 1683: 1.000
Score for 1696: 1.000
Score for 1703: 0.812
Score for 1707: 0.135
Score for 1723: 0.333
Score for 1724: 0.000
Score for 1726: 0.667
Score for 1742: 0.284
Score for 1745: 0.000
Score for 1750: 0.167
Score for 1757: 1.000
Score for 1758: 0.769
Score for 1759: 0.724
Score for 1761: 0.455
Score for 1763: 0.000
Score for 1774: 0.724
Score for 17

In [10]:
finish = time.time()
print("Took about %s seconds" % (finish - start))

Took about 23.12292218208313 seconds


In [11]:
len(rf_results)

54

# Parallelized Universal Model

In [5]:
import ipyparallel as ipp

In [7]:
c = ipp.Client()

In [8]:
c.ids

[0,
 1,
 2,
 3,
 4,
 5,
 6,
 7,
 8,
 9,
 10,
 11,
 12,
 13,
 14,
 15,
 16,
 17,
 18,
 19,
 20,
 21,
 22,
 23,
 24,
 25,
 26,
 27,
 28,
 29,
 30,
 31]

In [9]:
dview = c[:]

In [10]:
scatter_result = dview.scatter("user_ids", wisdm.user_ids)

In [11]:
dview['user_ids']

[array(['194', '998'], dtype=object),
 array(['1097', '1104'], dtype=object),
 array(['1117', '1205'], dtype=object),
 array(['1238', '1246'], dtype=object),
 array(['1247', '1253'], dtype=object),
 array(['1269', '1274'], dtype=object),
 array(['1276', '1277'], dtype=object),
 array(['1280', '1319'], dtype=object),
 array(['1320', '1477'], dtype=object),
 array(['1480', '1491'], dtype=object),
 array(['1511', '1512'], dtype=object),
 array(['1518', '1531'], dtype=object),
 array(['1554', '1559'], dtype=object),
 array(['1603', '1676'], dtype=object),
 array(['1679', '1683'], dtype=object),
 array(['1696', '1703'], dtype=object),
 array(['1707', '1723'], dtype=object),
 array(['1724', '1726'], dtype=object),
 array(['1742', '1745'], dtype=object),
 array(['1750', '1757'], dtype=object),
 array(['1758', '1759'], dtype=object),
 array(['1761', '1763'], dtype=object),
 array(['1774'], dtype=object),
 array(['1775'], dtype=object),
 array(['1778'], dtype=object),
 array(['1793'], dtype=obj

In [12]:
def add_wisdm_helpers_path():
    import sys
    sys.path.append("/home/sac086/hcml_chapter/wisdm_model_personalization/notebooks/")

In [13]:
dview.apply_async(add_wisdm_helpers_path)

<AsyncResult: add_wisdm_helpers_path>

In [14]:
%%px --local
import warnings
import os
from wisdm import wisdm
import numpy as np
import pandas as pd

from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold, StratifiedShuffleSplit
from sklearn.cluster import KMeans
from scipy.stats import mode

In [15]:
with dview.sync_imports():
    import warnings
    import os
    from wisdm import wisdm
    import numpy as np
    import pandas as pd

    from sklearn.metrics import accuracy_score
    from sklearn.preprocessing import StandardScaler
    from sklearn.model_selection import StratifiedKFold, StratifiedShuffleSplit
    from sklearn.cluster import KMeans
    from scipy.stats import mode

importing os on engine(s)
importing wisdm from wisdm on engine(s)
importing numpy on engine(s)
importing pandas on engine(s)
importing accuracy_score from sklearn.metrics on engine(s)
importing StandardScaler from sklearn.preprocessing on engine(s)
importing StratifiedKFold,StratifiedShuffleSplit from sklearn.model_selection on engine(s)
importing KMeans from sklearn.cluster on engine(s)
importing mode from scipy.stats on engine(s)


In [17]:
dview.block=True

In [18]:
def init():
    wisdm_1_dir = "/home/sac086/wisdm_model_personalization/datasets/WISDM_v1/"
    wisdm_1_transformed_data  = "WISDM_ar_v1.1_transformed_FIXED.arff"
    wisdm_2_dir = "/home/sac086/wisdm_model_personalization/datasets/WISDM_v2/"
    wisdm_2_transformed_data = "WISDM_at_v2.0_transformed_FIXED.arff"
    
    wisdm.WISDM_DIR = wisdm_2_dir
    wisdm.WISDM_TRANSFORMED = wisdm_2_transformed_data
    
    wisdm.set_data()
    print("Done")

In [19]:
dview.apply(init)

[None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None]

# Cross-Validation with exact number of data points

### Experiment Attributes : 
* amount of training set from individual end user
* amount of training set from impersonal data (other users)
    * "ALL" all other data
    * "closest cluster" only data from the closest cluster
    * "furthest cluster" only data from the furthest cluster
    * "All - furthest cluster" all other data EXCEPT data from furthest cluster
* test user id
* algorithm
* algorithm parameters



In [20]:
%%px --local
def personal_model(active_features, active_labels, test_features, test_labels):
    scaler = StandardScaler().fit(active_features)
    scaled_train_x = scaler.transform(active_features)
    scaled_test_x = scaler.transform(test_features)

    rfc_clf = wisdm.weka_RF()
    rfc_clf.fit(scaled_train_x, active_labels)
    predictions = rfc_clf.predict(scaled_test_x)
    score = accuracy_score(test_labels, predictions)
    return score

In [21]:
%%px --local

def universal_model(universal_features, universal_labels, test_features, test_labels):
    scaler = StandardScaler().fit(universal_features)
    scaled_train_x = scaler.transform(universal_features)
    scaled_test_x = scaler.transform(test_features)

    rfc_clf = wisdm.weka_RF()
    rfc_clf.fit(scaled_train_x, universal_labels)
    predictions = rfc_clf.predict(scaled_test_x)
    score = accuracy_score(test_labels, predictions)

    return score

In [22]:
%%px --local

def universal_plus_personal_model(personal_features, personal_labels,
                                  universal_features, universal_labels,
                                  test_features, test_labels):
    personal_plus_universal_features = np.vstack((personal_features, universal_features))
    personal_plus_universal_labels = np.hstack((personal_labels, universal_labels))

    scaler = StandardScaler().fit(personal_plus_universal_features)
    scaled_train_x = scaler.transform(personal_plus_universal_features)
    scaled_test_x = scaler.transform(test_features)

    rfc_clf = wisdm.weka_RF()

    rfc_clf.fit(scaled_train_x, personal_plus_universal_labels)
    predictions = rfc_clf.predict(scaled_test_x)
    score = accuracy_score(test_labels, predictions)
    return score

In [23]:
%%px --local

def cluster_plus_personal_model(personal_features, personal_labels,
                                  universal_features, universal_labels,
                                  test_features, test_labels, KM, clusters):
    cluster_predictions = KM.predict(personal_features)
    closest_cluster = mode(cluster_predictions).mode[0]

    cluster_data_indeces = [i for i in range(len(clusters)) if clusters[i] == closest_cluster]
    cluster_features = universal_features[cluster_data_indeces]
    cluster_labels = universal_labels[cluster_data_indeces]

    training_features = np.vstack((personal_features, cluster_features))
    training_labels = np.hstack((personal_labels, cluster_labels))

    scaler = StandardScaler().fit(training_features)
    scaled_train_x = scaler.transform(training_features)
    scaled_test_x = scaler.transform(test_features)

    rfc_clf = wisdm.weka_RF()

    rfc_clf.fit(scaled_train_x, training_labels)
    predictions = rfc_clf.predict(scaled_test_x)
    score = accuracy_score(test_labels, predictions)
    return score

In [24]:
%%px --local

from collections import Counter

In [25]:
%%px --local
training_sizes = [10,20,30,40,50,60,70,80,90,100]

def all_models():
    scores = []
    err = None
    training_sizes = [10,20,30,40,50,60,70,80,90,100]

    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        for ind, user_id in enumerate(user_ids):
            user_scores_df = []
            print("Running user #%s: %s" % (ind, user_id))
            personal_set = wisdm.get_user_set(user_id)
            personal_set = wisdm.remove_all_nan(personal_set)

            personal_labels = np.array([t.decode("utf-8") for t in personal_set['class'].as_matrix()])
            personal_features = personal_set.as_matrix(columns=[personal_set.columns[1:-1]])

            # What is the distribution of labels for this participant?
            personal_labels_distribution = Counter(personal_labels)
            print("\tHas %s labels : " % len(personal_labels))
            for label_key, number in personal_labels_distribution.items():
                print("\t\t%s:%s" % (label_key, number))
            print("\n")
            # training features & labels
            universal_set = wisdm.data_df[wisdm.data_df['user'] != user_id]
            universal_set = wisdm.remove_all_nan(universal_set)
            universal_labels = np.array([t.decode("utf-8") for t in universal_set['class'].as_matrix()])
            universal_features = universal_set.as_matrix(columns=[universal_set.columns[1:-1]])

            # get k-means clusters
            number_of_clusters = 4 # the higher this number is, the smaller we should expect each cluster to be

            KM = KMeans(n_clusters=number_of_clusters)
            clusters = KM.fit_predict(universal_features)
            k = 10

            skf = StratifiedKFold(n_splits=k)

            k_run = 0
            try:
                for active_index, test_index in skf.split(personal_features, personal_labels):
                    print("\tRunning Fold #%s\n" % k_run)
                    # data set available for active labeling from the individual
                    all_active_features = personal_features[active_index]
                    all_active_labels = personal_labels[active_index]


                    # held out test set from individual
                    test_features = personal_features[test_index]
                    test_labels = personal_labels[test_index]

                    # iterate through size of training data
                    for training_size in training_sizes:
                        # initialize score holders
                        personal_model_scores = []
                        universal_model_scores = []
                        personal_plus_all_scores = []
                        personal_plus_cluster_scores = []

                        # run universal model
                        universal_model_score = universal_model(universal_features, universal_labels,
                                                                test_features, test_labels)
                        universal_model_scores.append(universal_model_score)

                        sss = StratifiedShuffleSplit(n_splits=5, train_size=training_size)

                        splits = sss.split(all_active_features, all_active_labels)

                        try:
                            for split_num, split_tup in enumerate(splits):
                                sampled_active_index, __ = split_tup
                                sampled_active_features = personal_features[sampled_active_index]
                                sampled_active_labels = personal_labels[sampled_active_index]

                                # run personal model
                                personal_score = personal_model(sampled_active_features, sampled_active_labels, test_features, test_labels)
                                personal_model_scores.append(personal_score)

                                # run personal + universal
                                personal_plus_all_score = universal_plus_personal_model(sampled_active_features, sampled_active_labels,
                                                                                        universal_features, universal_labels,
                                                                                        test_features, test_labels)
                                personal_plus_all_scores.append(personal_plus_all_score)

                                # run personal + cluster
                                personal_plus_cluster_score = cluster_plus_personal_model(sampled_active_features, sampled_active_labels,
                                                                                        universal_features, universal_labels,
                                                                                        test_features, test_labels, KM, clusters)
                                personal_plus_cluster_scores.append(personal_plus_cluster_score)
                        except ValueError as ve:
                            print("Error with training size while trying to split personal data")
                            print("Message : %s" % ve.args[0])
                            err = ve
                            if "Reduce test_size and/or train_size" in ve.args[0]:
                                print("continuing...")
                                continue
                            elif "should be smaller than the number of samples" in ve.args[0]:
                                print("continuing...")
                                continue
                            elif "The least populated class in y has only 1 member" in ve.args[0]:
                                print("continuing...")
                                continue
                            else:
                                raise(ve)

                        row = {"test user" : user_id,
                               "k-run" : k_run,
                           "classifier" : "RF with Wiki Parameters",
                           "personal training data" : training_size,
                           "personal score Mean" : np.mean(personal_model_scores),
                           "personal score STD" : np.std(personal_model_scores),
                           "impersonal score Mean" : np.mean(universal_model_scores),
                           "impersonal score STD" : np.std(universal_model_scores),
                           "personal + impersonal score Mean" : np.mean(personal_plus_all_scores),
                           "personal + impersonal score STD" : np.std(personal_plus_all_scores),
                           "personal + cluster score Mean" : np.mean(personal_plus_cluster_scores),
                           "personal + cluster score STD" : np.std(personal_plus_cluster_scores)
                           }
                        print("\tamount of personal data : %s row" % training_size)
                        print("\tpersonal model score : M=%.3f, SD=%.3f" % (row["personal score Mean"], row["personal score STD"]))
                        print("\tuniversal model score : M=%.3f, SD=%.3f" % (row["impersonal score Mean"], row["impersonal score STD"]))
                        print("\tpersonal + ALL universal : M=%.3f, SD=%.3f" % (row["personal + impersonal score Mean"], row["personal + impersonal score STD"]))
                        print("\tpersonal + CLUSTER universal : M=%.3f, SD=%.3f" % (row["personal + cluster score Mean"], row["personal + cluster score STD"]))
                        print("\n")
                        scores.append(row)
                        user_scores_df.append(row)
                    k_run += 1
            except ValueError as ve:
                if "Cannot have number of splits n_splits" in ve.args[0]:
                    print("Skipping this k-fold because there is not enough data...")
                    continue
                else:
                    raise ve
            user_scores_df = pd.DataFrame(user_scores_df)
            user_scores_df.to_pickle("/home/sac086/wisdm_model_personalization/results/experiment_08-21_v2_dataset/"+user_id+".pickle")

    scores_df = pd.DataFrame(scores)

In [26]:
import time

In [27]:
start = time.time()

In [28]:
dview.block = True

In [29]:
results = dview.execute("all_models()")

In [30]:
results.result()

[, , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ]

In [31]:
finish = time.time()
print("Finished all models in %s seconds" % (finish - start))

Finished all models in 1566.3238887786865 seconds


# Paralellizing to run on WISDM_2.0