# Figure out if StratifiedKFold folds consistently on subsequent calls

In [1]:
from sklearn.model_selection import StratifiedKFold
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import brier_score_loss, accuracy_score
from sklearn.preprocessing import StandardScaler

In [2]:
from wisdm import wisdm
import numpy as np
import pandas as pd
from collections import Counter

# How good is this classifier at knowing when to be confident?

In [3]:
wisdm.set_data(version="1", make_compatible=True)
data_df = wisdm.remove_all_nan(wisdm.data_df)
impersonal_labels = np.array([t.decode("utf-8") for t in data_df['class'].as_matrix()])
impersonal_features = data_df.as_matrix(columns=[data_df.columns[1:-1]])

In [4]:
skf = StratifiedKFold(n_splits=3)
fold_num = 0
folds = []
prediction_probs = []
prediction_probs_sigmoid = []
prediction_probs_isotonic = []
for train_ind, test_ind in skf.split(impersonal_features, impersonal_labels):
    fold_num += 1
    print("Fold #%s : " % fold_num)
    scaler = StandardScaler()
    train_X = scaler.fit_transform(impersonal_features[train_ind])
    train_Y = impersonal_labels[train_ind]
    train_counts = Counter(train_Y)
    
    test_X = scaler.transform(impersonal_features[test_ind])
    test_Y = impersonal_labels[test_ind]
    test_counts = Counter(test_Y)
    
    clf = wisdm.weka_RF()
    clf.fit(train_X, train_Y)
    prob_pos_clf = clf.predict_proba(test_X)
    predictions = clf.predict(test_X)
    prediction_probs.append(prob_pos_clf)
    
    baseline_score = accuracy_score(test_Y, predictions)
    print("Accuracy Score : %.3f" % baseline_score)
    
    sigmoid_clf = CalibratedClassifierCV(clf, cv=3, method='sigmoid')
    sigmoid_clf.fit(train_X, train_Y)
    prob_pos_sigmoid = sigmoid_clf.predict_proba(test_X)
    prediction_probs_sigmoid.append(prob_pos_sigmoid)
    
    isotonic_clf = CalibratedClassifierCV(clf, cv=3, method='isotonic')
    isotonic_clf.fit(train_X, train_Y)
    prob_pos_isotonic = isotonic_clf.predict_proba(test_X)
    prediction_probs_isotonic.append(prob_pos_isotonic)
    
    class_labels = clf.classes_
    fold_dict = {"test labels" : test_Y, "train labels" : train_Y}
    for label in class_labels:
        column_ind = np.where(class_labels == label)[0][0]

        print("Class Label : %s, Training : %.3f, Testing :%.3f" % (label, (train_counts[label]/float(len(train_Y))), \
                                                                       (test_counts[label]/float(len(test_Y)))))
        clf_score = brier_score_loss(test_Y, prob_pos_clf[:,column_ind], pos_label=label)
        print("\tWithout Calibration : %1.3f" % clf_score)
        fold_dict['brier_score_no_cal'] = clf_score

        sigmoid_score = brier_score_loss(test_Y, prob_pos_sigmoid[:,column_ind], pos_label=label)
        print("\tSigmoid Calibration : %1.3f" % sigmoid_score)
        fold_dict['brier_score_sigmoid'] = sigmoid_score

        isotonic_score = brier_score_loss(test_Y, prob_pos_isotonic[:,column_ind], pos_label=label)
        print("\tIsotonic Calibration : %1.3f" % isotonic_score)
        fold_dict['brier_score_isotonic'] = isotonic_score
        folds.append(fold_dict)

        print("\n")

Fold #1 : 
Accuracy Score : 0.856
Class Label : Jogging, Training : 0.326, Testing :0.326
	Without Calibration : 0.018
	Sigmoid Calibration : 0.017
	Isotonic Calibration : 0.016


Class Label : Sitting, Training : 0.027, Testing :0.027
	Without Calibration : 0.006
	Sigmoid Calibration : 0.007
	Isotonic Calibration : 0.006


Class Label : Stairs, Training : 0.221, Testing :0.221
	Without Calibration : 0.092
	Sigmoid Calibration : 0.085
	Isotonic Calibration : 0.085


Class Label : Standing, Training : 0.015, Testing :0.015
	Without Calibration : 0.005
	Sigmoid Calibration : 0.006
	Isotonic Calibration : 0.005


Class Label : Walking, Training : 0.410, Testing :0.410
	Without Calibration : 0.088
	Sigmoid Calibration : 0.083
	Isotonic Calibration : 0.083


Fold #2 : 
Accuracy Score : 0.824
Class Label : Jogging, Training : 0.326, Testing :0.326
	Without Calibration : 0.049
	Sigmoid Calibration : 0.051
	Isotonic Calibration : 0.052


Class Label : Sitting, Training : 0.027, Testing :0.027


In [5]:
wisdm.set_data(version="2", make_compatible=True)
data_df = wisdm.remove_all_nan(wisdm.data_df)
impersonal_labels = np.array([t.decode("utf-8") for t in data_df['class'].as_matrix()])
impersonal_features = data_df.as_matrix(columns=[data_df.columns[1:-1]])

In [17]:
skf = StratifiedKFold(n_splits=3)
fold_num = 0
folds = []
prediction_probs = []
prediction_probs_sigmoid = []
prediction_probs_isotonic = []
all_predictions = []
all_ground_truth = []
for train_ind, test_ind in skf.split(impersonal_features, impersonal_labels):
    fold_num += 1
    print("Fold #%s : " % fold_num)
    scaler = StandardScaler()
    train_X = scaler.fit_transform(impersonal_features[train_ind])
    train_Y = impersonal_labels[train_ind]
    train_counts = Counter(train_Y)
    
    test_X = scaler.transform(impersonal_features[test_ind])
    test_Y = impersonal_labels[test_ind]
    test_counts = Counter(test_Y)
    
    clf = wisdm.weka_RF()
    clf.fit(train_X, train_Y)
    prob_pos_clf = clf.predict_proba(test_X)
    predictions = clf.predict(test_X)
    all_predictions.append(predictions)
    all_ground_truth.append(test_Y)
    prediction_probs.append(prob_pos_clf)
    
    baseline_score = accuracy_score(test_Y, predictions)
    print("Accuracy Score : %.3f" % baseline_score)
    
    sigmoid_clf = CalibratedClassifierCV(clf, cv=3, method='sigmoid')
    sigmoid_clf.fit(train_X, train_Y)
    prob_pos_sigmoid = sigmoid_clf.predict_proba(test_X)
    prediction_probs_sigmoid.append(prob_pos_sigmoid)
    
    isotonic_clf = CalibratedClassifierCV(clf, cv=3, method='isotonic')
    isotonic_clf.fit(train_X, train_Y)
    prob_pos_isotonic = isotonic_clf.predict_proba(test_X)
    prediction_probs_isotonic.append(prob_pos_isotonic)
    
    class_labels = clf.classes_
    fold_dict = {"test labels" : test_Y, "train labels" : train_Y}
    for label in class_labels:
        column_ind = np.where(class_labels == label)[0][0]

        print("Class Label : %s, Training : %.3f, Testing :%.3f" % (label, (train_counts[label]/float(len(train_Y))), \
                                                                       (test_counts[label]/float(len(test_Y)))))
        clf_score = brier_score_loss(test_Y, prob_pos_clf[:,column_ind], pos_label=label)
        print("\tWithout Calibration : %1.3f" % clf_score)
        fold_dict['brier_score_no_cal'] = clf_score

        sigmoid_score = brier_score_loss(test_Y, prob_pos_sigmoid[:,column_ind], pos_label=label)
        print("\tSigmoid Calibration : %1.3f" % sigmoid_score)
        fold_dict['brier_score_sigmoid'] = sigmoid_score

        isotonic_score = brier_score_loss(test_Y, prob_pos_isotonic[:,column_ind], pos_label=label)
        print("\tIsotonic Calibration : %1.3f" % isotonic_score)
        fold_dict['brier_score_isotonic'] = isotonic_score
        folds.append(fold_dict)

        print("\n")

Fold #1 : 
Accuracy Score : 0.503
Class Label : Jogging, Training : 0.027, Testing :0.027
	Without Calibration : 0.009
	Sigmoid Calibration : 0.016
	Isotonic Calibration : 0.016


Class Label : Sitting, Training : 0.293, Testing :0.292
	Without Calibration : 0.294
	Sigmoid Calibration : 0.202
	Isotonic Calibration : 0.215


Class Label : Stairs, Training : 0.052, Testing :0.052
	Without Calibration : 0.064
	Sigmoid Calibration : 0.047
	Isotonic Calibration : 0.044


Class Label : Standing, Training : 0.175, Testing :0.174
	Without Calibration : 0.083
	Sigmoid Calibration : 0.100
	Isotonic Calibration : 0.092


Class Label : Walking, Training : 0.454, Testing :0.454
	Without Calibration : 0.329
	Sigmoid Calibration : 0.249
	Isotonic Calibration : 0.308


Fold #2 : 
Accuracy Score : 0.713
Class Label : Jogging, Training : 0.027, Testing :0.027
	Without Calibration : 0.011
	Sigmoid Calibration : 0.016
	Isotonic Calibration : 0.014


Class Label : Sitting, Training : 0.293, Testing :0.293


In [6]:
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
init_notebook_mode(connected=True)
import plotly.graph_objs as go
import plotly.figure_factory as ff

In [20]:
pred = np.hstack(all_predictions)

In [21]:
gt = np.hstack(all_ground_truth)

In [22]:
correct_pred_indeces = []
wrong_pred_indeces = []

for ind, p in enumerate(pred):
    if p == gt[ind]:
        correct_pred_indeces.append(ind)
    else:
        wrong_pred_indeces.append(ind)

In [23]:
len(correct_pred_indeces)

2830

In [24]:
len(wrong_pred_indeces)

1986

In [25]:
prediction_probs = np.vstack(prediction_probs)

In [26]:
correct_probs = prediction_probs[correct_pred_indeces]
wrong_probs = prediction_probs[wrong_pred_indeces]

In [27]:
group_labels = list(clf.classes_)
group_labels

['Jogging', 'Sitting', 'Stairs', 'Standing', 'Walking']

In [38]:
group_labels = ['correct', 'wrong']
hist_data = [correct_probs[:,0], wrong_probs[:,0]]
fig = ff.create_distplot(hist_data, group_labels, bin_size=.05)
#layout = go.Layout(title='Jogging')
#fig.layout = layout
iplot(fig)

In [39]:
group_labels = ['correct', 'wrong']
hist_data = [correct_probs[:,1], wrong_probs[:,1]]
fig = ff.create_distplot(hist_data, group_labels, bin_size=.05)
#layout = go.Layout(title='Sitting')
#fig.layout = layout
iplot(fig)

In [32]:
hist_data = [wrong_probs[:,col_ind] for col_ind in range(len(group_labels))]
fig = ff.create_distplot(hist_data, group_labels, bin_size=.05)
iplot(fig)

# Impersonal Lab on Lab vs. Field on Field vs. Lab on Field

In [15]:
from collections import Counter
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score

## Lab on Lab

In [53]:
wisdm.set_data(version="1", make_compatible=True)
data_df = wisdm.remove_all_nan(wisdm.data_df)
result_rows1 = []
class_labels = [l.decode("utf-8") for l in data_df['class'].unique()]
ignored_users1 = []

for user_id in wisdm.user_ids:
    print("User : %s" % user_id)
    user_df = data_df[data_df['user'] == user_id]
    
    if len(user_df) < 20:
        ignored_users1.append(user_id)
        continue
    personal_labels = np.array([t.decode("utf-8") for t in user_df['class'].as_matrix()])
    personal_features = user_df.as_matrix(columns=[user_df.columns[1:-1]])
    personal_label_counts = Counter(personal_labels)
    
    impersonal_df = data_df[data_df['user'] != user_id]
    impersonal_labels = np.array([t.decode("utf-8") for t in impersonal_df['class'].as_matrix()])
    impersonal_features = impersonal_df.as_matrix(columns=[impersonal_df.columns[1:-1]])
    
    clf = wisdm.weka_RF()
    impersonal_scaler = StandardScaler().fit(impersonal_features)
    scaled_train_X = impersonal_scaler.transform(impersonal_features)
    scaled_test_X = impersonal_scaler.transform(personal_features)
    
    clf.fit(scaled_train_X, impersonal_labels)
    predictions = clf.predict(scaled_test_X)
    prediction_probas = clf.predict_proba(scaled_test_X)
    accuracy = accuracy_score(personal_labels, predictions)
    print("\tAccuracy : %s" % accuracy)
    result_row = {"user_id" : user_id,
                   "predictions" : predictions,
                   "accuracy" : accuracy,
                   "truth" : personal_labels}
    result_rows1.append(result_row)
results_df1 = pd.DataFrame(result_rows1)

User : 33
	Accuracy : 0.859060402685
User : 17
	Accuracy : 0.519230769231
User : 20
	Accuracy : 0.835978835979
User : 29
	Accuracy : 0.837078651685
User : 13
	Accuracy : 0.942857142857
User : 15
	Accuracy : 0.992424242424
User : 6
	Accuracy : 0.945945945946
User : 27
	Accuracy : 0.682926829268
User : 36
	Accuracy : 0.913669064748
User : 18
	Accuracy : 0.838709677419
User : 32
	Accuracy : 0.848275862069
User : 35
	Accuracy : 0.486725663717
User : 11
	Accuracy : 0.770833333333
User : 16
	Accuracy : 0.872549019608
User : 5
	Accuracy : 0.907142857143
User : 10
	Accuracy : 0.794117647059
User : 28
	Accuracy : 0.934579439252
User : 26
	Accuracy : 0.915151515152
User : 14
	Accuracy : 0.769633507853
User : 24
	Accuracy : 0.879032258065
User : 12
	Accuracy : 0.886178861789
User : 23
	Accuracy : 0.860465116279
User : 4
	Accuracy : 0.923076923077
User : 30
	Accuracy : 0.25
User : 34
	Accuracy : 0.916666666667
User : 8
	Accuracy : 0.912408759124
User : 31
	Accuracy : 0.937799043062
User : 21
	Accu

# Field on Field

In [54]:
wisdm.set_data(version="2", make_compatible=True)
data_df = wisdm.remove_all_nan(wisdm.data_df)
class_labels = [l.decode("utf-8") for l in data_df['class'].unique()]
result_rows2 = []

ignored_users2 = []

for user_id in wisdm.user_ids:
    print("User : %s" % user_id)
    user_df = data_df[data_df['user'] == user_id]
    
    if len(user_df) < 20:
        ignored_users2.append(user_id)
        continue
    
    personal_labels = np.array([t.decode("utf-8") for t in user_df['class'].as_matrix()])
    personal_features = user_df.as_matrix(columns=[user_df.columns[1:-1]])
    personal_label_counts = Counter(personal_labels)
    
    
    impersonal_df = data_df[data_df['user'] != user_id]
    impersonal_labels = np.array([t.decode("utf-8") for t in impersonal_df['class'].as_matrix()])
    impersonal_features = impersonal_df.as_matrix(columns=[impersonal_df.columns[1:-1]])
    
    clf = wisdm.weka_RF()
    impersonal_scaler = StandardScaler().fit(impersonal_features)
    scaled_train_X = impersonal_scaler.transform(impersonal_features)
    scaled_test_X = impersonal_scaler.transform(personal_features)
    
    clf.fit(scaled_train_X, impersonal_labels)
    predictions = clf.predict(scaled_test_X)
    prediction_probas = clf.predict_proba(scaled_test_X)
    accuracy = accuracy_score(personal_labels, predictions)
    print("\tAccuracy : %s" % accuracy)
    result_row = {"user_id" : user_id,
                   "predictions" : predictions,
                   "accuracy" : accuracy,
                   "truth" : personal_labels}
    result_rows2.append(result_row)
results_df2 = pd.DataFrame(result_rows2)


User : 194
	Accuracy : 0.5
User : 998
	Accuracy : 0.636655948553
User : 1097
	Accuracy : 0.5
User : 1104
	Accuracy : 0.699074074074
User : 1117
	Accuracy : 0.318181818182
User : 1205
	Accuracy : 1.0
User : 1238
	Accuracy : 0.637404580153
User : 1246
	Accuracy : 1.0
User : 1247
User : 1253
	Accuracy : 0.848484848485
User : 1269
User : 1274
	Accuracy : 1.0
User : 1276
User : 1277
	Accuracy : 1.0
User : 1280
	Accuracy : 0.0
User : 1319
	Accuracy : 0.562949640288
User : 1320
	Accuracy : 0.391304347826
User : 1477
	Accuracy : 0.0905172413793
User : 1480
User : 1491
User : 1511
User : 1512
	Accuracy : 1.0
User : 1518
	Accuracy : 1.0
User : 1531
User : 1554
	Accuracy : 0.944444444444
User : 1559
	Accuracy : 0.872340425532
User : 1603
	Accuracy : 0.925287356322
User : 1676
	Accuracy : 0.393442622951
User : 1679
User : 1683
User : 1696
User : 1703
	Accuracy : 0.891719745223
User : 1707
	Accuracy : 0.142857142857
User : 1723
User : 1724
User : 1726
User : 1742
	Accuracy : 0.583333333333
User : 1

# Lab on Field

In [55]:
wisdm.set_data(version="1", make_compatible=True)
impersonal_df = wisdm.remove_all_nan(wisdm.data_df)
impersonal_labels = np.array([t.decode("utf-8") for t in impersonal_df['class'].as_matrix()])
impersonal_features = impersonal_df.as_matrix(columns=[impersonal_df.columns[1:-1]])
impersonal_scaler = StandardScaler().fit(impersonal_features)
scaled_train_X = impersonal_scaler.transform(impersonal_features)
clf = wisdm.weka_RF()
clf.fit(scaled_train_X, impersonal_labels)

wisdm.set_data(version="2", make_compatible=True)
result_rows3 = []
ignored_users3 = []

for user_id in wisdm.user_ids:
    print("User : %s" % user_id)
    user_df = data_df[data_df['user'] == user_id]
    
    if len(user_df) < 20:
        ignored_users3.append(user_id)
        continue
    
    personal_labels = np.array([t.decode("utf-8") for t in user_df['class'].as_matrix()])
    personal_features = user_df.as_matrix(columns=[user_df.columns[1:-1]])
    personal_label_counts = Counter(personal_labels)

    scaled_test_X = impersonal_scaler.transform(personal_features)
    
    predictions = clf.predict(scaled_test_X)
    prediction_probas = clf.predict_proba(scaled_test_X)
    accuracy = accuracy_score(personal_labels, predictions)
    print("\tAccuracy : %s" % accuracy)
    result_row = {"user_id" : user_id,
                   "predictions" : predictions,
                   "accuracy" : accuracy,
                   "truth" : personal_labels}
    result_rows3.append(result_row)
results_df3 = pd.DataFrame(result_rows3)

User : 194
	Accuracy : 0.619047619048
User : 998
	Accuracy : 0.0
User : 1097
	Accuracy : 0.0
User : 1104
	Accuracy : 0.412037037037
User : 1117
	Accuracy : 0.424242424242
User : 1205
	Accuracy : 1.0
User : 1238
	Accuracy : 0.137404580153
User : 1246
	Accuracy : 1.0
User : 1247
User : 1253
	Accuracy : 0.560606060606
User : 1269
User : 1274
	Accuracy : 0.0
User : 1276
User : 1277
	Accuracy : 0.0
User : 1280
	Accuracy : 0.0
User : 1319
	Accuracy : 0.129496402878
User : 1320
	Accuracy : 0.608695652174
User : 1477
	Accuracy : 0.00862068965517
User : 1480
User : 1491
User : 1511
User : 1512
	Accuracy : 0.0815450643777
User : 1518
	Accuracy : 0.321428571429
User : 1531
User : 1554
	Accuracy : 0.0
User : 1559
	Accuracy : 0.0212765957447
User : 1603
	Accuracy : 0.778735632184
User : 1676
	Accuracy : 0.114754098361
User : 1679
User : 1683
User : 1696
User : 1703
	Accuracy : 0.331210191083
User : 1707
	Accuracy : 0.122448979592
User : 1723
User : 1724
User : 1726
User : 1742
	Accuracy : 0.4861111

# When we choose only our most confident predictions does the accuracy go up?

In [37]:
prediction_probas[0]

array([ 0.03,  0.37,  0.07,  0.52,  0.01])

In [38]:
np.argmax(prediction_probas, axis=1)

array([3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
       3, 2, 2, 3, 3, 2, 2, 2, 3, 3, 2, 3, 1, 3, 3, 3, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 3, 2, 1, 1, 3, 1, 3, 1, 1, 1, 3, 1, 3, 3, 3, 3, 3, 3,
       3, 3, 3, 3, 3])

In [40]:
clf.classes_

array(['Jogging', 'Sitting', 'Stairs', 'Standing', 'Walking'],
      dtype='<U8')

In [42]:
proba_max = np.max(prediction_probas, axis=1)

In [48]:
certainty_sort_ind = np.argsort(proba_max)
least_certain_ind = certainty_sort_ind[:10]
most_certain_ind = certainty_sort_ind[-10:]

In [50]:
np.max(prediction_probas[least_certain_ind], axis=1)

array([ 0.25,  0.26,  0.26,  0.26,  0.26,  0.27,  0.27,  0.27,  0.27,  0.27])

In [51]:
np.max(prediction_probas[most_certain_ind], axis=1)

array([ 0.52,  0.52,  0.52,  0.53,  0.53,  0.53,  0.53,  0.54,  0.55,  0.55])

In [58]:
# lab on field
wisdm.set_data(version="1", make_compatible=True)
impersonal_df = wisdm.remove_all_nan(wisdm.data_df)
impersonal_labels = np.array([t.decode("utf-8") for t in impersonal_df['class'].as_matrix()])
impersonal_features = impersonal_df.as_matrix(columns=[impersonal_df.columns[1:-1]])
impersonal_scaler = StandardScaler().fit(impersonal_features)
scaled_train_X = impersonal_scaler.transform(impersonal_features)
clf = wisdm.weka_RF()
clf.fit(scaled_train_X, impersonal_labels)

wisdm.set_data(version="2", make_compatible=True)
result_rows4 = []
ignored_users4 = []

for user_id in wisdm.user_ids:
    print("User : %s" % user_id)
    user_df = data_df[data_df['user'] == user_id]
    
    if len(user_df) < 20:
        ignored_users4.append(user_id)
        continue
    
    personal_labels = np.array([t.decode("utf-8") for t in user_df['class'].as_matrix()])
    personal_features = user_df.as_matrix(columns=[user_df.columns[1:-1]])
    personal_label_counts = Counter(personal_labels)

    scaled_test_X = impersonal_scaler.transform(personal_features)
    
    predictions = clf.predict(scaled_test_X)
    prediction_probas = clf.predict_proba(scaled_test_X)
    proba_max = np.max(prediction_probas, axis=1)
    certainty_sort_ind = np.argsort(proba_max)
    least_certain_ind = certainty_sort_ind[:10]
    most_certain_ind = certainty_sort_ind[-10:]
    
    most_certain_pred = predictions[most_certain_ind]
    least_certain_pred = predictions[least_certain_ind]
    
    most_certain_truth = personal_labels[most_certain_ind]
    least_certain_truth = personal_labels[least_certain_ind]
    
    score = accuracy_score(personal_labels, predictions)
    most_certain_score = accuracy_score(most_certain_truth, most_certain_pred)
    least_certain_score = accuracy_score(least_certain_truth, least_certain_pred)
    print("\tAccuracy : %s" % score)
    print("\tMost Certain Accuracy : %s" % most_certain_score)
    print("\tLeast Certain Accuracy : %s" % least_certain_score)
    result_row = {"user_id" : user_id,
                  "score" : score,
                  "most_certain" : most_certain_score,
                  "least_certain" : least_certain_score,
                 }
    result_rows4.append(result_row)

User : 194
	Accuracy : 0.744047619048
	Most Certain Accuracy : 0.8
	Least Certain Accuracy : 0.5
User : 998
	Accuracy : 0.0
	Most Certain Accuracy : 0.0
	Least Certain Accuracy : 0.0
User : 1097
	Accuracy : 0.0
	Most Certain Accuracy : 0.0
	Least Certain Accuracy : 0.0
User : 1104
	Accuracy : 0.601851851852
	Most Certain Accuracy : 1.0
	Least Certain Accuracy : 0.0
User : 1117
	Accuracy : 0.590909090909
	Most Certain Accuracy : 0.8
	Least Certain Accuracy : 0.0
User : 1205
	Accuracy : 1.0
	Most Certain Accuracy : 1.0
	Least Certain Accuracy : 1.0
User : 1238
	Accuracy : 0.511450381679
	Most Certain Accuracy : 1.0
	Least Certain Accuracy : 0.0
User : 1246
	Accuracy : 1.0
	Most Certain Accuracy : 1.0
	Least Certain Accuracy : 1.0
User : 1247
User : 1253
	Accuracy : 0.568181818182
	Most Certain Accuracy : 1.0
	Least Certain Accuracy : 0.0
User : 1269
User : 1274
	Accuracy : 0.0
	Most Certain Accuracy : 0.0
	Least Certain Accuracy : 0.0
User : 1276
User : 1277
	Accuracy : 0.0
	Most Certain

# Plot these results ^^^ to see if a 10 most certain prediction even has a chance

# Are we good at assessing confidence when we add personal data?

In [None]:
def confidence_pipeline(version, output_path, user_ids, k=10, minimum_personal_samples=40, make_compatible=True)

# What is the distribution of prediction margins?

# Developing the pipeline to iteratively re-assess its confidence
* new pipeline should allow for control over the size of the uncertainty sampling batch in each iteration
* create folders which have dataframes separating each model for simplicity
* new results dataframe should include the IDs of the personal data that were sampled in each iteration

# Q1 : Does prediction certainty get better after each iteration?
# Q2 : Does prediction accuracy get better after each iteration?
# Q3 (Vague) : How should I choose a good uncertainty margin?

In [72]:
def uncertainty_pipeline1(version, output_path, user_ids, k=10, minimum_personal_samples=40, make_compatible=True):
    # initialize pipeline variables
    
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")

        # Train model with v1.1 data and get clusterings
        wisdm.set_data(version=version, make_compatible=make_compatible)

        for ind, user_id in enumerate(user_ids): # iterate through the users holding one out for testing
            user_results = []
            print("Running user #%s: %s" % (ind, user_id))
            personal_set = get_user_set(user_id)
            personal_set = remove_all_nan(personal_set)

            print("%s personal samples" % len(personal_set))

            if len(personal_set) < minimum_personal_samples:
                print("User %s has less than %s labeled samples..." % (user_id, minimum_personal_samples))
                continue

            personal_labels = np.array([t.decode("utf-8") for t in personal_set['class'].as_matrix()])
            personal_features = personal_set.as_matrix(columns=[personal_set.columns[1:-1]])

            # get impersonal data
            impersonal_set = data_df[data_df['user'] != user_id]
            impersonal_set = remove_all_nan(impersonal_set)
            impersonal_labels = np.array([t.decode("utf-8") for t in impersonal_set['class'].as_matrix()])
            impersonal_features = impersonal_set.as_matrix(columns=[impersonal_set.columns[1:-1]])

            # train an impersonal model
            impersonal_scaler = StandardScaler().fit(impersonal_features)
            scaled_train_x = impersonal_scaler.transform(impersonal_features)

            rfc_clf = weka_RF()
            rfc_clf.fit(scaled_train_x, impersonal_labels)

            # calibrated for probability estimation
            prob_cal_cv_generator = StratifiedKFold(n_splits=3).split(impersonal_features,impersonal_labels)
            prob_cal_clf = CalibratedClassifierCV(rfc_clf, cv=prob_cal_cv_generator, method='sigmoid')
            prob_cal_clf.fit(scaled_train_x, impersonal_labels)

            # split personal data into training (potentially) and test
            skf = StratifiedKFold(n_splits=k)
            k_run = 0

            for active_index, test_index in skf.split(personal_features, personal_labels):
                print("\tRunning Fold #%s\n" % k_run)
                # data set available for active labeling from the individual
                all_active_features = personal_features[active_index]
                all_active_labels = personal_labels[active_index]

                # held out test set from individual
                test_features = personal_features[test_index]
                test_labels = personal_labels[test_index]

                k_run_df = uncertainty_experiments(user_id, k_run,
                              impersonal_features, impersonal_labels, \
                              all_active_features, all_active_labels, \
                              test_features, test_labels, 
                              impersonal_model=rfc_clf, impersonal_scaler=impersonal_scaler)
                user_results.append(k_run_df)
                k_run += 1
            user_scores_df = pd.concat(user_results)
            user_scores_df.to_pickle(output_path+user_id+".pickle")

In [81]:
def shuffle_data(features,labels):
    rdm_state = np.random.get_state()
    np.random.shuffle(features)
    np.random.set_state(rdm_state)
    np.random.shuffle(labels)
    return features, labels

In [91]:
def should_active_sample_margin(feature, model, margin_size):
    prob = model.predict_prob([feature])
    uniform_prob = 1. / len(model.classes_)
    
    sorted_probs = np.argsort(prob)
    max_prob = prob[sorted_probs[-1]]
    second_max_prob = prob[sorted_probs[-2]]
    if (max_prob - second_max_prob) < margin_size:
        return True
    else:
        return False

In [None]:
def margin_active_sampling(all_personal_features, model, margin_size):
	probabilities = model.predict_proba(all_personal_features)
	uniform_prob = 1. / len(model.classes_)

	active_sample_indeces = []
	for ind, prob in enumerate(probabilities):
		sorted_probs = np.argsort(prob)
		max_prob = prob[sorted_probs[-1]]
		second_max_prob = prob[sorted_probs[-2]]
		if max_prob - second_max_prob < margin_size:
			active_sample_indeces.append(ind)

	return active_sample_indeces

In [92]:
len(wisdm.data_df['class'].unique())

5

In [None]:
@timeit
def uncertainty_experiments(user_id, k_run,
                              impersonal_features, impersonal_labels, \
                              all_active_features, all_active_labels, \
                              test_features, test_labels, 
                              impersonal_model=None, impersonal_scaler=None):

    # shuffle active features, labels
    all_active_features, all_active_labels = shuffle_data(all_active_features, all_active_labels)
    
    # make a baseline prediction to the held out test set
    scaled_test_X = impersonal_scaler.transform(test_features)
    baseline_predictions = impersonal_model.predict(scaled_test_X, test_labels)
    baseline_score = accuracy_score(test_labels, baseline_predictions)
    
    for stream_ind in range(len(all_active_labels)):
        # make prediction on incoming stream
        feature = all_active_features[ind]
        prediction = impersonal_model.predict([feature])
        
        # determine if we are certain about the prediction
        pred_prob = impersonal_model.pred
        # if we are certain, make the prediction and add to the training set
        
        # if we are not certain, get the label and add it to the training set
        
        # make predictions on held out test set
        
        # record results

    for ts in training_sizes:
        # initialize score holders
        random_personal_scores = []
        random_personal_plus_all_scores = []
        random_personal_plus_cluster_scores = []
        #random_gc_scores = []

        # run impersonal model
        impersonal_scaled_test_x = impersonal_scaler.transform(test_features)
        impersonal_model_score = accuracy_score(test_labels, impersonal_model.predict(impersonal_scaled_test_x))

        for run in range(random_sample_iterations):
            # get random samples
            try:
                random_active_indeces = np.random.choice(len(potential_active_features), ts, replace=False)
            except ValueError as ve:
                if """Cannot take a larger sample than population when 'replace=False'""" in ve.args[0]:
                    continue
            sampled_active_features = potential_active_features[random_active_indeces]
            sampled_active_labels = potential_active_labels[random_active_indeces]

            # run personal model
            random_personal_score = personal_model(sampled_active_features, sampled_active_labels, test_features=test_features, test_labels=test_labels)
            random_personal_scores.append(random_personal_score)

            # run personal + universal
            random_personal_plus_all_score = impersonal_plus_personal_model(sampled_active_features, sampled_active_labels,
                                                                    impersonal_features, impersonal_labels,
                                                                    test_features=test_features, test_labels=test_labels)
            random_personal_plus_all_scores.append(random_personal_plus_all_score)

            # run personal + cluster
            random_personal_plus_cluster_score = cluster_plus_personal_model(sampled_active_features, sampled_active_labels,
                                                                    impersonal_features, impersonal_labels,
                                                                    KM, clusters,
                                                                    test_features=test_features, test_labels=test_labels)

            random_personal_plus_cluster_scores.append(random_personal_plus_cluster_score)

            # run garcia-ceja personalization approach
            #random_gc_score = garcia_ceja_model(sampled_active_features, sampled_active_labels,
            #							impersonal_features, impersonal_labels,
            #								test_features=test_features, test_labels=test_labels)
            #random_gc_scores.append(random_gc_score)

        #least certain samples
        try:
            least_certain_active_indeces = least_confident_active_sampling(potential_active_features, impersonal_model, ts)
        except ValueError as ve:
            if "The number of personal samples provided" in ve.args[0]:
                print("Can't evaluate participant #%s with %s personal labels..."%(user_id, ts))
                continue
        sampled_active_features = potential_active_features[least_certain_active_indeces]
        sampled_active_labels = potential_active_labels[least_certain_active_indeces]

        # run personal model
        least_certain_personal_score = personal_model(sampled_active_features, sampled_active_labels, test_features=test_features, test_labels=test_labels)

        # run personal + universal
        least_certain_personal_plus_all_score = impersonal_plus_personal_model(sampled_active_features, sampled_active_labels,
                                                                impersonal_features, impersonal_labels,
                                                                test_features=test_features, test_labels=test_labels)

        # run personal + cluster
        least_certain_personal_plus_cluster_score = cluster_plus_personal_model(sampled_active_features, sampled_active_labels,
                                                                impersonal_features, impersonal_labels,
                                                                KM, clusters,
                                                                test_features=test_features, test_labels=test_labels)


        # run garcia-ceja personalization approach
        #least_certain_gc_score = garcia_ceja_model(sampled_active_features, sampled_active_labels,
        #							impersonal_features, impersonal_labels,
        #							test_features=test_features, test_labels=test_labels)

        row = {"test user" : user_id,
                   "k-run" : k_run,
               "classifier" : "RF with Wiki Parameters",
               "personal training data" : ts,
               "random personal score Mean" : np.mean(random_personal_scores),
               "impersonal score Mean" : impersonal_model_score,
               "random personal + impersonal score Mean" : np.mean(random_personal_plus_all_scores),
               "random personal + cluster score Mean" : np.mean(random_personal_plus_cluster_scores),
               #"random Garcia-Ceja MM Mean" : np.mean(random_gc_scores),
               "least_certain personal score Mean" : least_certain_personal_score,
               "least_certain personal + impersonal score Mean" : least_certain_personal_plus_all_score,
               "least_certain personal + cluster score Mean" : least_certain_personal_plus_cluster_score,
               #"least_certain Garcia-Ceja MM Mean" : least_certain_gc_score,
               }
        print("\tamount of personal data : %s row" % ts)
        print("\trandom personal model score : M=%.3f, SD=%.3f" % (row["random personal score Mean"], np.std(random_personal_scores)))
        print("\timpersonal model score : M=%.3f" % (row["impersonal score Mean"]))
        print("\trandom personal + ALL Impersonal : M=%.3f, SD=%.3f" % (row["random personal + impersonal score Mean"], np.std(random_personal_plus_all_scores)))
        print("\trandom personal + CLUSTER Impersonal : M=%.3f, SD=%.3f" % (row["random personal + cluster score Mean"], np.std(random_personal_plus_cluster_scores)))
        #print("\trandom GC MM M=%.3f, SD=%.3f" % (row["random Garcia-Ceja MM Mean"], np.std(random_gc_scores)))
        print("\tleast_certain personal model score : %.3f" % (row["least_certain personal score Mean"]))
        print("\tleast_certain personal + ALL Impersonal : %.3f" % (row["least_certain personal + impersonal score Mean"]))
        print("\tleast_certain personal + CLUSTER Impersonal : %.3f" % (row["least_certain personal + cluster score Mean"]))
        #print("\tleast_certain GC MM %.3f" % (row["least_certain Garcia-Ceja MM Mean"]))
        print("\n")
        rows.append(row)

    user_scores_df = pd.DataFrame(rows)
    return user_scores_df