In [2]:
from sklearn.model_selection import StratifiedKFold, StratifiedShuffleSplit, ShuffleSplit
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import brier_score_loss, accuracy_score
from sklearn.preprocessing import StandardScaler

In [3]:
from wisdm import wisdm
import random
import numpy as np
import pandas as pd
from collections import Counter

In [4]:
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
init_notebook_mode(connected=True)
import plotly.graph_objs as go

# Lab on Field

In [5]:
wisdm.set_data(version="1", make_compatible=True)
len(wisdm.data_df)

5418

In [6]:
Counter(wisdm.data_df['class'])

Counter({b'Jogging': 1625,
         b'Sitting': 306,
         b'Stairs': 1160,
         b'Standing': 246,
         b'Walking': 2081})

In [7]:
wisdm.set_data(version="2", make_compatible=True)
len(wisdm.data_df)

4816

In [8]:
Counter(wisdm.data_df['class'])

Counter({b'Jogging': 130,
         b'Sitting': 1410,
         b'Stairs': 251,
         b'Standing': 840,
         b'Walking': 2185})

In [9]:
wisdm.set_data(version="1", make_compatible=True)
impersonal_df = wisdm.remove_all_nan(wisdm.data_df)
impersonal_labels = np.array([t.decode("utf-8") for t in impersonal_df['class'].as_matrix()])
impersonal_features = impersonal_df.as_matrix(columns=[impersonal_df.columns[1:-1]])
impersonal_scaler = StandardScaler().fit(impersonal_features)
scaled_train_X = impersonal_scaler.transform(impersonal_features)
impersonal_clf = wisdm.weka_RF()
impersonal_clf.fit(scaled_train_X, impersonal_labels)
impersonal_clf.set_params(n_estimators=10000, n_jobs=28)

wisdm.set_data(version="2", make_compatible=True)
result_rows = []
ignored_users = []

for user_id in wisdm.user_ids:
    print("User : %s" % user_id)
    user_df = wisdm.data_df[wisdm.data_df['user'] == user_id]
    
    if len(user_df) < 40:
        ignored_users.append(user_id)
        continue
    
    personal_labels = np.array([t.decode("utf-8") for t in user_df['class'].as_matrix()])
    personal_features = user_df.as_matrix(columns=[user_df.columns[1:-1]])
    personal_label_counts = Counter(personal_labels)
    
    sss = StratifiedShuffleSplit(n_splits=4, test_size=30, train_size=10)
    
    #skf = StratifiedKFold(n_splits=3)
    
    personal_accs = []
    impersonal_accs = []
    hybrid_accs = []
    
    try:
        for train_ind, test_ind in sss.split(personal_features, personal_labels):
            active_labels = personal_labels[train_ind]
            active_features = personal_features[train_ind]

            test_labels = personal_labels[test_ind]
            test_features = personal_features[test_ind]
            
            #print("Active Labels : %s" % Counter(active_labels))
            #print("Test Labels : %s" % Counter(test_labels))

            # personal accuracy
            personal_scaler = StandardScaler().fit(active_features)
            scaled_personal_features = personal_scaler.transform(active_features)
            scaled_test_features = personal_scaler.transform(test_features)

            personal_clf = wisdm.weka_RF().fit(scaled_personal_features, active_labels)
            personal_clf.set_params(n_estimators=10000, n_jobs=28)

            personal_predictions = personal_clf.predict(scaled_test_features)
            personal_accuracy = accuracy_score(test_labels, personal_predictions)
            personal_accs.append(personal_accuracy)
            
            # impersonal accuracy
            scaled_test_features = impersonal_scaler.transform(test_features)
            impersonal_predictions = impersonal_clf.predict(scaled_test_features)
            impersonal_accuracy = accuracy_score(test_labels, impersonal_predictions)
            impersonal_accs.append(impersonal_accuracy)
            
            # hybrid accuracy
            hybrid_labels = np.hstack((active_labels, impersonal_labels))
            hybrid_features = np.vstack((active_features, impersonal_features))

            
            # shuffle features and labels
            c = list(zip(hybrid_labels, hybrid_features))
            random.shuffle(c)
            hybrid_labels, hybrid_features = zip(*c)
            hybrid_scaler = StandardScaler().fit(hybrid_features)
            scaled_hybrid_features = hybrid_scaler.transform(hybrid_features)
            scaled_test_features = hybrid_scaler.transform(test_features)

            hybrid_clf = wisdm.weka_RF()
            hybrid_clf.set_params(n_estimators=10000, n_jobs=28)
            hybrid_clf.fit(scaled_hybrid_features, hybrid_labels)
            hybrid_predictions = hybrid_clf.predict(scaled_test_features)
            hybrid_accuracy = accuracy_score(test_labels, hybrid_predictions)
            hybrid_accs.append(hybrid_accuracy)
    except ValueError as ve:
        if "The least populated class" in ve.args[0]:
            print("\tNot enough labeled data for %s" % user_id)
            ignored_users.append(user_id)
            continue
        else:
            raise ve
    print("\t Personal Accuracy : M=%.3f, SD=%.3f" % (np.mean(personal_accs), np.std(personal_accs)))
    print("\t Impersonal Accuracy : M=%.3f, SD=%.3f" % (np.mean(impersonal_accs), np.std(impersonal_accs)))
    print("\t Hybrid Accuracy : M=%.3f, SD=%.3f" % (np.mean(hybrid_accs), np.std(hybrid_accs)))
    result_row = {"user_id" : user_id,
                   "personal accuracies M" : np.mean(personal_accs),
                   "personal accuracies SD" : np.std(personal_accs),
                   "impersonal accuracies M" : np.mean(impersonal_accs),
                   "impersonal accuracies SD" : np.std(impersonal_accs),
                   "hybrid accuracies M" : np.mean(hybrid_accs),
                   "hybrid accuracies SD" : np.std(hybrid_accs)
                 }
    result_rows.append(result_row)
results_df_10 = pd.DataFrame(result_rows)

User : 194
	 Personal Accuracy : M=0.883, SD=0.037
	 Impersonal Accuracy : M=0.650, SD=0.087
	 Hybrid Accuracy : M=0.883, SD=0.037
User : 998
	 Personal Accuracy : M=1.000, SD=0.000
	 Impersonal Accuracy : M=0.000, SD=0.000
	 Hybrid Accuracy : M=1.000, SD=0.000
User : 1097
User : 1104
	 Personal Accuracy : M=0.850, SD=0.050
	 Impersonal Accuracy : M=0.450, SD=0.050
	 Hybrid Accuracy : M=0.825, SD=0.064
User : 1117
	 Personal Accuracy : M=0.900, SD=0.000
	 Impersonal Accuracy : M=0.417, SD=0.050
	 Hybrid Accuracy : M=0.875, SD=0.028
User : 1205
User : 1238
	 Personal Accuracy : M=0.883, SD=0.017
	 Impersonal Accuracy : M=0.067, SD=0.041
	 Hybrid Accuracy : M=0.900, SD=0.041
User : 1246
	 Personal Accuracy : M=1.000, SD=0.000
	 Impersonal Accuracy : M=1.000, SD=0.000
	 Hybrid Accuracy : M=1.000, SD=0.000
User : 1247
User : 1253
	 Personal Accuracy : M=0.967, SD=0.000
	 Impersonal Accuracy : M=0.533, SD=0.000
	 Hybrid Accuracy : M=0.967, SD=0.000
User : 1269
User : 1274
	 Personal Accurac

In [10]:
print("Impersonal Accuracies M=%.3f, SD=%.3f" % (results_df_10['impersonal accuracies M'].mean(), results_df_10['impersonal accuracies M'].std()))
print("Personal Accuracies M=%.3f, SD=%.3f" % (results_df_10['personal accuracies M'].mean(), results_df_10['personal accuracies M'].std()))
print("Hybrid Accuracies M=%.3f, SD=%.3f" % (results_df_10['hybrid accuracies M'].mean(), results_df_10['hybrid accuracies M'].std()))

Impersonal Accuracies M=0.345, SD=0.289
Personal Accuracies M=0.913, SD=0.097
Hybrid Accuracies M=0.926, SD=0.091


In [11]:
ignored_users

['1097',
 '1205',
 '1247',
 '1269',
 '1276',
 '1277',
 '1280',
 '1480',
 '1491',
 '1511',
 '1518',
 '1531',
 '1554',
 '1679',
 '1683',
 '1696',
 '1723',
 '1724',
 '1726',
 '1745',
 '1750',
 '1757',
 '1758',
 '1761',
 '1763',
 '1797',
 '1802',
 '1813',
 '1814']

In [14]:
trace0 = go.Box(
    y=results_df_10['impersonal accuracies M'],
    name='Impersonal',
    marker=dict(
        color='red',
    ),
    boxpoints='all',
    jitter=0.3,
    pointpos=-0.5
)

trace1 = go.Box(
    y=results_df_10['personal accuracies M'],
    name='Personal (10 Samples)',
    marker=dict(
        color='blue',
    ),
    boxpoints='all',
    jitter=0.1,
    pointpos=-0.5
)

trace2 = go.Box(
    y=results_df_10['hybrid accuracies M'],
    name='Hybrid (10 Samples)',
    marker=dict(
        color='green',
    ),
    boxpoints='all',
    jitter=0.1,
    pointpos=-0.5
)
data = [trace0, trace1, trace2]
layout = go.Layout(yaxis=dict(title="Accuracy"), showlegend=False)
fig=go.Figure(data=data,layout=layout)
iplot(fig)

# How many users labeled instances representing all activities?

In [None]:
wisdm.set_data(version="2", make_compatible=True)
class_labels = [cl.decode("utf-8") for cl in wisdm.data_df['class'].unique()]
user_label_presence = {}

for user_id in wisdm.user_ids:
    user_df = wisdm.remove_all_nan(wisdm.data_df[wisdm.data_df['user'] == user_id])
    labels = [cl.decode("utf-8") for cl in user_df['class']]
    
    label_presence = []
    for cl in class_labels:
        if cl in labels:
            label_presence.append(True)
        else:
            label_presence.append(False)
    user_label_presence[user_id] = label_presence

users_with_balanced_data = []

for user_id in wisdm.user_ids:
    user_labels_arr = user_label_presence[user_id]
    has_all_data = all(l==True for l in user_labels_arr)
    #print("%s : %s" % (user_id, has_all_data))
    if has_all_data:
        users_with_balanced_data.append(user_id)

print("Users with balanced data : %s" % users_with_balanced_data)

In [None]:
results_df[results_df['user_id'].isin(users_with_balanced_data)]['impersonal accuracies M'].mean()

In [None]:
results_df[results_df['user_id'].isin(users_with_balanced_data)]['personal accuracies M'].mean()

# Same thing with 30 training samples instead of 10

In [15]:
wisdm.set_data(version="1", make_compatible=True)
impersonal_df = wisdm.remove_all_nan(wisdm.data_df)
impersonal_labels = np.array([t.decode("utf-8") for t in impersonal_df['class'].as_matrix()])
impersonal_features = impersonal_df.as_matrix(columns=[impersonal_df.columns[1:-1]])
impersonal_scaler = StandardScaler().fit(impersonal_features)
scaled_train_X = impersonal_scaler.transform(impersonal_features)
impersonal_clf = wisdm.weka_RF()
impersonal_clf.fit(scaled_train_X, impersonal_labels)
impersonal_clf.set_params(n_estimators=10000, n_jobs=28)

wisdm.set_data(version="2", make_compatible=True)
result_rows = []
ignored_users = []

for user_id in wisdm.user_ids:
    print("User : %s" % user_id)
    user_df = wisdm.data_df[wisdm.data_df['user'] == user_id]
    
    if len(user_df) < 40:
        ignored_users.append(user_id)
        continue
    
    personal_labels = np.array([t.decode("utf-8") for t in user_df['class'].as_matrix()])
    personal_features = user_df.as_matrix(columns=[user_df.columns[1:-1]])
    personal_label_counts = Counter(personal_labels)
    
    sss = StratifiedShuffleSplit(n_splits=4, test_size=15, train_size=30)
    
    #skf = StratifiedKFold(n_splits=3)
    
    personal_accs = []
    impersonal_accs = []
    hybrid_accs = []
    
    try:
        for train_ind, test_ind in sss.split(personal_features, personal_labels):
            active_labels = personal_labels[train_ind]
            active_features = personal_features[train_ind]

            test_labels = personal_labels[test_ind]
            test_features = personal_features[test_ind]
            
            #print("Active Labels : %s" % Counter(active_labels))
            #print("Test Labels : %s" % Counter(test_labels))

            # personal accuracy
            personal_scaler = StandardScaler().fit(active_features)
            scaled_personal_features = personal_scaler.transform(active_features)
            scaled_test_features = personal_scaler.transform(test_features)

            personal_clf = wisdm.weka_RF().fit(scaled_personal_features, active_labels)
            personal_clf.set_params(n_estimators=10000, n_jobs=28)

            personal_predictions = personal_clf.predict(scaled_test_features)
            personal_accuracy = accuracy_score(test_labels, personal_predictions)
            personal_accs.append(personal_accuracy)
            
            # impersonal accuracy
            scaled_test_features = impersonal_scaler.transform(test_features)
            impersonal_predictions = impersonal_clf.predict(scaled_test_features)
            impersonal_accuracy = accuracy_score(test_labels, impersonal_predictions)
            impersonal_accs.append(impersonal_accuracy)
            
            # hybrid accuracy
            hybrid_labels = np.hstack((active_labels, impersonal_labels))
            hybrid_features = np.vstack((active_features, impersonal_features))

            
            # shuffle features and labels
            c = list(zip(hybrid_labels, hybrid_features))
            random.shuffle(c)
            hybrid_labels, hybrid_features = zip(*c)
            hybrid_scaler = StandardScaler().fit(hybrid_features)
            scaled_hybrid_features = hybrid_scaler.transform(hybrid_features)
            scaled_test_features = hybrid_scaler.transform(test_features)

            hybrid_clf = wisdm.weka_RF()
            hybrid_clf.set_params(n_estimators=10000, n_jobs=28)
            hybrid_clf.fit(scaled_hybrid_features, hybrid_labels)
            hybrid_predictions = hybrid_clf.predict(scaled_test_features)
            hybrid_accuracy = accuracy_score(test_labels, hybrid_predictions)
            hybrid_accs.append(hybrid_accuracy)
    except ValueError as ve:
        if "The least populated class" in ve.args[0]:
            print("\tNot enough labeled data for %s" % user_id)
            ignored_users.append(user_id)
            continue
        else:
            raise ve
    print("\t Personal Accuracy : M=%.3f, SD=%.3f" % (np.mean(personal_accs), np.std(personal_accs)))
    print("\t Impersonal Accuracy : M=%.3f, SD=%.3f" % (np.mean(impersonal_accs), np.std(impersonal_accs)))
    print("\t Hybrid Accuracy : M=%.3f, SD=%.3f" % (np.mean(hybrid_accs), np.std(hybrid_accs)))
    result_row = {"user_id" : user_id,
                   "personal accuracies M" : np.mean(personal_accs),
                   "personal accuracies SD" : np.std(personal_accs),
                   "impersonal accuracies M" : np.mean(impersonal_accs),
                   "impersonal accuracies SD" : np.std(impersonal_accs),
                   "hybrid accuracies M" : np.mean(hybrid_accs),
                   "hybrid accuracies SD" : np.std(hybrid_accs)
                 }
    result_rows.append(result_row)
results_df_30 = pd.DataFrame(result_rows)

User : 194
	 Personal Accuracy : M=0.950, SD=0.029
	 Impersonal Accuracy : M=0.600, SD=0.000
	 Hybrid Accuracy : M=0.933, SD=0.047
User : 998
	 Personal Accuracy : M=1.000, SD=0.000
	 Impersonal Accuracy : M=0.000, SD=0.000
	 Hybrid Accuracy : M=0.983, SD=0.029
User : 1097
User : 1104
	 Personal Accuracy : M=0.950, SD=0.029
	 Impersonal Accuracy : M=0.300, SD=0.145
	 Hybrid Accuracy : M=0.967, SD=0.033
User : 1117
	 Personal Accuracy : M=0.983, SD=0.029
	 Impersonal Accuracy : M=0.500, SD=0.075
	 Hybrid Accuracy : M=0.983, SD=0.029
User : 1205
User : 1238
	 Personal Accuracy : M=0.900, SD=0.033
	 Impersonal Accuracy : M=0.150, SD=0.029
	 Hybrid Accuracy : M=0.900, SD=0.033
User : 1246
	 Personal Accuracy : M=1.000, SD=0.000
	 Impersonal Accuracy : M=1.000, SD=0.000
	 Hybrid Accuracy : M=1.000, SD=0.000
User : 1247
User : 1253
	 Personal Accuracy : M=1.000, SD=0.000
	 Impersonal Accuracy : M=0.533, SD=0.000
	 Hybrid Accuracy : M=1.000, SD=0.000
User : 1269
User : 1274
	 Personal Accurac

In [16]:
print("Impersonal Accuracies M=%.3f, SD=%.3f" % (results_df_30['impersonal accuracies M'].mean(), results_df_30['impersonal accuracies M'].std()))
print("Personal Accuracies M=%.3f, SD=%.3f" % (results_df_30['personal accuracies M'].mean(), results_df_30['personal accuracies M'].std()))
print("Hybrid Accuracies M=%.3f, SD=%.3f" % (results_df_30['hybrid accuracies M'].mean(), results_df_30['hybrid accuracies M'].std()))

NameError: name 'results_df_30' is not defined

In [None]:
trace0 = go.Box(
    y=results_df_30['impersonal accuracies M'],
    name='Impersonal',
    marker=dict(
        color='red',
    ),
    boxpoints='all',
    jitter=0.3,
    pointpos=-0.5
)

trace1 = go.Box(
    y=results_df_30['personal accuracies M'],
    name='Personal',
    marker=dict(
        color='blue',
    ),
    boxpoints='all',
    jitter=0.1,
    pointpos=-0.5
)

trace2 = go.Box(
    y=results_df_30['hybrid accuracies M'],
    name='Hybrid',
    marker=dict(
        color='green',
    ),
    boxpoints='all',
    jitter=0.1,
    pointpos=-0.5
)
data = [trace0, trace1, trace2]
layout = go.Layout(yaxis=dict(title="Accuracy"))
fig=go.Figure(data=data,layout=layout)
iplot(fig)

In [None]:
personal_is_best_10 = []
impersonal_is_best_10 = []
hybrid_is_best_10 = []

for ind, row in results_df_10.iterrows():
    accs = [row['personal accuracies M'], row['impersonal accuracies M'], row['hybrid accuracies M']]
    #print(accs)
    best = np.argmax(accs)
    #print(best)
    if best == 0:
        personal_is_best_10.append(row['user_id'])
    if best == 1:
        impersonal_is_best_10.append(row['user_id'])
    if best == 2:
        hybrid_is_best_10.append(row['user_id'])
print("%s users get best from personal" % len(personal_is_best_10))
print("%s users get best from impersonal" % len(impersonal_is_best_10))
print("%s users get best from hybrid" % len(hybrid_is_best_10))

personal_is_best_30 = []
impersonal_is_best_30 = []
hybrid_is_best_30 = []

for ind, row in results_df_30.iterrows():
    accs = [row['personal accuracies M'], row['impersonal accuracies M'], row['hybrid accuracies M']]
    #print(accs)
    best = np.argmax(accs)
    #print(best)
    if best == 0:
        personal_is_best_30.append(row['user_id'])
    if best == 1:
        impersonal_is_best_30.append(row['user_id'])
    if best == 2:
        hybrid_is_best_30.append(row['user_id'])
print("%s users get best from personal" % len(personal_is_best_30))
print("%s users get best from impersonal" % len(impersonal_is_best_30))
print("%s users get best from hybrid" % len(hybrid_is_best_30))

trace1 = go.Bar(
    x=['10 samples', '30 samples'],
    y=[len(personal_is_best_10), len(personal_is_best_30)],
    marker=dict(color="blue"),
    name='Personal Data'
)

trace2 = go.Bar(
    x=['10 samples', '30 samples'],
    y=[len(hybrid_is_best_10), len(hybrid_is_best_30)],
    marker=dict(color="green"),
    name='Hybrid Data'
)

data = [trace1, trace2]
layout = go.Layout(
    barmode='stack',
    xaxis=dict(title="# of personal samples"),
    yaxis=dict(title="# of users")
)

fig = go.Figure(data=data, layout=layout)
iplot(fig, filename='stacked-bar')


## Q : Is impersonal model uncertainty good enough to help our model?

# Method for class  balancing

In [None]:
def stratified_sample(all_labels, num_per_class):
    classes = list(set(all_labels))
    print(classes)
    sample_indeces = []
    
    for cl in classes:
        class_indeces = np.array([i for i in range(len(all_labels)) if all_labels[i] == cl])
        print(class_indeces)
        class_sample_ind = class_indeces[np.random.choice(len(class_indeces), num_per_class, replace=False)]
        sample_indeces += list(class_sample_ind)
    
    np.random.shuffle(sample_indeces)
    return sample_indeces