# Experiment 1

In [2]:
from sklearn.model_selection import StratifiedKFold, StratifiedShuffleSplit, ShuffleSplit
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import brier_score_loss, accuracy_score, precision_recall_curve, average_precision_score, precision_score, recall_score
from sklearn.preprocessing import StandardScaler, label_binarize

In [3]:
from wisdm import wisdm
import random
import numpy as np
import pandas as pd
from collections import Counter
import time
from scipy import stats

In [4]:
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
init_notebook_mode(connected=True)
import plotly.graph_objs as go

In [5]:
n_trees = 10000
n_cores = 30
oob_score = False

In [6]:
def shuffle_rows(features, labels):
    permutation = np.random.permutation(features.shape[0])
    return features[permutation], labels[permutation]

In [7]:
def impersonal_pred(model, test_features):
    scaled_test_features = impersonal_scaler.transform(test_features)
    
    impersonal_predictions = model.predict(scaled_test_features)
    impersonal_probabilities = model.predict_proba(scaled_test_features)
    return impersonal_predictions, impersonal_probabilities

In [8]:
def personal_pred(personal_features, personal_labels, test_features):
    # build personal model and predict
    personal_scaler = StandardScaler().fit(personal_features)
    scaled_personal_features = personal_scaler.transform(personal_features)
    scaled_test_features = personal_scaler.transform(test_features)

    personal_clf = wisdm.weka_RF()
    personal_clf.set_params(n_estimators=n_trees, n_jobs=n_cores)
    personal_clf.fit(scaled_personal_features, personal_labels)
    
    personal_predictions = personal_clf.predict(scaled_test_features)
    personal_probabilities = personal_clf.predict_proba(scaled_test_features)
    return personal_predictions, personal_probabilities

In [9]:
def hybrid_pred(impersonal_features, impersonal_labels, \
                personal_features, personal_labels, \
                test_features, number_of_samples = None, \
                probabilities=None, \
                sampling_function=None):
    if sampling_function == None:
        hybrid_labels = np.hstack((personal_labels, impersonal_labels))
        hybrid_features = np.vstack((personal_features, impersonal_features))
        hybrid_features, hybrid_labels = shuffle_rows(hybrid_features, hybrid_labels)
    else:
        hybrid_features, hybrid_labels = sampling_function(personal_features, personal_labels, probabilities, number=number_of_samples)
    
    hybrid_scaler = StandardScaler().fit(hybrid_features)
    scaled_hybrid_features = hybrid_scaler.transform(hybrid_features)
    scaled_test_features = hybrid_scaler.transform(test_features)

    hybrid_clf = wisdm.weka_RF()
    hybrid_clf.set_params(n_estimators=n_trees, n_jobs=n_cores)
    hybrid_clf.fit(scaled_hybrid_features, hybrid_labels)
    
    hybrid_predictions = hybrid_clf.predict(scaled_test_features)
    hybrid_probabilities = hybrid_clf.predict
    
    return hybrid_predictions, hybrid_probabilities

In [10]:
def confidence_sample(features, labels, probabilities, number=None, top=False):
    confidence_ranking = np.argsort(np.max(probabilities, axis=1))
    
    if not number:
        return features[confidence_ranking], labels[confidence_ranking]
    
    if top:
        return features[confidence_ranking[-number:]], labels[confidence_ranking[-number:]]
    return features[confidence_ranking[:number]], labels[confidence_ranking[:number]]

In [10]:
#experiment setup
number_of_personal_samples = 10
test_size = 30

wisdm.set_data(version="1", make_compatible=True)
impersonal_df = wisdm.remove_all_nan(wisdm.data_df)
impersonal_labels = np.array([t.decode("utf-8") for t in impersonal_df['class'].as_matrix()])
impersonal_features = impersonal_df.as_matrix(columns=[impersonal_df.columns[1:-1]])
impersonal_scaler = StandardScaler().fit(impersonal_features)
scaled_train_X = impersonal_scaler.transform(impersonal_features)
impersonal_clf = wisdm.weka_RF()
impersonal_clf.set_params(n_estimators=n_trees, n_jobs=n_cores)

start=time.time()
print("Training...")
impersonal_clf.fit(scaled_train_X, impersonal_labels)
finished_training = time.time()
print("Finished Training in %s seconds" % (finished_training - start))
wisdm.set_data(version="2", make_compatible=True)
result_rows = []

number_of_personal_samples = 10
ignored_users = []
print("predicting...")
for user_id in wisdm.user_ids:
    print("User : %s" % user_id)
    user_df = wisdm.data_df[wisdm.data_df['user'] == user_id]
    
    if len(user_df) < 40:
        print("Not Enough Data, skipping...")
        ignored_users.append(user_id)
        continue
    
    personal_labels = np.array([t.decode("utf-8") for t in user_df['class'].as_matrix()])
    personal_features = user_df.as_matrix(columns=[user_df.columns[1:-1]])
    
    sss = StratifiedShuffleSplit(n_splits=4, test_size=test_size, train_size = number_of_personal_samples)
    
    personal_scores = []
    impersonal_scores = []
    hybrid_scores = []
    
    shuffle_count = 0
    try:
        for train_index, test_index in sss.split(personal_features, personal_labels):
            # data for personal model
            random_personal_features = personal_features[train_index]
            random_personal_labels = personal_labels[train_index]

            # create an active pool of everything not in the test set for active learning / hybrid model
            active_pool_mask = np.ones(personal_labels.shape, dtype=bool)
            active_pool_mask[test_index] = False
            active_pool_features = personal_features[active_pool_mask]
            active_pool_labels = personal_labels[active_pool_mask]

            # test set
            test_features = personal_features[test_index]
            test_labels = personal_labels[test_index]

            # build personal model and predict
            personal_predictions, _ = personal_pred(random_personal_features, random_personal_labels, test_features)
            personal_score = accuracy_score(test_labels, personal_predictions)
            personal_scores.append(personal_score)

            # build impersonal model and predict
            impersonal_predictions, _ = impersonal_pred(impersonal_clf, test_features)
            impersonal_score = accuracy_score(test_labels, impersonal_predictions)
            impersonal_scores.append(impersonal_score)

            # build hybrid model and predict
            impersonal_probabilities = impersonal_clf.predict_proba(active_pool_features)
            hybrid_predictions, _ = hybrid_pred(impersonal_features, impersonal_labels, \
                                                random_personal_features, random_personal_labels, \
                                                test_features)

            hybrid_score = accuracy_score(test_labels, hybrid_predictions)
            hybrid_scores.append(hybrid_score)
            print("\t impersonal acc : %.3f" % impersonal_score)
            print("\t personal acc : %.3f" % personal_score)
            print("\t hybrid acc : %.3f" % hybrid_score)
            print("")

            result_row = {"user_id" : user_id,
                          "shuffle" : shuffle_count,
                          "impersonal" : impersonal_score,
                          "personal" : personal_score,
                          "hybrid" : hybrid_score}
            result_rows.append(result_row)
            shuffle_count += 1
    except ValueError as ve:
        if "The least populated class" in ve.args[0]:
            print("\tNot enough labeled data for %s" % user_id)
            ignored_users.append(user_id)
            continue
        else:
            raise ve

finished_predicting = time.time()
print("Finished predicting in %s seconds" % (finished_predicting - finished_training))
print("Users without enough data : %s" % ignored_users)

results_df_exp1_10 = pd.DataFrame(result_rows)

Training...
Finished Training in 23.91435718536377 seconds
predicting...
User : 194
	 impersonal acc : 0.767
	 personal acc : 0.733
	 hybrid acc : 0.667

	 impersonal acc : 0.733
	 personal acc : 0.800
	 hybrid acc : 0.833

	 impersonal acc : 0.767
	 personal acc : 0.900
	 hybrid acc : 0.900

	 impersonal acc : 0.800
	 personal acc : 0.833
	 hybrid acc : 0.833

User : 998


KeyboardInterrupt: 

In [None]:
results_df_exp1_10.head()

In [None]:
results_df_exp1_10.to_pickle("exp1_results_10samples.pickle")

In [None]:
results_df_exp1_30.to_pickle("exp1_results_30samples.pickle")

In [None]:
results_df_exp1_10.to_pickle("")

In [None]:
print("Impersonal Accuracies M=%.3f, SD=%.3f" % (results_df_exp1_10['impersonal'].mean(), results_df_exp1_10['impersonal'].std()))
print("Personal Accuracies M=%.3f, SD=%.3f" % (results_df_exp1_10['personal'].mean(), results_df_exp1_10['personal'].std()))
print("Hybrid Accuracies M=%.3f, SD=%.3f" % (results_df_exp1_10['hybrid'].mean(), results_df_exp1_10['hybrid'].std()))

In [None]:
mean_impersonal = []
mean_personal = []
mean_hybrid = []

for user_id in results_df_exp1_10['user_id'].unique():
    user_df = results_df_exp1_10[results_df_exp1_10['user_id']==user_id]
    mean_impersonal.append(user_df['impersonal'].mean())
    mean_personal.append(user_df['personal'].mean())
    mean_hybrid.append(user_df['hybrid'].mean())

trace0 = go.Box(
    y=mean_impersonal,
    name='Impersonal',
    marker=dict(
        color='red',
    ),
    boxpoints='all',
    jitter=0.3,
    pointpos=-0.5
)

trace1 = go.Box(
    y=mean_personal,
    name='Personal (10 Samples)',
    marker=dict(
        color='blue',
    ),
    boxpoints='all',
    jitter=0.1,
    pointpos=-0.5
)

trace2 = go.Box(
    y=mean_hybrid,
    name='Hybrid (10 Samples)',
    marker=dict(
        color='green',
    ),
    boxpoints='all',
    jitter=0.1,
    pointpos=-0.5
)
data = [trace0, trace1, trace2]
layout = go.Layout(yaxis=dict(title="Accuracy"), showlegend=False)
fig=go.Figure(data=data,layout=layout)
iplot(fig)

# Experiment 1 with 30 samples

In [None]:
#experiment setup
number_of_personal_samples = 10
test_size = 30

wisdm.set_data(version="1", make_compatible=True)
impersonal_df = wisdm.remove_all_nan(wisdm.data_df)
impersonal_labels = np.array([t.decode("utf-8") for t in impersonal_df['class'].as_matrix()])
impersonal_features = impersonal_df.as_matrix(columns=[impersonal_df.columns[1:-1]])
impersonal_scaler = StandardScaler().fit(impersonal_features)
scaled_train_X = impersonal_scaler.transform(impersonal_features)
impersonal_clf = wisdm.weka_RF()
impersonal_clf.set_params(n_estimators=n_trees, n_jobs=n_cores)

start=time.time()
print("Training...")
impersonal_clf.fit(scaled_train_X, impersonal_labels)
finished_training = time.time()
print("Finished Training in %s seconds" % (finished_training - start))
wisdm.set_data(version="2", make_compatible=True)
result_rows = []

number_of_personal_samples = 10
ignored_users = []
print("predicting...")
for user_id in wisdm.user_ids:
    print("User : %s" % user_id)
    user_df = wisdm.data_df[wisdm.data_df['user'] == user_id]
    
    if len(user_df) < 40:
        print("Not Enough Data, skipping...")
        ignored_users.append(user_id)
        continue
    
    personal_labels = np.array([t.decode("utf-8") for t in user_df['class'].as_matrix()])
    personal_features = user_df.as_matrix(columns=[user_df.columns[1:-1]])
    
    sss = StratifiedShuffleSplit(n_splits=4, test_size=test_size, train_size = number_of_personal_samples)
    
    personal_scores = []
    impersonal_scores = []
    hybrid_scores = []
    
    shuffle_count = 0
    try:
        for train_index, test_index in sss.split(personal_features, personal_labels):
            # data for personal model
            random_personal_features = personal_features[train_index]
            random_personal_labels = personal_labels[train_index]

            # create an active pool of everything not in the test set for active learning / hybrid model
            active_pool_mask = np.ones(personal_labels.shape, dtype=bool)
            active_pool_mask[test_index] = False
            active_pool_features = personal_features[active_pool_mask]
            active_pool_labels = personal_labels[active_pool_mask]

            # test set
            test_features = personal_features[test_index]
            test_labels = personal_labels[test_index]

            # build personal model and predict
            personal_predictions, _ = personal_pred(random_personal_features, random_personal_labels, test_features)
            personal_score = accuracy_score(test_labels, personal_predictions)
            personal_scores.append(personal_score)

            # build impersonal model and predict
            impersonal_predictions, _ = impersonal_pred(impersonal_clf, test_features)
            impersonal_score = accuracy_score(test_labels, impersonal_predictions)
            impersonal_scores.append(impersonal_score)

            # build hybrid model and predict
            impersonal_probabilities = impersonal_clf.predict_proba(active_pool_features)
            hybrid_predictions, _ = hybrid_pred(impersonal_features, impersonal_labels, \
                                                random_personal_features, random_personal_labels, \
                                                test_features)

            hybrid_score = accuracy_score(test_labels, hybrid_predictions)
            hybrid_scores.append(hybrid_score)
            print("\t impersonal acc : %.3f" % impersonal_score)
            print("\t personal acc : %.3f" % personal_score)
            print("\t hybrid acc : %.3f" % hybrid_score)
            print("")

            result_row = {"user_id" : user_id,
                          "shuffle" : shuffle_count,
                          "impersonal" : impersonal_score,
                          "personal" : personal_score,
                          "hybrid" : hybrid_score}
            result_rows.append(result_row)
            shuffle_count += 1
    except ValueError as ve:
        if "The least populated class" in ve.args[0]:
            print("\tNot enough labeled data for %s" % user_id)
            ignored_users.append(user_id)
            continue
        else:
            raise ve

finished_predicting = time.time()
print("Finished predicting in %s seconds" % (finished_predicting - finished_training))
print("Users without enough data : %s" % ignored_users)

results_df_exp1_30 = pd.DataFrame(result_rows)

In [None]:
mean_impersonal = []
mean_personal = []
mean_hybrid = []

for user_id in results_df_exp1_30['user_id'].unique():
    user_df = results_df_exp1_30[results_df_exp1_30['user_id']==user_id]
    mean_impersonal.append(user_df['impersonal'].mean())
    mean_personal.append(user_df['personal'].mean())
    mean_hybrid.append(user_df['hybrid'].mean())

trace0 = go.Box(
    y=mean_impersonal,
    name='Impersonal',
    marker=dict(
        color='red',
    ),
    boxpoints='all',
    jitter=0.3,
    pointpos=-0.5
)

trace1 = go.Box(
    y=mean_personal,
    name='Personal (30 Samples)',
    marker=dict(
        color='blue',
    ),
    boxpoints='all',
    jitter=0.1,
    pointpos=-0.5
)

trace2 = go.Box(
    y=mean_hybrid,
    name='Hybrid (30 Samples)',
    marker=dict(
        color='green',
    ),
    boxpoints='all',
    jitter=0.1,
    pointpos=-0.5
)
data = [trace0, trace1, trace2]
layout = go.Layout(yaxis=dict(title="Accuracy"), showlegend=False)
fig=go.Figure(data=data,layout=layout)
iplot(fig)

In [None]:
trace1 = go.Bar(
    x=['10 samples', '30 samples'],
    y=[len(personal_is_best_10), len(personal_is_best_30)],
    marker=dict(color="blue"),
    name='Personal Data'
)

trace2 = go.Bar(
    x=['10 samples', '30 samples'],
    y=[len(hybrid_is_best_10), len(hybrid_is_best_30)],
    marker=dict(color="green"),
    name='Hybrid Data'
)

data = [trace1, trace2]
layout = go.Layout(
    barmode='stack',
    xaxis=dict(title="# of personal samples"),
    yaxis=dict(title="# of users")
)

fig = go.Figure(data=data, layout=layout)
iplot(fig, filename='stacked-bar')

# Experiment 2

In [11]:
#experiment setup
wisdm.set_data(version="1", make_compatible=True)
impersonal_df = wisdm.remove_all_nan(wisdm.data_df)
impersonal_labels = np.array([t.decode("utf-8") for t in impersonal_df['class'].as_matrix()])
impersonal_features = impersonal_df.as_matrix(columns=[impersonal_df.columns[1:-1]])
impersonal_scaler = StandardScaler().fit(impersonal_features)
scaled_train_X = impersonal_scaler.transform(impersonal_features)
impersonal_clf = wisdm.weka_RF()
impersonal_clf.set_params(n_estimators=n_trees, n_jobs=n_cores)

start=time.time()
print("Training...")
impersonal_clf.fit(scaled_train_X, impersonal_labels)
finished_training = time.time()
print("Finished Training in %s seconds" % (finished_training - start))
wisdm.set_data(version="2", make_compatible=True)
result_rows = []

number_of_personal_samples = 10
ignored_users = []
print("predicting...")

for user_id in wisdm.user_ids:
    print("User : %s" % user_id)
    user_df = wisdm.data_df[wisdm.data_df['user'] == user_id]
    
    if len(user_df) < 40:
        print("Not Enough Data, skipping...")
        ignored_users.append(user_id)
        continue
    
    personal_labels = np.array([t.decode("utf-8") for t in user_df['class'].as_matrix()])
    personal_features = user_df.as_matrix(columns=[user_df.columns[1:-1]])
    
    scaled_personal_features = impersonal_scaler.transform(personal_features)
    
    impersonal_probabilities = impersonal_clf.predict_proba(scaled_personal_features)
    impersonal_predictions = impersonal_clf.predict(scaled_personal_features)
    
    # get ranking
    confidence_ranking = np.argsort(np.max(impersonal_probabilities, axis=1))
    
    ranked_predictions = impersonal_predictions[confidence_ranking]
    ranked_truth = personal_labels[confidence_ranking]
    
    result_row = {"user_id" : user_id,
                  "top30" : accuracy_score(ranked_truth[-30:], ranked_predictions[-30:]),
                  "bottom30" : accuracy_score(ranked_truth[:30], ranked_predictions[:30]),
                  "overall" : accuracy_score(personal_labels, impersonal_predictions)}
    result_rows.append(result_row)
    
finished_predicting = time.time()
print("Finished predicting in %s seconds" % (finished_predicting - finished_training))
print("Users without enough data : %s" % ignored_users)

results_df_exp2 = pd.DataFrame(result_rows)

Training...
Finished Training in 23.827834367752075 seconds
predicting...
User : 194
User : 998
User : 1097
Not Enough Data, skipping...
User : 1104
User : 1117
User : 1205
Not Enough Data, skipping...
User : 1238
User : 1246
User : 1247
Not Enough Data, skipping...
User : 1253
User : 1269
Not Enough Data, skipping...
User : 1274
User : 1276
Not Enough Data, skipping...
User : 1277
Not Enough Data, skipping...
User : 1280
Not Enough Data, skipping...
User : 1319
User : 1320
User : 1477
User : 1480
Not Enough Data, skipping...
User : 1491
Not Enough Data, skipping...
User : 1511
Not Enough Data, skipping...
User : 1512
User : 1518
Not Enough Data, skipping...
User : 1531
Not Enough Data, skipping...
User : 1554
Not Enough Data, skipping...
User : 1559
User : 1603
User : 1676
User : 1679
Not Enough Data, skipping...
User : 1683
Not Enough Data, skipping...
User : 1696
Not Enough Data, skipping...
User : 1703
User : 1707
User : 1723
Not Enough Data, skipping...
User : 1724
Not Enough Data

In [13]:
results_df_exp2.describe()

Unnamed: 0,bottom30,overall,top30
count,26.0,26.0,26.0
mean,0.188462,0.387026,0.592308
std,0.269311,0.307927,0.39579
min,0.0,0.0,0.0
25%,0.0,0.059854,0.141667
50%,0.0,0.446399,0.733333
75%,0.3,0.620896,0.958333
max,1.0,1.0,1.0


# Experiment 3

# Quick check on temporal signal

In [None]:
user_amounts

In [None]:
np.mean(user_amounts)

In [None]:
user_amounts = []

for user_id in wisdm.user_ids:
    if user_id not in users_over_longest_time:
        user_df = wisdm.data_df[wisdm.data_df['user'] == user_id]
        user_amounts.append(len(user_df))

In [None]:
np.mean(user_amounts)

In [None]:
raw_df = pd.read_pickle('./datasets/WISDM_v2/all_raw_data.dataframe.pickle')

In [None]:
study_user_ids = wisdm.data_df['user'].unique()

In [None]:
raw_study_df = raw_df[raw_df['user'].isin(study_user_ids)]

In [None]:
timespans = []

for user_id in study_user_ids:
    times = raw_study_df[raw_study_df['user'] == user_id]['timestamp']
    first = times.min()
    last = times.max()
    span = last - first
    timespans.append(span)

In [None]:
result_rows = []

for ind, user_id in enumerate(study_user_ids):
    user_results_df = results_df_exp1_10[results_df_exp1_10['user_id'] == user_id]
    timespan = timespans[ind]
    
    personal_acc = user_df['personal'].mean()
    impersonal_acc = user_df['impersonal'].mean()
    hybrid_acc = user_df['hybrid'].mean()
    
    

# Users with day1 to later ratio

In [None]:
early_to_late_ratio = [1.1685393258426966,
 1.0,
 1.0,
 4.148148148148148,
 1.0,
 0.33739837398373984,
 2.0,
 1.0,
 1.121212121212121,
 1.0,
 1.0,
 0.13636363636363635,
 1.0,
 0.10294117647058823,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.8666666666666667,
 1.0,
 0.0622876557191393,
 1.0,
 1.0,
 1.0,
 0.184375,
 2.025,
 0.5714285714285714,
 1.0,
 3.8947368421052633,
 1.0,
 0.43661971830985913,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 5.071428571428571,
 0.304635761589404,
 1.0,
 1.0,
 0.5119047619047619,
 1.0,
 1.0]



In [None]:
users = ['194',
 '998',
 '1104',
 '1117',
 '1205',
 '1238',
 '1246',
 '1247',
 '1253',
 '1269',
 '1274',
 '1277',
 '1280',
 '1319',
 '1320',
 '1477',
 '1480',
 '1491',
 '1511',
 '1512',
 '1518',
 '1531',
 '1554',
 '1559',
 '1603',
 '1676',
 '1679',
 '1683',
 '1696',
 '1703',
 '1707',
 '1724',
 '1742',
 '1745',
 '1750',
 '1757',
 '1758',
 '1759',
 '1761',
 '1763',
 '1774',
 '1775',
 '1778',
 '1793',
 '1797',
 '1799',
 '1802']

In [None]:
personal_means = []
impersonal_means = []
hybrid_means = []

users_to_plot = []
ratio = []
for ind, user_id in enumerate(users):
    user_df = results_df_exp1_10[results_df_exp1_10['user_id'] == user_id]
    if len(user_df) > 0:
        personal_means.append(user_df['personal'].mean())
        impersonal_means.append(user_df['impersonal'].mean())
        hybrid_means.append(user_df['hybrid'].mean())
        users_to_plot.append(user_id)
        ratio.append(early_to_late_ratio[ind])

In [None]:
tau, p = stats.kendalltau(personal_means, ratio)

slope, intercept, r_value, p_value, std_err = stats.linregress(personal_means, ratio)

line = slope*np.array(personal_means)+intercept

line_trace = go.Scatter(x=personal_means,
                        y=line,
                        mode='lines',
                        name='Fit')

personal_trace = go.Scatter(x=personal_means,
                            y=ratio,
                            mode="markers")
data=[personal_trace, line_trace]

title = 'Personal Model : tau=%.5f, p=%.5f' % (tau, p)
layout=go.Layout(yaxis=dict(title='Ratio of labels < 1 hour to labels > 1 hour'),
                 xaxis=dict(title='Accuracy', range=[0,1]),
                 title=title)
fig=go.Figure(data=data, layout=layout)

iplot(fig)

In [None]:
tau, p = stats.kendalltau(impersonal_means, ratio)

slope, intercept, r_value, p_value, std_err = stats.linregress(impersonal_means, ratio)

line = slope*np.array(impersonal_means)+intercept

line_trace = go.Scatter(x=impersonal_means,
                        y=line,
                        mode='lines',
                        name='Fit')

impersonal_trace = go.Scatter(x=impersonal_means,
                            y=ratio,
                            mode="markers")
data=[impersonal_trace, line_trace]

title = 'impersonal Model : tau=%.5f, p=%.5f' % (tau, p)
layout=go.Layout(yaxis=dict(title='Ratio of labels < 1 hour to labels > 1 hour'),
                 xaxis=dict(title='Accuracy', range=[0,1]),
                 title=title)
fig=go.Figure(data=data, layout=layout)

iplot(fig)

In [None]:
tau, p = stats.kendalltau(hybrid_means, ratio)

slope, intercept, r_value, p_value, std_err = stats.linregress(hybrid_means, ratio)

line = slope*np.array(hybrid_means)+intercept

line_trace = go.Scatter(x=hybrid_means,
                        y=line,
                        mode='lines',
                        name='Fit')

hybrid_trace = go.Scatter(x=hybrid_means,
                            y=ratio,
                            mode="markers")
data=[hybrid_trace, line_trace]

title = 'hybrid Model : tau=%.5f, p=%.5f' % (tau, p)
layout=go.Layout(yaxis=dict(title='Ratio of labels < 1 hour to labels > 1 hour'),
                 xaxis=dict(title='Accuracy', range=[0,1]),
                 title=title)
fig=go.Figure(data=data, layout=layout)

iplot(fig)

In [None]:
time_deltas = [23.221944444444443,
 0.61361111111111111,
 0.44083333333333335,
 23.722777777777779,
 0.024722222222222222,
 23.2575,
 9.8166666666666664,
 0.063888888888888884,
 15.842499999999999,
 0.0025000000000000001,
 0.065833333333333327,
 4.402222222222222,
 0.033055555555555553,
 20.708055555555557,
 0.89583333333333337,
 0.20694444444444443,
 0.046944444444444441,
 0.03833333333333333,
 0.01638888888888889,
 0.84916666666666663,
 0.073888888888888893,
 0.30055555555555558,
 1.2597222222222222,
 0.16972222222222222,
 23.955555555555556,
 0.55583333333333329,
 0.01361111111111111,
 0.046944444444444441,
 2.3138888888888891,
 5.2169444444444446,
 4.7552777777777777,
 0.16805555555555557,
 16.563055555555554,
 0.030277777777777778,
 15.786666666666667,
 0.0080555555555555554,
 0.23833333333333334,
 0.50916666666666666,
 0.14444444444444443,
 0.05527777777777778,
 22.757222222222222,
 2.1077777777777778,
 0.35249999999999998,
 0.28083333333333332,
 5.371666666666667,
 0.57666666666666666,
 0.18361111111111111]

In [None]:
time_deltas_users = ['194',
 '998',
 '1104',
 '1117',
 '1205',
 '1238',
 '1246',
 '1247',
 '1253',
 '1269',
 '1274',
 '1277',
 '1280',
 '1319',
 '1320',
 '1477',
 '1480',
 '1491',
 '1511',
 '1512',
 '1518',
 '1531',
 '1554',
 '1559',
 '1603',
 '1676',
 '1679',
 '1683',
 '1696',
 '1703',
 '1707',
 '1724',
 '1742',
 '1745',
 '1750',
 '1757',
 '1758',
 '1759',
 '1761',
 '1763',
 '1774',
 '1775',
 '1778',
 '1793',
 '1797',
 '1799',
 '1802']

In [None]:
personal_means = []
impersonal_means = []
hybrid_means = []

users_to_plot = []
tds = []
for ind, user_id in enumerate(time_deltas_users):
    user_df = results_df_exp1_10[results_df_exp1_10['user_id'] == user_id]
    if len(user_df) > 0:
        personal_means.append(user_df['personal'].mean())
        impersonal_means.append(user_df['impersonal'].mean())
        hybrid_means.append(user_df['hybrid'].mean())
        users_to_plot.append(user_id)
        tds.append(time_deltas[ind])

### Correlations

In [None]:
from scipy import stats

In [None]:
tau, p = stats.kendalltau(personal_means, tds)

slope, intercept, r_value, p_value, std_err = stats.linregress(personal_means, tds)

line = slope*np.array(personal_means)+intercept

line_trace = go.Scatter(x=personal_means,
                        y=line,
                        mode='lines',
                        name='Fit')

personal_trace = go.Scatter(x=personal_means,
                            y=tds,
                            mode="markers")
data=[personal_trace, line_trace]

title = 'personal Model : tau=%.5f, p=%.5f' % (tau, p)
layout=go.Layout(yaxis=dict(title='Time Deltas'),
                 xaxis=dict(title='Accuracy', range=[0,1]),
                 title=title)
fig=go.Figure(data=data, layout=layout)

iplot(fig)

In [None]:
tau, p = stats.kendalltau(impersonal_means, tds)

slope, intercept, r_value, p_value, std_err = stats.linregress(impersonal_means, tds)

line = slope*np.array(impersonal_means)+intercept

line_trace = go.Scatter(x=impersonal_means,
                        y=line,
                        mode='lines',
                        name='Fit')

impersonal_trace = go.Scatter(x=impersonal_means,
                            y=tds,
                            mode="markers")
data=[impersonal_trace, line_trace]

title = 'impersonal Model : tau=%.5f, p=%.5f' % (tau, p)
layout=go.Layout(yaxis=dict(title='Time Deltas'),
                 xaxis=dict(title='Accuracy', range=[0,1]),
                 title=title)
fig=go.Figure(data=data, layout=layout)

iplot(fig)

In [None]:
tau, p = stats.kendalltau(hybrid_means, tds)

slope, intercept, r_value, p_value, std_err = stats.linregress(hybrid_means, tds)

line = slope*np.array(hybrid_means)+intercept

line_trace = go.Scatter(x=hybrid_means,
                        y=line,
                        mode='lines',
                        name='Fit')

hybrid_trace = go.Scatter(x=hybrid_means,
                            y=tds,
                            mode="markers")
data=[hybrid_trace, line_trace]

title = 'hybrid Model : tau=%.5f, p=%.5f' % (tau, p)
layout=go.Layout(yaxis=dict(title='Time Deltas'),
                 xaxis=dict(title='Accuracy', range=[0,1]),
                 title=title)
fig=go.Figure(data=data, layout=layout)

iplot(fig)

### Model with 30 personal samples for personal and hybrid models

In [None]:
personal_means = []
impersonal_means = []
hybrid_means = []

users_to_plot = []
ratio = []
for ind, user_id in enumerate(users):
    user_df = results_df_exp1_30[results_df_exp1_30['user_id'] == user_id]
    if len(user_df) > 0:
        personal_means.append(user_df['personal'].mean())
        impersonal_means.append(user_df['impersonal'].mean())
        hybrid_means.append(user_df['hybrid'].mean())
        users_to_plot.append(user_id)
        ratio.append(early_to_late_ratio[ind])

In [None]:
tau, p = stats.kendalltau(personal_means, (1. / np.array(ratio)))

slope, intercept, r_value, p_value, std_err = stats.linregress(personal_means, (1. / np.array(ratio)))

line = slope*np.array(personal_means)+intercept

line_trace = go.Scatter(x=personal_means,
                        y=line,
                        mode='lines',
                        name='Fit')

personal_trace = go.Scatter(x=personal_means,
                            y=(1. / np.array(ratio)),
                            mode="markers")
data=[personal_trace, line_trace]

title = 'personal Model : tau=%.5f, p=%.5f' % (tau, p)
layout=go.Layout(yaxis=dict(title='(1. / np.array(ratio)) of labels < 1 hour to labels > 1 hour'),
                 xaxis=dict(title='Accuracy', range=[0,1]),
                 title=title)
fig=go.Figure(data=data, layout=layout)

iplot(fig)

In [None]:
tau, p = stats.kendalltau(impersonal_means, (1. / np.array(ratio)))

slope, intercept, r_value, p_value, std_err = stats.linregress(impersonal_means, (1. / np.array(ratio)))

line = slope*np.array(impersonal_means)+intercept

line_trace = go.Scatter(x=impersonal_means,
                        y=line,
                        mode='lines',
                        name='Fit')

impersonal_trace = go.Scatter(x=impersonal_means,
                            y=(1. / np.array(ratio)),
                            mode="markers")
data=[impersonal_trace, line_trace]

title = 'impersonal Model : tau=%.5f, p=%.5f' % (tau, p)
layout=go.Layout(yaxis=dict(title='(1. / np.array(ratio)) of labels < 1 hour to labels > 1 hour'),
                 xaxis=dict(title='Accuracy', range=[0,1]),
                 title=title)
fig=go.Figure(data=data, layout=layout)

iplot(fig)

In [None]:
tau, p = stats.kendalltau(hybrid_means, (1. / np.array(ratio)))

slope, intercept, r_value, p_value, std_err = stats.linregress(hybrid_means, (1. / np.array(ratio)))

line = slope*np.array(hybrid_means)+intercept

line_trace = go.Scatter(x=hybrid_means,
                        y=line,
                        mode='lines',
                        name='Fit')

hybrid_trace = go.Scatter(x=hybrid_means,
                            y=(1. / np.array(ratio)),
                            mode="markers")
data=[hybrid_trace, line_trace]

title = 'hybrid Model : tau=%.5f, p=%.5f' % (tau, p)
layout=go.Layout(yaxis=dict(title='(1. / np.array(ratio)) of labels < 1 hour to labels > 1 hour'),
                 xaxis=dict(title='Accuracy', range=[0,1]),
                 title=title)
fig=go.Figure(data=data, layout=layout)

iplot(fig)

### Time Deltas

In [None]:
tau, p = stats.kendalltau(personal_means, tds)

slope, intercept, r_value, p_value, std_err = stats.linregress(personal_means, tds)

line = slope*np.array(personal_means)+intercept

line_trace = go.Scatter(x=personal_means,
                        y=line,
                        mode='lines',
                        name='Fit')

personal_trace = go.Scatter(x=personal_means,
                            y=tds,
                            mode="markers")
data=[personal_trace, line_trace]

title = 'personal Model : tau=%.5f, p=%.5f' % (tau, p)
layout=go.Layout(yaxis=dict(title='Time Deltas'),
                 xaxis=dict(title='Accuracy', range=[0,1]),
                 title=title)
fig=go.Figure(data=data, layout=layout)

iplot(fig)

In [None]:
tau, p = stats.kendalltau(impersonal_means, tds)

slope, intercept, r_value, p_value, std_err = stats.linregress(impersonal_means, tds)

line = slope*np.array(impersonal_means)+intercept

line_trace = go.Scatter(x=impersonal_means,
                        y=line,
                        mode='lines',
                        name='Fit')

impersonal_trace = go.Scatter(x=impersonal_means,
                            y=tds,
                            mode="markers")
data=[impersonal_trace, line_trace]

title = 'impersonal Model : tau=%.5f, p=%.5f' % (tau, p)
layout=go.Layout(yaxis=dict(title='Time Deltas'),
                 xaxis=dict(title='Accuracy', range=[0,1]),
                 title=title)
fig=go.Figure(data=data, layout=layout)

iplot(fig)

In [None]:
tau, p = stats.kendalltau(hybrid_means, tds)

slope, intercept, r_value, p_value, std_err = stats.linregress(hybrid_means, tds)

line = slope*np.array(hybrid_means)+intercept

line_trace = go.Scatter(x=hybrid_means,
                        y=line,
                        mode='lines',
                        name='Fit')

hybrid_trace = go.Scatter(x=hybrid_means,
                            y=tds,
                            mode="markers")
data=[hybrid_trace, line_trace]

title = 'hybrid Model : tau=%.5f, p=%.5f' % (tau, p)
layout=go.Layout(yaxis=dict(title='Time Deltas'),
                 xaxis=dict(title='Accuracy', range=[0,1]),
                 title=title)
fig=go.Figure(data=data, layout=layout)

iplot(fig)