## This notebook is here to get a general sense of the datasets

In [1]:
from wisdm import wisdm

In [2]:
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
init_notebook_mode(connected=True)
from plotly.graph_objs import *

In [80]:
wisdm.set_data()

# Total data for v1.1

In [81]:
from collections import Counter

In [83]:
data_df = wisdm.remove_all_nan(wisdm.data_df)

labeled_data_counter = Counter(data_df['user'])
    
fig = ff.create_distplot([list(labeled_data_counter.values())], ["data size"], bin_size=2)
fig['layout'].update(xaxis=dict(range=[0,300]))

iplot(fig, filename="Distribution of class prevalence by user")

# Understanding the distribution of class prevalence by participant

## V1.1

In [36]:
import plotly.figure_factory as ff

In [37]:
from collections import Counter

In [51]:
labels = [l.decode('utf-8') for l in wisdm.data_df['class'].unique()]

user_class = {user_id : {l : 0 for l in labels} for user_id in wisdm.user_ids}

for ind, row in wisdm.data_df.iterrows():
    user_id = row['user']
    label = row['class'].decode("utf-8")
    user_class[user_id][label] += 1

dist_data = []

for l in labels:
    l_dist = [user_class[user_id][l] for user_id in wisdm.user_ids]
    dist_data.append(l_dist)

fig = ff.create_distplot(dist_data, labels, bin_size=2)
iplot(fig, filename="Distribution of class prevalence by user")

# V1.1 Made Compatible With V2.0

In [76]:
wisdm.set_data(make_compatible=True)

In [53]:
labels = [l.decode('utf-8') for l in wisdm.data_df['class'].unique()]

user_class = {user_id : {l : 0 for l in labels} for user_id in wisdm.user_ids}

for ind, row in wisdm.data_df.iterrows():
    user_id = row['user']
    label = row['class'].decode("utf-8")
    user_class[user_id][label] += 1

dist_data = []

for l in labels:
    l_dist = [user_class[user_id][l] for user_id in wisdm.user_ids]
    dist_data.append(l_dist)

fig = ff.create_distplot(dist_data, labels, bin_size=2)
iplot(fig, filename="Distribution of class prevalence by user")

# V2.0 Dataset

In [85]:
wisdm.set_data(version="2")

In [87]:
data_df = wisdm.remove_all_nan(wisdm.data_df)

labeled_data_counter = Counter(data_df['user'])
    
fig = ff.create_distplot([list(labeled_data_counter.values())], ["data size"], bin_size=2)
fig['layout'].update(xaxis=dict(range=[0,300]))
iplot(fig, filename="Distribution of class prevalence by user")

In [58]:
labels = [l.decode('utf-8') for l in wisdm.data_df['class'].unique()]

user_class = {user_id : {l : 0 for l in labels} for user_id in wisdm.user_ids}

for ind, row in wisdm.data_df.iterrows():
    user_id = row['user']
    label = row['class'].decode("utf-8")
    user_class[user_id][label] += 1

dist_data = []

for l in labels:
    l_dist = [user_class[user_id][l] for user_id in wisdm.user_ids]
    dist_data.append(l_dist)

fig = ff.create_distplot(dist_data, labels, bin_size=2)
fig['layout'].update(xaxis=dict(range=[0,100]))
iplot(fig, filename="Distribution of class prevalence by user")

## V2.0 dataset made compatible

In [59]:
wisdm.set_data(version="2", make_compatible=True)

In [60]:
labels = [l.decode('utf-8') for l in wisdm.data_df['class'].unique()]

user_class = {user_id : {l : 0 for l in labels} for user_id in wisdm.user_ids}

for ind, row in wisdm.data_df.iterrows():
    user_id = row['user']
    label = row['class'].decode("utf-8")
    user_class[user_id][label] += 1

dist_data = []

for l in labels:
    l_dist = [user_class[user_id][l] for user_id in wisdm.user_ids]
    dist_data.append(l_dist)

fig = ff.create_distplot(dist_data, labels, bin_size=2)
fig['layout'].update(xaxis=dict(range=[0,100]))
iplot(fig, filename="Distribution of class prevalence by user")

# Which users have the least amount of data and are they causing problems for training runs?

In [88]:
wisdm.set_data()

In [110]:
# remove nan rows
data_df = wisdm.remove_all_nan(wisdm.data_df)

labeled_data_counter = Counter(data_df['user'])

sorted_user_ids, sorted_counts = zip(*[(k, labeled_data_counter[k]) \
                                       for k in sorted(labeled_data_counter, key=labeled_data_counter.get, reverse=True)])

print("Users with most labeled data : ")
for ind in range(0,10):
    print("\t%s : %s" % (sorted_user_ids[ind], sorted_counts[ind]))

print("\n")

print("Users with least labeled data : ")
for ind in range(0,10):
    i = len(sorted_counts) - 10 + ind
    print("\t%s : %s" % (sorted_user_ids[i], sorted_counts[i]))

Users with most labeled data : 
	31 : 209
	19 : 194
	14 : 191
	20 : 189
	29 : 178
	13 : 175
	10 : 170
	34 : 168
	26 : 165
	27 : 164


Users with least labeled data : 
	22 : 113
	30 : 112
	2 : 109
	21 : 107
	28 : 107
	17 : 104
	16 : 102
	25 : 66
	9 : 64
	4 : 52


In [111]:
wisdm.set_data(version="2")

In [112]:
# remove nan rows
data_df = wisdm.remove_all_nan(wisdm.data_df)

labeled_data_counter = Counter(data_df['user'])

sorted_user_ids, sorted_counts = zip(*[(k, labeled_data_counter[k]) \
                                       for k in sorted(labeled_data_counter, key=labeled_data_counter.get, reverse=True)])

print("Users with most labeled data : ")
for ind in range(0,10):
    print("\t%s : %s" % (sorted_user_ids[ind], sorted_counts[ind]))

print("\n")

print("Users with least labeled data : ")
for ind in range(0,10):
    i = len(sorted_counts) - 10 + ind
    print("\t%s : %s" % (sorted_user_ids[i], sorted_counts[i]))

Users with most labeled data : 
	1603 : 637
	1319 : 571
	1246 : 565
	998 : 311
	1238 : 264
	1512 : 233
	1477 : 232
	1104 : 218
	1775 : 186
	1799 : 184


Users with least labeled data : 
	1724 : 5
	1679 : 5
	1491 : 4
	1276 : 3
	1802 : 3
	1757 : 3
	1813 : 2
	1763 : 1
	1480 : 1
	1269 : 1
