# CPSC 599.83 - Project 1.2 - Interactive Graphs

*Tyler Gillson - 10170105*

In [55]:
from IPython.display import HTML
HTML('''<script>
code_show=true; 
function code_toggle() {
 if (code_show){
 $('div.input').hide();
 } else {
 $('div.input').show();
 }
 code_show = !code_show
} 
$( document ).ready(code_toggle);
</script>
<span style="color:red">The raw code for this notebook is by default hidden for easier reading.
To toggle on/off the raw code, click <a href="javascript:code_toggle()">here</a>.</span>''')

In [56]:
import numpy as np
import scipy as sp
import pandas as pd
import sklearn as sk
import sklearn.metrics as skm
from sklearn import preprocessing

import plotly.figure_factory as ff
import plotly.graph_objs as go

import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set(style='white', color_codes=True, font_scale=1.3)

In [57]:
pd.options.display.max_columns = 65
pd.options.display.max_rows = 20

In [58]:
# Import transformed data from project 1.1:
all_data = pd.read_csv('transformed_data.csv', delimiter='\t')
dac_index = pd.read_csv('index.csv')

In [59]:
all_data = all_data.drop(columns='row_sum')

In [60]:
err_avg = all_data.loc[:,'error1':'error10'].mean(axis=1)
sx_avg = all_data.loc[:,'speedx1':'speedx10':2].mean(axis=1)
sy_avg = all_data.loc[:,'speedy1':'speedy10':2].mean(axis=1)
p_avg = all_data.loc[:,'pres1':'pres10'].mean(axis=1)
ev_avg = all_data.loc[:,'events1':'events10'].mean(axis=1)
o_avg = all_data.loc[:,'orient1':'orient10'].mean(axis=1)
        
all_data['error_avg'] = err_avg
all_data['speedx_avg'] = sx_avg
all_data['speedy_avg'] = sy_avg
all_data['pressure_avg'] = p_avg
all_data['events_avg'] = ev_avg
all_data['orientation_avg'] = o_avg

In [61]:
import math

def reduce_features(df):
    all_zvectors = []
    
    for index, row in df.iterrows():
        zvectors = [math.sqrt(math.pow(row['speedx'+str(i)],2) +
                              math.pow(row['speedy'+str(i)],2)) for i in range(1,11)]
        all_zvectors.append(zvectors)

    all_zvectors_df = pd.DataFrame(all_zvectors, 
        columns=['spz1','spz2','spz3','spz4','spz5','spz6','spz7','spz8','spz9','spz10'])

    df = df.drop(df.loc[:, 'speedx1':'speedy10'], axis=1)
    df = pd.concat([all_zvectors_df, df], axis=1)
    return df

In [62]:
all_data2 = reduce_features(all_data)

In [63]:
# Re-order all_data:
all_data2 = pd.concat([all_data2['ID'], all_data2.loc[:, 'error1':'error10'], all_data2.loc[:, 'spz1':'spz10'], all_data2.loc[:, 'pres1':'orient10']], axis=1)

In [64]:
reduced_data = pd.concat([all_data['ID'], all_data.loc[:, 'eStart':'eRadius'], all_data.loc[:, 'error_avg':]], axis=1)

In [65]:
all_data_grouped = all_data2.groupby('ID', as_index=False, sort=False).mean()

In [66]:
def encode_labels(df, col):
    le = preprocessing.LabelEncoder()
    df[col] = le.fit_transform(df[col])

In [67]:
encode_labels(all_data_grouped, 'ID')

In [68]:
# Use scikitlearn's Robust Scaler to normalize all the grouped data:
rs = preprocessing.RobustScaler(with_centering=False)
data_scaled = rs.fit_transform(all_data_grouped.drop(columns=['ID']))
all_data_scaled = pd.DataFrame(data=data_scaled)
all_data_scaled = pd.concat([all_data_grouped['ID'], all_data_scaled], axis=1)
all_data_scaled.columns = all_data_grouped.columns

In [69]:
encode_labels(all_data_scaled, 'ID')

In [70]:
import plotly.offline as py
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
init_notebook_mode(connected=True)

### Q2: Which combinations of features and visualization format can best illuminate the overlap between user profiles?

Each of these parallel coordinates plots can be easily updated to include different constraint ranges or column orderings.

To add a new constraint slider, simply copy the constraintrange definition and paste it into any feature's dictionary definition.

Similarly, any plot can be re-ordered by re-arranging its feature dictionary definitions.

#### Raw Data:

In [71]:
# Parallel coordinates plot for all of the unscaled data:
data = [
    go.Parcoords(
        line = dict(color = all_data_grouped['ID'],
                   colorscale = 'Jet',
                   showscale = False,
                   reversescale = False,
                   cmin = 0,
                   cmax = 200),
        hoverinfo = all_data_grouped['ID'],
        dimensions = list([
            # Other:
            dict(range = [-1,1],
                 constraintrange = [0,1],
                 label = 'D', values = all_data_grouped['direction']),
            dict(range = [2,24],
                 label = 'eS', values = all_data_grouped['eStart']),
            dict(range = [5,18],
                 label = 'eC', values = all_data_grouped['eCenter']),
            dict(range = [1,16],
                 label = 'eR', values = all_data_grouped['eRadius']),
            dict(range = [0.002,0.032],
                 label = 'TD', values = all_data_grouped['timediff']),
            
            # Error:
            dict(range = [3,26],
                 label = 'E1', values = all_data_grouped['error1']),
            dict(range = [3,26],
                 label = 'E2', values = all_data_grouped['error2']),
            dict(range = [3,26],
                 label = 'E3', values = all_data_grouped['error3']),
            dict(range = [3,26],
                 label = 'E4', values = all_data_grouped['error4']),
            dict(range = [3,26],
                 label = 'E5', values = all_data_grouped['error5']),
            dict(range = [3,26],
                 label = 'E6', values = all_data_grouped['error6']),
            dict(range = [3,26],
                 label = 'E7', values = all_data_grouped['error7']),
            dict(range = [3,26],
                 label = 'E8', values = all_data_grouped['error8']),
            dict(range = [3,26],
                 label = 'E9', values = all_data_grouped['error9']),
            dict(range = [3,26],
                 label = 'E10', values = all_data_grouped['error10']),
            
            # Speed Z:
            dict(range = [354,6530],
                 label = 'Z1', values = all_data_grouped['spz1']),
            dict(range = [354,6530],
                 label = 'Z2', values = all_data_grouped['spz2']),
            dict(range = [354,6530],
                 label = 'Z3', values = all_data_grouped['spz3']),
            dict(range = [354,6530],
                 label = 'Z4', values = all_data_grouped['spz4']),
            dict(range = [354,6530],
                 label = 'Z5', values = all_data_grouped['spz5']),
            dict(range = [354,6530],
                 label = 'Z6', values = all_data_grouped['spz6']),
            dict(range = [354,6530],
                 label = 'Z7', values = all_data_grouped['spz7']),
            dict(range = [354,6530],
                 label = 'Z8', values = all_data_grouped['spz8']),
            dict(range = [354,6530],
                 label = 'Z9', values = all_data_grouped['spz9']),
            dict(range = [354,6530],
                 label = 'Z10', values = all_data_grouped['spz10']),
            
            # Pressure:
            dict(range = [0,1],
                 label = 'P1', values = all_data_grouped['pres1']),
            dict(range = [0,1],
                 label = 'P2', values = all_data_grouped['pres2']),
            dict(range = [0,1],
                 label = 'P3', values = all_data_grouped['pres3']),
            dict(range = [0,1],
                 label = 'P4', values = all_data_grouped['pres4']),
            dict(range = [0,1],
                 label = 'P5', values = all_data_grouped['pres5']),
            dict(range = [0,1],
                 label = 'P6', values = all_data_grouped['pres6']),
            dict(range = [0,1],
                 label = 'P7', values = all_data_grouped['pres7']),
            dict(range = [0,1],
                 label = 'P8', values = all_data_grouped['pres8']),
            dict(range = [0,1],
                 label = 'P9', values = all_data_grouped['pres9']),
            dict(range = [0,1],
                 label = 'P10', values = all_data_grouped['pres10']),
            
            # Events:
            dict(range = [0,0.9],
                 label = 'V1', values = all_data_grouped['events1']),
            dict(range = [0,0.9],
                 label = 'V2', values = all_data_grouped['events2']),
            dict(range = [0,0.9],
                 label = 'V3', values = all_data_grouped['events3']),
            dict(range = [0,0.9],
                 label = 'V4', values = all_data_grouped['events4']),
            dict(range = [0,0.9],
                 label = 'V5', values = all_data_grouped['events5']),
            dict(range = [0,0.9],
                 label = 'V6', values = all_data_grouped['events6']),
            dict(range = [0,0.9],
                 label = 'V7', values = all_data_grouped['events7']),
            dict(range = [0,0.9],
                 label = 'V8', values = all_data_grouped['events8']),
            dict(range = [0,0.9],
                 label = 'V9', values = all_data_grouped['events9']),
            dict(range = [0,0.9],
                 label = 'V10', values = all_data_grouped['events10']),
            
            # Orients:
            dict(range = [0,360],
                 label = 'O1', values = all_data_grouped['orient1']),
            dict(range = [0,360],
                 label = 'O2', values = all_data_grouped['orient2']),
            dict(range = [0,360],
                 label = 'O3', values = all_data_grouped['orient3']),
            dict(range = [0,360],
                 label = 'O4', values = all_data_grouped['orient4']),
            dict(range = [0,360],
                 label = 'O5', values = all_data_grouped['orient5']),
            dict(range = [0,360],
                 label = 'O6', values = all_data_grouped['orient6']),
            dict(range = [0,360],
                 label = 'O7', values = all_data_grouped['orient7']),
            dict(range = [0,360],
                 label = 'O8', values = all_data_grouped['orient8']),
            dict(range = [0,360],
                 label = 'O9', values = all_data_grouped['orient9']),
            dict(range = [0,360],
                 label = 'O10', values = all_data_grouped['orient10']),          
        ])
    )
]

py.iplot(data, filename = 'parcoords_all_raw')

The difference in range across the features made the plot difficult to interpret. Additionally, the overlapping labels were extremely busy and confusing. So I used scikitlearn's RobustScaler to convert all of the data into the same range. Doing so removed each feature's median but preserved the data's quantile range. This method preserves the underlying statistical distribution of each feature, while simultaneously normalizing the data.

#### Scaled Data:

In [72]:
# Parallel coordinates plot for all of the scaled data:
data = [
    go.Parcoords(
        line = dict(color = all_data_scaled['ID'],
                   colorscale = 'Jet',
                   showscale = False,
                   reversescale = False,
                   cmin = 0,
                   cmax = 200),
        hoverinfo = all_data_scaled['ID'],
        dimensions = list([
            # Other:
            dict(range = [-1,10],
                 constraintrange = [1,1.5],
                 label = 'eS', values = all_data_scaled['eStart']),
            dict(range = [-1,10],
                 label = 'eC', values = all_data_scaled['eCenter']),
            dict(range = [-1,10],
                 label = 'eR', values = all_data_scaled['eRadius']),
            dict(range = [-1,10],
                 label = 'D', values = all_data_scaled['direction']),
            dict(range = [-1,10],
                 label = 'TD', values = all_data_scaled['timediff']), 
            
            # Error:
            dict(range = [-1,10],
                 label = 'E1', values = all_data_scaled['error1']),
            dict(range = [-1,10],
                 label = 'E2', values = all_data_scaled['error2']),
            dict(range = [-1,10],
                 label = 'E3', values = all_data_scaled['error3']),
            dict(range = [-1,10],
                 label = 'E4', values = all_data_scaled['error4']),
            dict(range = [-1,10],
                 label = 'E5', values = all_data_scaled['error5']),
            dict(range = [-1,10],
                 label = 'E6', values = all_data_scaled['error6']),
            dict(range = [-1,10],
                 label = 'E7', values = all_data_scaled['error7']),
            dict(range = [-1,10],
                 label = 'E8', values = all_data_scaled['error8']),
            dict(range = [-1,10],
                 label = 'E9', values = all_data_scaled['error9']),
            dict(range = [-1,10],
                 label = 'E10', values = all_data_scaled['error10']),
            
            # Speed Z:
            dict(range = [-1,10],
                 label = 'Z1', values = all_data_scaled['spz1']),
            dict(range = [-1,10],
                 label = 'Z2', values = all_data_scaled['spz2']),
            dict(range = [-1,10],
                 label = 'Z3', values = all_data_scaled['spz3']),
            dict(range = [-1,10],
                 label = 'Z4', values = all_data_scaled['spz4']),
            dict(range = [-1,10],
                 label = 'Z5', values = all_data_scaled['spz5']),
            dict(range = [-1,10],
                 label = 'Z6', values = all_data_scaled['spz6']),
            dict(range = [-1,10],
                 label = 'Z7', values = all_data_scaled['spz7']),
            dict(range = [-1,10],
                 label = 'Z8', values = all_data_scaled['spz8']),
            dict(range = [-1,10],
                 label = 'Z9', values = all_data_scaled['spz9']),
            dict(range = [-1,10],
                 label = 'Z10', values = all_data_scaled['spz10']),
            
            # Pressure:
            dict(range = [-1,10],
                 label = 'P1', values = all_data_scaled['pres1']),
            dict(range = [-1,10],
                 label = 'P2', values = all_data_scaled['pres2']),
            dict(range = [-1,10],
                 label = 'P3', values = all_data_scaled['pres3']),
            dict(range = [-1,10],
                 label = 'P4', values = all_data_scaled['pres4']),
            dict(range = [-1,10],
                 label = 'P5', values = all_data_scaled['pres5']),
            dict(range = [-1,10],
                 label = 'P6', values = all_data_scaled['pres6']),
            dict(range = [-1,10],
                 label = 'P7', values = all_data_scaled['pres7']),
            dict(range = [-1,10],
                 label = 'P8', values = all_data_scaled['pres8']),
            dict(range = [-1,10],
                 label = 'P9', values = all_data_scaled['pres9']),
            dict(range = [-1,10],
                 label = 'P10', values = all_data_scaled['pres10']),
            
            # Events:
            dict(range = [-1,10],
                 label = 'V1', values = all_data_scaled['events1']),
            dict(range = [-1,10],
                 label = 'V2', values = all_data_scaled['events2']),
            dict(range = [-1,10],
                 label = 'V3', values = all_data_scaled['events3']),
            dict(range = [-1,10],
                 label = 'V4', values = all_data_scaled['events4']),
            dict(range = [-1,10],
                 label = 'V5', values = all_data_scaled['events5']),
            dict(range = [-1,10],
                 label = 'V6', values = all_data_scaled['events6']),
            dict(range = [-1,10],
                 label = 'V7', values = all_data_scaled['events7']),
            dict(range = [-1,10],
                 label = 'V8', values = all_data_scaled['events8']),
            dict(range = [-1,10],
                 label = 'V9', values = all_data_scaled['events9']),
            dict(range = [-1,10],
                 label = 'V10', values = all_data_scaled['events10']),
            
            # Orients:
            dict(range = [-1,10],
                 label = 'O1', values = all_data_scaled['orient1']),
            dict(range = [-1,10],
                 label = 'O2', values = all_data_scaled['orient2']),
            dict(range = [-1,10],
                 label = 'O3', values = all_data_scaled['orient3']),
            dict(range = [-1,10],
                 label = 'O4', values = all_data_scaled['orient4']),
            dict(range = [-1,10],
                 label = 'O5', values = all_data_scaled['orient5']),
            dict(range = [-1,10],
                 label = 'O6', values = all_data_scaled['orient6']),
            dict(range = [-1,10],
                 label = 'O7', values = all_data_scaled['orient7']),
            dict(range = [-1,10],
                 label = 'O8', values = all_data_scaled['orient8']),
            dict(range = [-1,10],
                 label = 'O9', values = all_data_scaled['orient9']),
            dict(range = [-1,10],
                 label = 'O10', values = all_data_scaled['orient10']),           
        ])
    )
]

py.iplot(data, filename = 'parcoords_all_scaled', image_width=1500, image_height=12)

Normalization of the data allowed for a cleaner presentation, but not necessarily any better insight into the relative density/overlap of users within or across feature groups. So next I decided to plot subsets of the overall dataset.

#### Pressure, Events, and Orientation Feature Groups:

In [73]:
# Parallel coordinates plot for raw pressure, events, and orientation data:
data = [
    go.Parcoords(
        line = dict(color = all_data_grouped['ID'],
                   colorscale = 'Jet',
                   showscale = False,
                   reversescale = False,
                   cmin = 0,
                   cmax = 200),
        hoverinfo = all_data_grouped['ID'],
        dimensions = list([
            
            # Pressure:
            dict(range = [0,1],
                 constraintrange = [0.1,0.3],
                 label = 'P1', values = all_data_grouped['pres1']),
            dict(range = [0,1],
                 label = 'P2', values = all_data_grouped['pres2']),
            dict(range = [0,1],
                 label = 'P3', values = all_data_grouped['pres3']),
            dict(range = [0,1],
                 label = 'P4', values = all_data_grouped['pres4']),
            dict(range = [0,1],
                 label = 'P5', values = all_data_grouped['pres5']),
            dict(range = [0,1],
                 label = 'P6', values = all_data_grouped['pres6']),
            dict(range = [0,1],
                 label = 'P7', values = all_data_grouped['pres7']),
            dict(range = [0,1],
                 label = 'P8', values = all_data_grouped['pres8']),
            dict(range = [0,1],
                 label = 'P9', values = all_data_grouped['pres9']),
            dict(range = [0,1],
                 label = 'P10', values = all_data_grouped['pres10']),
            
           # Events:
            dict(range = [0,0.9],
                 constraintrange = [0.1,0.3],
                 label = 'V1', values = all_data_grouped['events1']),
            dict(range = [0,0.9],
                 label = 'V2', values = all_data_grouped['events2']),
            dict(range = [0,0.9],
                 label = 'V3', values = all_data_grouped['events3']),
            dict(range = [0,0.9],
                 label = 'V4', values = all_data_grouped['events4']),
            dict(range = [0,0.9],
                 label = 'V5', values = all_data_grouped['events5']),
            dict(range = [0,0.9],
                 label = 'V6', values = all_data_grouped['events6']),
            dict(range = [0,0.9],
                 label = 'V7', values = all_data_grouped['events7']),
            dict(range = [0,0.9],
                 label = 'V8', values = all_data_grouped['events8']),
            dict(range = [0,0.9],
                 label = 'V9', values = all_data_grouped['events9']),
            dict(range = [0,0.9],
                 label = 'V10', values = all_data_grouped['events10']),
            
            # Orients:
            dict(range = [0,360],
                 constraintrange = [30,95],
                 label = 'O1', values = all_data_grouped['orient1']),
            dict(range = [0,360],
                 label = 'O2', values = all_data_grouped['orient2']),
            dict(range = [0,360],
                 label = 'O3', values = all_data_grouped['orient3']),
            dict(range = [0,360],
                 label = 'O4', values = all_data_grouped['orient4']),
            dict(range = [0,360],
                 label = 'O5', values = all_data_grouped['orient5']),
            dict(range = [0,360],
                 label = 'O6', values = all_data_grouped['orient6']),
            dict(range = [0,360],
                 label = 'O7', values = all_data_grouped['orient7']),
            dict(range = [0,360],
                 label = 'O8', values = all_data_grouped['orient8']),
            dict(range = [0,360],
                 label = 'O9', values = all_data_grouped['orient9']),
            dict(range = [0,360],
                 label = 'O10', values = all_data_grouped['orient10']),
            
           
        ])
    )
]

py.iplot(data, filename = 'parcoords_p_ev_o_grouped', image_width=1500, image_height=12)

I decided to compare pressure, event size, and orientation because those feature groups had the least normal distributions (according to the analysis I did in part 1). Therefore I felt that there would be more opportunity for meaningful differentiation of users via filtering those values.

In [74]:
all_data2 = pd.read_csv('transformed_data.csv', delimiter='\t')
all_data2_grouped = all_data2.groupby('ID', as_index=False, sort=False).mean()
encode_labels(all_data2_grouped, 'ID')

#### Speed Feature Groups:

In [75]:
data = [
    go.Parcoords(
        line = dict(color = all_data2_grouped['ID'],
                   colorscale = 'Jet',
                   showscale = False,
                   reversescale = False,
                   cmin = 0,
                   cmax = 200),
        hoverinfo = all_data2_grouped['ID'],
        dimensions = list([
          # Speed X:
          dict(range = [30,5850],
               constraintrange = [5000,5200],
               label = 'X1', values = all_data2_grouped['speedx1']),
          dict(range = [30,5850],
               label = 'X2', values = all_data2_grouped['speedx2']),
          dict(range = [30,5850],
               label = 'X3', values = all_data2_grouped['speedx3']),
          dict(range = [30,5850],
               label = 'X4', values = all_data2_grouped['speedx4']),
          dict(range = [30,5850],
               label = 'X5', values = all_data2_grouped['speedx5']),
          dict(range = [30,5850],
               label = 'X6', values = all_data2_grouped['speedx6']),
          dict(range = [30,5850],
               label = 'X7', values = all_data2_grouped['speedx7']),
          dict(range = [30,5850],
               label = 'X8', values = all_data2_grouped['speedx8']),
          dict(range = [30,5850],
               label = 'X9', values = all_data2_grouped['speedx9']),
          dict(range = [30,5850],
               label = 'X10', values = all_data2_grouped['speedx10']),
          
          # Speed Y:
          dict(range = [30,5850],
              label = 'Y1', values = all_data2_grouped['speedy1']),
          dict(range = [30,5850],
              label = 'Y2', values = all_data2_grouped['speedy2']),
          dict(range = [30,5850],
              label = 'Y3', values = all_data2_grouped['speedy3']),
          dict(range = [30,5850],
              label = 'Y4', values = all_data2_grouped['speedy4']),
          dict(range = [30,5850],
              label = 'Y5', values = all_data2_grouped['speedy5']),
          dict(range = [30,5850],
              label = 'Y6', values = all_data2_grouped['speedy6']),
          dict(range = [30,5850],
              label = 'Y7', values = all_data2_grouped['speedy7']),
          dict(range = [30,5850],
              label = 'Y8', values = all_data2_grouped['speedy8']),
          dict(range = [30,5850],
              label = 'Y9', values = all_data2_grouped['speedy9']),
          dict(range = [30,5850],
              label = 'Y10', values = all_data2_grouped['speedy10']),           
        ])
    )
]

py.iplot(data, filename = 'parcoords_speeds_split_raw')

Being very normally distributed, the speed data clearly indicates that the majority of users draw circles at roughly the same speed. It also shows that people tend to draw their circles at a speed which remains consistent for the duration of the drawing event. As such, this data will be useful for differentiating certain outliers, but less useful for differentiating amongst the broader population.

#### Raw Data Averaged by Feature Group:

In [76]:
encode_labels(reduced_data, 'ID')

In [77]:
reduced_data_grouped = reduced_data.groupby('ID', as_index=False, sort=False).mean()

In [78]:
# Parallel coordinates plot for reduced data:
data = [
    go.Parcoords(
        line = dict(color = reduced_data_grouped['ID'],
                   colorscale = 'Jet',
                   showscale = False,
                   reversescale = False,
                   cmin = 0,
                   cmax = 200),
        hoverinfo = reduced_data['ID'],
        dimensions = list([
            dict(range = [-1,1],
                 constraintrange = [0,0.2],
                 label = 'D', values = reduced_data_grouped['direction']),
            dict(range = [0,360],
                 label = 'oAvg', values = reduced_data_grouped['orientation_avg']),
            dict(range = [0,1],
                 label = 'pAvg', values = reduced_data_grouped['pressure_avg']),
            dict(range = [0,1],
                 label = 'evAvg', values = reduced_data_grouped['events_avg']), 
            dict(range = [2,25],
                 #constraintrange = [6,6.5],
                 label = 'eS', values = reduced_data_grouped['eStart']),
            dict(range = [5,20],
                 label = 'eC', values = reduced_data_grouped['eCenter']),
            dict(range = [1,16],
                 label = 'eR', values = reduced_data_grouped['eRadius']),
            dict(range = [3,20],
                 label = 'errAvg', values = reduced_data_grouped['error_avg']),
            dict(range = [250,3550],
                 label = 'sxAvg', values = reduced_data_grouped['speedx_avg']),
            dict(range = [240,3650],
                 label = 'syAvg', values = reduced_data_grouped['speedy_avg']),
            dict(range = [0,0.032],
                 label = 'TD', values = reduced_data_grouped['timediff']),
        ])
    )
]

py.iplot(data, filename = 'parcoords_all_reduced_grouped')

Ultimately I felt that the above plot was the most useful for assessing the overlap between user profiles. Using all 65 of the raw data features was simply too much for a human to coherently analyse.  After I collapsed each feature group into its average value I was able to obtain a view that contrasted each of the types of data being collected, without being overly busy.

### Q3: How much variance exists within each of the feature groups and unique features after the data has been aggregated by user ID? And how is the variance distributed?

In [79]:
%load_ext autoreload
%autoreload 1
%aimport crossfilter

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [80]:
from crossfilter import *
load_resources()

<IPython.core.display.Javascript object>

In [81]:
reduced_data_grouped.index = all_data.ID.unique()
reduced_data_grouped = reduced_data_grouped.drop(columns='ID')

In [82]:
reduced_data.ID = all_data.ID

### Individual User Crossfilters

In order to address the second half of my question I decided to use crossfilters. Below are two examples: one from Stan, a user with very average data; and one from Daniel, a user with much more aberrant data.

#### Stan:

In [83]:
x = reduced_data.loc[reduced_data['ID'] == 'stan'] #stan -- normal, daniel, less normal   
Crossfilter(x.loc[:, 'eStart':], width=310, height=160)

<crossfilter.Crossfilter at 0x110a17400>

#### Rafat:

In [84]:
y = reduced_data.loc[reduced_data['ID'] == 'rafat'] #stan -- normal, daniel, less normal   
Crossfilter(y.loc[:, 'eStart':], width=310, height=160)

<crossfilter.Crossfilter at 0x10f91d358>

The Crossfilters make it clear that (a) individual users are not very consistent, and (b) device type is most likely having a large impact on the efficacy of the system. An example of (a) is that Stan draws roughly 50% of his circles in each direction. An example of (b) is that Rafat's phone recorded no pressure and events data at all, whereas Stan's phone did.

#### All usernames:

In [86]:
print(reduced_data.ID.unique())

['A3MC5OA9RXOOFH' 'akrishnan' 'amitkumar' 'anandhuvijayank' 'andrea'
 'antony' 'aprilgrooves' 'aprilgroves' 'Aron' 'Athena' 'audacity'
 'Avrahman' 'azul' 'beccatron' 'bingu' 'bluesea' 'broburns89' 'bt'
 'builder' 'bunny' 'c.96' 'calawampus' 'Casey' 'chakku' 'chrissy'
 'CirclePro' 'cnl5609' 'coco' 'Coggle' 'cookers1' 'COPIEDA5H767UHFV6JC'
 'cshahe1' 'dacram' 'Daniel' 'daniel' 'Daniel123' 'Dax' 'decoy' 'dezziboo'
 'dimple' 'DocNEMS' 'doctao' 'dreddlord' 'Eisenyx' 'Elamado' 'Elamadox'
 'fm114' 'froggymo' 'fuzzy' 'gerdau' 'gracecar' 'hailey' 'Halma' 'Hans'
 'hari' 'Harsh' 'haygoods123' 'hayley9191' 'highzenith' 'Ian' 'ibittyDan'
 'Igna' 'illy' 'indika' 'Indika' 'indreesh' 'jdub' 'jenn' 'jimi' 'jjzay'
 'jm' 'johannasmom' 'john' 'jojo' 'jordan' 'JPadawan11' 'kaliluv562'
 'Kaliluv562' 'karthik' 'khajamoin77' 'Kim' 'klyde' 'Km' 'kmm' 'knmr'
 'kotanikinya' 'krishnas' 'krsmre2002' 'krystaj' 'Krystaj' 'kumar'
 'kurikuri' 'kyo' 'lalyn' 'lambchop522' 'LSA' 'luis' 'Maahi' 'mackey519'
 'manas' 'manda

In [55]:
from IPython.display import HTML
HTML('''<script>
code_show=true; 
function code_toggle() {
 if (code_show){
 $('div.input').hide();
 } else {
 $('div.input').show();
 }
 code_show = !code_show
} 
$( document ).ready(code_toggle);
</script>
<span style="color:red">The raw code for this notebook is by default hidden for easier reading.
To toggle on/off the raw code, click <a href="javascript:code_toggle()">here</a>.</span>''')