In [18]:
# Feature engineering 
import pandas as pd
import numpy as np
import scipy as sp

# Machine learning
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.cross_validation import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import log_loss
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix

# Charts
import matplotlib.pylab as plt
import seaborn as sns
%matplotlib inline


# Static params
DATA_FOLDER = 'Data/'
DATA_FILE = 'raw_data_fixed.txt'


class ActitrackerLR:
    ''' Logistic Regression models
        one for each class
    '''
    @staticmethod
    def train_models(X_train, Y_train, model_params):
        ''' Train models iteratively 
            for each class
        '''
        models = [] 
        for i in xrange(Y_train.shape[1]):
            model = LogisticRegression(**model_params)
            y = Y_train[:,i]
            model.fit(X_train, y)
            models.append(model)
        return models
    
    @staticmethod
    def make_predictions(X_test, models, num_classes=6):
        ''' Make predictions 
            for each class 
        '''
        predictions = np.zeros((X_test.shape[0], num_classes))
        for i, model in enumerate(models):
            p = model.predict_proba(X_test)
            predictions[:,i] = p[:,1]
        return predictions


def load_data():
    global actitracker
    actitracker = pd.read_csv(
        DATA_FOLDER+DATA_FILE ,
        sep=',' ,
        lineterminator=';' ,
        header=None ,
    )
    actitracker.columns = [
        'user' ,
        'activity' ,
        'timestamp' ,
        'x-accel' ,
        'y-accel' ,
        'z-accel' ,
        'NA' ,
    ]
    del actitracker['NA']


def create_sessions():
    global actitracker
    # re-calculate time in seconds
    actitracker['time_seconds'] = actitracker['timestamp']*10e-9

    # sort by user and time 
    actitracker = actitracker.sort_values(by=['user','time_seconds'])

    # create sessions
    session_length = 100
    actitracker['seq'] = xrange(actitracker.shape[0])
    actitracker['session'] = actitracker.\
                               groupby(['user','activity'])['seq'].\
                               apply(lambda x: x%session_length == 0).\
                               fillna(0).cumsum()


def gather_target_vars():
    global label_lookup
    # get session_labels 
    ohe = OneHotEncoder(sparse=False); le = LabelEncoder()
    labels = actitracker.groupby(['user','session'])['activity'].apply(lambda x: max(x))
    le_labels = le.fit_transform(labels)
    ohe_labels = ohe.fit_transform(le_labels.reshape(-1,1))
    label_lookup = { k: v for k,v in set((i, v) for i,v in np.vstack((le_labels,labels)).T) }
    
    # create target variables
    Y = pd.DataFrame(ohe_labels,index=labels.index)
    return Y,labels


get_label = np.vectorize(lambda x: label_lookup[x])


def feature_engineering():
    # group by user and session
    accel_cols = ['x-accel','y-accel','z-accel']
    g = actitracker.loc[:,accel_cols+['user','session']].groupby(['user','session'])

    # IQR function
    def iqr(x):
        ''' calculate IQR from array
        '''
        q75, q25 = np.percentile(x, [75,25])
        return q75-q25

    # calculate model cols 
    means = g[accel_cols].apply(lambda x: np.mean(x))
    sds = g[accel_cols].apply(lambda x: np.std(x))
    median_1 = g[accel_cols[0]].apply(lambda x: np.median(x))
    median_2 = g[accel_cols[1]].apply(lambda x: np.median(x))
    median_3 = g[accel_cols[2]].apply(lambda x: np.median(x))
    iqr_1 = g[accel_cols[0]].apply(lambda x: iqr(x))
    iqr_2 = g[accel_cols[1]].apply(lambda x: iqr(x))
    iqr_3 = g[accel_cols[2]].apply(lambda x: iqr(x))
    mins = g[accel_cols].apply(lambda x: np.min(x))
    maxs = g[accel_cols].apply(lambda x: np.max(x))
    kurtosis_1 = g[accel_cols[0]].apply(lambda x: sp.stats.kurtosis(x))
    kurtosis_2 = g[accel_cols[1]].apply(lambda x: sp.stats.kurtosis(x))
    kurtosis_3 = g[accel_cols[2]].apply(lambda x: sp.stats.kurtosis(x))
    skew_1 = g[accel_cols[0]].apply(lambda x: sp.stats.skew(x))
    skew_2 = g[accel_cols[1]].apply(lambda x: sp.stats.skew(x))
    skew_3 = g[accel_cols[2]].apply(lambda x: sp.stats.skew(x))
    percentiles = []
    for i in range(10,100,10):
        for e in range(1,4):
            percentiles.append(eval('g[accel_cols['+str(e-1)+']].apply(lambda x: sp.percentile(x,'+str(i)+'))'))

    # concat columns
    X = pd.concat([means,
                    sds,
                   median_1,
                   median_2,
                   median_3,
                   iqr_1,
                   iqr_2,
                   iqr_3,
                   mins,
                   maxs,
                   kurtosis_1,
                   kurtosis_2,
                   kurtosis_3,
                   skew_1,
                   skew_2,
                   skew_3,
                  ]+percentiles
                  ,axis=1)

    # Scale data
    ss = StandardScaler()
    X = ss.fit_transform(X)
    return X


def lr_evaluate_params(c_values):
    accuracies = []
    log_losses = []
    for c in c_values:
        params = {'C':c,'max_iter':1000,'tol':1e-8}
        models = lrmodel.train_models(X_train, Y_train, params)
        predictions = lrmodel.make_predictions(X_test, models, 6)
        accuracy = accuracy_score(np.argmax(Y_test, axis=1), np.argmax(predictions,axis=1))
        ll = log_loss(Y_test, predictions)
        accuracies.append(accuracy)
        log_losses.append(ll)
    evaluation = pd.DataFrame({'C':c_values,'accuracy':accuracies,'log_loss':log_losses})
    print evaluation
    return evaluation


def lr_param_charts(c_values, accuracies, log_losses):
    plt.figure(figsize=(14, 4))
    plt.subplot(1,2,1)
    plt.plot(np.log(c_values), accuracies, 'g')
    plt.title("Change in Accuracy with Decreasing Regularization")
    plt.xlabel("Log of Inv. Regularization Strength (C)")
    plt.ylabel("Accuracy")
    plt.subplot(1,2,2)
    plt.plot(np.log(c_values), log_losses, 'b')
    plt.title("Change in Log-loss with Decreasing Regularization")
    plt.xlabel("Log of Inv. Regularization Strength (C)")
    plt.ylabel("Log-loss")


def print_accuracy(true_category, pred_category):
    print 'Accuracy: {}'.format(accuracy_score(true_category, pred_category ))
    print 'Log-loss: {}'.format(log_loss(Y_test, predictions))


def analyze_errors(true_category, pred_category, get_label=get_label):
    errors = pred_category != true_category
    true_labels = get_label(true_category)
    base = np.ones(true_labels.shape)
    error_base = pd.DataFrame({'errors':errors,
                      'labels':true_labels,
                      'base':base})
    b = error_base.groupby('labels').sum()
    error_rates = pd.DataFrame((b['errors']/b['base']).sort_values(ascending=False), columns=['Error Rate'])
    error_rates['Total Session Counts'] = b['base']
    return error_rates


def plot_confusion_matrix(cm, title='Confusion matrix', cmap=plt.cm.Blues):
    classes = label_lookup.values()
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)
    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

In [19]:
load_data()
create_sessions()
Y, labels = gather_target_vars()
X = feature_engineering()
X_train, X_test, Y_train, Y_test = train_test_split(X, Y.as_matrix(), test_size=0.33, random_state=22)

In [21]:
labels.unique

<bound method Series.unique of user  session
1     1           Walking
      2           Walking
      3           Walking
      4           Walking
      5           Walking
      6           Walking
      7           Walking
      8           Walking
      9           Walking
      10          Walking
      11          Walking
      12          Walking
      13          Walking
      14          Walking
      15          Walking
      16          Walking
      17          Walking
      18          Walking
      19          Walking
      20          Walking
      21          Walking
      22          Walking
      23          Walking
      24          Walking
      25          Walking
      26          Walking
      27          Walking
      28          Walking
      29          Walking
      30          Walking
                   ...   
36    10954       Sitting
      10955       Sitting
      10956       Sitting
      10957       Sitting
      10958       Sitting
      10959       S

In [17]:
Y

Unnamed: 0_level_0,Unnamed: 1_level_0,0,1,2,3,4,5
user,session,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,1,0,0,0,0,0,1
1,2,0,0,0,0,0,1
1,3,0,0,0,0,0,1
1,4,0,0,0,0,0,1
1,5,0,0,0,0,0,1
1,6,0,0,0,0,0,1
1,7,0,0,0,0,0,1
1,8,0,0,0,0,0,1
1,9,0,0,0,0,0,1
1,10,0,0,0,0,0,1


In [14]:
X_train[0]

array([ 1.4003247 , -0.3805544 , -0.17791853,  0.13834895, -0.67707807,
        0.1862355 ,  1.45689209, -0.45677004, -0.487276  ,  0.14323398,
       -0.67898883, -0.37901384,  0.80240762,  0.50062263,  0.08050633,
        1.41200524,  0.04279476,  0.8866245 , -0.16974649,  0.25476395,
        1.32464603,  0.19608871,  1.25351932,  1.9168293 ,  0.90290446,
        0.21110376,  0.13234236,  1.02353982,  0.09213821, -0.03568492,
        1.24188142, -0.09534324, -0.15783303,  1.3654427 , -0.27739236,
       -0.31567319,  1.45689209, -0.45677004, -0.487276  ,  1.43321083,
       -0.67439332, -0.65683225,  1.31815458, -0.72055132, -0.68426837,
        1.07560947, -0.72491893, -0.59336155,  1.09926409, -0.69184339,
       -0.47611452])

In [16]:
Y_train[0]

array([ 0.,  0.,  0.,  0.,  0.,  1.])

In [4]:
Y_train.shape

(7382, 6)

In [6]:
import tensorflow as tf
import shutil
import os.path

# Parameters
learning_rate = 0.01
training_epochs = 1000
batch_size = 100
display_step = 10

g = tf.Graph()

with g.as_default():
    # Create the model
    
    # tf Graph Input
    x = tf.placeholder(tf.float32, [None, 51]) # 3 inputs
    y = tf.placeholder(tf.float32, [None, 6]) # 6 classes

    # Set model weights
    W = tf.Variable(tf.zeros([51, 6]))
    b = tf.Variable(tf.zeros([6]))

    # Construct model
    pred = tf.nn.softmax(tf.matmul(x, W) + b) # Softmax

    # Minimize error using cross entropy
    cost = tf.reduce_mean(-tf.reduce_sum(y*tf.log(pred), reduction_indices=1))
    # Gradient Descent
    optimizer = tf.train.GradientDescentOptimizer(learning_rate).minimize(cost)

    # Initializing the variables
    init = tf.initialize_all_variables()
    
    
    sess = tf.Session()

    sess.run(init)

    # Training cycle
    for epoch in range(training_epochs):
        avg_cost = 0.
        _, c = sess.run([optimizer, cost], feed_dict={x: X_train,y: Y_train})
        
        # Compute average loss
        #avg_cost += c / total_batch
        # Display logs per epoch step
        if (epoch+1) % display_step == 0:
            print "Epoch:", '%04d' % (epoch+1), "cost=", "{:.9f}".format(c)

    print "Optimization Finished!"

    # Test model
    correct_prediction = tf.equal(tf.argmax(pred, 1), tf.argmax(y, 1))
    # Calculate accuracy for 3000 examples
    accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
    print "Accuracy:", accuracy.eval({x: X_test, y: Y_test}, sess)

# Store variable
_W = W.eval(sess)
_b = b.eval(sess)


sess.close()

#Create new graph for exporting
g_2 = tf.Graph()
with g_2.as_default():
    # Reconstruct graph
    x_2 = tf.placeholder("float", [None, 51], name="input")
    W_2 = tf.constant(_W, name="constant_W")
    b_2 = tf.constant(_b, name="constant_b")
    y_2 = tf.nn.softmax(tf.matmul(x_2, W_2) + b_2, name="output")

    sess_2 = tf.Session()

    init_2 = tf.initialize_all_variables();
    sess_2.run(init_2)

    
    graph_def = g_2.as_graph_def()
    
    tf.train.write_graph(graph_def, 'Models','activityModelLR.pb', as_text=False)

    # Test trained model
    y__2 = tf.placeholder("float", [None, 6])
    correct_prediction_2 = tf.equal(tf.argmax(y_2, 1), tf.argmax(y__2, 1))
    accuracy_2 = tf.reduce_mean(tf.cast(correct_prediction_2, "float"))
    print(accuracy_2.eval({x_2: X_test, y__2: Y_test}, sess_2))

Epoch: 0010 cost= 1.565006971
Epoch: 0020 cost= 1.402346969
Epoch: 0030 cost= 1.296683788
Epoch: 0040 cost= 1.222968221
Epoch: 0050 cost= 1.168118477
Epoch: 0060 cost= 1.125343323
Epoch: 0070 cost= 1.090854168
Epoch: 0080 cost= 1.062356591
Epoch: 0090 cost= 1.038365960
Epoch: 0100 cost= 1.017859697
Epoch: 0110 cost= 1.000106812
Epoch: 0120 cost= 0.984566748
Epoch: 0130 cost= 0.970830500
Epoch: 0140 cost= 0.958582640
Epoch: 0150 cost= 0.947575808
Epoch: 0160 cost= 0.937612474
Epoch: 0170 cost= 0.928534746
Epoch: 0180 cost= 0.920215905
Epoch: 0190 cost= 0.912549317
Epoch: 0200 cost= 0.905449390
Epoch: 0210 cost= 0.898843169
Epoch: 0220 cost= 0.892671406
Epoch: 0230 cost= 0.886881351
Epoch: 0240 cost= 0.881430387
Epoch: 0250 cost= 0.876282036
Epoch: 0260 cost= 0.871404231
Epoch: 0270 cost= 0.866769373
Epoch: 0280 cost= 0.862355173
Epoch: 0290 cost= 0.858139038
Epoch: 0300 cost= 0.854103982
Epoch: 0310 cost= 0.850233912
Epoch: 0320 cost= 0.846514046
Epoch: 0330 cost= 0.842933714
Epoch: 034

In [26]:
import tensorflow as tf
import shutil
import os.path


# Parameters
learning_rate = 0.001
training_epochs = 2000
batch_size = 500
display_step = 5

# Network Parameters
n_hidden_1 = 200 # 1st layer number of features
n_hidden_2 = 200 # 2nd layer number of features
n_input = 51 # Number of inputs
n_classes = 6 # Number of classes

g = tf.Graph()
with g.as_default():
    # model inputs
    x = tf.placeholder("float", shape=[None, n_input])
    y = tf.placeholder("float", shape=[None, n_classes])
    
    # set model weights
    W_h1 = tf.Variable(tf.random_normal([n_input, n_hidden_1]))
    W_h2 = tf.Variable(tf.random_normal([n_hidden_1, n_hidden_2]))
    W_out = tf.Variable(tf.random_normal([n_hidden_2, n_classes]))
    
    # set model biases
    b1 = tf.Variable(tf.random_normal([n_hidden_1]))
    b2 = tf.Variable(tf.random_normal([n_hidden_2]))
    b_out = tf.Variable(tf.random_normal([n_classes]))
    
    # Construct Model
    # Hidden layer with RELU activation
    layer_1 = tf.add(tf.matmul(x, W_h1), b1)
    layer_1 = tf.nn.relu(layer_1)
    # Hidden layer with RELU activation
    layer_2 = tf.add(tf.matmul(layer_1, W_h2), b2)
    layer_2 = tf.nn.relu(layer_2)
    # Output layer with linear activation
    pred = tf.matmul(layer_2, W_out) + b_out
    
    # Define loss and optimizer
    cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(pred, y))
    optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(cost)
    
    # Initializing the variables
    init = tf.initialize_all_variables()
    
    sess = tf.Session()

    sess.run(init)

    # Training cycle
    for epoch in range(training_epochs):
        avg_cost = 0.
        _, c = sess.run([optimizer, cost], feed_dict={x: X_train,y: Y_train})
        
        # Compute average loss
        #avg_cost += c / total_batch
        # Display logs per epoch step
        if (epoch+1) % display_step == 0:
            print "Epoch:", '%04d' % (epoch+1), "cost=", "{:.9f}".format(c)

    print "Optimization Finished!"

    # Test model
    correct_prediction = tf.equal(tf.argmax(pred, 1), tf.argmax(y, 1))
    # Calculate accuracy for 3000 examples
    accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
    print "Accuracy:", accuracy.eval({x: X_test, y: Y_test}, sess)

# Store Variable
_W_h1 = W_h1.eval(sess)
_W_h2 = W_h2.eval(sess)
_W_out =W_out.eval(sess)

_b1 = b1.eval(sess)
_b2 = b2.eval(sess)
_b_out = b_out.eval(sess)

sess.close()

# create a new graph for exporting
g_2 = tf.Graph()
with g_2.as_default():
    # Reconstruct Graph
    # model inputs
    x_2 = tf.placeholder("float", shape=[None, n_input], name="input")
    
    
    # set model weights
    W_2_h1 = tf.constant(_W_h1, name="constant_W_h1")
    W_2_h2 = tf.constant(_W_h2, name="constant_W_h2")
    W_2_out = tf.constant(_W_out, name="constant_W_out")
    
    # set model biases
    b_2_1 = tf.constant(_b1, name="constant_b1")
    b_2_2 = tf.constant(_b2, name="constant_b2")
    b_2_out = tf.constant(_b_out, name="constant_b_out")
    
    # Construct Model
    # Hidden layer with RELU activation
    layer_2_1 = tf.add(tf.matmul(x_2, W_2_h1), b_2_1)
    layer_2_1 = tf.nn.relu(layer_2_1)
    # Hidden layer with RELU activation
    layer_2_2 = tf.add(tf.matmul(layer_2_1, W_2_h2), b_2_2)
    layer_2_2 = tf.nn.relu(layer_2_2)
    
    # Output layer with linear activation
    y_2 = tf.nn.bias_add(tf.matmul(layer_2_2, W_2_out), b_2_out, name="output")
    
    #y_2.name = "output"
    
    sess_2 = tf.Session()

    init_2 = tf.initialize_all_variables();
    sess_2.run(init_2)

    
    graph_def = g_2.as_graph_def()
    
    tf.train.write_graph(graph_def, 'Models','activityModelMLP2.pb', as_text=False)

    # Test trained model
    y__2 = tf.placeholder("float", [None, 6])
    correct_prediction_2 = tf.equal(tf.argmax(y_2, 1), tf.argmax(y__2, 1))
    accuracy_2 = tf.reduce_mean(tf.cast(correct_prediction_2, "float"))
    print(accuracy_2.eval({x_2: X_test, y__2: Y_test}, sess_2))

    



Epoch: 0005 cost= 396.260162354
Epoch: 0010 cost= 276.588134766
Epoch: 0015 cost= 215.371566772
Epoch: 0020 cost= 165.162765503
Epoch: 0025 cost= 124.462135315
Epoch: 0030 cost= 103.507209778
Epoch: 0035 cost= 88.064643860
Epoch: 0040 cost= 77.016250610
Epoch: 0045 cost= 70.202415466
Epoch: 0050 cost= 64.329627991
Epoch: 0055 cost= 59.109596252
Epoch: 0060 cost= 54.552120209
Epoch: 0065 cost= 50.528736115
Epoch: 0070 cost= 46.903934479
Epoch: 0075 cost= 43.589862823
Epoch: 0080 cost= 40.719127655
Epoch: 0085 cost= 38.154396057
Epoch: 0090 cost= 35.920024872
Epoch: 0095 cost= 34.145431519
Epoch: 0100 cost= 32.502452850
Epoch: 0105 cost= 30.978788376
Epoch: 0110 cost= 29.559520721
Epoch: 0115 cost= 28.282278061
Epoch: 0120 cost= 27.093397141
Epoch: 0125 cost= 25.995040894
Epoch: 0130 cost= 25.040588379
Epoch: 0135 cost= 24.146463394
Epoch: 0140 cost= 23.320995331
Epoch: 0145 cost= 22.508140564
Epoch: 0150 cost= 21.761072159
Epoch: 0155 cost= 21.017499924
Epoch: 0160 cost= 20.350358963
Ep