# Importing dependences

In [1]:
import numpy as np
from scipy.spatial.distance import euclidean
from fastdtw import fastdtw
from dtaidistance import dtw
import matplotlib.pyplot as plt
import math
import csv
import array
from sklearn.preprocessing import MinMaxScaler
import pandas as pd
from numpy import savetxt

# Loading the dataset

In [3]:
dataset = pd.read_csv("../dataset/Training/Lab/bigact_raw_lab_acc.csv")
label = pd.read_csv("../dataset/Training/Lab/labels_lab_2users.csv")

modified_dataset = dataset.iloc[:, [2,3,4]].values  # array containing only (x,y,z)
Y = label.iloc[:-1,1].values   # array containing only the labels


In [5]:
dataset.head()

Unnamed: 0,user_id,datetime,x,y,z
0,1,2018-07-25T14:58:46.247+1000,0.612,7.7,0.0
1,1,2018-07-25T14:55:30.396+1000,5.286,7.7,0.0
2,1,2018-07-25T14:55:30.402+1000,5.286,7.7,0.0
3,1,2018-07-25T14:58:46.247+1000,0.612,7.7,0.0
4,1,2018-07-25T14:55:30.396+1000,5.286,7.7,0.0


# Data processing

## Defining the variables

$\color{red}{id\_datetime\_}$ variable look like this:
<img src="images/id_datetime.png">
and the $\color{red}{label\_id\_datetime\_}$ variable look like this:
<img src="images/label_id_datetime.png">

In [7]:
id_datetime = dataset[['datetime','user_id']].values
id_datetime_ = np.zeros((id_datetime.shape[0], 6))
label_datetime_start = label['start'].values
label_datetime_finish = label['finish'].values
label_user_id = label['user_id'].values
label_id_datetime = np.concatenate((label_datetime_start.reshape(-1,1), label_datetime_finish.reshape(-1,1),label_user_id.reshape(-1,1)), axis = 1)
label_id_datetime_ = np.zeros((label.shape[0], 6))

## Taking year, date, time, hour, minute, second into a seperate array

Assigning date and id to the $\color{red}{id\_datetime\_}$ variable

In [9]:
for i in range(len(id_datetime)):
    year = id_datetime[i,0][0:4]
    month = id_datetime[i,0][5:7]
    date = id_datetime[i,0][8:10]
    hour = id_datetime[i,0][11:13]
    minute = id_datetime[i,0][14:16]
    second = id_datetime[i,0][17:23]
    user_id = id_datetime[i,1]
    hour_minute = int(str(hour) + str(minute))  # let, hour = 10, minute = 20 then hour_minute = 1020 
    id_datetime_[i][0] = year
    id_datetime_[i][1] = month
    id_datetime_[i][2] = date
    id_datetime_[i][3] = hour_minute
    id_datetime_[i][4] = second
    id_datetime_[i][5] = user_id

In [11]:
id_datetime_[0:5,:]

array([[2.0180e+03, 7.0000e+00, 2.5000e+01, 1.4580e+03, 4.6247e+01,
        1.0000e+00],
       [2.0180e+03, 7.0000e+00, 2.5000e+01, 1.4550e+03, 3.0396e+01,
        1.0000e+00],
       [2.0180e+03, 7.0000e+00, 2.5000e+01, 1.4550e+03, 3.0402e+01,
        1.0000e+00],
       [2.0180e+03, 7.0000e+00, 2.5000e+01, 1.4580e+03, 4.6247e+01,
        1.0000e+00],
       [2.0180e+03, 7.0000e+00, 2.5000e+01, 1.4550e+03, 3.0396e+01,
        1.0000e+00]])

Assigning date and id to the $\color{red}{label\_id\_datetime\_}$ variable

In [14]:
for i in range(len(label_id_datetime)):
    month = label_id_datetime[i,0][0]
    date = label_id_datetime[i,0][2:4]
    year = label_id_datetime[i,0][5:9]
    hour_start = label_id_datetime[i,0][10:12]
    hour_stop = label_id_datetime[i,1][10:12]
    minute_start = label_id_datetime[i,0][13:15]
    minute_stop = label_id_datetime[i,1][13:15]
    user_id = label_id_datetime[i,2]
    hour_minute_start = int(str(hour_start) + str(minute_start))
    hour_minute_stop = int(str(hour_stop) + str(minute_stop))
    label_id_datetime_[i][0] = year
    label_id_datetime_[i][1] = month
    label_id_datetime_[i][2] = date
    label_id_datetime_[i][3] = hour_minute_start
    label_id_datetime_[i][4] = hour_minute_stop
    label_id_datetime_[i][5] = user_id

In [17]:
label_id_datetime_[0:5,:]

array([[2.018e+03, 8.000e+00, 2.200e+01, 1.106e+03, 1.106e+03, 1.900e+01],
       [2.018e+03, 8.000e+00, 2.200e+01, 1.118e+03, 1.118e+03, 1.900e+01],
       [2.018e+03, 8.000e+00, 2.200e+01, 1.119e+03, 1.119e+03, 1.900e+01],
       [2.018e+03, 7.000e+00, 2.500e+01, 1.224e+03, 1.224e+03, 1.000e+00],
       [2.018e+03, 7.000e+00, 2.500e+01, 1.226e+03, 1.227e+03, 1.000e+00]])

## Taking label and the corresponding (x,y,z) into a seperate list

$\color{red}{label\_list\_}$ variable look like this:
<img src="images/lable_list.png">
$\color{blue}{Warning:}$ this section will take 15-20 minute to run. Need to be optimised

In [22]:
label_list = []
for i in range(len(label_id_datetime)-1):
    data_list = []
    year = label_id_datetime_[i][0]
    month = label_id_datetime_[i][1]
    date = label_id_datetime_[i][2]
    hour_minute_start = label_id_datetime_[i][3]
    hour_minute_stop = label_id_datetime_[i][4]
    user_id = label_id_datetime_[i][5]
    for j in range(len(id_datetime)):
        if id_datetime_[j][0] == year and id_datetime_[j][1] == month and id_datetime_[j][2] == date and (id_datetime_[j][3] >= hour_minute_start and id_datetime_[j][3] <= hour_minute_stop) and id_datetime_[j][5] == user_id:
            data_list.append(j)
    label_list.append(data_list)

label_list = np.array(label_list)
label_list_ = np.concatenate((label_list.reshape(-1,1), Y.reshape(-1,1)), axis = 1)

In [23]:
label_list_[0:5,:]

array([[list([499106, 499639, 500320, 500825, 500831, 500897, 500947, 500962, 500963, 500964, 500965, 500971, 501142, 501494, 501496, 501498, 509618, 509671, 509673, 510233, 510241, 510454, 510522, 510567, 510576, 510651, 510706, 510769, 510777, 510782, 510788, 510799, 510868, 511171, 514267, 514272, 514273, 514277, 514279, 514300, 514323, 514350, 515667, 516022, 518673, 518688, 518696, 519289, 520055, 520056, 520124, 520557, 520565, 521038, 521525, 521533, 522401, 522812, 525135, 525136, 525759, 526006, 526015, 526026, 526094, 526766, 527381, 527384, 530395, 530446, 530528, 530669, 530706, 530724, 530882, 530911, 531279, 612855, 793357, 793702, 808491, 809516, 809523, 810005, 811240, 811284, 811374, 811460, 812079, 812098, 812859, 812883, 813459, 813472, 813502, 814791, 814810, 816342, 816585, 816586, 816675, 817733, 817864, 817940, 818041, 818047, 818051, 818169, 818339, 818347, 819327, 819762, 819763, 820034, 821385, 821537, 823252, 823259, 823402, 823405, 823698, 823723, 823752, 82

## Count the number of datapoints

In [24]:
count = 0
for i in range(len(label_list)):
    index_list = label_list_[i][0]
    if len(index_list) != 0:
        for j in index_list:
            count += 1

## The Final dataset

The findal dataset, $\color{red}{X}$ look like this:
<img src="images/final_data.png">

In [25]:
X = np.zeros((count,4))
count_ = 0
for i in range(len(label_list)):
    index_list = label_list_[i][0]
    if len(index_list) != 0:
        for j in index_list:
            X[count_][0] = modified_dataset[j][0]
            X[count_][1] = modified_dataset[j][1]
            X[count_][2] = modified_dataset[j][2]
            X[count_][3] = label_list_[i][1]
            count_ = count_ + 1

In [26]:
X[0:5,:]

array([[-1.8  ,  9.194,  0.211,  1.   ],
       [-1.8  ,  9.385,  0.193,  1.   ],
       [-7.7  ,  5.976,  0.037,  1.   ],
       [-9.5  ,  0.919,  0.113,  1.   ],
       [-9.5  ,  0.919,  0.094,  1.   ]])

## Saving the dataset as csv file

In [None]:
savetxt('sample_data.csv', X, delimiter=',')