## Compute features for accelerometer only

In [29]:
import os
import pandas as pd
import numpy as np
from scipy.signal import find_peaks_cwt
import pickle
from pathlib import Path

in_dir = "data_3_4_2018"

trim_num_seconds = 10
acc_freq = 4
window_num_seconds = 4 #seconds
steps_per_sec = int(1000/acc_freq)
window_size = int(window_num_seconds*steps_per_sec)
window_step = 2 #seconds
window_jump_steps = int(window_step*steps_per_sec)

print("Window_size, Window_jump_steps: ", window_size, window_jump_steps)

def number_of_peaks(window):
    indexes = find_peaks_cwt(window, np.arange(1, len(window)))

    return len(indexes)

#this function assumes that records are evenly spaced
def trim_first_last_n_seconds(df, n, freq):
    if df.shape[0] < 6001:
        return None
    
    remove_indexes = list(range(0, int(n*1000/freq)))
    df = df.drop(remove_indexes)

    remove_indexes = list(range(df.shape[0] - int(n*1000/freq), df.shape[0]-1))
    df = df.drop(remove_indexes)
    
    return df
    

dfs_list = []
features = []
labels = []

pickle_file = Path("pickles/accelerometer_features.pickle")

if pickle_file.exists():
    print("Found pickle files for accelerometer")
    
    features = pickle.load(open("pickles/accelerometer_features.pickle", "rb"))
    labels = pickle.load(open("pickles/accelerometer_labels.pickle", "rb"))
    dfs_list = pickle.load(open("pickles/accelerometer_dfs_list.pickle", "rb"))
    
else:

    for root, dirs, files in os.walk(in_dir):
        path = root.split(os.sep)

        for f in files:
            print("/".join(path) + "/" + f)

            full_path = "/".join(path) + "/" + f

            if "gyroscope" in full_path:
                print("Skip ", full_path)
                continue

            df = pd.read_csv(full_path, header=None)

            print("Before trimming: ", df.shape)
            
            df = trim_first_last_n_seconds(df, trim_num_seconds, acc_freq)
            if df is None:
                print("Continuing")
                continue

            print("After trimming: ", df.shape)

            #Sample the data according to the size of the window with 50% overlap
            for index in range(0, df.shape[0]-window_size, window_jump_steps):
                indexes = list(range(index, index + int(window_size)))

                window = df.iloc[indexes,:]

                #Generate the features for this window
                mean_x = np.mean(window[1].tolist())
                mean_y = np.mean(window[2].tolist())
                mean_z = np.mean(window[3].tolist())

                var_x = np.var(window[1].tolist())
                var_y = np.var(window[2].tolist())
                var_z = np.var(window[3].tolist())

                num_peaks_x = number_of_peaks(window[1].tolist())
                num_peaks_y = number_of_peaks(window[2].tolist())
                num_peaks_z = number_of_peaks(window[3].tolist())            

                #Store the features
                window_feature = []
                window_feature.append(mean_x)
                window_feature.append(mean_y)
                window_feature.append(mean_z)
                window_feature.append(var_x)
                window_feature.append(var_y)
                window_feature.append(var_z)
                window_feature.append(num_peaks_x)
                window_feature.append(num_peaks_y)
                window_feature.append(num_peaks_z)

                features.append(window_feature)

                #Store the label
                labels.append(df[5].iloc[1])

            dfs_list.append(df)

    dfs = pd.concat(dfs_list)

    pickle.dump(features, open("pickles/accelerometer_features.pickle", "wb"))
    pickle.dump(labels, open("pickles/accelerometer_labels.pickle", "wb"))
    pickle.dump(dfs_list, open("pickles/accelerometer_dfs_list.pickle", "wb"))

features = np.asarray(features)
labels = np.asarray(labels)

Window_size, Window_jump_steps:  1000 500
Found pickle files for accelerometer


## Compute LSTM features for accelerometer

In [28]:
import pickle
from pathlib import Path
import os
import pandas as pd
import numpy as np

in_dir = "data_3_4_2018"

trim_num_seconds = 10
acc_freq = 4
window_num_seconds = 4 #seconds
steps_per_sec = int(1000/acc_freq)
window_size = int(window_num_seconds*steps_per_sec)
window_step = 2 #seconds
window_jump_steps = int(window_step*steps_per_sec)

print("Window_size, Window_jump_steps: ", window_size, window_jump_steps)

dfs_list = []
features = []
labels = []


#this function assumes that records are evenly spaced
def trim_first_last_n_seconds(df, n, freq):
    if df.shape[0] < 6001:
        return None
    
    remove_indexes = list(range(0, int(n*1000/freq)))
    df = df.drop(remove_indexes)

    remove_indexes = list(range(df.shape[0] - int(n*1000/freq), df.shape[0]-1))
    df = df.drop(remove_indexes)
    
    return df


pickle_file = Path("pickles/lstm_acc_features.pickle")

if pickle_file.exists():
    print("Found pickle files for LSTM acc and gyro")
    
    features = pickle.load(open("pickles/lstm_acc_features.pickle", "rb"))
    labels = pickle.load(open("pickles/lstm_acc_labels.pickle", "rb"))
    dfs_list = pickle.load(open("pickles/lstm_acc_dfs_list.pickle", "rb"))
    
else:

    for root, dirs, files in os.walk(in_dir):
        path = root.split(os.sep)

        for f in files:

            if 'accelerometer' in f:
                accelerometer_path = "/".join(path) + "/" + f
                print(accelerometer_path)

                df = pd.read_csv(accelerometer_path, header=None)

                print("Before trimming: ", df.shape)

                df = trim_first_last_n_seconds(df, trim_num_seconds, acc_freq)
                if df is None:
                    print("Continuing")
                    continue

                print("After trimming: ", df.shape)

                #Sample the data according to the size of the window with 50% overlap
                for index in range(0, df.shape[0]-window_size, window_jump_steps):
                    indexes = list(range(index, index + int(window_size)))

                    window = df.iloc[indexes, 1:df.shape[1]-2]

                    #Generate the features for this window            
                    features.append(np.asarray(window))

                    #Store the label
                    labels.append(df[5].iloc[0])

                dfs_list.append(df)

    dfs = pd.concat(dfs_list)
    
    pickle.dump(features, open("pickles/lstm_acc_features.pickle", "wb"))
    pickle.dump(labels, open("pickles/lstm_acc_labels.pickle", "wb"))
    pickle.dump(dfs_list, open("pickles/lstm_acc_dfs_list.pickle", "wb"))
    
    
features = np.asarray(features)
labels = np.asarray(labels)    

Window_size, Window_jump_steps:  1000 500
data_3_4_2018/0_1_android.sensor.accelerometer.data.csv
Before trimming:  (64964, 6)
After trimming:  (59965, 6)
data_3_4_2018/10_1_android.sensor.accelerometer.data.csv
Before trimming:  (64675, 6)
After trimming:  (59676, 6)
data_3_4_2018/11_1_android.sensor.accelerometer.data.csv
Before trimming:  (66938, 6)
After trimming:  (61939, 6)
data_3_4_2018/12_1_android.sensor.accelerometer.data.csv
Before trimming:  (60980, 6)
After trimming:  (55981, 6)
data_3_4_2018/13_1_android.sensor.accelerometer.data.csv
Before trimming:  (62506, 6)
After trimming:  (57507, 6)
data_3_4_2018/14_1_android.sensor.accelerometer.data.csv
Before trimming:  (65108, 6)
After trimming:  (60109, 6)
data_3_4_2018/15_1_android.sensor.accelerometer.data.csv
Before trimming:  (62588, 6)
After trimming:  (57589, 6)
data_3_4_2018/16_1_android.sensor.accelerometer.data.csv
Before trimming:  (82181, 6)
After trimming:  (77182, 6)
data_3_4_2018/17_1_android.sensor.accelerometer

## Compute LSTM features for accelerometer and gyroscope

In [26]:
import pickle
from pathlib import Path
import os
import pandas as pd
import numpy as np

in_dir = "data_3_4_2018"

trim_num_seconds = 10
acc_freq = 4
window_num_seconds = 4 #seconds
steps_per_sec = int(1000/acc_freq)
window_size = int(window_num_seconds*steps_per_sec)
window_step = 2 #seconds
window_jump_steps = int(window_step*steps_per_sec)

print("Window_size, Window_jump_steps: ", window_size, window_jump_steps)

dfs_list = []
features = []
labels = []


#this function assumes that records are evenly spaced
def trim_first_last_n_seconds(df, n, freq):
    if df.shape[0] < 6001:
        return None
    
    remove_indexes = list(range(0, int(n*1000/freq)))
    df = df.drop(remove_indexes)

    remove_indexes = list(range(df.shape[0] - int(n*1000/freq), df.shape[0]-1))
    df = df.drop(remove_indexes)
    
    return df

def combine_acc_gyro_data(df_acc, df_gyro):
    threshold = 10
    
    acc_index = 0
    gyro_index = 0
    
    acc_matrix = df_acc.as_matrix()
    gyro_matrix = df_gyro.as_matrix()
    
    combined_list = []
    
    while (acc_index < df_acc.shape[0]) and (gyro_index < df_gyro.shape[0]):
        #find next gyro_index within threshold and append the data
        
        acc_time = df_acc.iloc[acc_index][0]
        gyro_time = df_gyro.iloc[gyro_index][0]

        if (acc_time < gyro_time):
            while (acc_index < df_acc.shape[0]) and (gyro_time - acc_time > threshold):
                acc_time = acc_matrix[acc_index][0] #df_acc.iloc[acc_index][0]
                acc_index += 1
        else:
            while (gyro_index < df_gyro.shape[0]) and (acc_time - gyro_time > threshold):
                gyro_time = gyro_matrix[gyro_index][0] #df_gyro.iloc[gyro_index][0]
                gyro_index += 1
            
        combined_list.append(np.concatenate((acc_matrix[acc_index][:4], gyro_matrix[gyro_index][1:6]), axis=0))
        acc_index += 1
        gyro_index += 1
            
    return combined_list


pickle_file = Path("pickles/lstm_acc_gyro_features.pickle")

if pickle_file.exists():
    print("Found pickle files for LSTM acc and gyro")
    
    features = pickle.load(open("pickles/lstm_acc_gyro_features.pickle", "rb"))
    labels = pickle.load(open("pickles/lstm_acc_gyro_labels.pickle", "rb"))
    dfs_list = pickle.load(open("pickles/lstm_acc_gyro_dfs_list.pickle", "rb"))
    
else:

    for root, dirs, files in os.walk(in_dir):
        path = root.split(os.sep)

        for f in files:

            if 'accelerometer' in f:
                accelerometer_path = "/".join(path) + "/" + f
                print(accelerometer_path)

                first_ = f.find("_")
                g = f[0:first_] + "_4" + f[first_+2:]
                gyroscope_path = "/".join(path) + "/" + g.replace("accelerometer","gyroscope")
                print(gyroscope_path)

                df_acc = pd.read_csv(accelerometer_path, header=None)
                df_gyro = pd.read_csv(gyroscope_path, header=None)

                combined_list = combine_acc_gyro_data(df_acc, df_gyro)
                combined_numpy = np.array(combined_list)

                combined_df = pd.DataFrame(data=combined_numpy)

                print("Before trimming: ", combined_df.shape)

                combined_df = trim_first_last_n_seconds(combined_df, trim_num_seconds, acc_freq)
                if combined_df is None:
                    print("Continuing")
                    continue

                print("After trimming: ", combined_df.shape)

                #Sample the data according to the size of the window with 50% overlap
                for index in range(0, combined_df.shape[0]-window_size, window_jump_steps):
                    indexes = list(range(index, index + int(window_size)))

                    window = combined_df.iloc[indexes, 1:7] #,1:combined_df.shape[1]-2]

                    #Generate the features for this window            
                    features.append(np.asarray(window))

                    #Store the label
                    labels.append(combined_df[8].iloc[0])

                dfs_list.append(combined_df)

    dfs = pd.concat(dfs_list)
    
    pickle.dump(features, open("pickles/lstm_acc_gyro_features.pickle", "wb"))
    pickle.dump(labels, open("pickles/lstm_acc_gyro_labels.pickle", "wb"))
    pickle.dump(dfs_list, open("pickles/lstm_acc_gyro_dfs_list.pickle", "wb"))
    
    
features = np.asarray(features)
labels = np.asarray(labels)    

Window_size, Window_jump_steps:  1000 500
data_3_4_2018/0_1_android.sensor.accelerometer.data.csv
data_3_4_2018/0_4_android.sensor.gyroscope.data.csv
Before trimming:  (34142, 9)
After trimming:  (29143, 9)
data_3_4_2018/10_1_android.sensor.accelerometer.data.csv
data_3_4_2018/10_4_android.sensor.gyroscope.data.csv
Before trimming:  (34053, 9)
After trimming:  (29054, 9)
data_3_4_2018/11_1_android.sensor.accelerometer.data.csv
data_3_4_2018/11_4_android.sensor.gyroscope.data.csv
Before trimming:  (34366, 9)
After trimming:  (29367, 9)
data_3_4_2018/12_1_android.sensor.accelerometer.data.csv
data_3_4_2018/12_4_android.sensor.gyroscope.data.csv
Before trimming:  (31982, 9)
After trimming:  (26983, 9)
data_3_4_2018/13_1_android.sensor.accelerometer.data.csv
data_3_4_2018/13_4_android.sensor.gyroscope.data.csv
Before trimming:  (40277, 9)
After trimming:  (35278, 9)
data_3_4_2018/14_1_android.sensor.accelerometer.data.csv
data_3_4_2018/14_4_android.sensor.gyroscope.data.csv
Before trimming

In [81]:
import os
import pandas as pd
import numpy as np

def trim_first_last_n_seconds(df, n, freq):
    remove_indexes = list(range(0, int(n*1000/freq)))
    df = df.drop(remove_indexes)

    remove_indexes = list(range(df.shape[0] - int(n*1000/freq), df.shape[0]-1))
    df = df.drop(remove_indexes)
    
    return df
    
in_dir = "final"
dfs_list = []

features = []
labels = []

for root, dirs, files in os.walk(in_dir):
    path = root.split(os.sep)

    for f in files:
        print("/".join(path) + "/" + f)

        full_path = "/".join(path) + "/" + f
        df = pd.read_csv(full_path, header=None)
        
#         print("Before trimming: ", df.shape)
        df = trim_first_last_n_seconds(df, 10, 4)
#         print("After trimming: ", df.shape)
   
        #Sample the data according to the size of the window with 50% overlap
        freq = 4
        window_size = 4*1000
        for index in range(0, df.shape[0]-1000, 500):
            indexes = list(range(index, index + int(window_size/freq)))
            
            window = df.iloc[indexes,1:df.shape[1]-2]
            
            #Generate the features for this window            
            features.append(np.asarray(window))
            
            #Store the label
            labels.append(df[5].iloc[1])
        
        dfs_list.append(df)
        
dfs = pd.concat(dfs_list)

features = np.asarray(features)
labels = np.asarray(labels)

print(features.shape)
print(labels.shape)
print(features[0])
print(labels[0])

final/0_1_android.sensor.accelerometer.data.csv
final/10_1_android.sensor.accelerometer.data.csv
final/11_1_android.sensor.accelerometer.data.csv
final/12_1_android.sensor.accelerometer.data.csv
final/13_1_android.sensor.accelerometer.data.csv
final/14_1_android.sensor.accelerometer.data.csv
final/15_1_android.sensor.accelerometer.data.csv
final/16_1_android.sensor.accelerometer.data.csv
final/17_1_android.sensor.accelerometer.data.csv
final/18_1_android.sensor.accelerometer.data.csv
final/19_1_android.sensor.accelerometer.data.csv
final/1_1_android.sensor.accelerometer.data.csv
final/2_1_android.sensor.accelerometer.data.csv
final/3_1_android.sensor.accelerometer.data.csv
final/4_1_android.sensor.accelerometer.data.csv
final/5_1_android.sensor.accelerometer.data.csv
final/6_1_android.sensor.accelerometer.data.csv
final/7_1_android.sensor.accelerometer.data.csv
final/8_1_android.sensor.accelerometer.data.csv
final/9_1_android.sensor.accelerometer.data.csv
(2414, 1000, 3)
(2414,)
[[-2.6

In [30]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import KFold
import numpy as np
from sklearn.preprocessing import normalize

X = np.array(normalize(features))
y = np.array(labels)

kf = KFold(n_splits=5, random_state=None, shuffle=True)
kf.get_n_splits(X)

acc = []

for train_index, test_index in kf.split(X):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    model = LogisticRegression()
    
    model.fit(X_train, y_train)
    
    predicted_labels = model.predict(X_test)
    
    acc.append(accuracy_score(y_test, predicted_labels))
    
print("Accuracy across all folds: ", np.mean(acc))    

Accuracy across all folds:  0.7433358720487433


In [246]:
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM, LSTMCell
from keras.layers import Dropout
from sklearn.metrics import accuracy_score
from sklearn.model_selection import KFold
import numpy as np
from sklearn.preprocessing import normalize
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder

# X = np.array(normalize(features))
X = features

print(X.shape)

# print(len(y))

y = np.array(labels)
le = preprocessing.LabelEncoder()
y = le.fit_transform(y)

print(y[:5])

y = y.reshape(-1,1)
ohe = OneHotEncoder(sparse=False)
y = ohe.fit_transform(y)

print(y.shape)
# print(len(y))
# kf = KFold(n_splits=5)
# kf.get_n_splits(X)

acc = []

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

# print(len(X_train))
# print(len(X_test))
# print(len(y_train))
# print(len(y_test))

# print(X_train[0].shape)
# print(len(y_train))

# X_train = X_train.reshape(X_train.shape[0], X_train.shape[1], 1)
# X_test = X_test.reshape(X_test.shape[0], X_test.shape[1], 1)
# y_train = y_train.reshape(y_train.shape[0], 1, 1)
# y_test = y_test.reshape(y_test.shape[0], 1, 1)

# print(X_train[0].shape)
# print(X_test.shape)
# print(y_train.shape)
# print(y_test.shape)

print(X_train.shape)
print(X_train[0].shape[0], X_train[0].shape[1])
print(y_train.shape)

model = Sequential()
model.add(LSTM(100, input_shape=(X_train[0].shape[0], X_train[0].shape[1]), return_sequences=True))
model.add(LSTM(100))
#     model.add(Dropout(0.2))
#     model.add(LSTM(50))
#     model.add(Dropout(0.2))
model.add(Dense(4, activation='softmax'))
#     model.add(Dense(1, activation='sigmoid'))
#     model.compile(loss='mean_squared_error', optimizer='adam')

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

model.summary()

model.fit(X_train, y_train, batch_size=300, epochs=100)

#     predicted_labels = model.predict(X_test)

#     acc.append(accuracy_score(y_test, predicted_labels))


# print("Accuracy across all folds: ", np.mean(acc))    

(4232, 1000, 6)
[1 1 1 1 1]
(4232, 4)
(3808, 1000, 6)
1000 6
(3808, 4)
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_91 (LSTM)               (None, 1000, 100)         42800     
_________________________________________________________________
lstm_92 (LSTM)               (None, 100)               80400     
_________________________________________________________________
dense_71 (Dense)             (None, 4)                 404       
Total params: 123,604
Trainable params: 123,604
Non-trainable params: 0
_________________________________________________________________
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100


<keras.callbacks.History at 0x1d56dfd7ba8>

In [247]:
from sklearn.metrics import precision_score, recall_score, roc_auc_score

scores = model.evaluate(X_test, y_test, verbose=0)
predicted_labels = model.predict(X_test)

print("Accuracy: ", scores[1]*100)

def get_hot_value(my_list):
    max_val = max(my_list)
    return [int(item == max_val) for item in my_list]

hot_list = [get_hot_value(sublist) for sublist in predicted_labels]

t  = np.array(y_test)
p  = np.array(hot_list)
t = np.argmax(t, axis=1)
p = np.argmax(p, axis=1)

print("Precision: ", precision_score(t, p, average=None))


Accuracy:  78.77358490566037
Precision:  [0.61797753 0.71212121 0.86752137 0.82857143]


In [245]:
import os
import pandas as pd
import numpy as np

def trim_first_last_n_seconds(df, n, freq):
    if df.shape[0] < 6001:
        return None
    
    remove_indexes = list(range(0, int(n*1000/freq)))
    df = df.drop(remove_indexes)

    remove_indexes = list(range(df.shape[0] - int(n*1000/freq), df.shape[0]-1))
    df = df.drop(remove_indexes)
    
    return df

in_dir = "data_3_4_2018"
dfs_list = []

features = []
labels = []

for root, dirs, files in os.walk(in_dir):
    path = root.split(os.sep)

    for f in files:
        
        if 'accelerometer' in f:
            accelerometer_path = "/".join(path) + "/" + f
            print(accelerometer_path)
    
            first_ = f.find("_")
            g = f[0:first_] + "_4" + f[first_+2:]
            gyroscope_path = "/".join(path) + "/" + g.replace("accelerometer","gyroscope")
            print(gyroscope_path)
            
            df_acc = pd.read_csv(accelerometer_path, header=None)
            df_gyro = pd.read_csv(gyroscope_path, header=None)
            
            combined_list = combine_acc_gyro_data(df_acc, df_gyro)
            combined_numpy = np.array(combined_list)

            combined_df = pd.DataFrame(data=combined_numpy)
            
            print("Before trimming: ", combined_df.shape)

            combined_df = trim_first_last_n_seconds(combined_df, 10, 4)
            if combined_df is None:
                print("Continuing")
                continue
                
            print("After trimming: ", combined_df.shape)
            
            #Sample the data according to the size of the window with 50% overlap
            freq = 4
            window_size = 4*1000
            for index in range(0, combined_df.shape[0]-1000, 500):
                indexes = list(range(index, index + int(window_size/freq)))

                window = combined_df.iloc[indexes, 1:7] #,1:combined_df.shape[1]-2]

                #Generate the features for this window            
                features.append(np.asarray(window))

                #Store the label
                labels.append(combined_df[8].iloc[0])
        
            dfs_list.append(combined_df)
                    
dfs = pd.concat(dfs_list)

features = np.asarray(features)
labels = np.asarray(labels)
            

data_3_4_2018/0_1_android.sensor.accelerometer.data.csv
data_3_4_2018/0_4_android.sensor.gyroscope.data.csv
Before trimming:  (34142, 9)
After trimming:  (29143, 9)
data_3_4_2018/10_1_android.sensor.accelerometer.data.csv
data_3_4_2018/10_4_android.sensor.gyroscope.data.csv
Before trimming:  (34053, 9)
After trimming:  (29054, 9)
data_3_4_2018/11_1_android.sensor.accelerometer.data.csv
data_3_4_2018/11_4_android.sensor.gyroscope.data.csv
Before trimming:  (34366, 9)
After trimming:  (29367, 9)
data_3_4_2018/12_1_android.sensor.accelerometer.data.csv
data_3_4_2018/12_4_android.sensor.gyroscope.data.csv
Before trimming:  (31982, 9)
After trimming:  (26983, 9)
data_3_4_2018/13_1_android.sensor.accelerometer.data.csv
data_3_4_2018/13_4_android.sensor.gyroscope.data.csv
Before trimming:  (40277, 9)
After trimming:  (35278, 9)
data_3_4_2018/14_1_android.sensor.accelerometer.data.csv
data_3_4_2018/14_4_android.sensor.gyroscope.data.csv
Before trimming:  (34135, 9)
After trimming:  (29136, 9)
