In [1]:
import sys
import os
import csv
import itertools

import numpy as np
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt

from scipy import signal
from scipy import ndimage
from multiprocessing import Pool

from multiprocessing.dummy import Pool as ThreadPool


In [2]:
classes = ['run', 'walk', 'stand', 'sit', 'sit-to-stand', 'stand-to-sit', 
        'stair-up', 'stair-down', 'jump-one-leg', 'jump-two-leg', 'curve-left-step', 
        'curve-right-step', 'curve-left-spin-Lfirst', 'curve-left-spin-Rfirst', 
        'curve-right-spin-Lfirst', 'curve-right-spin-Rfirst', 'lateral-shuffle-left', 
        'lateral-shuffle-right','v-cut-left-Lfirst', 'v-cut-left-Rfirst', 'v-cut-right-Lfirst', 'v-cut-right-Rfirst']

sensors = ['EMG1', 'EMG2', 'EMG3', 'EMG4', 'Microphone', 'ACC upper X', 'ACC upper Y','ACC upper Z', 'Goniometer X',
          'ACC lower X', 'ACC lower Y', 'ACC lower Z', 'Goniometer Y', 'Gyro upper X', 'Gyro upper Y', 'Gyro upper Z',
          'Gyro lower X', 'Gyro lower Y', 'Gyro lower Z']

variance_sensors = ['EMG1', 'EMG2', 'EMG3', 'EMG4', 'Microphone']

smooth_sensors = ['ACC upper X', 'ACC upper Y','ACC upper Z', 'Goniometer X','ACC lower X', 'ACC lower Y', 
                  'ACC lower Z', 'Goniometer Y', 'Gyro upper X', 'Gyro upper Y', 'Gyro upper Z', 'Gyro lower X', 
                  'Gyro lower Y', 'Gyro lower Z']

data_path = "/Users/thomasklein/Projects/BremenBigDataChallenge2019/bbdc_2019_Bewegungsdaten/"


In [3]:
def smooth(data, windowsize, std):
    kernel = signal.gaussian(windowsize, std=std)
    kernel /= np.sum(kernel)
    return np.convolve(data, kernel, 'valid')

def variance_filter(data, windowsize):
    half = windowsize//2
    res = np.zeros(data.shape[0]-windowsize)
    for i in range(half,len(data)-half):
        res[i-half] = np.std(data[i-half:i+half])
    maxi = np.max(res)
    if(maxi == 0):
        maxi = 0.00001
    return res / maxi

def sample(data, num_samples):
    samples = [int(sample) for sample in np.linspace(0, data.shape[0]-1, num_samples)]
    return data[samples]
    
def smooth_extractor(data, num_samples):
    """
    data = 1d-numpy array of length timestep:
    """
    smoothed = smooth(data,200,50)
    smax = np.max(smoothed)
    if smax == 0:
        smax = 0.00001
    normalized = (smoothed-np.mean(smoothed))/smax
    return sample(normalized, num_samples)

def variance_extractor(data, num_samples):
    """
    data = 1d-numpy array of length timesteps
    """
    var_data = smooth(variance_filter(data,windowsize=100),windowsize=100,std=25)
    return sample(var_data, num_samples)


In [4]:
def recurrent_feature_extractor(data, num_samples):
    """
    data = 2d-numpy array of shape [timesteps, sensors]
    
    """
    features = []
        
    for sensor in variance_sensors:
        features.append(variance_extractor(data[:,sensors.index(sensor)], num_samples))
        
    if(np.isnan(np.array(features)).any()):
        raise ValueError("Error in variance")
        
    for sensor in smooth_sensors:
        features.append(smooth_extractor(data[:,sensors.index(sensor)], num_samples))
        
    if(np.isnan(np.array(features)).any()):
        raise ValueError("Error in smooth")
        
    return features

def threaded_recurrent_feature_extractor(data, num_samples):
    """
    data = 2d-numpy array of shape [timesteps, sensors]
    
    """
    pool = ThreadPool(8)
    
    variance_sequences = []
    smooth_sequences = []
    
    for sensor in variance_sensors:
        variance_sequences.append(data[:,sensors.index(sensor)])
        
    for sensor in smooth_sensors:
        smooth_sequences.append(data[:,sensors.index(sensor)])
        
    var_results = pool.starmap(variance_extractor, zip(variance_sequences, itertools.repeat(num_samples)))
    
    if(np.isnan(np.array(var_results)).any()):
        raise ValueError("NaN after variance feature extraction")
        
    smo_results = pool.starmap(smooth_extractor, zip(smooth_sequences, itertools.repeat(num_samples)))
        
    if(np.isnan(np.array(smo_results)).any()):
        raise ValueError("NaN after smoothing variance extraction")
        
    pool.close()
    pool.join()
    
    return var_results + smo_results

In [5]:
def _float_feature(value):
    """Returns a float_list from a float / double."""
    return tf.train.Feature(float_list=tf.train.FloatList(value=[value]))

def _int64_feature(value):
    """Returns an int64_list from a bool / enum / int / uint."""
    return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))

def serialize_example(featurelist, label):
    """
    Creates a tf.Example message from the list of features and the label, where
    every element in the featurelist is actually a sequence=ndarray
    """

    feature = {}
    for i in range(len(featurelist)):
        feature['feature'+str(i)] = tf.train.Feature(float_list=tf.train.FloatList(value=list(featurelist[i])))
        #_float_feature(featurelist[i])
    feature['label'] = _int64_feature(label)

    example_proto = tf.train.Example(features=tf.train.Features(feature=feature))
    return example_proto.SerializeToString()

In [6]:
def split_dataset(file, train_name, test_name, percentage=10):
    """
    Splits the file that contains the original dataset in two files, one for training and one for testing.
    
    file = the original file
    """
    df = pd.read_csv(file)
    headers = list(df)
    files = df.values
    
    indices = np.random.randint(low=0, high=files.shape[0], size=files.shape[0]//percentage)
    
    testset = np.take(files, indices, axis=0)
    files = np.delete(files, indices, axis=0)
    
    odf = pd.DataFrame(files)
    odf.columns = headers
    odf.to_csv(train_name+".csv")
    
    tdf = pd.DataFrame(testset)
    tdf.columns = headers
    tdf.to_csv(test_name+".csv")

In [15]:
def dataset_creator(in_file, outfile):
    """
    
    """
    df = pd.read_csv(in_file)
    
    with tf.python_io.TFRecordWriter(outfile+".tfrecords") as writer:
        
        for index, row in df.iterrows():
            if(index % 100 == 0):
                print("Digesting",row['Datafile'])
            if(row['Label'] in classes):
                path = row['Datafile']
                data = pd.read_csv(data_path+path).values

                label = classes.index(row['Label'])
                extracted_featurelist = recurrent_feature_extractor(data, 80)

                # this is where the fun begins: extracted_featurelist is a 19-element-list of  np-arrays of length 80
                # We need to get that into a tf.train.Example, which we can then serialize to string and 
                # write to a tfrecords-file.
                serialized_example = serialize_example(extracted_featurelist, label)

                writer.write(serialized_example)


In [8]:
#split_dataset(data_path+"train.csv", "rec_train", "rec_test")

dataset_creator("rec_train.csv",
                "/Users/thomasklein/Projects/BremenBigDataChallenge2019/bigdatachallenge/recurrent/detail_rec_train")

dataset_creator("rec_test.csv",
                "/Users/thomasklein/Projects/BremenBigDataChallenge2019/bigdatachallenge/recurrent/detail_rec_test")


Digesting Subject02/Subject02_Aufnahme000.csv
Digesting Subject02/Subject02_Aufnahme112.csv
Digesting Subject02/Subject02_Aufnahme226.csv
Digesting Subject02/Subject02_Aufnahme337.csv
Digesting Subject03/Subject03_Aufnahme010.csv
Digesting Subject03/Subject03_Aufnahme120.csv
Digesting Subject03/Subject03_Aufnahme234.csv
Digesting Subject03/Subject03_Aufnahme344.csv
Digesting Subject04/Subject04_Aufnahme014.csv
Digesting Subject04/Subject04_Aufnahme128.csv
Digesting Subject04/Subject04_Aufnahme234.csv
Digesting Subject04/Subject04_Aufnahme342.csv
Digesting Subject05/Subject05_Aufnahme019.csv
Digesting Subject05/Subject05_Aufnahme136.csv
Digesting Subject05/Subject05_Aufnahme247.csv
Digesting Subject05/Subject05_Aufnahme352.csv
Digesting Subject06/Subject06_Aufnahme025.csv
Digesting Subject06/Subject06_Aufnahme130.csv
Digesting Subject06/Subject06_Aufnahme243.csv
Digesting Subject06/Subject06_Aufnahme353.csv
Digesting Subject07/Subject07_Aufnahme012.csv
Digesting Subject07/Subject07_Aufn

In [16]:
dataset_creator("abgabe1.csv",
                "/Users/thomasklein/Projects/BremenBigDataChallenge2019/bigdatachallenge/recurrent/abgabe")



Digesting Subject01/Subject01_Aufnahme000.csv
Digesting Subject01/Subject01_Aufnahme100.csv
Digesting Subject01/Subject01_Aufnahme200.csv
Digesting Subject01/Subject01_Aufnahme300.csv
Digesting Subject01/Subject01_Aufnahme400.csv
Digesting Subject10/Subject10_Aufnahme060.csv
Digesting Subject10/Subject10_Aufnahme160.csv
Digesting Subject10/Subject10_Aufnahme260.csv
Digesting Subject10/Subject10_Aufnahme360.csv
Digesting Subject14/Subject14_Aufnahme042.csv
Digesting Subject14/Subject14_Aufnahme142.csv
Digesting Subject14/Subject14_Aufnahme242.csv
Digesting Subject14/Subject14_Aufnahme342.csv
Digesting Subject15/Subject15_Aufnahme002.csv
Digesting Subject15/Subject15_Aufnahme102.csv
Digesting Subject15/Subject15_Aufnahme202.csv
Digesting Subject15/Subject15_Aufnahme302.csv
Digesting Subject15/Subject15_Aufnahme402.csv


In [14]:
dataset_creator("/Users/thomasklein/Projects/BremenBigDataChallenge2019/bbdc_2019_Bewegungsdaten/train.csv",
                "/Users/thomasklein/Projects/BremenBigDataChallenge2019/bigdatachallenge/recurrent/full80")



1738
