In [1]:
import sys
import os
import numpy as np
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt
from scipy import signal
from scipy import ndimage
import csv

In [2]:
classes = ['run', 'walk', 'stand', 'sit', 'sit-to-stand', 'stand-to-sit', 
        'stair-up', 'stair-down', 'jump-one-leg', 'jump-two-leg', 'curve-left-step', 
        'curve-right-step', 'curve-left-spin-Lfirst', 'curve-left-spin-Rfirst', 
        'curve-right-spin-Lfirst', 'curve-right-spin-Rfirst', 'lateral-shuffle-left', 
        'lateral-shuffle-right','v-cut-left-Lfirst', 'v-cut-left-Rfirst', 'v-cut-right-Lfirst', 'v-cut-right-Rfirst']

sensors = ['EMG1', 'EMG2', 'EMG3', 'EMG4', 'Microphone', 'ACC upper X', 'ACC upper Y','ACC upper Z', 'Goniometer X',
          'ACC lower X', 'ACC lower Y', 'ACC lower Z', 'Goniometer Y', 'Gyro upper X', 'Gyro upper Y', 'Gyro upper Z',
          'Gyro lower X', 'Gyro lower Y', 'Gyro lower Z']

variance_sensors = ['EMG1', 'EMG2', 'EMG3', 'EMG4', 'Microphone']

smooth_sensors = ['ACC upper X', 'ACC upper Y','ACC upper Z', 'Goniometer X','ACC lower X', 'ACC lower Y', 
                  'ACC lower Z', 'Goniometer Y', 'Gyro upper X', 'Gyro upper Y', 'Gyro upper Z', 'Gyro lower X', 
                  'Gyro lower Y', 'Gyro lower Z']

data_path = "/Users/thomasklein/Projects/BremenBigDataChallenge2019/bbdc_2019_Bewegungsdaten/"


In [3]:
def smooth(data, windowsize, std):
    kernel = signal.gaussian(windowsize, std=std)
    kernel /= np.sum(kernel)
    return np.convolve(data, kernel, 'valid')

def variance_filter(data, windowsize):
    half = windowsize//2
    res = np.zeros(data.shape[0]-windowsize)
    for i in range(half,len(data)-half):
        res[i-half] = np.std(data[i-half:i+half])
    maxi = np.max(res)
    if(maxi == 0):
        print("max:",maxi)
    return res / maxi

def sample(data, num_samples):
    samples = [int(sample) for sample in np.linspace(0, data.shape[0]-1, num_samples)]
    return data[samples]
    
def smooth_extractor(data, num_samples):
    """
    data = 1d-numpy array of length timestep:
    """
    smoothed = smooth(data,200,50)
    normalized = (smoothed-np.mean(smoothed))/np.max(smoothed)
    return sample(normalized, num_samples)

def variance_extractor(data, num_samples):
    """
    data = 1d-numpy array of length timesteps
    """
    var_data = smooth(variance_filter(data,windowsize=100),windowsize=100,std=25)
    return sample(var_data, num_samples)


def _float_feature(value):
    """Returns a float_list from a float / double."""
    return tf.train.Feature(float_list=tf.train.FloatList(value=[value]))

def _int64_feature(value):
    """Returns an int64_list from a bool / enum / int / uint."""
    return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))

In [4]:
def recurrent_feature_extractor(data, num_samples):
    """
    data = 2d-numpy array of shape [timesteps, sensors]
    
    """
    features = []
        
    for sensor in variance_sensors:
        features.append(variance_extractor(data[:,sensors.index(sensor)], num_samples))
        
    if(np.isnan(np.array(features)).any()):
        raise ValueError("Error in variance")
        
    for sensor in smooth_sensors:
        features.append(smooth_extractor(data[:,sensors.index(sensor)], num_samples))
        
    if(np.isnan(np.array(features)).any()):
        raise ValueError("Error in smooth")
        
    return features

In [5]:
def serialize_example(featurelist, label):
    """
    Creates a tf.Example message from the list of features and the label, where
    every element in the featurelist is actually a sequence=ndarray
    """

    feature = {}
    for i in range(len(featurelist)):
        feature['feature'+str(i)] = tf.train.Feature(float_list=tf.train.FloatList(value=list(featurelist[i])))
        #_float_feature(featurelist[i])
    feature['label'] = _int64_feature(label)

    example_proto = tf.train.Example(features=tf.train.Features(feature=feature))
    return example_proto.SerializeToString()

In [6]:
def dataset_creator(in_file, outfile):
    """
    """
    
    df = pd.read_csv(in_file)
    featurelist = []
    
    with tf.python_io.TFRecordWriter(outfile+".tfrecords") as writer:
        
        for index, row in df.iterrows():
            if(index % 100 == 0):
                print(row['Datafile'])
            if(row['Label'] in classes):
                path = row['Datafile']
                data = pd.read_csv(data_path+path).values
                if np.isnan(data).any():
                    raise ValueError("Encountered NaN")

                label = classes.index(row['Label'])
                extracted_featurelist = recurrent_feature_extractor(data, 80)
                if np.isnan(np.array(extracted_featurelist)).any():
                    raise ValueError("Encountered NaN after processing")
                # this is where the fun begins: extracted_featurelist is a 19-element-list of numpy arrays of length 80
                # We need to get that into a tf.train.Example, which we can then serialize to string and 
                # write to a tfrecords-file.
                serialized_example = serialize_example(extracted_featurelist, label)

                writer.write(serialized_example)


In [7]:
dataset_creator(data_path+"train.csv",
                "/Users/thomasklein/Projects/BremenBigDataChallenge2019/bigdatachallenge/recurrent/rec_features")



Subject02/Subject02_Aufnahme000.csv
Subject02/Subject02_Aufnahme100.csv
Subject02/Subject02_Aufnahme200.csv
Subject02/Subject02_Aufnahme300.csv
Subject02/Subject02_Aufnahme400.csv




ValueError: Encountered NaN after processing