In [1]:
import sys
import os
import numpy as np
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt
from scipy import signal
from scipy import ndimage
import csv

In [2]:
classes = ['run', 'walk', 'stand', 'sit', 'sit-to-stand', 'stand-to-sit', 
        'stair-up', 'stair-down', 'jump-one-leg', 'jump-two-leg', 'curve-left-step', 
        'curve-right-step', 'curve-left-spin-Lfirst', 'curve-left-spin-Rfirst', 
        'curve-right-spin-Lfirst', 'curve-right-spin-Rfirst', 'lateral-shuffle-left', 
        'lateral-shuffle-right','v-cut-left-Lfirst', 'v-cut-left-Rfirst', 'v-cut-right-Lfirst', 'v-cut-right-Rfirst']

sensors = ['EMG1', 'EMG2', 'EMG3', 'EMG4', 'Microphone', 'ACC upper X', 'ACC upper Y','ACC upper Z', 'Goniometer X',
          'ACC lower X', 'ACC lower Y', 'ACC lower Z', 'Goniometer Y', 'Gyro upper X', 'Gyro upper Y', 'Gyro upper Z',
          'Gyro lower X', 'Gyro lower Y', 'Gyro lower Z']

data_path = "/Users/thomasklein/Projects/BremenBigDataChallenge2019/bbdc_2019_Bewegungsdaten/"

variance_analysis_params = {'EMG1':(100, 0.5, 20,  6),
                            'EMG2':(200, 0.5, 20, 10),
                            'EMG3':(200, 0.2, 20,  6),
                            'EMG4':(200, 0.5, 20,  6),
                            'Microphone': (200, 0.6, 20, 4),
                            'ACC upper X':(200, 0.2, 20, 6),
                            'ACC upper Y':(200, 0.5, 20, 5),
                            'ACC upper Z':(200, 0.5, 20, 5),
                            'Goniometer X':(200, 0.5, 20, 6),
                            'ACC lower X':(200, 0.2, 20, 6),
                            'ACC lower Y':(200, 0.2, 20, 6),
                            'ACC lower Z':(200, 0.2, 20, 6),
                            'Goniometer Y':(200, 0.5, 20, 6),
                            'Gyro upper X':(200, 0.5, 20, 6),
                            'Gyro upper Y':(200, 0.5, 20, 6),
                            'Gyro upper Z':(200, 0.5, 20, 6),
                            'Gyro lower X':(200, 0.5, 20, 6),
                            'Gyro lower Y':(200, 0.5, 20, 6),
                            'Gyro lower Z':(200, 0.5, 20, 6)}


In [8]:
def variance_filter(data, windowsize):
    """
    Filters a signal by sliding a window over it, under which the variance is calculated.
    At every time point t, the method outputs the variance of the original signal within a window around t
    """
    half = windowsize//2
    res = np.zeros(data.shape[0]-windowsize)
    for i in range(half,len(data)-half):
        res[i-half] = np.std(data[i-half:i+half])
    return res#/np.max(res)

def smooth(data, windowsize, std):
    kernel = signal.gaussian(windowsize, std=std)
    kernel /= np.sum(kernel)
    return np.convolve(data, kernel, 'valid')

def derivative_filter(data, windowsize, std):
    lowpass = smooth(data, windowsize, std)
    deriv = ndimage.sobel(lowpass, 0)
    return deriv
    

def peakfinder(data, h_thresh, w_thresh, n):
    """
    Finds peaks in a spectrum. Returns the first six peaks as 
    
    location1, location2, ...  location6
    height1,   height2,   ...  height6
    width1,    width2,    ...  width6
    
    The location is normalized so that the location of the first peak corresponds to zero.
    
    arguments: 
        data = 1d-array (of variance intensity values)
        h_thresh = the threshold for the minimum height a peak needs to have
        w_thresh = the threshold for the minimum width a peak needs to have
        n = the number of peaks to extract
    """
    #print(data)
    if h_thresh == None:
        h_thresh = np.mean(data)
        #print("max: ",np.max(data))
        #print("had to change height, new height:",h_thresh)
    locations, properties = signal.find_peaks(data, height=h_thresh, width=w_thresh)
    heights = properties['peak_heights']
    #widths = properties['width_heights']
    left = properties['left_ips']
    right = properties['right_ips']
    results = np.stack((locations,heights))#,left,right))
    #print(results)
    
    # if no peaks were found, return empty array
    if results.size == 0:
        #print("returning empty array")
        return np.zeros((2,n))
    
    # subtract location of first peak to shift everything (this doesn't make sense)
    #results[0,:] = results[0,:] - results[0,0]
    
    # sort descending by peak height
    results = results[:,(-results[1,:]).argsort()]
    
    # select biggest n peaks
    results = results[:,0:n]
    
    # rounding to 3 decimal places
    #results = np.round(results,3)

    # if not enough peaks found, fill up with zeros
    if results.shape[1] != n:
        results = np.pad(results,((0,0),(0,n-results.shape[1])),'constant', constant_values=0)
        
    return results

In [9]:
def variance_analysis(data, windowsize, h_thresh, w_thresh, n):
    """
    data = 1d-numpy array of shape [timesteps]
    """
    vardata = variance_filter(data, windowsize)
    res = peakfinder(vardata, h_thresh, w_thresh, 3) # instead of n
    return res.flatten()

def derivative_feature_extractor(data, sensor):
    derdata = derivative_filter(data, windowsize=200, std=50)
    res = peakfinder(derdata, h_thresh=None, w_thresh=20, n=5)
    return res.flatten()
    
def variance_feature_extractor(data, sensor):
    """
    Calls variance analysis with appropr
    """
    windowsize, h_thresh, w_thresh, n = variance_analysis_params[sensor]
    
    return variance_analysis(data, windowsize, h_thresh, w_thresh, n)

def extract_airborne(data):
    num_segments = 5
    
    airborne = data[:,4] # select airborne column
    airborne = np.abs(airborne - np.mean(airborne)) # center data around zero and take absolute value
    airborne /= np.max(airborne) # normalize between 0 and 1
    airborne = ndimage.filters.gaussian_filter1d(airborne, 40) # smooth data with gaussian
    segments = np.array_split(airborne, num_segments)
    
    maximums = np.array([np.max(seg) for seg in segments])
    stds = np.array([np.std(seg) for seg in segments])
    
    features = np.array([maximums, stds])
    return features.flatten()

In [10]:
def feature_extractor(data):
    """
    data = 2d-numpy array of shape [timesteps, sensors]
    
    """
    features = []
    #for idx,sensor in enumerate(['EMG1','Microphone']):
    #    features = features + list(variance_feature_extractor(data[:,idx], sensor))
        
    for idx,sensor in enumerate(['Goniometer X']):
        features = features + list(derivative_feature_extractor(data[:,idx], sensor))
    
    features + list(extract_airborne(data))
    
    for sensor in sensors:
        features.append(np.mean(data[:,sensors.index(sensor)]))
        features.append(np.std(data[:,sensors.index(sensor)]))
        
    #features.append(np.std(data[:,sensors.index('Goniometer X')]))
        
    return features

In [11]:
def dataset_creator(in_file, target_path, outfile):
    df = pd.read_csv(in_file)
    featurelist = []
    for index, row in df.iterrows():
        if(index % 100 == 0):
            print(row['Datafile'])
        if(row['Label'] in classes):
            path = row['Datafile']
            data = pd.read_csv(data_path+path).values
            features = feature_extractor(data)
            features.append(classes.index(row['Label']))
            featurelist.append(np.array(features))
    features = np.array(featurelist)
    print("Shape of features:",features.shape)
    features[:,0:-1] = (features[:,0:-1]-np.mean(features[:,0:-1],axis=0))/np.max(features[:,0:-1],axis=0)
    
    indices = np.random.randint(low=0, high=features.shape[0], size=features.shape[0]//10)
    testset = np.take(features, indices, axis=0)
    features = np.delete(features, indices, axis=0)
    
    if not os.path.exists(target_path):
        os.makedirs(target_path)
        
    odf = pd.DataFrame(features)
    odf.to_csv(target_path+outfile+".csv")
    
    tdf = pd.DataFrame(testset)
    tdf.to_csv(target_path+outfile+"_test.csv")

In [12]:
dataset_creator(data_path+"train.csv",
                "/Users/thomasklein/Projects/BremenBigDataChallenge2019/bigdatachallenge/cafeteria/",
                "minifeat")

Subject02/Subject02_Aufnahme000.csv
Subject02/Subject02_Aufnahme100.csv
Subject02/Subject02_Aufnahme200.csv
Subject02/Subject02_Aufnahme300.csv
Subject02/Subject02_Aufnahme400.csv
Subject03/Subject03_Aufnahme060.csv
Subject03/Subject03_Aufnahme160.csv
Subject03/Subject03_Aufnahme260.csv
Subject03/Subject03_Aufnahme360.csv
Subject04/Subject04_Aufnahme020.csv
Subject04/Subject04_Aufnahme120.csv
Subject04/Subject04_Aufnahme220.csv
Subject04/Subject04_Aufnahme320.csv
Subject04/Subject04_Aufnahme420.csv
Subject05/Subject05_Aufnahme082.csv
Subject05/Subject05_Aufnahme182.csv
Subject05/Subject05_Aufnahme282.csv
Subject05/Subject05_Aufnahme382.csv
Subject06/Subject06_Aufnahme044.csv
Subject06/Subject06_Aufnahme144.csv
Subject06/Subject06_Aufnahme244.csv
Subject06/Subject06_Aufnahme344.csv
Subject06/Subject06_Aufnahme444.csv
Subject07/Subject07_Aufnahme089.csv
Subject07/Subject07_Aufnahme189.csv
Subject07/Subject07_Aufnahme289.csv
Subject07/Subject07_Aufnahme389.csv
Subject08/Subject08_Aufnahme

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


Subject16/Subject16_Aufnahme070.csv
Subject16/Subject16_Aufnahme170.csv
Subject17/Subject17_Aufnahme015.csv
Subject17/Subject17_Aufnahme115.csv
Subject17/Subject17_Aufnahme215.csv
Subject17/Subject17_Aufnahme315.csv
Subject17/Subject17_Aufnahme415.csv
Subject18/Subject18_Aufnahme075.csv
Subject18/Subject18_Aufnahme175.csv
Subject18/Subject18_Aufnahme275.csv
Subject18/Subject18_Aufnahme375.csv
Subject19/Subject19_Aufnahme039.csv
Subject19/Subject19_Aufnahme139.csv
Subject19/Subject19_Aufnahme239.csv
Subject19/Subject19_Aufnahme339.csv
Subject19/Subject19_Aufnahme439.csv
Shape of features: (6385, 31)


In [13]:
def _float_feature(value):
    """Returns a float_list from a float / double."""
    return tf.train.Feature(float_list=tf.train.FloatList(value=[value]))

def _int64_feature(value):
  """Returns an int64_list from a bool / enum / int / uint."""
  return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))

def convert_csv_to_tfrecords(file,outfile):
    csv = pd.read_csv(file).values

    def serialize_example(featurelist):
        """
        Creates a tf.Example message ready to be written to a file.
        """

        # Create a dictionary mapping the feature name to the type of list
        feature = {}
        for i in range(csv.shape[1]-1):
            feature['feature'+str(i)] = _float_feature(featurelist[i])
        feature['label'] = _int64_feature(int(featurelist[-1]))

        example_proto = tf.train.Example(features=tf.train.Features(feature=feature))
        return example_proto.SerializeToString()


    with tf.python_io.TFRecordWriter(outfile+".tfrecords") as writer:
        for row in csv:
            example = serialize_example(row)
            writer.write(example)


In [14]:
convert_csv_to_tfrecords("/Users/thomasklein/Projects/BremenBigDataChallenge2019/bigdatachallenge/cafeteria/minifeat.csv","cafeteria/minifeat")

convert_csv_to_tfrecords("/Users/thomasklein/Projects/BremenBigDataChallenge2019/bigdatachallenge/cafeteria/minifeat_test.csv","cafeteria/minifeat_test")

