In [1]:
import s3fs
import pandas as pd
import numpy
import scipy.stats
import time
import multiprocessing as mp

import matplotlib.pylab as plt
%matplotlib inline

In [2]:
def collect_featured_data_from_s3(path, fs):
    top_dir = fs.ls(path)
    data = []
    
    for files in top_dir:
        df = None
        bottom_dir = fs.ls(files)
        df = combine_multiple_json(bottom_dir)
        df = get_accleration_timeseries(df)
        params = [df, 'unkown', np.arange(0,100,10)]
        example = featurize(params)
        data.append(example)
        
    return data # a list that has a featurized vector (dictioanry) for each folder in top_dir (each capture session)
    
    

In [3]:
def mp_collect_featured_data_from_s3(path, fs):
    top_dir = fs.ls(path)
    data = []
    data_params = []
    
    for files in top_dir:
        df = None
        bottom_dir = fs.ls(files)
        df = combine_multiple_json(bottom_dir)
        df = get_accleration_timeseries(df)
        params = (df, 'unkown', np.arange(0,100,10))
        data_params.append(params)
        
    then = time.time()
    pool = mp.Pool(processes=8)
    data = pool.map(featurize,data_params)
    print((time.time()-then)/60, "minutes")
        
    return data # a list that has a featurized vector (dictioanry) for each folder in top_dir (each capture session)


In [5]:
def combine_multiple_json(bottom_dir):
    
    df = pd.DataFrame([])
    
    for partial_json  in bottom_dir:
        x = fs.open(partial_json)
        try: # TODO fix this try/except
            temp_data = pd.read_json(x.read())
        except ValueError:
            continue
        x.close()
        df = df.append(temp_data, ignore_index = True)
        temp_data = None
    
    df = pd.io.json.json_normalize(df['motion'])
    df = df.iloc[:,0:3]
        
    return df # 3xn dataframe of acceleration data   
        
def get_accleration_timeseries(timeseries):
    
    timeseries = timeseries.apply((lambda x: x**2))
    timeseries = timeseries.sum(axis=1)
    timeseries = timeseries.apply(np.sqrt)
    
    return timeseries # 1xn Series 

def featurize(params):
    ts = params[0]
    label = params[1]
    bins = params[2]
    mean = np.mean(ts)
    median = np.median(ts)
    std = np.std(ts)
    length = len(ts)
    kurtosis = scipy.stats.kurtosis(ts)
    
    n,b,p = plt.hist(ts, bins=bins)
    n = np.array(n)/float(np.sum(n)) #normalize i.e. fraction of entries in each bin
    
    if median == 0: 
        features = {'mean_over_median': 0, #dimensionless            
                    'std_over_median': 0, #dimensionless            
                    'length': length,
                    'kurtosis': kurtosis, #already dimensionless by definition
                   }
        
    else: 
        features = {'mean_over_median': mean/median, #dimensionless            
            'std_over_median': std/median, #dimensionless            
            'length': length,
            'kurtosis': kurtosis, #already dimensionless by definition
           }
        
    for i, val in enumerate(n):
        features[f'binfrac_{i}'] = val
    
    features['label'] = label
    
    
    return features
    

In [6]:
path_to_data = 'cchase-rh-demo-4/mock-server-data/motions/2019-02-25'
fs = s3fs.S3FileSystem()
x = mp_collect_featured_data_from_s3(path_to_data,fs)

  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)


0.0023487528165181478 minutes
