In [107]:
import json, os, csv, platform, zipfile, datetime
from glob import glob
import pandas as pd
import numpy as np
from pprint import pprint


DATA_ROOT = "irb"
SESSION_ID = "CHI2019"

def save_jsonfile(name, data):
    with open(name, 'w') as outfile:
        json.dump(data, outfile)
    print("File saved!", name)
    


In [58]:
# %load lib/cvxEDA.py
"""
______________________________________________________________________________

 File:                         cvxEDA.py
 Last revised:                 07 Nov 2015 r69
 ______________________________________________________________________________

 Copyright (C) 2014-2015 Luca Citi, Alberto Greco
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
 Foundation; either version 3 of the License, or (at your option) any later
 version.
 
 This program is distributed in the hope that it will be useful, but WITHOUT
 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 
 You may contact the author by e-mail (lciti@ieee.org).
 ______________________________________________________________________________

 This method was first proposed in:
 A Greco, G Valenza, A Lanata, EP Scilingo, and L Citi
 "cvxEDA: a Convex Optimization Approach to Electrodermal Activity Processing"
 IEEE Transactions on Biomedical Engineering, 2015
 DOI: 10.1109/TBME.2015.2474131

 If you use this program in support of published research, please include a
 citation of the reference above. If you use this code in a software package,
 please explicitly inform the end users of this copyright notice and ask them
 to cite the reference above in their published research.
 ______________________________________________________________________________
"""

import numpy as np
import cvxopt as cv
import cvxopt.solvers

def cvxEDA(y, delta, tau0=2., tau1=0.7, delta_knot=10., alpha=8e-4, gamma=1e-2,
           solver=None, options={'reltol':1e-9}):
    """CVXEDA Convex optimization approach to electrodermal activity processing

    This function implements the cvxEDA algorithm described in "cvxEDA: a
    Convex Optimization Approach to Electrodermal Activity Processing"
    (http://dx.doi.org/10.1109/TBME.2015.2474131, also available from the
    authors' homepages).

    Arguments:
       y: observed EDA signal (we recommend normalizing it: y = zscore(y))
       delta: sampling interval (in seconds) of y
       tau0: slow time constant of the Bateman function
       tau1: fast time constant of the Bateman function
       delta_knot: time between knots of the tonic spline function
       alpha: penalization for the sparse SMNA driver
       gamma: penalization for the tonic spline coefficients
       solver: sparse QP solver to be used, see cvxopt.solvers.qp
       options: solver options, see:
                http://cvxopt.org/userguide/coneprog.html#algorithm-parameters

    Returns (see paper for details):
       r: phasic component
       p: sparse SMNA driver of phasic component
       t: tonic component
       l: coefficients of tonic spline
       d: offset and slope of the linear drift term
       e: model residuals
       obj: value of objective function being minimized (eq 15 of paper)
    """

    n = len(y)
    y = cv.matrix(y)

    # bateman ARMA model
    a1 = 1./min(tau1, tau0) # a1 > a0
    a0 = 1./max(tau1, tau0)
    ar = np.array([(a1*delta + 2.) * (a0*delta + 2.), 2.*a1*a0*delta**2 - 8.,
        (a1*delta - 2.) * (a0*delta - 2.)]) / ((a1 - a0) * delta**2)
    ma = np.array([1., 2., 1.])

    # matrices for ARMA model
    i = np.arange(2, n)
    A = cv.spmatrix(np.tile(ar, (n-2,1)), np.c_[i,i,i], np.c_[i,i-1,i-2], (n,n))
    M = cv.spmatrix(np.tile(ma, (n-2,1)), np.c_[i,i,i], np.c_[i,i-1,i-2], (n,n))

    # spline
    delta_knot_s = int(round(delta_knot / delta))
    spl = np.r_[np.arange(1.,delta_knot_s), np.arange(delta_knot_s, 0., -1.)] # order 1
    spl = np.convolve(spl, spl, 'full')
    spl /= max(spl)
    # matrix of spline regressors
    i = np.c_[np.arange(-(len(spl)//2), (len(spl)+1)//2)] + np.r_[np.arange(0, n, delta_knot_s)]
    nB = i.shape[1]
    j = np.tile(np.arange(nB), (len(spl),1))
    p = np.tile(spl, (nB,1)).T
    valid = (i >= 0) & (i < n)
    B = cv.spmatrix(p[valid], i[valid], j[valid])

    # trend
    C = cv.matrix(np.c_[np.ones(n), np.arange(1., n+1.)/n])
    nC = C.size[1]

    # Solve the problem:
    # .5*(M*q + B*l + C*d - y)^2 + alpha*sum(A,1)*p + .5*gamma*l'*l
    # s.t. A*q >= 0

    old_options = cv.solvers.options.copy()
    cv.solvers.options.clear()
    cv.solvers.options.update(options)
    if solver == 'conelp':
        # Use conelp
        z = lambda m,n: cv.spmatrix([],[],[],(m,n))
        G = cv.sparse([[-A,z(2,n),M,z(nB+2,n)],[z(n+2,nC),C,z(nB+2,nC)],
                    [z(n,1),-1,1,z(n+nB+2,1)],[z(2*n+2,1),-1,1,z(nB,1)],
                    [z(n+2,nB),B,z(2,nB),cv.spmatrix(1.0, range(nB), range(nB))]])
        h = cv.matrix([z(n,1),.5,.5,y,.5,.5,z(nB,1)])
        c = cv.matrix([(cv.matrix(alpha, (1,n)) * A).T,z(nC,1),1,gamma,z(nB,1)])
        res = cv.solvers.conelp(c, G, h, dims={'l':n,'q':[n+2,nB+2],'s':[]})
        obj = res['primal objective']
    else:
        # Use qp
        Mt, Ct, Bt = M.T, C.T, B.T
        H = cv.sparse([[Mt*M, Ct*M, Bt*M], [Mt*C, Ct*C, Bt*C], 
                    [Mt*B, Ct*B, Bt*B+gamma*cv.spmatrix(1.0, range(nB), range(nB))]])
        f = cv.matrix([(cv.matrix(alpha, (1,n)) * A).T - Mt*y,  -(Ct*y), -(Bt*y)])
        res = cv.solvers.qp(H, f, cv.spmatrix(-A.V, A.I, A.J, (n,len(f))),
                            cv.matrix(0., (n,1)), solver=solver)
        obj = res['primal objective'] + .5 * (y.T * y)
    cv.solvers.options.clear()
    cv.solvers.options.update(old_options)

    l = res['x'][-nB:]
    d = res['x'][n:n+nC]
    t = B*l + C*d
    q = res['x'][:n]
    p = A * q
    r = M * q
    e = y - r - t

    return (np.array(a).ravel() for a in (r, p, t, l, d, e, obj))


In [96]:
'''
Processes E4 zipfile
Unzip all zip files in a folder. 
Append a suffix (user_id) to the extracted files. 
Delete zip file. 
'''
def process_e4(folder, verbose = True):
    user = os.path.basename(folder)
    zips = glob(folder + "/e4*.zip")
    if len(zips) == 0: 
        print("E4 processed")
    for z in zips:
        zip_full_path = os.path.realpath(z)
        directory_to_extract_to = os.path.dirname(zip_full_path)
        zip_ref = zipfile.ZipFile(zip_full_path, 'r')
        zip_ref.extractall(directory_to_extract_to)
        files = zip_ref.namelist()
        if verbose: 
            print("Unzipping", z)
        zip_ref.close()
        for file in files: 
            prefixed = file.replace(".", "_"+user + ".").lower()
            src = directory_to_extract_to + "/" + file
            dest = directory_to_extract_to + "/" + prefixed
            if verbose: 
                print("Renaming", os.path.basename(src), os.path.basename(dest))
            os.rename(src, dest)
        os.remove(zip_full_path)

In [114]:
def process_log(folder):
    user = os.path.basename(folder)
    logs = glob(folder + "/log*.json")
    if len(logs) == 0:
        print("ERROR: No log found for", user)
    else:
        for log in logs:
            logname, ext = os.path.basename(log).split('.')
            if len(logname.split('_')) == 3:
                filetype, user_id, starttime = logname.split('_')
                if user_id != user: 
                    print("ERROR: Log is incorrectly labeled/in wrong folder")
                with open(log, 'r+') as f: 
                    data = json.load(f)
#                     if data[0]['type'] != "logstart":
#                         data.insert(0, {
#                             'type': 'logstart', 
#                             'time': int(starttime)
#                         })
#                         print("Appending start time to", log)
                    f.seek(0)
                    info = {}
                    info['data'] = data
                    json.dump(info, f)
                    f.truncate()
                    print("Starttime appended to log file")
            
                name = ".".join(["_".join([filetype, user_id]), ext])
                name = os.path.join(os.path.dirname(log), name) 
                os.rename(log, name)
            print("Log processed.", log)

In [28]:
'''
Moves video to a data directory and makes a metadata json with paths to all video resources.
'''
def process_videos(folder):
    data_dir = folder + "/data"
    user = os.path.basename(folder)
    if not os.path.exists(data_dir):
       print("Making data directory")
       os.makedirs(data_dir)
        
    files = []
    files.extend(glob(folder + "/*.mp4"))
    files.extend(glob(folder + "/*.mov"))
    for video in files: 
        src = video
        name = os.path.basename(src)
        os.rename(src, os.path.join(data_dir, name))
    
    data = get_video_metadata(data_dir)
    save_jsonfile(os.path.join(folder, "videometadata_111.json"), data)
    
def get_video_metadata(folder):
    files = []
    files.extend(glob(folder + "/*.mp4"))
    files.extend(glob(folder + "/*.mov"))
    
    data = {}
    for video in files: 
        name = os.path.basename(video)
        file_type = name.split("_")[0]
        if not file_type in data:
            data[file_type] = {}
        if "opt" in video:            
            data[file_type]['opt'] = video
        else:
            data[file_type]['raw']= video
    return data
#        

## Empatica Biosignals

In [29]:
'''
Ensures all files have user_id as the second argument. 
'''
def process_bio(folder):
    user = os.path.basename(folder)
    files = glob(folder + "/*.csv")
    for f in files:
        info = extract_info(f)
        directory = os.path.dirname(f)
        filename, ext = os.path.basename(f).split('.')
        filetype, user_id = filename.split("_")
        data = dict(info, **E4_MANIFEST[filetype])
        data['user_id'] = int(user)
        data['session_id'] = SESSION_ID
        save_jsonfile(os.path.join(directory, filetype + "_" + user_id + ".json"), data)
        os.remove(f)
    print("Processed bio csv files to json.")

In [30]:
def extract_info(file):
    info = {}
    name = os.path.basename(file).split('.')[0].split("_")[0]   
    info['name'] = name
    with open(file) as csvfile:
        csvreader = csv.reader(csvfile)
        if name != "tags":
            info['timestamp'] = int(float(next(csvreader)[0]))
            info['sampling_rate'] = int(float(next(csvreader)[0]))
        if name == "ibi":
            data = []
            for row in csvreader:
                data.append([float(row[0]), float(row[1])])
            info['data'] = data
        elif name == "acc":
            data = []
            for row in csvreader:
                data.append([float(row[0]), float(row[1]), float(row[2])])
            info['data'] = data
        else: 
            data = []
            for row in csvreader:
                data.append(float(row[0]))
            info['data'] = data

    return info

In [31]:
'''
Ensures all files have user_id as the second argument. 
'''
def process_filenames(folder):
    user = os.path.basename(folder)
    files = glob(folder + "/*")
    for f in files:
        if os.path.isfile(f):
            directory = os.path.dirname(f)
            name, ext = os.path.basename(f).split(".")
            filetype, user_id = name.split("_")
            if user != user_id: 
                print("Invalid", f, 'expecting', user, 'but got', user_id)
            lname = ".".join(["_".join([filetype.lower(), user_id]), ext])
            src = f
            dst = os.path.join(directory, lname) 
            os.rename(src, dst)
    print("Processed files for correct naming conventions.")

In [32]:
def process_images(folder): 
    data_dir = folder + "/data"
    user = os.path.basename(folder)
    if not os.path.exists(data_dir):
       print("Making data directory")
       os.makedirs(data_dir)
        
    files = []
    files.extend(glob(folder + "/*.jpg"))
    files.extend(glob(folder + "/*.png"))
    for img in files: 
        src = img
        name = os.path.basename(src)
        os.rename(src, os.path.join(data_dir, name))
    
    data = get_img_metadata(data_dir)
    save_jsonfile(os.path.join(folder, "imagemetadata_111.json"), data)

In [33]:
def get_img_metadata(folder):
    files = []
    files.extend(glob(folder + "/*.png"))
    files.extend(glob(folder + "/*.jpg"))
    
    data = {}
    for img in files: 
        name = os.path.basename(img)
        file_type = name.split("_")[0]
        if not file_type in data:
            data[file_type] = []    
        data[file_type].append(img)
    return data


In [34]:
def process_notes(folder): 
    data_dir = folder + "/data"
    user = os.path.basename(folder)
    if not os.path.exists(data_dir):
       print("Making data directory")
       os.makedirs(data_dir)
        
    files = glob(folder + "/*.txt")
    for note in files: 
        src = note
        name = os.path.basename(src)
        os.rename(src, os.path.join(data_dir, name))
    
    data = get_notes_metadata(data_dir)
    save_jsonfile(os.path.join(folder, "notesmetadata_111.json"), data)

In [35]:
def get_notes_metadata(folder):
    files = glob(folder + "/*.txt")

    
    data = {}
    for note in files: 
        name = os.path.basename(note)
        file_type = name.split("_")[0]
        if not file_type in data:
            data[file_type] = []    
        data[file_type].append(note)
    return data


In [95]:
# NUKE THE UNOPTIMIZED VIDEO FILES
def nuke_vids(folder):
    files = []
    files.extend(glob(folder + "/*.mp4"))
    files.extend(glob(folder + "/*.mov"))
    
    for video in files: 
        if not "opt" in video:
            os.remove(video)

# Data Processing

In [115]:
data = filter(os.path.isdir, glob(DATA_ROOT + "/*"))
for user_folder in data:
    user = os.path.basename(user_folder)
    process_e4(user_folder)
    process_log(user_folder)
    process_bio(user_folder)
    process_videos(user_folder)
    process_images(user_folder)
    process_notes(user_folder)
    process_filenames(user_folder)

E4 processed
Starttime appended to log file
Log processed. irb/111/log_111_1929292.json
Processed bio csv files to json.
File saved! irb/111/videometadata_111.json
File saved! irb/111/imagemetadata_111.json
File saved! irb/111/notesmetadata_111.json
Processed files for correct naming conventions.


In [110]:
data = filter(os.path.isdir, glob(DATA_ROOT + "/*"))
for user_folder in data: 
    nuke_vids(os.path.join(user_folder, 'data'))