In [73]:
import json, os, csv, platform, zipfile, datetime, time
from glob import glob
import pandas as pd
import numpy as np
from pprint import pprint


DATA_ROOT = "irb"
SESSION_ID = "CHI2019"

E4_MANIFEST = {
    'phasic': {
        'name': "Phasic GSR",
        'description': "Event-specific EDA"
    },
    'smna':{
        'name': "SMNA",
        'description': "sparse SMNA driver of phasic component"
    },
    'textchunk': {
        'name': "Text Chunk",
        'description': "Values correspond to typing behaviors.  # of consecutive characters typed."
    },
    'temp': {
        'name': "Temperature",
        'description': "Data from temperature sensor expressed degrees on the Celsius (°C) scale.",
        'unit': "celsius"
    },
    'tags':{
        'name': "Tags",
        'description': "Event mark times. Each row corresponds to a physical button press on the device; the same time as the status LED is first illuminated. The time is expressed as a unix timestamp in UTC and it is synchronized with initial time of the session indicated in the related data files from the corresponding session."
    },
    'acc':{
        'description': "Data from 3-axis accelerometer sensor. The accelerometer is configured to measure acceleration in the range [-2g, 2g]. Therefore the unit in this file is 1/64g. Data from x, y, and z axis are respectively in first, second, and third column.",
        'name': "3-Axis Accelerometer",
        'unit': "1/64g"
    }, 
    'eda':{
        'description': "Data from the electrodermal activity sensor expressed as microsiemens (μS).", 
        'name':"Electrodermal Activity",
        'unit': "μS"
    }, 
    'bvp':{
        'name': "Blood Volume Pulse (BVP) from PPG",
        'description': "Data from photoplethysmograph.",
    }, 
    'ibi':{
        'name': "IBI", 
        'description': "Time between individuals heart beats extracted from the BVP signal. No sample rate is needed for this file. The first column is the time (respect to the initial time) of the detected inter-beat interval expressed in seconds (s). The second column is the duration in seconds (s) of the detected inter-beat interval (i.e., the distance in seconds from the previous beat)."
    },
    'hr':{
        'name': "Heart rate", 
        'description': "Average heart rate extracted from the BVP signal.The first row is the initial time of the session expressed as unix timestamp in UTC. The second row is the sample rate expressed in Hz."
    }, 
    'kinnunen':{
        'name': "Kinnunen codes", 
        'description': "From Kinnunen et al.; self-efficacy assessments x emotions; used as a closed-coding system. Addition of a failure stage."
    }, 
    'a':{
        'name': "soldering acc"
    }, 
    'g':{
         'name': "gyro"
    }, 
    'm':{
        'name': 'magnetometer'
    }
}
def save_jsonfile(name, data):
    with open(name, 'w') as outfile:
        json.dump(data, outfile)
    print("File saved!", name)
    
def gen_save_file(folder, feature):
    user = os.path.basename(folder)
    return os.path.join(folder, feature + "_" + user + ".json")

In [90]:
'''
Processes E4 zipfile
Unzip all zip files in a folder. 
Append a suffix (user_id) to the extracted files. 
Delete zip file. 
'''
def process_e4(folder, verbose = True):
    user = os.path.basename(folder)
    zips = glob(folder + "/e4*.zip")
    if len(zips) == 0: 
        print("E4 processed")
    for z in zips:
        zip_full_path = os.path.realpath(z)
        directory_to_extract_to = os.path.dirname(zip_full_path)
        zip_ref = zipfile.ZipFile(zip_full_path, 'r')
        zip_ref.extractall(directory_to_extract_to)
        files = zip_ref.namelist()
        if verbose: 
            print("Unzipping", z)
        zip_ref.close()
        for file in files: 
            print(file)
            prefixed = file.replace(".", "_"+user + ".").lower()
            src = directory_to_extract_to + "/" + file
            dest = directory_to_extract_to + "/" + prefixed
            if verbose: 
                print("Renaming", os.path.basename(src), os.path.basename(dest))
            os.rename(src, dest)
        os.remove(zip_full_path)

In [91]:
def process_log(folder):
    user = os.path.basename(folder)
    logs = glob(folder + "/log*.json")
    if len(logs) == 0:
        print("ERROR: No log found for", user)
    else:
        for log in logs:
            logname, ext = os.path.basename(log).split('.')
            if len(logname.split('_')) == 3:
                filetype, user_id, starttime = logname.split('_')
                if user_id != user: 
                    print("ERROR: Log is incorrectly labeled/in wrong folder")
                with open(log, 'r+') as f: 
                    data = json.load(f)
                    for i, d in enumerate(data): 
                        data[i]['time'] = float(d['time'])/ 1000
                    f.seek(0)
                    info = {}
                    info['data'] = data
                    json.dump(info, f)
                    f.truncate()
                    print("Starttime appended to log file")
            
                name = ".".join(["_".join([filetype, user_id]), ext])
                name = os.path.join(os.path.dirname(log), name) 
                os.rename(log, name)
            print("Log processed.", log)

In [92]:
'''
Moves video to a data directory and makes a metadata json with paths to all video resources.
'''
def process_videos(folder):
    data_dir = folder + "/data"
    user = os.path.basename(folder)
    if not os.path.exists(data_dir):
       print("Making data directory")
       os.makedirs(data_dir)
        
    files = []
    files.extend(glob(folder + "/*.mp4"))
    files.extend(glob(folder + "/*.m4v"))
    files.extend(glob(folder + "/*.mov"))
    for video in files: 
        src = video
        name = os.path.basename(src)
        os.rename(src, os.path.join(data_dir, name))
    
    data = get_video_metadata(data_dir)
    save_jsonfile(os.path.join(folder, "videometadata_"+user+".json"), data)
    
def get_video_metadata(folder):
    files = []
    files.extend(glob(folder + "/*.mp4"))
    files.extend(glob(folder + "/*.mov"))
    
    data = {}
    for video in files: 
        name = os.path.basename(video)
        file_type = name.split("_")[0]
        if not file_type in data:
            data[file_type] = {}
        if "opt" in video:            
            data[file_type]['opt'] = video
        else:
            data[file_type]['raw']= video
    return data
#        

## Soldering data

In [93]:
def extract_solder_info(folder, file):
    info = {}
    name = os.path.basename(file).split('.')[0].split("_")[0]   
    info['name'] = name
    with open(file) as csvfile:
        csvreader = csv.reader(csvfile)
        r = next(csvreader)
        info['timestamp'] = int(float(r[0]))
        info['end_timestamp'] = int(float(r[1]))
        info['elapsed_time'] = int(float(r[2]))
        
        print(time.ctime(info['timestamp']))
        print(time.ctime(info['end_timestamp']))
        print(info['elapsed_time']/60)
        
        x = []
        y = []
        z = []
        
        for row in csvreader:
            x.append(float(row[0]))
            y.append(float(row[1]))
            z.append(float(row[2]))
        
        channels = {'x':x, 'y':y, 'z':z}
        
        for c in channels:
            feature = name + "-" + c
            
            d = dict(E4_MANIFEST[name], **info)
            d["name"] = feature
            d["data"] = channels[c]
            
            savefile = gen_save_file(folder, feature)
            save_jsonfile(savefile, d)
     
    return info
def process_soldering(folder):
    user = os.path.basename(folder)
    files = []
    files.extend(glob(folder + "/a_*.csv"))
    files.extend(glob(folder + "/g_*.csv"))
    files.extend(glob(folder + "/m_*.csv"))
    for f in files:
        info = extract_solder_info(folder,f)
        os.remove(f)

In [94]:
process_soldering("irb/722")

## Empatica Biosignals

In [107]:
'''
Ensures all files have user_id as the second argument. 
'''
def process_bio(folder):
    user = os.path.basename(folder)
    files = glob(folder + "/*.csv")
    print(files)
    for f in files:
        
        directory = os.path.dirname(f)
        filename, ext = os.path.basename(f).split('.')
        filetype, user_id = filename.split("_")
        if filetype == "code":
            continue
        print("INFO", f)
        info = extract_info(f)
        
        data = dict(info, **E4_MANIFEST[filetype])
        data['user_id'] = int(user)
        data['session_id'] = SESSION_ID
        save_jsonfile(os.path.join(directory, filetype + "_" + user_id + ".json"), data)
        os.remove(f)
    print("Processed bio csv files to json.")

In [96]:
def extract_info(file):
    info = {}
    name = os.path.basename(file).split('.')[0].split("_")[0]   
    info['name'] = name
    with open(file) as csvfile:
        csvreader = csv.reader(csvfile)
        if name != "tags":
            info['timestamp'] = int(float(next(csvreader)[0]))
            info['sampling_rate'] = int(float(next(csvreader)[0]))
        if name == "ibi":
            data = []
            for row in csvreader:
                data.append([float(row[0]), float(row[1])])
            info['data'] = data
        elif name == "acc":
            data = []
            for row in csvreader:
                data.append([float(row[0]), float(row[1]), float(row[2])])
            info['data'] = data
        else: 
            data = []
            for row in csvreader:
                data.append(float(row[0]))
            info['data'] = data

    return info

In [97]:
'''
Ensures all files have user_id as the second argument. 
'''
def process_filenames(folder):
    user = os.path.basename(folder)
    files = glob(folder + "/*")
    for f in files:
        if os.path.isfile(f):
            directory = os.path.dirname(f)
            name, ext = os.path.basename(f).split(".")
            print(name)
            filetype, user_id = name.split("_")
            if user != user_id: 
                print("Invalid", f, 'expecting', user, 'but got', user_id)
            lname = ".".join(["_".join([filetype.lower(), user_id]), ext])
            src = f
            dst = os.path.join(directory, lname) 
            os.rename(src, dst)
    print("Processed files for correct naming conventions.")

In [98]:
def process_images(folder): 
    data_dir = folder + "/data"
    user = os.path.basename(folder)
    if not os.path.exists(data_dir):
       print("Making data directory")
       os.makedirs(data_dir)
        
    files = []
    files.extend(glob(folder + "/*.jpg"))
    files.extend(glob(folder + "/*.JPG"))
    files.extend(glob(folder + "/*.png"))
    for img in files: 
        src = img
        name = os.path.basename(src)
        os.rename(src, os.path.join(data_dir, name))
    
    data = get_img_metadata(data_dir)
    save_jsonfile(os.path.join(folder, "imagemetadata_"+user+".json"), data)

In [99]:
def get_img_metadata(folder):
    files = []
    files.extend(glob(folder + "/*.png"))
    files.extend(glob(folder + "/*.jpg"))
    
    data = {}
    for img in files: 
        name = os.path.basename(img)
        file_type = name.split("_")[0]
        if not file_type in data:
            data[file_type] = []    
        data[file_type].append(img)
    return data


In [100]:
def process_notes(folder): 
    data_dir = folder + "/data"
    user = os.path.basename(folder)
    if not os.path.exists(data_dir):
       print("Making data directory")
       os.makedirs(data_dir)
        
    files = glob(folder + "/*.txt")
    for note in files: 
        src = note
        name = os.path.basename(src)
        os.rename(src, os.path.join(data_dir, name))
    
    data = get_notes_metadata(data_dir)
    save_jsonfile(os.path.join(folder, "notesmetadata_"+user+".json"), data)

In [101]:
def get_notes_metadata(folder):
    files = glob(folder + "/*.txt")
    data = {}
    for note in files: 
        name = os.path.basename(note)
        file_type = name.split("_")[0]
        if not file_type in data:
            data[file_type] = []    
        data[file_type].append(note)
    return data


In [102]:
# NUKE THE UNOPTIMIZED VIDEO FILES
def nuke_vids(folder):
    files = []
    files.extend(glob(folder + "/*.mp4"))
    files.extend(glob(folder + "/*.mov"))
    files.extend(glob(folder + "/*.m4v"))
    
    print(files)
    for video in files: 
        if not "opt" in video:
            os.remove(video)
            
    for f in glob(user_folder + "/*.docx"):
        os.remove(f)

# Data Processing

In [109]:
data = filter(os.path.isdir, glob(DATA_ROOT + "/*"))
for user_folder in data:
    user = os.path.basename(user_folder)
    if user == "datasets":
        continue
    process_soldering(user_folder)
    process_e4(user_folder)
#     process_log(user_folder)
    process_bio(user_folder)
    process_videos(user_folder)
    process_images(user_folder)
    process_notes(user_folder)
    process_filenames(user_folder)
    

E4 processed
['irb/1000/eda_1000.csv', 'irb/1000/temp_1000.csv', 'irb/1000/acc_1000.csv', 'irb/1000/hr_1000.csv', 'irb/1000/bvp_1000.csv']
INFO irb/1000/eda_1000.csv
File saved! irb/1000/eda_1000.json
INFO irb/1000/temp_1000.csv
File saved! irb/1000/temp_1000.json
INFO irb/1000/acc_1000.csv
File saved! irb/1000/acc_1000.json
INFO irb/1000/hr_1000.csv
File saved! irb/1000/hr_1000.json
INFO irb/1000/bvp_1000.csv
File saved! irb/1000/bvp_1000.json
Processed bio csv files to json.
Making data directory
File saved! irb/1000/videometadata_1000.json
File saved! irb/1000/imagemetadata_1000.json
File saved! irb/1000/notesmetadata_1000.json
g-x_1000
a-y_1000
eda_1000
videometadata_1000
notesmetadata_1000
m-y_1000
g-y_1000
a-x_1000
imagemetadata_1000
hr_1000
acc_1000
m-x_1000
tags_1000
g-z_1000
temp_1000
a-z_1000
m-z_1000
bvp_1000
Processed files for correct naming conventions.


In [None]:
data = filter(os.path.isdir, glob(DATA_ROOT + "/*"))
for user_folder in data: 
    nuke_vids(os.path.join(user_folder, 'data'))
        
