In [206]:
import json, os, csv, platform, zipfile, datetime
from glob import glob
import pandas as pd
import numpy as np
from pprint import pprint


DATA_ROOT = "irb"
SESSION_ID = "CHI2019"

def save_jsonfile(name, data):
    with open(name, 'w') as outfile:
        json.dump(data, outfile)
    print("File saved!", name)
    
def gen_save_file(folder, feature):
    user = os.path.basename(folder)
    return os.path.join(folder, feature + "_" + user + ".json")

In [207]:
'''
Processes E4 zipfile
Unzip all zip files in a folder. 
Append a suffix (user_id) to the extracted files. 
Delete zip file. 
'''
def process_e4(folder, verbose = True):
    user = os.path.basename(folder)
    zips = glob(folder + "/e4*.zip")
    if len(zips) == 0: 
        print("E4 processed")
    for z in zips:
        zip_full_path = os.path.realpath(z)
        directory_to_extract_to = os.path.dirname(zip_full_path)
        zip_ref = zipfile.ZipFile(zip_full_path, 'r')
        zip_ref.extractall(directory_to_extract_to)
        files = zip_ref.namelist()
        if verbose: 
            print("Unzipping", z)
        zip_ref.close()
        for file in files: 
            prefixed = file.replace(".", "_"+user + ".").lower()
            src = directory_to_extract_to + "/" + file
            dest = directory_to_extract_to + "/" + prefixed
            if verbose: 
                print("Renaming", os.path.basename(src), os.path.basename(dest))
            os.rename(src, dest)
        os.remove(zip_full_path)

In [208]:
def process_log(folder):
    user = os.path.basename(folder)
    logs = glob(folder + "/log*.json")
    if len(logs) == 0:
        print("ERROR: No log found for", user)
    else:
        for log in logs:
            logname, ext = os.path.basename(log).split('.')
            if len(logname.split('_')) == 3:
                filetype, user_id, starttime = logname.split('_')
                if user_id != user: 
                    print("ERROR: Log is incorrectly labeled/in wrong folder")
                with open(log, 'r+') as f: 
                    data = json.load(f)
                    for i, d in enumerate(data): 
                        data[i]['time'] = float(d['time'])/ 1000
                    f.seek(0)
                    info = {}
                    info['data'] = data
                    json.dump(info, f)
                    f.truncate()
                    print("Starttime appended to log file")
            
                name = ".".join(["_".join([filetype, user_id]), ext])
                name = os.path.join(os.path.dirname(log), name) 
                os.rename(log, name)
            print("Log processed.", log)

In [209]:
'''
Moves video to a data directory and makes a metadata json with paths to all video resources.
'''
def process_videos(folder):
    data_dir = folder + "/data"
    user = os.path.basename(folder)
    if not os.path.exists(data_dir):
       print("Making data directory")
       os.makedirs(data_dir)
        
    files = []
    files.extend(glob(folder + "/*.mp4"))
    files.extend(glob(folder + "/*.mov"))
    for video in files: 
        src = video
        name = os.path.basename(src)
        os.rename(src, os.path.join(data_dir, name))
    
    data = get_video_metadata(data_dir)
    save_jsonfile(os.path.join(folder, "videometadata_"+user+".json"), data)
    
def get_video_metadata(folder):
    files = []
    files.extend(glob(folder + "/*.mp4"))
    files.extend(glob(folder + "/*.mov"))
    
    data = {}
    for video in files: 
        name = os.path.basename(video)
        file_type = name.split("_")[0]
        if not file_type in data:
            data[file_type] = {}
        if "opt" in video:            
            data[file_type]['opt'] = video
        else:
            data[file_type]['raw']= video
    return data
#        

## Empatica Biosignals

In [210]:
'''
Ensures all files have user_id as the second argument. 
'''
def process_bio(folder):
    user = os.path.basename(folder)
    files = glob(folder + "/*.csv")
    for f in files:
        info = extract_info(f)
        directory = os.path.dirname(f)
        filename, ext = os.path.basename(f).split('.')
        filetype, user_id = filename.split("_")
        data = dict(info, **E4_MANIFEST[filetype])
        data['user_id'] = int(user)
        data['session_id'] = SESSION_ID
        save_jsonfile(os.path.join(directory, filetype + "_" + user_id + ".json"), data)
        os.remove(f)
    print("Processed bio csv files to json.")

In [211]:
def extract_info(file):
    info = {}
    name = os.path.basename(file).split('.')[0].split("_")[0]   
    info['name'] = name
    with open(file) as csvfile:
        csvreader = csv.reader(csvfile)
        if name != "tags":
            info['timestamp'] = int(float(next(csvreader)[0]))
            info['sampling_rate'] = int(float(next(csvreader)[0]))
        if name == "ibi":
            data = []
            for row in csvreader:
                data.append([float(row[0]), float(row[1])])
            info['data'] = data
        elif name == "acc":
            data = []
            for row in csvreader:
                data.append([float(row[0]), float(row[1]), float(row[2])])
            info['data'] = data
        else: 
            data = []
            for row in csvreader:
                data.append(float(row[0]))
            info['data'] = data

    return info

In [212]:
'''
Ensures all files have user_id as the second argument. 
'''
def process_filenames(folder):
    user = os.path.basename(folder)
    files = glob(folder + "/*")
    for f in files:
        if os.path.isfile(f):
            directory = os.path.dirname(f)
            name, ext = os.path.basename(f).split(".")
            filetype, user_id = name.split("_")
            if user != user_id: 
                print("Invalid", f, 'expecting', user, 'but got', user_id)
            lname = ".".join(["_".join([filetype.lower(), user_id]), ext])
            src = f
            dst = os.path.join(directory, lname) 
            os.rename(src, dst)
    print("Processed files for correct naming conventions.")

In [213]:
def process_images(folder): 
    data_dir = folder + "/data"
    user = os.path.basename(folder)
    if not os.path.exists(data_dir):
       print("Making data directory")
       os.makedirs(data_dir)
        
    files = []
    files.extend(glob(folder + "/*.jpg"))
    files.extend(glob(folder + "/*.png"))
    for img in files: 
        src = img
        name = os.path.basename(src)
        os.rename(src, os.path.join(data_dir, name))
    
    data = get_img_metadata(data_dir)
    save_jsonfile(os.path.join(folder, "imagemetadata_"+user+".json"), data)

In [214]:
def get_img_metadata(folder):
    files = []
    files.extend(glob(folder + "/*.png"))
    files.extend(glob(folder + "/*.jpg"))
    
    data = {}
    for img in files: 
        name = os.path.basename(img)
        file_type = name.split("_")[0]
        if not file_type in data:
            data[file_type] = []    
        data[file_type].append(img)
    return data


In [215]:
def process_notes(folder): 
    data_dir = folder + "/data"
    user = os.path.basename(folder)
    if not os.path.exists(data_dir):
       print("Making data directory")
       os.makedirs(data_dir)
        
    files = glob(folder + "/*.txt")
    for note in files: 
        src = note
        name = os.path.basename(src)
        os.rename(src, os.path.join(data_dir, name))
    
    data = get_notes_metadata(data_dir)
    save_jsonfile(os.path.join(folder, "notesmetadata_"+user+".json"), data)

In [216]:
def get_notes_metadata(folder):
    files = glob(folder + "/*.txt")
    data = {}
    for note in files: 
        name = os.path.basename(note)
        file_type = name.split("_")[0]
        if not file_type in data:
            data[file_type] = []    
        data[file_type].append(note)
    return data


In [217]:
# NUKE THE UNOPTIMIZED VIDEO FILES
def nuke_vids(folder):
    files = []
    files.extend(glob(folder + "/*.mp4"))
    files.extend(glob(folder + "/*.mov"))
    
    print(files)
    for video in files: 
        if not "opt" in video:
            os.remove(video)
            
    for f in glob(user_folder + "/*.docx"):
        os.remove(f)

# Data Processing

In [224]:
data = filter(os.path.isdir, glob(DATA_ROOT + "/*"))
for user_folder in data:
    user = os.path.basename(user_folder)
    process_e4(user_folder)
    process_log(user_folder)
    process_bio(user_folder)
    process_videos(user_folder)
    process_images(user_folder)
    process_notes(user_folder)
    process_filenames(user_folder)
    process_maxqda(user_folder)

E4 processed
Log processed. irb/111/log_111.json
Processed bio csv files to json.
File saved! irb/111/videometadata_111.json
File saved! irb/111/imagemetadata_111.json
File saved! irb/111/notesmetadata_111.json
Processed files for correct naming conventions.
Unzipping irb/411/e4_411_1535737565_A01B3B.zip
Renaming ACC.csv acc_411.csv
Renaming EDA.csv eda_411.csv
Renaming BVP.csv bvp_411.csv
Renaming TEMP.csv temp_411.csv
Renaming IBI.csv ibi_411.csv
Renaming HR.csv hr_411.csv
Renaming info.txt info_411.txt
Renaming tags.csv tags_411.csv
Starttime appended to log file
Log processed. irb/411/log_411_1535739063864.json
File saved! irb/411/temp_411.json
File saved! irb/411/bvp_411.json
File saved! irb/411/hr_411.json
File saved! irb/411/acc_411.json
File saved! irb/411/eda_411.json
File saved! irb/411/ibi_411.json
File saved! irb/411/tags_411.json
Processed bio csv files to json.
Making data directory
File saved! irb/411/videometadata_411.json
File saved! irb/411/imagemetadata_411.json
File

In [225]:
data = filter(os.path.isdir, glob(DATA_ROOT + "/*"))
for user_folder in data: 
    nuke_vids(os.path.join(user_folder, 'data'))
        


['irb/111/data/interview_111_opt.mp4', 'irb/111/data/side_111_opt.mp4', 'irb/111/data/screen_111_opt.mp4']
['irb/411/data/screen_411_opt.mp4', 'irb/411/data/interview2_411_opt.mp4', 'irb/411/data/interview1_411_opt.mp4', 'irb/411/data/side_411_opt.mp4']
['irb/214/data/Interview_214_opt.mp4', 'irb/214/data/Side_214_opt.mp4', 'irb/214/data/Screen_214_opt.mp4']
['irb/112/data/side_112_opt.mp4', 'irb/112/data/interview_112_opt.mp4', 'irb/112/data/screen_112_opt.mp4']
['irb/113/data/side_113_opt.mp4', 'irb/113/data/interview_113_opt.mp4', 'irb/113/data/screen_113_opt.mp4']
