In [None]:
#                                Leap GeneActiv Deidentification
#
#   This notebook does the following: 
#     1. Reads all Geneactiv files under a directory
#     2. Applies the deidentification offset for this subject to the timestamp.
#     3. Clears anything remotely fishy in the header
#     4. Swizzles the time in the filename
#     5. Writes the compressed data 
#
#     Input: A directory name containing GeneActiv files
#     Output: GeneActiv file with deidentified timestamps 
#
#     Author: Stephen Heisig
#     Changes:
#         8/07/23 The Dawn of Time: Stephen Heisig
#         8/12/23 Add compression
#

In [None]:
#  Imports
import sys
import os
import shutil
import glob
import gzip
from datetime import datetime, timedelta
from dateutil.parser import parse
import numpy as np
import pandas as pd
from hurry.filesize import size

In [None]:
pd.set_option('display.max_rows', None)

In [None]:
#Subject/Offset Dictionary
subjectOffsetDict = {'subject1': -4, 'subject2': -3, 'subject3': -2}

In [None]:
# Test Directories
#Input directory with CSV files
#GeneActivCsvDir = '/Users/Heisig/West/LEAP/FirstGeneActivTest/CSV/'
GeneActivCsvDir = '/Users/Heisig/West/LEAP/FirstGeneActivTest/CSV/'
#Output directory with deidentified CSV files
GeneActivDeidentifiedCsvDir = '/Users/Heisig/West/LEAP/FirstGeneActivTest/Deidentified_CSV/'
subject = '807'
subjectOffset = subjectOffsetDict[subject]


In [None]:
#Main function to deidentify a file
def deidentify(GeneActivFileName,GeneActivDeidentifiedCsvDir):
    
    #Read the GeneActiv data part file and process the data section 
    gaDataDF = pd.read_csv(GeneActivFileName,skiprows=100,header=None)
    
    #Create a timestamp field
    gaDataDF['scoreDateTime'] =  pd.to_datetime(gaDataDF.iloc[:,0], format='%Y-%m-%d %H:%M:%S:%f')
    
    #Apply the week offset
    def addWeek(df):  
        #Shift
        newTime = df['scoreDateTime'] + timedelta(weeks=subjectOffset)
        #Drop off extra milliseconds
        newTime = newTime.isoformat(timespec='milliseconds')
        return newTime

    gaDataDF['scoreDateTime'] = gaDataDF.apply(addWeek,axis=1)
    
    #Fixup columns
    gaDataDF.iloc[:,0] = gaDataDF['scoreDateTime']
    gaDataDF = gaDataDF.drop(['scoreDateTime'], axis=1, errors='ignore')
    
    #Sanitize the Header section and write it to the output file
    path_file = os.path.split(GeneActivFileName) 
    pathName = path_file[0]
    fileName = path_file[1]
    subjectCode = fileName.split('_')[0]
    
    #Swizzle the date in the fileName
    codeField = fileName.split('_')[1]
    dateField = fileName.split('_')[2]
    dateField = dateField.split('.')[0]
    try:
        dateObj = parse(dateField)
        dateObj = dateObj + timedelta(weeks=subjectOffset)
        dateString = dateObj.strftime("%Y-%m-%d")
    except Exception:
        print('Not a date in file name')
        dateString = 'NotADate'
    swizzledFileName = subjectCode+'_'+ codeField+'_'+dateString+'.csv.gz'
    
    finalFileName = GeneActivDeidentifiedCsvDir+swizzledFileName
    fields_to_zero = ['Start Time', 'Date of Birth', 'Config Time', 'Extract Time',
                      'Calibration Date', 'Sex', 'Height', 'Weight', 'Handedness Code',
                      'Time Zone']

    #Read and santize the header lines
    fileObj = open(GeneActivFileName, 'r')
    linesRead = []
    count = 0
    while True:
        count = count+1
        line = fileObj.readline()
        if not line:
            break
        lineList = line.split(',')
        
        if lineList[0]=='Subject Code':
           linesRead.append('Subject Code,'+subjectCode+'\n')

        elif lineList[0] in fields_to_zero:
           zeroedLine = lineList[0]+',\n'
           linesRead.append(zeroedLine)
        else:
           linesRead.append(line)

        if count==100:
            break
    fileObj.close()
    
    #Make a string of the header lines
    lineStr = ''.join(linesRead)
    
    #Make a string of the dataframe 
    dataString = gaDataDF.to_csv(index=False,header=False)

    # Combine the text and CSV string
    combined_string = lineStr + dataString
            
    # Write the combined string to a compressed file
    with gzip.open(finalFileName, 'wt') as f:
        f.write(combined_string)
 

In [None]:
#             Find all the .csv files
geneActivFiles = glob.glob(GeneActivCsvDir+'*.csv', recursive = True)
geneActivFiles


In [None]:
#Process all the files
for file in geneActivFiles:
    deidentify(file,GeneActivDeidentifiedCsvDir)