In `config.py`, need to have the variable:
```
config = {
    'api_token_data_entry': '',
    'api_token_encounter': '',
    'api_token_konica': '',
    'api_token_participant': '',
    'api_token_abg': '',
    'api_token_devices': '',
    'api_url': 'https://redcap.ucsf.edu/api/'
}
```

Definitions

- `record_id` is the unique identifier for each patient in the REDCap database.
- `session_id` is the unique identifier for each session in the REDCap database.

And for the de-identified files:
- `patient_id` is the de-identified version of `record_id`.
- `encounter_id` is the de-identified version of `session_id`.


In [None]:
error_files = {}

# Imports

In [None]:
import csv
import datetime
import glob
import json
import hashlib
import io
import os
import re
import shutil
import time
import warnings
import plotly.express as px
import pdb
import getpass
user = getpass.getuser()

import requests
from config import config
import numpy as np
import pandas as pd
# import wfdb
from tqdm.notebook import tqdm, trange

In [None]:
build_args = {
    'convert_to_wfdb': True, # if True, converts raw waveforms to wfdb format (PPG and labview raw). keeps existing wfdb files if False.
    'average_over': 0, # number of seconds to average over when extracting data from labview data
}

In [None]:
from exclude_unclean import drop_dict_after_algo, loc_dict_after_algo

print(config.keys())

fields = {
    'content': 'record',
    'format': 'csv',
    'action': 'export',
    'returnFormat': 'csv',
    'arms': ''
}

BASE_DIR = os.path.dirname(os.path.abspath('download_redcap.ipynb'))

In [None]:
import apprise
apobj = apprise.Apprise()

if user in [None]:
    pass
else:
    def apprise(message, attach_file=None):
        print(message)

In [None]:
# Function to read a dictionary from a JSON file
def read_dict_from_file(filename):
    with open(filename, 'r') as f:
        return json.load(f)

# Read the dictionaries from files
labview_filenames = read_dict_from_file('labview_filenames.json')
ppg_filenames= read_dict_from_file('ppg_filenames.json')

# Print loaded dictionaries to verify
print(labview_filenames)
print(ppg_filenames)


# Process: Create Output CSVs

## Create Patient CSV

In [None]:
os.makedirs('output', exist_ok=True)

participants = pd.read_csv('participants.csv')

participants['site_id'] = 'ucsf-hypoxialab'

#np.random.seed())

#prng = np.random.RandomState()
participants['date_skew'] = 'Redacted'

#prng = np.random.RandomState()
participants['date_skew'] = 'Redacted'

#prng = np.random.RandomState()
participants['random_number'] = 'Redacted'

def sha256_hash_concatenated_columns(row):
    concatenated_string = ''.join(str(row[col]) for col in ['site_id', 'record_id', 'random_number'])
    return hashlib.sha256(concatenated_string.encode()).hexdigest()

participants['patient_id'] = participants.apply(sha256_hash_concatenated_columns, axis=1)

participants_save = participants.loc[:,
                                [
                                    'patient_id',
                                    'site_id',
                                    'assigned_sex',
                                    'race',
                                    'ethnicity',
                                ]
                               ]

### Deidentified

In [None]:
participants_save.sort_values(by='patient_id', inplace=True)
participants_save.to_csv('output/patient.csv', index=False)

## Create Encounter CSV

<code>encounter</code> is PHI internal

<code>encounter_cleaned</code> is PHI internal with restricted sessions

<code>encounter_save</code> is deidentified

In [None]:
encounter = pd.read_csv('encounters.csv')
print(encounter.raw_ppg_file)

encounter = encounter.rename(columns=
                         {    're': 'right_ear_device',
                              'le': 'left_ear_device',
                              'forehead': 'forehead_device',
                              'l1' : 'finger_l1_device',
                              'l2' : 'finger_l2_device',
                              'l3' : 'finger_l3_device',
                              'l4' : 'finger_l4_device',
                              'l5' : 'finger_l5_device',
                              'r1' : 'finger_r1_device',
                              'r2' : 'finger_r2_device',
                              'r3' : 'finger_r3_device',
                              'r4' : 'finger_r4_device',
                              'r5' : 'finger_r5_device',
                              'l1_diameter' : 'finger_l1_diameter',
                              'l2_diameter' : 'finger_l2_diameter',
                              'l3_diameter' : 'finger_l3_diameter',
                              'l4_diameter' : 'finger_l4_diameter',
                              'l5_diameter' : 'finger_l5_diameter',
                              'r1_diameter' : 'finger_r1_diameter',
                              'r2_diameter' : 'finger_r2_diameter',
                              'r3_diameter' : 'finger_r3_diameter',
                              'r4_diameter' : 'finger_r4_diameter',
                              'r5_diameter' : 'finger_r5_diameter',
                         })

encounter.columns
print(encounter.shape)

encounter = encounter.merge(participants.loc[:, ['subject_id', 'patient_id', 'dob', 'date_skew']],
                        left_on='patient_id',
                        right_on='subject_id',
                        how='left',
                        suffixes=['', '_participant'])

print(encounter.shape)

encounter['dob'] = pd.to_datetime(encounter.dob)
encounter['date'] = pd.to_datetime(encounter.session_date)
encounter['encounter_date'] = pd.to_datetime(encounter.session_date)
encounter['encounter_date'] = encounter.encounter_date + pd.to_timedelta(encounter['date_skew'], unit='D')

encounter['age_at_encounter'] = np.floor((encounter.date - encounter.dob)/datetime.timedelta(days=365))

encounter['site'] = 'ucsf-hypoxialab'

encounter['string_date'] = encounter.session_date.astype(str)
encounter['hash_string'] = encounter.apply(lambda row: '_'.join(str(row[col]) for col in ['site', 'string_date', 'patient_id_participant']), axis=1)

def sha256_hash_column(value):
    return hashlib.sha256(value.encode()).hexdigest()

encounter['encounter_id'] = encounter['hash_string'].apply(sha256_hash_column)
encounter['patient_id'] = encounter.patient_id_participant

encounter_dict = encounter.set_index('record_id')['encounter_id'].to_dict()

print(encounter.shape)

In [None]:
#print(encounter_dict)

### export files

In [None]:
encounter_cleaned = encounter
encounter_save = encounter_cleaned.loc[:,
                           ['patient_id',
                            'encounter_id',
                            'encounter_date',
                            'age_at_encounter',
                            'warming',
                            'fitzpatrick',
                            'monk_fingernail',
                            'monk_dorsal',
                            'monk_palmar',
                            'monk_upper_arm',
                            'monk_forehead',
                            'right_ear_device',
                            'left_ear_device',
                            'forehead_device',
                            'finger_l1_device',
                            'finger_l2_device',
                            'finger_l3_device',
                            'finger_l4_device',
                            'finger_l5_device',
                            'finger_r1_device',
                            'finger_r2_device',
                            'finger_r3_device',
                            'finger_r4_device',
                            'finger_r5_device',
                            'finger_l1_diameter',
                            'finger_l2_diameter',
                            'finger_l3_diameter',
                            'finger_l4_diameter',
                            'finger_l5_diameter',
                            'finger_r1_diameter',
                            'finger_r2_diameter',
                            'finger_r3_diameter',
                            'finger_r4_diameter',
                            'finger_r5_diameter',
                           ]]

print(encounter.shape)
encounter_map = encounter.loc[:, ['encounter_id', 'record_id']].rename(columns={'encounter_id': 'encounter_id_hash', 'record_id': 'encounter_id_phi'})

encounter_save.sort_values(by='encounter_id', inplace=True)
encounter_save.to_csv('output/encounter.csv', index=False)

encounter_save


### Restrict Participants File to Valid Encounters

In [None]:
participants_save = participants_save.loc[np.in1d(participants_save.patient_id, encounter_save.patient_id)]
participants_save.to_csv('output/patient.csv', index=False)
participants_save

## Create ABG CSV

### Cleaning & Trailing zeroes

In [None]:
abg = pd.read_csv('abg.csv')

abg['date_calc'] = pd.to_datetime(abg.date_calc)

abg = abg.merge(participant_map,
                      left_on='patient_id',
                      right_on='patient_id_phi',
                      how='left')

abg = abg.merge(encounter_map,
                      left_on='session',
                      right_on='encounter_id_phi',
                      how='left')

abg['date'] = abg.date_calc
abg['date_calc'] = abg.date_calc + pd.to_timedelta(abg['date_skew'], unit='D')

abg = abg.drop(['record_id',
                'date_skew',
                'subject',
                'time_stamp',
                'patient_id',
                'session',
                'hypoxia_lab_abg_complete',
               ], axis=1)

time_format = re.compile(r'^\d{2}:\d{2}:\d{2}$')

# Replace values in the "time_calc" column with "NA" if they don't match the format
abg['time_calc'] = abg['time_calc'].astype(str).apply(lambda x: x if time_format.match(x) else 'NA')

#
abg = abg.rename(columns={'patient_id_hash': 'patient_id',
                                'encounter_id_hash': 'encounter_id',
                                'encounter_id_phi': 'session',
                                'patient_id_phi': 'subject_id',
                                 'time_calc':'time'})


# Process: Clean Labview Data Files

In [None]:
# Helper function to delete the extraneous lines at the beginning of the PPG file
def delete_lines(filename, lines=14):
    # Read the file
    try:
        with open(filename, 'r') as file:
            content = file.readlines()

    # Some files contain non-UTF-8 characters (Why?) so try ISO-8859-1
    except UnicodeDecodeError:
        with open(filename, 'r', encoding='ISO-8859-1') as file:
            content = file.readlines()

    if content[0].startswith('Start'):
        del content[:2]
        legacy = False
    else:
        # Delete the specified lines
        del content[:lines]
        legacy = True

    # Add '_trim' before .csv if filename ends in .csv
    if filename.endswith('.csv'):
        trim_filename = f'{filename[:-4]}_trim.csv'
    else:
        trim_filename = f'{filename}_trim'

    # Write the modified content back to the file
    with open(trim_filename, 'w') as file:
        file.writelines(content)

    return trim_filename, legacy

In [None]:
with open ('last_modified_redcap.json') as f:
    last_modified_redcap = json.load(f)

#turn the session id back into integers, they get turned into strings when saved as json
last_modified_redcap = {int(k): v for k, v in last_modified_redcap.items()}

from pandas.errors import ParserError
apprise('starting labview data')

sample_agg_df = pd.DataFrame()

sample_agg_df['REDACTED HB/SpO2'] = None
sample_agg_df['REDACTED HB/HR'] = None
sample_agg_df['REDACTED HB/PI'] = None

labview_dfs = []
labview_df_metadata={}

for index, row in tqdm(encounter.loc[~encounter.labview_data.isnull()].iterrows(), desc='Labview Data', total=len(encounter.loc[~encounter.labview_data.isnull()])):
    session_id = row['session']
    subject_id = row['subject_id']
    patient_id_hash = row['patient_id']
    encounter_id = row['encounter_id']

    # Determine the filename using the ID from last_modified_redcap if available
    if session_id in last_modified_redcap and 'labview_data' in last_modified_redcap[session_id]:
        file_id = last_modified_redcap[session_id]['labview_data']['id']
    else:
        file_id = 'unknown'
    filename = f'labview_data_{session_id}_id_{file_id}'

    # ## just do one sample
    # if session_id != 167:
    #     continue

    #just do 2 smaples
    # if session_id not in [301,79]:
    #     continue

    try:
        # now process every labview 2hz file
        new_filename, _ = delete_lines(f'labview_files/{filename}', lines=2)

        # Read the column names without loading the entire file so we can exclude Raw columns
        column_names = pd.read_table(new_filename, na_values=['---'], nrows=0).columns.tolist()
        filtered_columns = [col for col in column_names if 'Raw' not in col]

        df = pd.read_table(new_filename, na_values=['---'], usecols=filtered_columns, on_bad_lines='skip')

        # If timestamp is malformed, it will be set to NA
        with warnings.catch_warnings():
            warnings.simplefilter("ignore")
            df['Time Computer'] = pd.to_datetime(df['Time Computer'], errors='coerce')
            df['Date Computer'] = pd.to_datetime(df['Date Computer'], errors='coerce')

        # Combine 'Date Computer' and 'Time Computer' to create 'Timestamp'
        df['Timestamp'] = df.apply(lambda row: pd.Timestamp.combine(row['Date Computer'].date(), row['Time Computer'].time()) if pd.notnull(row['Date Computer']) and pd.notnull(row['Time Computer']) else pd.NaT, axis=1)

        # Filter out any values in the Sample column that are not the string 'Sample #' followed by a number
        # Also replace the string 'Sample #' with an empty string
        pattern = r'^Sample #([0-9]+)$'
        df['Sample'] = df['Sample'].apply(lambda x: re.match(pattern, str(x)).group(1) if pd.notna(x) and re.match(pattern, str(x)) else '')

        raw_columns = [x for x in df.columns if 'Raw' in x]
        df = df.drop(columns=['Time Computer', 'Date Computer', 'Comments'] + raw_columns)

        # Remove rows that don't have an ETCO2, since these are extraneous rows
        # Some contain dates or other malformed information, so this helps to get rid of some of the dates
        df['ETCO2'] = pd.to_numeric(df['ETCO2'], errors='coerce')
        df = df.loc[df.ETCO2.notnull()]

        # Replace double spaces in column names
        df.columns = df.columns.str.replace(r'\s+', ' ', regex=True)

        # Coerce columns to formats
        for col in ['REDACTED']:
            if col in df:
                df[col] = df[col].astype(str).str.extract(r'(\d+\.\d+|\d+)')
                df[col] = pd.to_numeric(df[col], errors='coerce')
                df[col] = df[col].replace(0, np.nan)

        # Drop rows where all values are NaN in the specified subset of columns
        subset_columns = [col for col in df.columns if col not in ['encounter_id', 'Timestamp', 'patient_id']]
        df = df.dropna(subset=subset_columns, how='all')

        # add encounter_id to df
        df['encounter_id'] = encounter_id

        # Removed code related to obfuscating device names in columns

        df_deid = df.rename(columns=deid_dict)

        df_deid.to_csv(f'output/waveforms/{encounter_id[:1]}/{encounter_id}_2hz.csv', index=False)

    except BaseException as e:
        print(f'ERROR on {session_id}: {str(e)}')
        error_files[session_id] = str(e)
        continue

# Go back to ABG

### Create vitaldf and PIdf

These are for merging with ABG so they have vital signs, and for the device df so the ref devices are included in the device spo2 file.

In [None]:
vital_df = sample_agg_df.loc[:, [
     'patient_id',
     'encounter_id',
     'Sample',
     'Timestamp',
     'ETCO2',
     'ETO2',
     'ScalcO2',
     'RR'] + [col for col in sample_agg_df.columns if  any(substring in col for substring in ['HR', 'SYS', 'DIA','MEAN'])]
]

#select all spo2 and pi columns to create
pi_df = sample_agg_df.loc[:, ['patient_id','encounter_id','Sample','Timestamp'] + [col for col in sample_agg_df.columns if  any(substring in col for substring in ['SpO2', 'PI', 'Perf'])]]


### Output public repository abg

In [None]:
abg_public = abg.merge(vital_df, left_on=['patient_id', 'encounter_id', 'sample'], right_on=['patient_id', 'encounter_id', 'Sample'], how='left').drop(columns=['Sample'])
abg_public['type'] = 1
abg_public = abg_public.loc[:, ['patient_id',
                  'encounter_id',
                  'type',
                  'date',
                  'time',
                  'sample',
                  'ph',
                  'pco2',
                  'po2',
                  'so2',
                  'cohb',
                  'methb',
                  'thb',
                  'k',
                  'na',
                  'ca',
                  'cl',
                  'glucose',
                  'lactate',
                  'p50',
                  'cbase',
                  'ETCO2',
                  'ETO2',
                  'ScalcO2',
                  'RR',
                 ]+[col for col in sample_agg_df.columns if  any(substring in col for substring in ['HR', 'SYS', 'DIA','MEAN'])]
]

abg_public.rename(columns=deid_dict, inplace=True)
abg_public.sort_values(by='encounter_id', inplace=True)
abg_public.to_csv('output/bloodgas.csv', index=False)

### Create devices-formatted labels for reference devices

To create the `devices.csv`, we want a df with columns `['device name','device id number','session','sample','spo2','perfusion index']`, for example.

For devices with spo2 manually recorded, this is taken care of later in the script.

However, we would also like the reference devices available in this table. To do this, it first requires us to 

- Extract the `device, sample, spo2, perfusion index` data from each session and sample and label the device`s.
- 

In [None]:
# Columns to keep as identifier variables
id_vars = ['patient_id', 'encounter_id', 'Sample', 'Timestamp']

# Melt the DataFrame for 'SpO2'
melted_spO2_df = pd.melt(pi_df, id_vars=id_vars, value_vars=[col for col in sample_agg_df.columns if 'SpO2' in col],
                         var_name='Device', value_name='saturation')
melted_spO2_df['Device'] = melted_spO2_df['Device'].str.split('/').str[0]

# Melt the DataFrame for 'PI'
melted_PI_df = pd.melt(pi_df, id_vars=id_vars, value_vars=[col for col in sample_agg_df.columns if 'PI' in col],
                       var_name='Device', value_name='pi')
melted_PI_df['Device'] = melted_PI_df['Device'].str.split('/').str[0]

# Concatenate the two melted DataFrames
reference_sat_pi = pd.merge(melted_spO2_df, melted_PI_df, on=id_vars + ['Device'], how='outer')

reference_sat_pi['device'] = np.where(reference_sat_pi.Device.str.contains('REDACTED'),
                               60,
                               np.where(reference_sat_pi.Device.str.contains('REDACTED'),
                                        64,
                                        np.where(reference_sat_pi.Device.str.contains('REDACTED'),
                                                 59,
                                                 np.where(reference_sat_pi.Device.str.contains('REDACTED'),
                                                 71,
                                                 9999
                                                ))))

reference_device_dict = {
    'REDACTED':'REDACTED'
}

reference_sat_pi['device'] = reference_sat_pi['Device'].apply(lambda device: reference_device_dict[device])

finger_map = encounter.filter(regex='_device$|encounter_id')
finger_map = pd.melt(finger_map, id_vars=['encounter_id'], var_name='probe_location_string', value_name='device')

probe_location_map = {
    'finger_l1_device': 1, 'finger_l2_device': 2, 'finger_l3_device': 3,
    'finger_l4_device': 4, 'finger_l5_device': 5,
    'finger_r1_device': 6, 'finger_r2_device': 7, 'finger_r3_device': 8,
    'finger_r4_device': 9, 'finger_r5_device': 10
}

# Map 'device_location' to 'probe_location'
finger_map['probe_location'] = finger_map['probe_location_string'].map(probe_location_map)

reference_sat_pi = reference_sat_pi.merge(finger_map.loc[:, ['encounter_id', 'device', 'probe_location']], on=['encounter_id', 'device'], how='left')
reference_sat_pi['sample_number'] = reference_sat_pi.Sample
reference_sat_pi = reference_sat_pi.loc[:, ['encounter_id','Device', 'device', 'probe_location', 'sample_number', 'saturation', 'pi']]
reference_sat_pi

assert 9999 not in reference_sat_pi.device.values

In [None]:
reference_sat_pi

In [None]:
for x in ['patient_id',
        'encounter_id',
        'age_at_encounter',
        'date',
        'time',
        'fitzpatrick',
        'monk_fingernail',
        'monk_dorsal',
        'monk_palmar',
        'monk_upper_arm',
        'monk_forehead',
        'finger_l1_device',
        'finger_l2_device',
        'finger_l3_device',
        'finger_l4_device',
        'finger_l5_device',
        'finger_r1_device',
        'finger_r2_device',
        'finger_r3_device',
        'finger_r4_device',
        'finger_r5_device',
        'finger_l1_probe_id',
        'finger_l2_probe_id',
        'finger_l3_probe_id',
        'finger_l4_probe_id',
        'finger_l5_probe_id',
        'finger_r1_probe_id',
        'finger_r2_probe_id',
        'finger_r3_probe_id',
        'finger_r4_probe_id',
        'finger_r5_probe_id',
        'finger_l1_version',
        'finger_l2_version',
        'finger_l3_version',
        'finger_l4_version',
        'finger_l5_version',
        'finger_r1_version',
        'finger_r2_version',
        'finger_r3_version',
        'finger_r4_version',
        'finger_r5_version',
        'finger_l1_diameter',
        'finger_l2_diameter',
        'finger_l3_diameter',
        'finger_l4_diameter',
        'finger_l5_diameter',
        'finger_r1_diameter',
        'finger_r2_diameter',
        'finger_r3_diameter',
        'finger_r4_diameter',
        'finger_r5_diameter',
        'warming',
        'waveform_filename']:
    if x not in encounter.columns:
        print(f'{x} missing')

## Create Spectrophotometer CSV

In [None]:
konica = pd.read_csv('konica.csv')

konica = konica.drop(columns=['record_id', 'comment'])

konica = konica.rename(columns={'upi':'patient_id_phi'})

konica['date'] = pd.to_datetime(konica.date)

konica = konica.merge(participant_map,
                      on='patient_id_phi',
                      how='left')

konica = konica.merge(encounter_map,
                      left_on='session',
                      right_on='encounter_id_phi',
                      how='left')

konica_encounter_map = encounter_map.merge(konica.loc[:, ['patient_id_phi', 'session', 'date']],
                                       left_on='encounter_id_phi',
                                       right_on='session',
                                       how='left')

konica['date_calc'] = konica.date + pd.to_timedelta(konica['date_skew'], unit='D')

konica = konica.rename(columns={'patient_id_hash': 'patient_id',
                                'encounter_id_hash': 'encounter_id',
                                'encounter_id_phi': 'session',
                                'patient_id_phi': 'subject_id'})

# Fix typos
konica['group'] = konica['group'].str.replace('Dorsal - DIP (B)', 'Dorsal (B)', regex=False)
konica['group'] = konica['group'].str.replace('Palmar - Opposite DIP (C)', 'Palmar (C)', regex=False)
konica['group'] = konica['group'].str.replace('Inner Arn (D)', 'Inner Upper Arm (D)', regex=False)
konica['group'] = konica['group'].str.replace('Forehead (G)', 'Forehead (E)', regex=False)
konica = konica.loc[konica.group != 'Group1']

# remove duplicate session columns
konica = konica.loc[:, ~konica.columns.duplicated()]
konica_cleaned = konica

konica_cleaned = konica_cleaned.loc[konica_cleaned.patient_id.notnull(),
                    ['patient_id',
                     'encounter_id',
                     'group',
                     'date_calc',
                     'melanin_index',
                     'hb_index',
                     'hb_so2_index',
                     'hue',
                     'value',
                     'chroma',
                     'lab_l',
                     'lab_a',
                     'lab_b',
                     'km400',
                     'km410',
                     'km420',
                     'km430',
                     'km440',
                     'km450',
                     'km460',
                     'km470',
                     'km480',
                     'km490',
                     'km500',
                     'km510',
                     'km520',
                     'km530',
                     'km540',
                     'km550',
                     'km560',
                     'km570',
                     'km580',
                     'km590',
                     'km600',
                     'km610',
                     'km620',
                     'km630',
                     'km640',
                     'km650',
                     'km660',
                     'km670',
                     'km680',
                     'km690',
                     'km700']]

konica.rename(columns={'date_calc': 'date'}, inplace=True)


konica_cleaned.sort_values(by='encounter_id', inplace=True)
konica_cleaned.to_csv('output/spectrophotometer.csv', index=False)

## Create Devices CSV

In [None]:
devices = pd.read_csv('devices.csv')

devices_save = devices.loc[:, [
    'device_number',
    # 'manufacturer',
    # 'model',
#    'brand',
    'device_type',
    'light_type'
]]

devices_save.to_csv('output/devices.csv', index=False)
devices_save.head()

In [None]:
devices.head()

## Create Device Performance CSV

# Generate WFDB Files

## Convert LabView Files to WFDB

In [None]:
if build_args['convert_to_wfdb'] == True:
    import wfdb
    apprise('Converting Labview to wfdb')

    os.chdir(BASE_DIR)

    # Wipe RECORD file
    with open("output/RECORDS", 'w') as file:
        file.write(f'')

    def extract_recording_rate(tsv_file):
        """Extract the recording rate (e.g. 250 Hz) from the new TSV file format"""
        with open(tsv_file, 'r') as file:
            reader = csv.reader(file, delimiter='\t')
            for row in reader:
                if len(row) >= 3 and 'Start Date' in row[0] and 'Start Time' in row[1] and 'Recording Rate' in row[2]:
                    next_row = next(reader, None)
                    if next_row:
                        recording_rate = next_row[2].split()[0]
                        return recording_rate
        return None

    time_list = []
    frequency_dict = {}
    for filename, metadata in tqdm(labview_filenames.items()):

        metadata['encounter_id'] = encounter_dict.get(metadata.get('session_id'), 'Z')

        try:
            recording_rate = extract_recording_rate(f'labview_files/{filename}')
            if recording_rate is not None:
                # New file format:  skip 2 header rows
                labview = pd.read_csv(f'labview_files/{filename}', sep='\t', encoding = 'unicode_escape', on_bad_lines='skip', skiprows=2)
            else:
                # Old file format
                labview = pd.read_csv(f'labview_files/{filename}', sep='\t', encoding = 'unicode_escape', on_bad_lines='skip')
        except:
            print('error reading labview file:', filename, metadata)

        if recording_rate is None:
            # Old file format, extract the recording rate from headers
            try:
                frequencies = [int(re.search(r'(\d+) Hz', col).group(1)) for col in labview.columns]
            except AttributeError as e:
                print(f'ERROR:  {labview.columns}')

            assert all([frequency == frequencies[0] for frequency in frequencies])
        frequency_dict[metadata.get('encounter_id')] = frequencies[0] if recording_rate is None else recording_rate

        labview.columns = [re.sub(r' \(\d+ Hz\)', '', col) for col in labview.columns]

        try:
            if recording_rate is None:
                # Old file format
                # Change columns with strings in them to NAs, since apparently the column headers are randomly re-inserted
                labview = labview.apply(pd.to_numeric, errors='coerce')
                labview = labview.dropna()
            else:
                labview = labview.drop(columns=['Time'])

            wfdb.io.wrsamp(metadata.get('encounter_id'),
                        fs=frequencies[0],
                        units=['%', 'mmHg', 'mmHg', 'mV'],
                        sig_name=labview.columns.to_list(),
                        p_signal=labview.to_numpy().astype(np.float32),
                        write_dir=f'output/waveforms/{metadata.get("encounter_id")[:1]}'
                        )

            with open("output/RECORDS", 'a') as file:
                file.write(f'waveforms/{metadata.get("encounter_id")[:1]}/{metadata.get("encounter_id")}\n')
            time_list.append(len(labview)/frequencies[0])
            print(f'Finished processing labview file for session {metadata.get("session_id")}, encounter {metadata.get("encounter_id")}')
        except Exception as e:
            print(f'ERROR:  Processing {filename}\nData {labview.to_numpy()}\nError: {e}')
            print(isinstance(labview.to_numpy(), np.floating))
            print(type(labview.to_numpy()))
            print(labview.columns)
            error_files[filename] = 'WFDB conversion error'
            raise e
    sum(time_list)/3600
    os.chdir(os.path.join(BASE_DIR, f'output/waveforms/{metadata.get("encounter_id")[:1]}'))
    wfdb.io.rdsamp(metadata.get("encounter_id"))

## Convert PPG to WFDB

In [None]:
if build_args['convert_to_wfdb'] == True:
    apprise('converting ppg to wfdb')
    os.chdir(BASE_DIR)
    time_list = []
    for filename, metadata in tqdm(ppg_filenames.items()):
        metadata['encounter_id'] = encounter_dict.get(metadata.get('session_id'), 'Z')
        try:
            (filename, legacy) = delete_lines(f'raw_ppg_files/{filename}')
        except IndexError:
            print(f'Processing {filename} for {metadata.get("encounter_id")}')
            print(f'ERROR:  file {filename} has no content')
            error_files[filename] = 'No content in file'
            continue

        print(f'Processing {filename} for {metadata.get("encounter_id")}.  Legacy is {legacy}.')


        try:
            if legacy:
                ppg = pd.read_csv(f'{filename}', low_memory=False, on_bad_lines='warn')
                first_timestamp = float(ppg['timestamp'].min()) # Unix epoch, in milliseconds
                first_timestamp = datetime.datetime.fromtimestamp(first_timestamp/1000).time()
            else:
                ppg = pd.read_csv(f'{filename}', low_memory=False, on_bad_lines='warn', sep='\t')
                first_timestamp = datetime.datetime.strptime(ppg['Time'].min(), '%H:%M:%S.%f').time()
        except BaseException as e:
            print(f'ERROR:  Could not get timestamp from {filename}\nColumns are:  {ppg.columns}\nError is {e}')
            error_files[filename] = 'Timestamp extraction error'
            first_timestamp = None

        try:
            if legacy:
                wfdb.io.wrsamp(f'{metadata.get("encounter_id")}_ppg',
                            fs=86,
                            units=['counts', 'counts'],
                            sig_name = ['IR', 'RED'],
                            d_signal=ppg.loc[:, ['LEDC1_PD1', 'LEDC2_PD1']].dropna(how='all').fillna(0).astype('int').to_numpy(),
                            fmt=['32', '32'],
                            adc_gain=[1, 1],
                            baseline=[0, 0],
                            comments = ['PPG IR LED', 'PPG RED LED'],
                            base_time = first_timestamp,
                            write_dir = f'output/waveforms/{metadata.get("encounter_id")[:1]}'
                            )
            else:
                wfdb.io.wrsamp(f'{metadata.get("encounter_id")}_ppg',
                            fs=86,
                            units=['counts', 'counts'],
                            sig_name = ['IR', 'RED'],
                            d_signal=ppg.loc[:, ['IR Signal', 'Red Signal']].dropna(how='all').fillna(0).astype('int').to_numpy(),
                            fmt=['32', '32'],
                            adc_gain=[1, 1],
                            baseline=[0, 0],
                            comments = ['PPG IR LED', 'PPG RED LED'],
                            base_time = first_timestamp,
                            write_dir = f'output/waveforms/{metadata.get("encounter_id")[:1]}'
                            )

            with open("output/RECORDS", 'a') as file:
                file.write(f'waveforms/{metadata.get("encounter_id")[:1]}/{metadata.get("encounter_id")}_ppg\n')
            time_list.append(len(ppg)/86)
        except KeyError as e:
            print(f'ERROR:  File {filename} is missing some keys.\nColumns are:  {ppg.columns}\nError is {e}')
            error_files[filename] = 'PPG conversion error'
    sum(time_list)/3600

In [None]:
os.chdir(BASE_DIR)

# View Logs

In [None]:
#send error files
apprise('Error files:' + '\n' + str(pd.Series(error_files).value_counts()))

#send list of error sessions
from collections import defaultdict
inverted_dict = defaultdict(list)

for key, value in error_files.items():
    inverted_dict[value].append(key)

# Convert defaultdict back to a regular dictionary if needed
inverted_dict = dict(inverted_dict)
apprise(str(inverted_dict))

In [None]:
log_files = {}
log_files['abg_log'] = {'log': abg_log_df, 'description': 'output log of threesamples cleaning'}
log_files['error_files'] = {'log': error_files, 'description': 'errors that occurred during the build'}
log_files['build_args'] = {'log': build_args, 'description': 'arguments used to build the data'}
log_files['labview_df_metadata'] = {'log': labview_df_metadata, 'description': 'how many samples were kept or rejected per session with this cleaning'}
log_files['labview_samples'] = {'df': labview_samples, 'description': 'simplified sao2/spo2 pairing for each sample and session'}