In `config.py`, need to have the variable:
```
config = {
    'api_token_data_entry': '',
    'api_token_encounter': '',
    'api_token_konica': '',
    'api_token_participant': '',
    'api_token_abg': '',
    'api_token_devices': '',
    'api_url': 'https://redcap.ucsf.edu/api/'
}
```

Definitions

- `record_id` is the unique identifier for each patient in the REDCap database.
- `session_id` is the unique identifier for each session in the REDCap database.

And for the de-identified files:
- `patient_id` is the de-identified version of `record_id`.
- `encounter_id` is the de-identified version of `session_id`.


# Imports

In [None]:
import csv
import datetime
import glob
import json
import hashlib
import io
import os
import re
import shutil
import time
import warnings
import plotly.express as px
import pdb
import getpass
import openox.hypoxialab_functions as ox
user = getpass.getuser()

import requests
from config import config
import numpy as np
import pandas as pd
# import wfdb
from tqdm.notebook import tqdm, trange

In [None]:
build_args = {
    'download_waveforms': True, #if True, downloads waveforms from redcap if newer than existing files
    'average_over': 0, # number of seconds to average over when extracting data from labview data
    'sleep_time': 0, #to spare the redcap API, use with download_waveforms = True
}

In [None]:
from exclude_unclean import drop_dict_after_algo, loc_dict_after_algo

print(config.keys())

fields = {
    'content': 'record',
    'format': 'csv',
    'action': 'export',
    'returnFormat': 'csv',
    'arms': ''
}

BASE_DIR = os.path.dirname(os.path.abspath('download_redcap.ipynb'))

In [None]:
import apprise
apobj = apprise.Apprise()

if user in [None]:
    pass
else:
    def apprise(message, attach_file=None):
        print(message)

In [None]:
apprise('Downloading new repository files:')
start = time.time()

# Download file logs

In [None]:
# download redcap logs
data = {
    'token': config['api_token_encounter'],
    'content': 'log',
    'logtype': 'record',
    'user': '',
    'record': '',
    'beginTime': 'REDACTED',
    'endTime': '',
    'format': 'csv',
    'returnFormat': 'json'
}

r = requests.post('https://redcap.ucsf.edu/api/', data=data)

if r.status_code == 200:
    file_path = 'file_log.csv'

    # Write the response content to the file
    with open(file_path, 'w', newline='') as file:
        file.write(r.text)

    print('CSV file saved successfully.')
else:
    print('Failed to retrieve the CSV file.')

file_log = pd.read_csv('file_log.csv')

In [None]:
# Load the file log from a CSV file
# 'file_log' is the raw export of the REDCap log, containing timestamp, details, and record columns
file_log = pd.read_csv('file_log.csv')

# Drop rows where 'details' column is NaN
file_log = file_log.dropna(subset=['details'])

# Keep only relevant columns: 'timestamp', 'details', and 'record'
file_log = file_log[['timestamp', 'details', 'record']]

# Define patterns to extract the IDs from the 'details' column
patterns = {
    'raw_ppg_file': r"raw_ppg_file = '(\d+)'",
    'labview_raw': r"labview_raw = '(\d+)'",
    'lil_stevie_upload': r"lil_stevie_upload = '(\d+)'",
    'labview_data': r"labview_data = '(\d+)'"
}

# Extract IDs using the defined patterns and create new columns in 'file_log'
for col, pattern in patterns.items():
    file_log[col] = file_log['details'].str.extract(pattern)

# Drop rows where all the columns defined in 'patterns' are NaN
file_log.dropna(subset=patterns.keys(), how='all', inplace=True)

file_log.head()

The goal is now to create a dictionary from the contents of `file_log` where each key is the `record_id` (session) and the value is a list of the files that were uploaded for that session.

In [None]:
# Initialize an empty dictionary to store the last modified timestamps and IDs
last_modified_redcap = {}

# Iterate through each group of records based on the 'record' column
for record_id, record_df in file_log.groupby('record'):
    # Iterate over each column defined in 'patterns'
    for col in ['raw_ppg_file', 'labview_raw', 'lil_stevie_upload', 'labview_data']:
        if record_df[col].notna().any():
            # Filter to include only rows where the current column is not null
            recent_record = record_df[record_df[col].notna()]
            # If the record_id is not already in the dictionary, add it
            if record_id not in last_modified_redcap:
                last_modified_redcap[record_id] = {}
            # Update the dictionary with the most recent timestamp and file ID for the current column
            last_modified_redcap[record_id][col] = {
                'timestamp': recent_record['timestamp'].iloc[0],
                'id': recent_record[col].iloc[0]
            }

# Test with a specific record (record 58) to check the results
last_modified_redcap[58]

In [None]:
last_modified_redcap[86]

In [None]:
with open ('last_modified_redcap.json', 'w') as f:
    json.dump(last_modified_redcap, f)

# Download: Create CSVs from Redcap

In [None]:
# Encounter Data
fields['token'] = config['api_token_encounter']
r = requests.post(config['api_url'], data=fields)

if r.status_code == 200:
    file_path = 'encounters.csv'

    # Write the response content to the file
    with open(file_path, 'w', newline='') as file:
        file.write(r.text)

    print('CSV file saved successfully.')
else:
    print('Failed to retrieve the CSV file.')

encounter = pd.read_csv('encounters.csv')

In [None]:
# Participant Data
fields['token'] = config['api_token_participant']
r = requests.post(config['api_url'], data=fields)

if r.status_code == 200:
    file_path = 'participants.csv'

    # Write the response content to the file
    with open(file_path, 'w', newline='') as file:
        file.write(r.text)

    print('CSV file saved successfully.')
else:
    print('Failed to retrieve the CSV file.')

participants = pd.read_csv('participants.csv')

In [None]:
# Pulse Ox Data
fields['token'] = config['api_token_data_entry']
r = requests.post(config['api_url'], data=fields)

if r.status_code == 200:
    file_path = 'pulseoximeter.csv'

    # Write the response content to the file
    with open(file_path, 'w', newline='') as file:
        file.write(r.text)

    print('CSV file saved successfully.')
else:
    print('Failed to retrieve the CSV file.')

In [None]:
# Konica Data
fields['token'] = config['api_token_konica']
r = requests.post(config['api_url'], data=fields)

if r.status_code == 200:
    file_path = 'konica.csv'

    # Write the response content to the file
    with open(file_path, 'w', newline='') as file:
        file.write(r.text)

    print('CSV file saved successfully.')
else:
    print('Failed to retrieve the CSV file.')

In [None]:
# ABG Data
fields['token'] = config['api_token_abg']
r = requests.post(config['api_url'], data=fields)

if r.status_code == 200:
    file_path = 'abg.csv'

    # Write the response content to the file
    with open(file_path, 'w', newline='') as file:
        file.write(r.text)

    print('CSV file saved successfully.')
else:
    print('Failed to retrieve the CSV file.')

In [None]:
# Device Data
fields['token'] = config['api_token_devices']
r = requests.post(config['api_url'], data=fields)

if r.status_code == 200:
    file_path = 'devices.csv'

    # Write the response content to the file
    with open(file_path, 'w', newline='') as file:
        file.write(r.text)

    print('CSV file saved successfully.')
else:
    print('Failed to retrieve the CSV file.')

# Download: Local copy of LabView Files

In [None]:

# create log for the waveforms, to report at the end which waveforms were downloaded and which were not
waveform_log = {}

def check_and_log_download(record_id, col):
    if record_id not in waveform_log:
        waveform_log[record_id] = {}
    waveform_log[record_id].update({col: 'Downloaded'})

## Download Labview Raw files

In [None]:
apprise('Downloading Labview Raw')

In [None]:
error_files = {}

In [None]:
# Ensure the directory for labview files exists
os.makedirs('labview_files', exist_ok=True)

# Dictionary to store filenames and their associated metadata
labview_filenames = dict()

# Filter the DataFrame to get only rows where 'labview_raw' is not null
labview_raw_iterable = encounter.loc[~encounter.labview_raw.isnull()]

# Function to check if the file exists
def file_exists_with_id(directory, session_id, subject_id, file_id):
    filename = f'labview_raw_session_{session_id}_subject_{subject_id}_id_{file_id}'
    return os.path.exists(os.path.join(directory, filename))

# Iterate through each relevant row in the DataFrame
for index, row in tqdm(labview_raw_iterable.iterrows(), desc='Labview Raw', total=len(labview_raw_iterable)):
    session_id = row['record_id']
    subject_id = row['patient_id']

    # Determine the filename using the ID from last_modified_redcap if available
    if session_id in last_modified_redcap and 'labview_raw' in last_modified_redcap[session_id]:
        file_id = last_modified_redcap[session_id]['labview_raw']['id']
    else:
        file_id = 'unknown'
    filename = f'labview_raw_session_{session_id}_subject_{subject_id}_id_{file_id}'

    # Add the filename and associated metadata to the labview_filenames dictionary
    labview_filenames[filename] = {
        'session_id': session_id,
        'subject_id': subject_id,
    }

    if build_args['download_waveforms'] == True:
        # Download Labview Raw file if it does not already exist locally
        time.sleep(build_args['sleep_time'])

        # Check if the file needs to be downloaded
        if not file_exists_with_id('labview_files', session_id, subject_id, file_id):
            data = {
                'token': config['api_token_encounter'],
                'content': 'file',
                'action': 'export',
                'record': session_id,
                'field': 'labview_raw',
                'event': '',
                'returnFormat': 'csv'
            }

            r = requests.post('https://redcap.ucsf.edu/api/', data=data)

            if r.status_code == 200:
                # Save the downloaded file to the specified directory
                with open(f'labview_files/{filename}', 'wb') as f:
                    f.write(r.content)

                print(filename)

                # Once written to file, update the last_modified dictionary with the timestamp and id from last_modified_redcap
                check_and_log_download(session_id, 'labview_raw')
            else:
                print(f'FAILURE to write {filename}')
                error_files[session_id] = 'Labview Raw write error'
        else:
            waveform_log[session_id] = {'labview_raw': 'Skipped'}

## Download Labview Data files

In [None]:
apprise('Downloading LabView Data')

In [None]:
# Function to check if the file exists
def data_file_exists_with_id(directory, session_id, subject_id, file_id, filetype):
    if filetype == '2hz':
        filename = f'labview_data_{session_id}_id_{file_id}'
    elif filetype == 'ppg':
        filename = f'labview_raw_session_{session_id}_subject_{subject_id}_id_{file_id}'
    return os.path.exists(os.path.join(directory, filename))


In [None]:
for index, row in tqdm(encounter.loc[~encounter.labview_data.isnull()].iterrows(), desc='Labview Data', total=len(encounter.loc[~encounter.labview_data.isnull()])):
    session_id = row['record_id']
    subject_id = row['patient_id']

    # Determine the filename using the ID from last_modified_redcap if available
    if session_id in last_modified_redcap and 'labview_data' in last_modified_redcap[session_id]:
        file_id = last_modified_redcap[session_id]['labview_data']['id']
    else:
        file_id = 'unknown'
    filename = f'labview_data_{session_id}_id_{file_id}'

    if build_args['download_waveforms'] == True:
        # Download Labview Data file
        time.sleep(build_args['sleep_time'])
        if not data_file_exists_with_id('labview_files', session_id, subject_id, file_id,'2hz'):
            print('Downloading:', session_id)
            print(f'Downloading {filename}')
            data = {
                'token': config['api_token_encounter'],
                'content': 'file',
                'action': 'export',
                'record': session_id,
                'field': 'labview_data',
                'event': '',
                'returnFormat': 'csv'
            }

            timenow=datetime.datetime.now()
            r = requests.post('https://redcap.ucsf.edu/api/', data=data)

            if r.status_code == 200:
                f = open(f'labview_files/{filename}', 'wb')
                f.write(r.content)
                f.close()
                check_and_log_download(session_id, 'labview_data')
        #        print(f'Wrote {filename}')
            else:
                print(f'FAILURE to write {filename}')
                error_files[session_id] = 'labview data write error'
                continue
        else:
            waveform_log[session_id].update({'labview_data': 'Skipped'})

## Download raw PPG files

In [None]:
apprise('Downloading PPG')

In [None]:
# Ensure the directory for raw PPG files exists
os.makedirs('raw_ppg_files', exist_ok=True)

# Dictionary to store filenames and their associated metadata
ppg_filenames = dict()

# Function to check if the file exists
def file_exists_with_id(directory, session_id, subject_id, file_id):
    filename = f'raw_ppg_session_{session_id}_id_{file_id}'
    return os.path.exists(os.path.join(directory, filename))

# Function to download and save PPG file
def download_and_save_ppg(session_id, field, filename):
    data = {
        'token': config['api_token_encounter'],
        'content': 'file',
        'action': 'export',
        'record': session_id,
        'field': field,
        'event': '',
        'returnFormat': 'csv'
    }

    r = requests.post('https://redcap.ucsf.edu/api/', data=data)

    if r.status_code == 200:
        with open(os.path.join('raw_ppg_files', filename), 'wb') as f:
            f.write(r.content)
        check_and_log_download(session_id, 'raw_ppg_file')
        return True
    else:
        print(r.text)
        print(f'FAILURE to write {filename}')
        error_files[session_id] = 'PPG write error'
        return False

# Function to process PPG data
def process_ppg_data(row, field):
    session_id = row['record_id']
    subject_id = row['patient_id']

    if session_id in last_modified_redcap and field in last_modified_redcap[session_id]:
        file_id = last_modified_redcap[session_id][field]['id']
    else:
        file_id = 'unknown'
    filename = f'raw_ppg_session_{session_id}_id_{file_id}'

    ppg_filenames[filename] = {
        'session_id': session_id,
        'subject_id': subject_id,
    }

    if build_args['download_waveforms']:
        time.sleep(build_args['sleep_time'])
        print(session_id, subject_id, file_id)
        print(file_exists_with_id('raw_ppg_files',session_id,subject_id,file_id))
        if not file_exists_with_id('raw_ppg_files', session_id, subject_id, file_id):
            if download_and_save_ppg(session_id, field, filename):
                return 'Downloaded'
        else:
            if session_id not in waveform_log:
                waveform_log[session_id] = {}
            waveform_log[session_id].update({'raw_ppg_file': 'Skipped'})
            return 'Skipped'
    return None

# Process original PPG data
ppg_raw_iterable = encounter.loc[~encounter.raw_ppg_file.isnull()]
for _, row in tqdm(ppg_raw_iterable.iterrows(), desc='Raw PPG Original', total=len(ppg_raw_iterable)):
    process_ppg_data(row, 'raw_ppg_file')

# Process Stevie PPG data
stevie_ppg_raw_iterable = encounter.loc[~encounter.lil_stevie_upload.isnull()]
for _, row in tqdm(stevie_ppg_raw_iterable.iterrows(), desc='Raw PPG Stevie', total=len(stevie_ppg_raw_iterable)):
    process_ppg_data(row, 'lil_stevie_upload')

In [None]:
# sanity check that the last session’s PPG values as stored in the wfdb file match the raw data
# may error when there are new sessions

# assert all(ppg.loc[:, ['LEDC1_PD1', 'LEDC2_PD1']].dropna(how='all').astype('int').to_numpy()[1] == wfdb.io.rdsamp('631ee6ac1403358a321fda223606008eb2063d797b8be301b216e18eff0dcf1c_ppg')[0][1])

In [None]:
os.chdir(BASE_DIR)

In [None]:
#send error files
if len(error_files) >0:
    apprise('Error files:' + '\n' + str(pd.Series(error_files).value_counts()))

#send list of error sessions
from collections import defaultdict
inverted_dict = defaultdict(list)

for key, value in error_files.items():
    inverted_dict[value].append(key)

# Convert defaultdict back to a regular dictionary if needed
inverted_dict = dict(inverted_dict)


In [None]:
waveform_log_df = pd.DataFrame.from_dict(waveform_log).T

# Initialize variables to 0 to handle cases where 'Downloaded' doesn't exist
lab_data_download_count = 0
lab_raw_download_count = 0
raw_ppg_download_count = 0

# Check if 'Downloaded' exists in each value_counts() and update the count
if 'Downloaded' in waveform_log_df.labview_data.value_counts():
    lab_data_download_count = waveform_log_df.labview_data.value_counts()['Downloaded']

if 'Downloaded' in waveform_log_df.labview_raw.value_counts():
    lab_raw_download_count = waveform_log_df.labview_raw.value_counts()['Downloaded']

if 'Downloaded' in waveform_log_df.raw_ppg_file.value_counts():
    raw_ppg_download_count = waveform_log_df.raw_ppg_file.value_counts()['Downloaded']

if lab_data_download_count > 0 or lab_raw_download_count > 0 or raw_ppg_download_count > 0:
    apprise(f"Download counts detected:\n"
        f"Lab Data: {lab_data_download_count}\n"
        f"Lab Raw: {lab_raw_download_count}\n"
        f"Raw PPG: {raw_ppg_download_count}")

In [None]:
# Write labview_filenames and ppg_filenames to disk
def write_dict_to_file(filename, data):
    with open(filename, 'w') as f:
        json.dump(data, f, indent=4)

# Write the dictionaries to files
write_dict_to_file('labview_filenames.json', labview_filenames)
write_dict_to_file('ppg_filenames.json', ppg_filenames)