In [2]:
# Import necessary libraries
import heartpy as hp
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import glob, os, gzip, json
from scipy.stats import zscore
import seaborn as sns
import neurokit2 as nk

# Set the base directory containing your data
# EDIT the following line
base_dir = '/data/MoodGroup/code/Taliah/Test_Subjects_Physio_Input/bids/sourcedata'

# Use glob to find all .tsv.gz files that match the pattern in the base directory
# Adjust the pattern if your file naming convention is different
# EDIT the following line
tsv_fns = glob.glob(os.path.join(base_dir,'*','*','func', '*rest*physio*.tsv.gz'))

# Initialize lists to store data
all_srate = []
all_bpm = []
all_dat = []
all_nope = []

# EDIT the following line
# Sample rate to use if JSON file is not found
default_srate = 500  # Change this to your consistent sampling rate
#if you do not have a consistent sampling rate or JSON file with the rates, you must make a JSON file with rates

# Output directory
# EDIT the following line
output_dir = '/data/MoodGroup/code/Taliah/Test_Subjects_Physio_Output'
os.makedirs(output_dir, exist_ok=True)  # Create the directory if it doesn't exist 

# Loop through each found file
for idx, fn in enumerate(tsv_fns):
    base_fn = os.path.basename(fn)
    print(f"Processing file: {base_fn}")

    # Try to read the corresponding JSON file to get the sampling frequency
    json_fn = fn.replace('tsv.gz', 'json')
    if os.path.exists(json_fn):
        # JSON file found, use it to get the sampling rate
        with open(json_fn, 'r') as fid:
            json_dat = json.load(fid)
        srate = json_dat['SamplingFrequency']
    else:
        # JSON file not found, use the default sampling rate
        srate = default_srate
        print(f"JSON file not found for {base_fn}. Using default sampling rate: {srate}")

    all_srate.append(srate)

    # Read the .tsv.gz file into a DataFrame
    #the file uses tabs to seperate, if that does not match your file please edit sep= to correspond to your data
    #note: our file does not have a header and has three columns ppg(hr using photoplethysmography), rsp(respiratory rate) and trig(trigger signals)
    dat = pd.read_csv(fn, compression='gzip', sep='\t', header=None, names=["ppg", "rsp", "trig"])

    try:
        # Process the PPG and RSP signals
        ppg_signals, ppg_info = nk.ppg_process(dat["ppg"], sampling_rate=srate)
        rsp_signals, rsp_info = nk.rsp_process(dat["rsp"][1:], sampling_rate=srate)

        # Extract interval-related measurements
        ppg_meas = nk.ppg_intervalrelated(ppg_signals)
        rsp_meas = nk.rsp_intervalrelated(rsp_signals)

        all_dat.append((ppg_signals, rsp_signals))

        # Parse the file name to get subject, session, and task information
        bids_bits = base_fn.split('_')

        # Handle the measurements
        tmp = [ii if isinstance(ii, np.float64) else ii[0] for idx, ii in enumerate(rsp_meas.values[0])]
        all_bpm.append(bids_bits[:-1] + [srate] + list(ppg_meas.values[0]) + tmp)

        # Print the results for the current file
        print(f"{idx}: {base_fn} - PPG Rate Mean: {ppg_meas['PPG_Rate_Mean'].values[0]}, RSP Rate Mean: {rsp_meas['RSP_Rate_Mean'].values[0]}")

    except Exception as e:
        print(f"{idx}: {base_fn} - Error: {e}")
        all_nope.append((fn, dat))

# Create a DataFrame of the collected measurements
# The columns are derived from the keys of the PPG and RSP measurement dictionaries
columns = ['sub', 'ses', 'task', 'run', 'echo', 'srate'] + list(ppg_meas.keys()) + list(rsp_meas.keys())
df = pd.DataFrame(all_bpm, columns=columns)

# Save the DataFrame to a CSV file in the specified directory
output_file = os.path.join(output_dir, 'HRV_Data_Loop_Test.csv')
df.to_csv(output_file, index=False)

print(f"Data saved to {output_file}")


Processing file: sub-RD119_ses-20210504_task-rest_run-201_echo-01_physio.tsv.gz
0: sub-RD119_ses-20210504_task-rest_run-201_echo-01_physio.tsv.gz - PPG Rate Mean: 58.33241505020791, RSP Rate Mean: 17.531278788500313
Processing file: sub-RD119_ses-20210504_task-rest_run-101_echo-01_physio.tsv.gz
1: sub-RD119_ses-20210504_task-rest_run-101_echo-01_physio.tsv.gz - PPG Rate Mean: 56.629344699769945, RSP Rate Mean: 18.077315736004618
Processing file: sub-RD119_ses-20210504_task-rest_run-001_echo-01_physio.tsv.gz
2: sub-RD119_ses-20210504_task-rest_run-001_echo-01_physio.tsv.gz - PPG Rate Mean: 56.06747038737779, RSP Rate Mean: 17.571213835240208
Processing file: sub-RD114_ses-20191015_task-rest_run-201_echo-01_physio.tsv.gz
3: sub-RD114_ses-20191015_task-rest_run-201_echo-01_physio.tsv.gz - PPG Rate Mean: 58.25534877619704, RSP Rate Mean: 14.633574556215358
Processing file: sub-RD119_ses-20210427_task-rest_run-101_echo-01_physio.tsv.gz
4: sub-RD119_ses-20210427_task-rest_run-101_echo-01_phy