In [4]:
import os
import numpy as np
import sys
import pandas as pd
from IPython.display import display
from random import choice

ROOT_DIR = '../'
sys.path.insert(0, os.path.abspath(f'/{ROOT_DIR}/heart_rhythm_analysis/'))    # make repo root importable

from heart_rhythm_analysis.get_data.CapnoBaseETL import CapnoBaseETL
from heart_rhythm_analysis.get_data.MimicETL import MimicETL
from heart_rhythm_analysis.utils.timeseries_viewer import make_window_figure, create_time_series_viewer

import dash
app = dash.Dash(__name__)

import h5py, inspect
open_files = [v for v in globals().values() 
              if isinstance(v, h5py.File)]

for f in open_files:
    print(f"Closing {f.filename}")
    f.close()

bSetUpDB = True
WINDOW_LEN = 8

# Create Datasets from Databases

In [5]:
def load_as_df(file_path, filename,
               store_signals=True,
               group_by_record=False):
    h5_path = f"{file_path}/{filename}.h5"
    print(f'Loading: {h5_path}')
    hf = h5py.File(h5_path, "r")
    rows = []
    for subj in hf.keys():
        subj_grp = hf[subj]
        # print(subj)
        window_count = 0
        for win_id in subj_grp.keys():
            window_count = window_count+1
            win_grp = subj_grp[win_id]
            # pull out everything you need
            rec_id      = win_grp.attrs['rec_id']
            label       = win_grp.attrs['label']
            raw_ppg     = win_grp["raw_ppg"][:]
            proc_ppg    = win_grp["proc_ppg"][:]
            raw_ekg     = win_grp["raw_ekg"][:]
            raw_abp     = win_grp["raw_abp"][:]
            raw_ppg_fs  = win_grp.attrs["raw_ppg_fs"]
            ekg_fs      = win_grp.attrs["ekg_fs"]
            ppg_fs      = win_grp.attrs["ppg_fs"]
            abp_fs      = win_grp.attrs["abp_fs"]
            notes       = win_grp.attrs.get("notes", "")
            # build the window row
            row = {
                "subject": subj,
                "window_id": win_id,
                "window_count": window_count,
                "rec_id": rec_id,
                "label": label,
                "raw_ppg_fs": raw_ppg_fs,
                "ppg_fs_out": ppg_fs,
                "ekg_fs_out": ekg_fs,
                "abp_fs_out": abp_fs,
                "raw_len": len(raw_ppg),
                "proc_len": len(proc_ppg),
                "duration_raw_s": len(raw_ppg) / ppg_fs,
                "duration_proc_s": len(proc_ppg) / ppg_fs,
                "notes": notes
            }
            if store_signals:
                row.update({
                    "raw_ppg": raw_ppg,
                    "proc_ppg": proc_ppg,
                    "raw_ekg": raw_ekg,
                    "raw_abp": raw_abp
                })
            rows.append(row)

    df = pd.DataFrame(rows)

    if group_by_record and store_signals:
        # define how to aggregate each column
        agg_dict = {
            "subject":     "first",
            "label":       "first",
            "raw_ppg_fs":  "first",
            "ppg_fs_out":  "first",
            "ekg_fs_out":  "first",
            "abp_fs_out":  "first",
            "notes":       lambda x: list(x),
            "raw_len":     lambda x: np.sum(x),
            "proc_len":    lambda x: np.sum(x),
            "duration_raw_s":  lambda x: np.sum(x),
            "duration_proc_s": lambda x: np.sum(x),
            # now the key: concatenate all windows into one long array
            "raw_ppg":  lambda series: np.concatenate(series.values),
            "proc_ppg": lambda series: np.concatenate(series.values),
            "raw_ekg":  lambda series: np.concatenate(series.values),
            "raw_abp":  lambda series: np.concatenate(series.values),
        }
        df = (
            df
            .groupby("rec_id", as_index=False)
            .agg(agg_dict)
        )
    return df

## Create Capno Dataset

### CapnoBase

In [None]:
def main():
    root_path = os.path.join('../data/raw/capnobase/data/mat')
    out_path = os.path.join('../data/processed/length_full/capnobase_db')
    out_filename = 'capnobase_db'
    if not os.path.exists(out_path):
        os.mkdir(out_path)

    fs_in = 100.00
    fs_out = 100.00

    config = {
    "input_dir"      : root_path,
    "output_dir"     : out_path,
    "window_size_sec": 30,
    "fs_in"          : fs_in,
    "fs_out"   : fs_out,
    "lowpass_cutoff" : (fs_out / 2),
    "fir_numtaps"    : 129,
    "zero_phase"     : True,
    "out_filename" :  out_filename
}
    if bSetUpDB:
        etl = CapnoBaseETL(config)
        h5file = etl.process_all()
        print(f"Saved windows HDF5 to {h5file}")
    df = load_as_df(out_path,out_filename)

    display(df.head(10))
    return df
if __name__ == "__main__":
    df_capnobase = main()

## MIMIC III

### MIMIC III AF DATASET

In [None]:

# def main():
#     root_path = os.path.join('../data/raw/mimic_af_nonaf/mimic_af_data.mat')
#     out_filename = 'mimic_af_db'
#     out_path = os.path.join('../data/processed/length_full/mimic_af_db')
    
#     if not os.path.exists(out_path):
#         os.mkdir(out_path)
#     config = {
#         "input_dir": root_path,
#         "output_dir":  out_path,
#         "fs_in": 125.00,
#         "fs_out": 125.00,
#         "window_size_sec": 30,
#         "scale_type": "norm",
#         "zero_phase": True,
#         "decimate_signal": False,
#         "out_filename": out_filename
#     }
#     # if bSetUpDB:
#     #     etl = MimicETL(config)
#     #     out_file = etl.process()
#     #     print("Saved AF windows to", out_file)
#     df = load_as_df(out_path,out_filename)
#     display(df.head(10))
#     return df

# if __name__ == "__main__":
#     df_mimc_af = main()
    

### MIMIC III Non-AF Dataset

In [None]:

# def main():
#     root_path = os.path.join('../data/raw/mimic_af_nonaf/mimic_non_af_data.mat')
#     out_filename = 'mimic_non_af_db'
#     out_path = os.path.join('../data/processed/length_full/mimic_non_af_db')
    
#     if not os.path.exists(out_path):
#         os.mkdir(out_path)
#     config = {
#         "input_dir": root_path,
#         "output_dir":  out_path,
#         "fs_in": 125.00,
#         "fs_out": 125.00,
#         "window_size_sec": 30,
#         "scale_type": None,
#         "decimate_signal": False,
#         "zero_phase": True,
#         "out_filename": out_filename   
#     }
#     if bSetUpDB:
#         etl = MimicETL(config)
#         out_file = etl.process()
#         print("Saved non AF windows to", out_file)
#     df = load_as_df(out_path,out_filename)
#     display(df.head(10))
#     return df
# if __name__ == "__main__":
#     df_mimc_nonaf = main()

### MIMIC III General Subset

In [7]:
def main():
    ver_num = 1
    root_path = os.path.join(f'../data/raw/mimic3_data/mimic3_data_{1501}_v{ver_num}.mat')
    out_filename = f'mimic3_db_v{ver_num}'
    out_path = os.path.join(f'../data/processed/length_full/mimic3_db_v{ver_num}')
    
    if not os.path.exists(out_path):
        os.mkdir(out_path)
    config = {
        "input_dir": root_path,
        "output_dir":  out_path,
        "fs_in": 125.00,
        "fs_out": 20.83,
        'fs_ekg': 125,
        'fs_bp':125,
        "window_size_sec": WINDOW_LEN,
        "scale_type": "norm",
        "decimate_signal": True,
        "zero_phase": True,
        "out_filename": out_filename 
    }
    bSetUpDB = True
    if bSetUpDB:
        etl = MimicETL(config)
        out_file = etl.process()
        print("Saved General MIMIC III windows to", out_file)
    df = load_as_df(out_path,out_filename)
    # display(df.head(10))
    return df
if __name__ == "__main__":
    df_mimic3_gen = main()
    # first = df_mimic3_gen.iloc[np.random.randint(0, len(df_mimic3_gen))] 
    chosen_subj = df_mimic3_gen['subject'].unique()[0]

    surr_subject_df = df_mimic3_gen[df_mimic3_gen['subject'] == chosen_subj]

    display(surr_subject_df.head())
    initial_window = df_mimic3_gen.sample(1).iloc[0]

    specs = [
        {"key":"raw_ppg",  "fs_key":"raw_ppg_fs",  "subplot":1, "legend":"Raw PPG"},
        {"key":"proc_ppg", "fs_key":"ppg_fs_out",  "subplot":1, "legend":"Proc PPG"},
        {"key":"raw_ekg",  "fs_key":"ekg_fs_out",  "subplot":2,               },
        {"key":"raw_abp",  "fs_key":"abp_fs_out",  "subplot":3,               },
    ]

    fig_fn = lambda window: make_window_figure(window, specs)
    
    app = create_time_series_viewer(
    surr_subject_df,
    fig_fn,
    specs,
    index_label_fn=lambda idx, row: f"{row.subject}-{row.window_count}"
)
    print(f'Total # of Subjects: {len(df_mimic3_gen['subject'].unique())}')
    print(f'Total # of Windows: {df_mimic3_gen.shape[0]}')
    print("Launching Dash at http://127.0.0.1:8050 …")
    app.run(debug=True)

/Users/shayanriyaz/Documents/Projects/heart_rhythm_analysis/data/processed/length_full/mimic3_db_v1
Saved General MIMIC III windows to /Users/shayanriyaz/Documents/Projects/heart_rhythm_analysis/data/processed/length_full/mimic3_db_v1/mimic3_db_v1.h5
Loading: ../data/processed/length_full/mimic3_db_v1/mimic3_db_v1.h5


Unnamed: 0,subject,window_id,window_count,rec_id,label,raw_ppg_fs,ppg_fs_out,ekg_fs_out,abp_fs_out,raw_len,proc_len,duration_raw_s,duration_proc_s,notes,raw_ppg,proc_ppg,raw_ekg,raw_abp
0,p000160,3b5910cb-6645-4b6c-8f20-bc1de305f72c,1,p000160-2174-11-06-10-12,-1,125.0,20.83,125.0,125.0,1000,167,48.007681,8.017283,,"[0.5259042033235581, 0.5268817204301075, 0.528...","[-0.77904636, 0.19585234, -0.05195117, 0.13058...","[0.6, 0.596078431372549, 0.596078431372549, 0....","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ..."


Total # of Subjects: 1436
Total # of Windows: 1436
Launching Dash at http://127.0.0.1:8050 …


## MIMIC IV

### MIMIC IV General Data

In [None]:
np.random.seed(42)
def main():
    ver_num = 1
    substring = ''
    file_name = f'_{substring}' if len(substring) > 0 else ""
    root_path = os.path.join(f'../data/raw/mimic4_data/mimic4{file_name}_struct_v{ver_num}.mat')
    out_filename = f'mimic4{file_name}_db_v{ver_num}'
    out_path = os.path.join(f'../data/processed/length_full/{out_filename}')

    if not os.path.exists(out_path):
        os.mkdir(out_path)
    fs_in = 62.5
    fs_out = 20.83
    config = {
        "input_dir": root_path,
        "output_dir":  out_path,
        "fs_in": fs_in,
        "fs_out": fs_out,
        'fs_ekg': 62.5, 
        'fs_bp':fs_in,
        "window_size_sec": WINDOW_LEN,
        "scale_type": "norm",
        "decimate_signal": True,
        "zero_phase": True,
        "out_filename": out_filename 
    }
    bSetUpDB = True
    if bSetUpDB:
        etl = MimicETL(config)
        out_file = etl.process()
        print(f"Saved {substring} MIMIC IV  windows to", out_file)
    df = load_as_df(out_path,out_filename)
    return df

if __name__ == "__main__":

    df_mimic4_gen = main()
    first = df_mimic4_gen.iloc[np.random.randint(0, len(df_mimic4_gen))] 
    initial_window = df_mimic4_gen.sample(1).iloc[0]
    specs = [
        {"key":"raw_ppg",  "fs_key":"raw_ppg_fs",  "subplot":1, "legend":"Raw PPG"},
        {"key":"proc_ppg", "fs_key":"ppg_fs_out",  "subplot":1, "legend":"Proc PPG"},
        {"key":"raw_ekg",  "fs_key":"ekg_fs_out",  "subplot":2,               },
        {"key":"raw_abp",  "fs_key":"abp_fs_out",  "subplot":3,               },
    ]

    # 3) bind specs into a single-arg figure fn
    fig_fn = lambda window: make_window_figure(window, specs)
    
    app = create_time_series_viewer(
    df_mimic4_gen,
    fig_fn,
    specs,
    index_label_fn=lambda idx, row: f"{row.subject}-{row.window_count}"
)
    print(f'Total # of Subjects: {len(df_mimic4_gen['subject'].unique())}')
    print(f'Total # of Windows: {df_mimic4_gen.shape[0]}')
    print("Launching Dash at http://127.0.0.1:8050 …")
    
    # app.run(debug=True)
    

In [None]:
subject_agg = {
    # sampling & metadata: just take the first (they're constant per subject)
    "raw_ppg_fs": "first",
    "ppg_fs_out": "first",
    "ekg_fs_out": "first",
    "abp_fs_out": "first",
    # lengths & durations: sum across windows
    "raw_len":       "sum",
    "proc_len":      "sum",
    "duration_raw_s":  "sum",
    "duration_proc_s": "sum",
    # signals: concatenate all windows end-to-end
    "raw_ppg":  lambda s: np.concatenate(s.values),
    "proc_ppg": lambda s: np.concatenate(s.values),
    "raw_ekg":  lambda s: np.concatenate(s.values),
    "raw_abp":  lambda s: np.concatenate(s.values),
    # if you want to keep track of window_ids or rec_ids:
    "window_id": lambda s: list(s.values),
    "rec_id":    lambda s: list(s.values),
    "label":     lambda s: list(s.values),
}

# ── 2) group by subject ─
df_by_subject = (
    df_mimic4_gen
    .groupby("subject", as_index=False)
    .agg(subject_agg)
)

# ── 3) now df_by_subject has one row per subject, with each signal a long array
print(df_by_subject.shape)
print(df_by_subject.raw_ppg.iloc[0].shape)  # e.g. (sum of all its windows,) 

# bind specs & figure fn exactly as before
fig_fn = lambda row: make_window_figure(row, specs)

# subject‐level dashboard (labels are just the subject IDs)
app = create_time_series_viewer(
    df_by_subject,
    fig_fn,
    specs,
    index_label_fn=lambda idx, row: str(row.subject)
)
print("Launching Dash at http://127.0.0.1:8080 …")
app.run(debug=True,port=8080)

# Create Train/Test Files

In [3]:
from heart_rhythm_analysis.get_data.prepare_dataset import my_split_and_save
import os, h5py

ver_num = 1
ROOT_PATH = '../data/processed/length_full'
file_paths = [f"{ROOT_PATH}/mimic3_db_v{ver_num}/mimic3_db_v{ver_num}.h5", f"{ROOT_PATH}/mimic4_db_v{ver_num}/mimic4_db_v{ver_num}.h5"]
out_path = '../data/development_dataset/length_full'
train_path, test_path = my_split_and_save(file_paths, train_ratio=0.8, output_dir=out_path)

def count_windows(h5_path):
    with h5py.File(h5_path, 'r') as f:
        subjects = list(f.keys())
        n_subjects = len(subjects)
        n_windows  = sum(len(f[subj].keys()) for subj in subjects)
    return n_subjects, n_windows

# Replace these with your real paths:
paths = {
    "mimic3": "../data/processed/length_full/mimic3_db/mimic3_db.h5",
    "mimic4": "../data/processed/length_full/mimic4_db/mimic4_db.h5",
    "train":   "../data/development_dataset/length_full/train_dataset.h5",
    "test":    "../data/development_dataset/length_full/test_dataset.h5",
}

for name, p in paths.items():
    if os.path.exists(p):
        subs, wins = count_windows(p)
        print(f"{name:>6} → {subs} subjects, {wins} windows")
    else:
        print(f"{name:>6} → file not found: {p}")

mimic3 → 100 subjects, 45000 windows
mimic4 → 51 subjects, 22899 windows
 train → 151 subjects, 53770 windows
  test → 151 subjects, 13443 windows


In [None]:
np.concatenate(df_mimic4_gen['raw_ppg'].to_list())