In [None]:
import os
import numpy as np
import sys
import h5py
import pandas as pd
from IPython.display import display
from random import choice

ROOT_DIR = '../'
sys.path.insert(0, os.path.abspath(f'/{ROOT_DIR}/heart_rhythm_analysis/'))    # make repo root importable

from heart_rhythm_analysis.get_data.CapnoBaseETL import CapnoBaseETL
from heart_rhythm_analysis.get_data.MimicETL import MimicETL
from heart_rhythm_analysis.utils.timeseries_viewer import make_window_figure, create_time_series_viewer

bSetUpDB = True
WINDOW_LEN = 8

import dash
app = dash.Dash(__name__)

# Create Datasets from Databases

In [4]:
def load_as_df(file_path, filename,
               store_signals=True,
               group_by_record=False):
    h5_path = f"{file_path}/{filename}.h5"
    print(f'Loading: {h5_path}')
    hf = h5py.File(h5_path, "r")
    rows = []
    for subj in hf.keys():
        subj_grp = hf[subj]
        # print(subj)
        for win_id in subj_grp.keys():
            win_grp = subj_grp[win_id]
            # pull out everything you need
            rec_id      = win_grp.attrs['rec_id']
            label       = win_grp.attrs['label']
            raw_ppg     = win_grp["raw_ppg"][:]
            proc_ppg    = win_grp["proc_ppg"][:]
            raw_ekg     = win_grp["raw_ekg"][:]
            raw_abp     = win_grp["raw_abp"][:]
            raw_ppg_fs  = win_grp.attrs["raw_ppg_fs"]
            ekg_fs      = win_grp.attrs["ekg_fs"]
            ppg_fs      = win_grp.attrs["ppg_fs"]
            abp_fs      = win_grp.attrs["abp_fs"]
            notes       = win_grp.attrs.get("notes", "")
            # build the window row
            row = {
                "subject": subj,
                "window_id": win_id,
                "rec_id": rec_id,
                "label": label,
                "raw_ppg_fs": raw_ppg_fs,
                "ppg_fs_out": ppg_fs,
                "ekg_fs_out": ekg_fs,
                "abp_fs_out": abp_fs,
                "raw_len": len(raw_ppg),
                "proc_len": len(proc_ppg),
                "duration_raw_s": len(raw_ppg) / ppg_fs,
                "duration_proc_s": len(proc_ppg) / ppg_fs,
                "notes": notes
            }
            if store_signals:
                row.update({
                    "raw_ppg": raw_ppg,
                    "proc_ppg": proc_ppg,
                    "raw_ekg": raw_ekg,
                    "raw_abp": raw_abp
                })
            rows.append(row)

    df = pd.DataFrame(rows)

    if group_by_record and store_signals:
        # define how to aggregate each column
        agg_dict = {
            "subject":     "first",
            "label":       "first",
            "raw_ppg_fs":  "first",
            "ppg_fs_out":  "first",
            "ekg_fs_out":  "first",
            "abp_fs_out":  "first",
            "notes":       lambda x: list(x),
            "raw_len":     lambda x: np.sum(x),
            "proc_len":    lambda x: np.sum(x),
            "duration_raw_s":  lambda x: np.sum(x),
            "duration_proc_s": lambda x: np.sum(x),
            # now the key: concatenate all windows into one long array
            "raw_ppg":  lambda series: np.concatenate(series.values),
            "proc_ppg": lambda series: np.concatenate(series.values),
            "raw_ekg":  lambda series: np.concatenate(series.values),
            "raw_abp":  lambda series: np.concatenate(series.values),
        }
        df = (
            df
            .groupby("rec_id", as_index=False)
            .agg(agg_dict)
        )
    return df

## Create Capno Dataset

### CapnoBase

In [None]:
def main():
    root_path = os.path.join('../data/raw/capnobase/data/mat')
    out_path = os.path.join('../data/processed/length_full/capnobase_db')
    out_filename = 'capnobase_db'
    if not os.path.exists(out_path):
        os.mkdir(out_path)

    fs_in = 100.00
    fs_out = 100.00

    config = {
    "input_dir"      : root_path,
    "output_dir"     : out_path,
    "window_size_sec": 30,
    "fs_in"          : fs_in,
    "fs_out"   : fs_out,
    "lowpass_cutoff" : (fs_out / 2),
    "fir_numtaps"    : 129,
    "zero_phase"     : True,
    "out_filename" :  out_filename
}
    if bSetUpDB:
        etl = CapnoBaseETL(config)
        h5file = etl.process_all()
        print(f"Saved windows HDF5 to {h5file}")
    df = load_as_df(out_path,out_filename)

    display(df.head(10))
    return df
if __name__ == "__main__":
    df_capnobase = main()

## MIMIC III

### MIMIC III AF DATASET

In [None]:

def main():
    root_path = os.path.join('../data/raw/mimic_af_nonaf/mimic_af_data.mat')
    out_filename = 'mimic_af_db'
    out_path = os.path.join('../data/processed/length_full/mimic_af_db')
    
    if not os.path.exists(out_path):
        os.mkdir(out_path)
    config = {
        "input_dir": root_path,
        "output_dir":  out_path,
        "fs_in": 125.00,
        "fs_out": 125.00,
        "window_size_sec": 30,
        "scale_type": "norm",
        "zero_phase": True,
        "decimate_signal": False,
        "out_filename": out_filename
    }
    # if bSetUpDB:
    #     etl = MimicETL(config)
    #     out_file = etl.process()
    #     print("Saved AF windows to", out_file)
    df = load_as_df(out_path,out_filename)
    display(df.head(10))
    return df

if __name__ == "__main__":
    df_mimc_af = main()
    

### MIMIC III Non-AF Dataset

In [None]:

def main():
    root_path = os.path.join('../data/raw/mimic_af_nonaf/mimic_non_af_data.mat')
    out_filename = 'mimic_non_af_db'
    out_path = os.path.join('../data/processed/length_full/mimic_non_af_db')
    
    if not os.path.exists(out_path):
        os.mkdir(out_path)
    config = {
        "input_dir": root_path,
        "output_dir":  out_path,
        "fs_in": 125.00,
        "fs_out": 125.00,
        "window_size_sec": 30,
        "scale_type": None,
        "decimate_signal": False,
        "zero_phase": True,
        "out_filename": out_filename   
    }
    if bSetUpDB:
        etl = MimicETL(config)
        out_file = etl.process()
        print("Saved non AF windows to", out_file)
    df = load_as_df(out_path,out_filename)
    display(df.head(10))
    return df
if __name__ == "__main__":
    df_mimc_nonaf = main()

### MIMIC III General Subset

Unnamed: 0,subject,window_id,rec_id,label,raw_ppg_fs,ppg_fs_out,ekg_fs_out,abp_fs_out,raw_len,proc_len,duration_raw_s,duration_proc_s,notes,raw_ppg,proc_ppg,raw_ekg,raw_abp
0,p000160,019a126a-a9ed-4af9-9f22-53cdee45d798,3531764,-1,125.0,20.83,125,125,1000,167,48.007681,8.017283,,"[0.5210166177908113, 0.520039100684262, 0.5190...","[-0.7377055, 0.26229447, -0.0060676336, 0.0949...","[0.6, 0.6058823529411764, 0.5705882352941176, ...","[85.2673306827449, 84.6806288661205, 84.093927..."
1,p000160,02020e7e-cbc6-4d19-9199-f56091e94d4a,3531764,-1,125.0,20.83,125,125,1000,167,48.007681,8.017283,,"[0.5483870967741935, 0.552297165200391, 0.5571...","[-0.55547786, 0.44452214, 0.25203043, 0.372302...","[0.5647058823529412, 0.5901960784313726, 0.574...","[103.45508699810104, 101.890548820436, 99.9348..."
2,p000160,02942b90-52c7-41b1-a468-9cb017d8040b,3531764,-1,125.0,20.83,125,125,1000,167,48.007681,8.017283,,"[0.6832844574780058, 0.6832844574780058, 0.683...","[-0.48934713, 0.49947518, 0.24289119, 0.373496...","[0.6509803921568628, 0.6196078431372549, 0.605...","[82.7249561440392, 84.09392704949612, 87.41857..."
3,p000160,03087d3b-80c0-43bb-a341-280e03e8c5b6,3531764,-1,125.0,20.83,125,125,1000,167,48.007681,8.017283,,"[0.4897360703812317, 0.4916911045943304, 0.493...","[-0.846876, 0.15312397, -0.10480279, 0.0938112...","[0.7764705882352941, 0.7607843137254902, 0.745...","[121.05614149683278, 122.22954513008156, 123.5..."
4,p000160,044d88da-b083-46bc-817e-57ddf631d1ba,3531764,-1,125.0,20.83,125,125,1000,167,48.007681,8.017283,,"[0.47996089931573804, 0.47996089931573804, 0.4...","[-0.6449328, -0.07259107, -0.21645212, -0.1340...","[0.5607843137254902, 0.5803921568627451, 0.570...","[143.15524325635153, 146.08875233947347, 144.7..."


In [None]:
def main():
    root_path = os.path.join('../data/raw/mimic3_data/mimic3_struct.mat')
    out_filename = 'mimic3_db'
    out_path = os.path.join('../data/processed/length_full/mimic3_db')
    
    if not os.path.exists(out_path):
        os.mkdir(out_path)
    config = {
        "input_dir": root_path,
        "output_dir":  out_path,
        "fs_in": 125.00,
        "fs_out": 20.83,
        'fs_ekg': 125,
        'fs_bp':125,
        "window_size_sec": WINDOW_LEN,
        "scale_type": "norm",
        "decimate_signal": True,
        "zero_phase": True,
        "out_filename": out_filename 
    }
    bSetUpDB = False
    if bSetUpDB:
        etl = MimicETL(config)
        out_file = etl.process()
        print("Saved General MIMIC III windows to", out_file)
    df = load_as_df(out_path,out_filename)
    display(df.head(10))
    return df
if __name__ == "__main__":
    df_mimic3_gen = main()
    # first = df_mimic3_gen.iloc[np.random.randint(0, len(df_mimic3_gen))] 
    chosen_subj = df_mimic3_gen['subject'].unique()[0]

    surr_subject_df = df_mimic3_gen[df_mimic3_gen['subject'] == chosen_subj]

    display(surr_subject_df.head())
    # # initialize Dash


    # # randomly pick one window at start-up
    initial_window = df_mimic3_gen.sample(1).iloc[0]
    # 2) declare once what each subplot should show
    specs = [
        {"key":"raw_ppg",  "fs_key":"raw_ppg_fs",  "subplot":1, "legend":"Raw PPG"},
        {"key":"proc_ppg", "fs_key":"ppg_fs_out",  "subplot":1, "legend":"Proc PPG"},
        {"key":"raw_ekg",  "fs_key":"ekg_fs_out",  "subplot":2,               },
        {"key":"raw_abp",  "fs_key":"abp_fs_out",  "subplot":3,               },
    ]

    # 3) bind specs into a single-arg figure fn
    fig_fn = lambda window: make_window_figure(window, specs)
    
    app = create_time_series_viewer(
    surr_subject_df,
    fig_fn,
    specs,
    index_label_fn=lambda idx, row: f"{row.subject}-{row.window_id}"
)
    print(f'Total # of Subjects: {len(df_mimic3_gen['subject'].unique())}')
    print(f'Total # of Windows: {df_mimic3_gen.shape[0]}')
    print("Launching Dash at http://127.0.0.1:8050 …")
    app.run(debug=True)

Loading: ../data/processed/length_full/mimic3_db/mimic3_db.h5


Unnamed: 0,subject,window_id,rec_id,label,raw_ppg_fs,ppg_fs_out,ekg_fs_out,abp_fs_out,raw_len,proc_len,duration_raw_s,duration_proc_s,notes,raw_ppg,proc_ppg,raw_ekg,raw_abp
0,p000160,019a126a-a9ed-4af9-9f22-53cdee45d798,3531764,-1,125.0,20.83,125,125,1000,167,48.007681,8.017283,,"[0.5210166177908113, 0.520039100684262, 0.5190...","[-0.7377055, 0.26229447, -0.0060676336, 0.0949...","[0.6, 0.6058823529411764, 0.5705882352941176, ...","[85.2673306827449, 84.6806288661205, 84.093927..."
1,p000160,02020e7e-cbc6-4d19-9199-f56091e94d4a,3531764,-1,125.0,20.83,125,125,1000,167,48.007681,8.017283,,"[0.5483870967741935, 0.552297165200391, 0.5571...","[-0.55547786, 0.44452214, 0.25203043, 0.372302...","[0.5647058823529412, 0.5901960784313726, 0.574...","[103.45508699810104, 101.890548820436, 99.9348..."
2,p000160,02942b90-52c7-41b1-a468-9cb017d8040b,3531764,-1,125.0,20.83,125,125,1000,167,48.007681,8.017283,,"[0.6832844574780058, 0.6832844574780058, 0.683...","[-0.48934713, 0.49947518, 0.24289119, 0.373496...","[0.6509803921568628, 0.6196078431372549, 0.605...","[82.7249561440392, 84.09392704949612, 87.41857..."
3,p000160,03087d3b-80c0-43bb-a341-280e03e8c5b6,3531764,-1,125.0,20.83,125,125,1000,167,48.007681,8.017283,,"[0.4897360703812317, 0.4916911045943304, 0.493...","[-0.846876, 0.15312397, -0.10480279, 0.0938112...","[0.7764705882352941, 0.7607843137254902, 0.745...","[121.05614149683278, 122.22954513008156, 123.5..."
4,p000160,044d88da-b083-46bc-817e-57ddf631d1ba,3531764,-1,125.0,20.83,125,125,1000,167,48.007681,8.017283,,"[0.47996089931573804, 0.47996089931573804, 0.4...","[-0.6449328, -0.07259107, -0.21645212, -0.1340...","[0.5607843137254902, 0.5803921568627451, 0.570...","[143.15524325635153, 146.08875233947347, 144.7..."
5,p000160,047a526c-3412-43a9-bda5-7ed4fc5471d6,3531764,-1,125.0,20.83,125,125,1000,167,48.007681,8.017283,,"[0.39296187683284456, 0.3782991202346041, 0.36...","[-0.28628236, -0.2334657, -0.51416355, -0.4347...","[0.6294117647058823, 0.6196078431372549, 0.619...","[91.32991612119694, 90.54764703236442, 89.7653..."
6,p000160,04b8f3b9-45a7-49c2-9d11-faa3b2d5f62f,3531764,-1,125.0,20.83,125,125,1000,167,48.007681,8.017283,,"[0.4946236559139785, 0.4946236559139785, 0.496...","[-0.837118, 0.16288197, -0.052505314, 0.110326...","[0.5745098039215686, 0.5745098039215686, 0.574...","[99.54374155393842, 99.15260700952216, 98.5659..."
7,p000160,057f5269-d556-4af5-8bbf-5d4b8967b383,3531764,-1,125.0,20.83,125,125,1000,167,48.007681,8.017283,,"[0.46529814271749753, 0.46236559139784944, 0.4...","[-0.66862786, -0.018857896, -0.22763738, -0.16...","[0.6254901960784314, 0.6450980392156863, 0.641...","[70.5997852671351, 70.40421799492698, 70.40421..."
8,p000160,05ab4649-7024-4e54-a247-b2ab5d669e62,3531764,-1,125.0,20.83,125,125,1000,167,48.007681,8.017283,,"[0.5219941348973607, 0.5219941348973607, 0.521...","[-0.8284066, 0.17159343, -0.124973595, -0.0066...","[0.6254901960784314, 0.6294117647058823, 0.645...","[61.40812347335298, 64.1460652842668, 70.40421..."
9,p000160,05c4f226-dc09-4a98-b32b-11c84342a788,3531764,-1,125.0,20.83,125,125,1000,167,48.007681,8.017283,,"[0.7214076246334311, 0.7331378299120235, 0.739...","[-0.11536825, 0.2434088, 0.15824479, 0.2026873...","[0.5745098039215686, 0.5607843137254902, 0.570...","[85.65846522716116, 85.2673306827449, 84.68062..."


KeyError: 'p000160'

## MIMIC IV

### MIMIC IV General Data

In [None]:
np.random.seed(42)
def main():
    substring = ''
    file_name = f'_{substring}' if len(substring) > 0 else ""
    root_path = os.path.join(f'../data/raw/mimic4_data/mimic4{file_name}_struct.mat')
    out_filename = f'mimic4{file_name}_db'
    out_path = os.path.join(f'../data/processed/length_full/{out_filename}')

    if not os.path.exists(out_path):
        os.mkdir(out_path)
    fs_in = 62.5
    fs_out = 20.83
    config = {
        "input_dir": root_path,
        "output_dir":  out_path,
        "fs_in": fs_in,
        "fs_out": fs_out,
        'fs_ekg': 62.5, 
        'fs_bp':fs_in,
        "window_size_sec": WINDOW_LEN,
        "scale_type": "norm",
        "decimate_signal": True,
        "zero_phase": True,
        "out_filename": out_filename 
    }
    bSetUpDB = True
    if bSetUpDB:
        etl = MimicETL(config)
        out_file = etl.process()
        print(f"Saved {substring} MIMIC IV  windows to", out_file)
    df = load_as_df(out_path,out_filename)
    display(df.head())
    return df

if __name__ == "__main__":

    df_mimic4_gen = main()
    first = df_mimic4_gen.iloc[np.random.randint(0, len(df_mimic4_gen))] 
    initial_window = df_mimic4_gen.sample(1).iloc[0]
    specs = [
        {"key":"raw_ppg",  "fs_key":"raw_ppg_fs",  "subplot":1, "legend":"Raw PPG"},
        {"key":"proc_ppg", "fs_key":"ppg_fs_out",  "subplot":1, "legend":"Proc PPG"},
        {"key":"raw_ekg",  "fs_key":"ekg_fs_out",  "subplot":2,               },
        {"key":"raw_abp",  "fs_key":"abp_fs_out",  "subplot":3,               },
    ]

    # 3) bind specs into a single-arg figure fn
    fig_fn = lambda window: make_window_figure(window, specs)
    
    app = create_time_series_viewer(
    df_mimic4_gen,
    fig_fn,
    specs,
    index_label_fn=lambda idx, row: f"{row.subject}-{row.window_id}"
)
    print(f'Total # of Subjects: {len(df_mimic4_gen['subject'].unique())}')
    print(f'Total # of Windows: {df_mimic4_gen.shape[0]}')
    print("Launching Dash at http://127.0.0.1:8050 …")
    
    # app.run(debug=True)
    

Loading ../data/raw/mimic4_data/mimic4_struct.mat
Saved  MIMIC IV  windows to ../data/processed/length_full/mimic4_db/mimic4_db.h5
Loading: ../data/processed/length_full/mimic4_db/mimic4_db.h5
p10020306
p10126957
p10209410
p10952189
p11109975


Unnamed: 0,subject,window_id,rec_id,label,raw_ppg_fs,ppg_fs_out,ekg_fs_out,abp_fs_out,raw_len,proc_len,duration_raw_s,duration_proc_s,notes,raw_ppg,proc_ppg,raw_ekg,raw_abp
0,p10020306,002f4472-d963-4a2c-8ca5-caa327eb569f,83404654,-1,62.5,20.83,62.5,62.5,499,167,23.955833,8.017283,,"[0.528564453125, 0.521240234375, 0.517578125, ...","[-0.6274134, 0.17065835, -0.13868475, -0.08036...","[-0.025, -0.03, -0.015, -0.01, 0.0, 0.015, 0.0...","[71.4375, 70.1875, 69.1875, 68.3125, 67.4375, ..."
1,p10020306,01e94413-e787-48af-be42-4bc745acee55,83404654,-1,62.5,20.83,62.5,62.5,499,167,23.955833,8.017283,,"[0.716552734375, 0.716552734375, 0.70825195312...","[-0.63966334, 0.10403001, -0.16901734, -0.0824...","[0.02, 0.25, 0.48, 0.15, -0.105, -0.1, -0.09, ...","[66.5, 65.9375, 65.375, 64.875, 64.375, 63.875..."
2,p10020306,024947f6-92c1-4e7a-82dd-613c3516a54a,83404654,-1,62.5,20.83,62.5,62.5,499,167,23.955833,8.017283,,"[0.289306640625, 0.2978515625, 0.300537109375,...","[-0.44204277, -0.015548229, -0.14170718, -0.11...","[0.02, 0.0, 0.0, -0.02, -0.005, -0.01, -0.005,...","[77.1875, 77.0, 76.8125, 76.125, 75.25, 74.625..."
3,p10020306,06befb7a-7ca4-482a-a59c-186a62d480a5,83404654,-1,62.5,20.83,62.5,62.5,499,167,23.955833,8.017283,,"[0.3916015625, 0.38330078125, 0.379150390625, ...","[-0.60387427, -0.19629657, -0.39741305, -0.340...","[-0.115, -0.115, -0.115, -0.1, -0.095, -0.075,...","[62.625, 61.875, 61.25, 60.75, 60.1875, 59.687..."
4,p10020306,085bdb2e-56dc-4ee5-b00c-ed5df72ea401,83404654,-1,62.5,20.83,62.5,62.5,499,167,23.955833,8.017283,,"[0.658203125, 0.654052734375, 0.62841796875, 0...","[-0.36530617, 0.5594827, 0.15250596, 0.2358321...","[-0.24, -0.225, -0.205, -0.17, -0.16, -0.165, ...","[62.875, 62.8125, 63.5, 66.6875, 74.25, 84.5, ..."


Total # of Subjects: 5
Total # of Windows: 1120
Launching Dash at http://127.0.0.1:8050 …


In [4]:
subject_agg = {
    # sampling & metadata: just take the first (they're constant per subject)
    "raw_ppg_fs": "first",
    "ppg_fs_out": "first",
    "ekg_fs_out": "first",
    "abp_fs_out": "first",
    # lengths & durations: sum across windows
    "raw_len":       "sum",
    "proc_len":      "sum",
    "duration_raw_s":  "sum",
    "duration_proc_s": "sum",
    # signals: concatenate all windows end-to-end
    "raw_ppg":  lambda s: np.concatenate(s.values),
    "proc_ppg": lambda s: np.concatenate(s.values),
    "raw_ekg":  lambda s: np.concatenate(s.values),
    "raw_abp":  lambda s: np.concatenate(s.values),
    # if you want to keep track of window_ids or rec_ids:
    "window_id": lambda s: list(s.values),
    "rec_id":    lambda s: list(s.values),
    "label":     lambda s: list(s.values),
}

# ── 2) group by subject ─
df_by_subject = (
    df_mimic4_gen
    .groupby("subject", as_index=False)
    .agg(subject_agg)
)

# ── 3) now df_by_subject has one row per subject, with each signal a long array
print(df_by_subject.shape)
print(df_by_subject.raw_ppg.iloc[0].shape)  # e.g. (sum of all its windows,) 

# bind specs & figure fn exactly as before
fig_fn = lambda row: make_window_figure(row, specs)

# subject‐level dashboard (labels are just the subject IDs)
app = create_time_series_viewer(
    df_by_subject,
    fig_fn,
    specs,
    index_label_fn=lambda idx, row: str(row.subject)
)
print("Launching Dash at http://127.0.0.1:8080 …")
app.run(debug=True,port=8080)

(35, 16)
(111776,)
Launching Dash at http://127.0.0.1:8080 …


# Create Train/Test Files

In [None]:
from heart_rhythm_analysis.get_data.prepare_dataset import my_split_and_save

ROOT_PATH = '../data/processed/length_full'
file_paths = [f"{ROOT_PATH}/mimic3_db/mimic3_db.h5", f"{ROOT_PATH}/mimic4_db/mimic4_db.h5"]
out_path = '../data/development_dataset/length_full'
train_path, test_path = my_split_and_save(file_paths, train_ratio=0.8, output_dir=out_path)

def count_windows(h5_path):
    with h5py.File(h5_path, 'r') as f:
        subjects = list(f.keys())
        n_subjects = len(subjects)
        n_windows  = sum(len(f[subj].keys()) for subj in subjects)
    return n_subjects, n_windows

# Replace these with your real paths:
paths = {
    "mimic3": "../data/processed/length_full/mimic3_db/mimic3_db.h5",
    "mimic4": "../data/processed/length_full/mimic4_db/mimic4_db.h5",
    "train":   "../data/development_dataset/length_full/train_dataset.h5",
    "test":    "../data/development_dataset/length_full/test_dataset.h5",
}

for name, p in paths.items():
    if os.path.exists(p):
        subs, wins = count_windows(p)
        print(f"{name:>6} → {subs} subjects, {wins} windows")
    else:
        print(f"{name:>6} → file not found: {p}")

In [None]:
np.concatenate(df_mimic4_gen['raw_ppg'].to_list())