In [1]:
import os, sys, glob

print(os.getcwd())
os.chdir('..')
print(f'Changed dir to: {os.getcwd()}')
# !export PYTHONPATH="$(pwd)/src:$PYTHONPATH"

/Users/shayanriyaz/Documents/Projects/heart_rhythm_analysis/notebooks
Changed dir to: /Users/shayanriyaz/Documents/Projects/heart_rhythm_analysis


In [2]:
import numpy as np
import pandas as pd
from IPython.display import display
from random import choice
import scipy
project_root = os.path.abspath(os.path.join(os.getcwd()))

src_dir = os.path.join(project_root, "src")
if src_dir not in sys.path:
    sys.path.insert(0, src_dir)

from src.lib.create_training_pipelines.capnobase_etl import CapnoBaseETL
from src.lib.create_training_pipelines.mimic_etl import MimicETL
from src.lib.utils.timeseries_viewer import make_window_figure, create_time_series_viewer

import dash
app = dash.Dash(__name__)

import h5py, inspect
open_files = [v for v in globals().values() 
              if isinstance(v, h5py.File)]

for f in open_files:
    print(f"Closing {f.filename}")
    f.close()

bSetUpDB = True
WINDOW_LEN = 8

# Create Datasets from Databases

In [3]:
def load_as_df(file_path, filename,
               store_signals=True,
               group_by_record=False):
    h5_path = f"{file_path}/{filename}.h5"
    print(f'Loading: {h5_path}')
    hf = h5py.File(h5_path, "r")
    rows = []
    for subj in hf.keys():
        subj_grp = hf[subj]
        # print(subj)
        window_count = 0
        for win_id in subj_grp.keys():
            window_count = window_count+1
            win_grp = subj_grp[win_id]
            rec_id     = win_grp.attrs.get('rec_id', '')
            label      = win_grp.attrs.get('label', -1)
            raw_ppg_fs = win_grp.attrs.get("raw_ppg_fs", np.nan)
            ekg_fs     = win_grp.attrs.get("ekg_fs", np.nan)
            ppg_fs     = win_grp.attrs.get("ppg_fs", np.nan)
            abp_fs     = win_grp.attrs.get("abp_fs", np.nan)
            notes      = win_grp.attrs.get("notes", "")
            # pull out everything you need
            raw_ppg     = win_grp["raw_ppg"][:]
            proc_ppg    = win_grp["proc_ppg"][:]
            raw_ekg     = win_grp["raw_ekg"][:]
            raw_abp     = win_grp["raw_abp"][:]
            # build the window row
            row = {
                "subject": subj,
                "window_id": win_id,
                "window_count": window_count,
                "rec_id": rec_id,
                "label": label,
                "raw_ppg_fs": raw_ppg_fs,
                "ppg_fs_out": ppg_fs,
                "ekg_fs_out": ekg_fs,
                "abp_fs_out": abp_fs,
                "raw_len": len(raw_ppg),
                "proc_len": len(proc_ppg),
                "duration_raw_s": len(raw_ppg) / ppg_fs,
                "duration_proc_s": len(proc_ppg) / ppg_fs,
                "notes": notes
            }
            if store_signals:
                row.update({
                    "raw_ppg": raw_ppg,
                    "proc_ppg": proc_ppg,
                    "raw_ekg": raw_ekg,
                    "raw_abp": raw_abp
                })
            rows.append(row)

    df = pd.DataFrame(rows)

    if group_by_record and store_signals:
        # define how to aggregate each column
        agg_dict = {
            "subject":     "first",
            "label":       "first",
            "raw_ppg_fs":  "first",
            "ppg_fs_out":  "first",
            "ekg_fs_out":  "first",
            "abp_fs_out":  "first",
            "notes":       lambda x: list(x),
            "raw_len":     lambda x: np.sum(x),
            "proc_len":    lambda x: np.sum(x),
            "duration_raw_s":  lambda x: np.sum(x),
            "duration_proc_s": lambda x: np.sum(x),
            # now the key: concatenate all windows into one long array
            "raw_ppg":  lambda series: np.concatenate(series.values),
            "proc_ppg": lambda series: np.concatenate(series.values),
            "raw_ekg":  lambda series: np.concatenate(series.values),
            "raw_abp":  lambda series: np.concatenate(series.values),
        }
        df = (
            df
            .groupby("rec_id", as_index=False)
            .agg(agg_dict)
        )
    return df

## Create Capno Dataset

### CapnoBase

In [4]:
# def main():
#     root_path = os.path.join('../data/raw/capnobase/data/mat')
#     out_path = os.path.join('../data/processed/length_full/capnobase_db')
#     out_filename = 'capnobase_db'
#     if not os.path.exists(out_path):
#         os.mkdir(out_path)

#     fs_in = 100.00
#     fs_out = 100.00

#     config = {
#     "input_dir"      : root_path,
#     "output_dir"     : out_path,
#     "window_size_sec": 30,
#     "fs_in"          : fs_in,
#     "fs_out"   : fs_out,
#     "lowpass_cutoff" : (fs_out / 2),
#     "fir_numtaps"    : 129,
#     "zero_phase"     : True,
#     "out_filename" :  out_filename
# }
#     if bSetUpDB:
#         etl = CapnoBaseETL(config)
#         h5file = etl.process_all()
#         print(f"Saved windows HDF5 to {h5file}")
#     df = load_as_df(out_path,out_filename)

#     display(df.head(10))
#     return df
# if __name__ == "__main__":
#     df_capnobase = main()

## MIMIC III

### MIMIC III General Subset

In [5]:
# def matobj_to_dict(matobj):
#     """
#     Recursively convert a scipy.io.matlab.mat_struct into a nested Python dict.
#     """
#     out = {}
#     for name in matobj._fieldnames:
#         val = getattr(matobj, name)
#         # Nested struct?
#         if hasattr(val, '_fieldnames'):
#             out[name] = matobj_to_dict(val)
#         # Array of structs?
#         elif isinstance(val, np.ndarray) and val.dtype == object:
#             flat = val.flatten()
#             out[name] = [
#                 matobj_to_dict(x) if hasattr(x, '_fieldnames') else x
#                 for x in flat
#             ]
#         else:
#             out[name] = val
#     return out

# def load_and_merge_subject_sets(pattern):
#     """
#     pattern: glob pattern matching your chunked .mat files,
#              e.g. "subject_sets/mimic*_data_*.mat"
    
#     Returns one big Python list of dicts.
#     """
#     merged = []
#     all_files = sorted(glob.glob(pattern))
#     for path in all_files:
#         mat = scipy.io.loadmat(path,
#                           struct_as_record=False,
#                           squeeze_me=True)
#         data = mat['data']
#         # make sure it's a 1D array of structs
#         if not isinstance(data, np.ndarray):
#             data = np.array([data], dtype=object)
#         elif data.dtype != object or data.ndim != 1:
#             data = data.flatten()

#         for elem in data:
#             if hasattr(elem, '_fieldnames'):
#                 merged.append(matobj_to_dict(elem))
#             else:
#                 # if it wasn’t a struct, just append as-is
#                 merged.append(elem)
#     return merged,all_files

# # --- USAGE ---
# # 1) load & merge
# all_subjects,all_files = load_and_merge_subject_sets("data/raw/mimic3_data/subject_sets/mimic3_data_1500_*_v1.mat")
# print(f"Total subjects merged: {len(all_subjects)}")

# # 2) save back out to one big .mat
# out_path = 'data/raw/mimic3_data'
# filename = f"mimic3_data_{50*len(all_files)}_v1.mat"
# scipy.io.savemat(os.path.join(out_path,filename), {'data': np.array(all_subjects, dtype=object)})

# print(f"Written merged file → {os.path.join(out_path,filename)}")

In [7]:
def main():
    ver_num = 1
    set_filenames = sorted(glob.glob("data/raw/mimic3_data/subject_sets/mimic3_data_1500_*_v1.mat"))
    # root_path = os.path.join(f'data/raw/mimic3_data/{filename}')
    for file_paths in set_filenames:
        root_path = file_paths
        filename = file_paths.rpartition('/')[-1]
        out_filename = f'{filename.rpartition('.')[0]}'
        out_folder =  f'mimic3_db_v{ver_num}_sets/'
        out_path = os.path.join('data/processed/length_full/',out_folder)

        if not os.path.exists(out_path):
            os.mkdir(out_path)
        config = {
            "input_dir": root_path,
            "output_dir":  out_path,
            "fs_in": 125.00,
            "fs_out": 20.83,
            'fs_ekg': 125,
            'fs_bp':125,
            "window_size_sec": WINDOW_LEN,
            "scale_type": "norm",
            "decimate_signal": True,
            "zero_phase": True,
            "out_filename": out_filename 
        }
        bSetUpDB = True
        if bSetUpDB:
            etl = MimicETL(config)
            out_file = etl.process()
            print("Saved General MIMIC III subject windows set to: ", out_file)
    # df = load_as_df(out_path,out_filename)
    # display(df)
    # return df
if __name__ == "__main__":
    df_mimic3_gen = main()
    # first = df_mimic3_gen.iloc[np.random.randint(0, len(df_mimic3_gen))] 
#     chosen_subj = df_mimic3_gen['subject'].unique()[0]

#     surr_subject_df = df_mimic3_gen[df_mimic3_gen['subject'] == chosen_subj]

#     display(surr_subject_df.head())
#     initial_window = df_mimic3_gen.sample(1).iloc[0]

#     specs = [
#         {"key":"raw_ppg",  "fs_key":"raw_ppg_fs",  "subplot":1, "legend":"Raw PPG"},
#         {"key":"proc_ppg", "fs_key":"ppg_fs_out",  "subplot":1, "legend":"Proc PPG"},
#         {"key":"raw_ekg",  "fs_key":"ekg_fs_out",  "subplot":2,               },
#         {"key":"raw_abp",  "fs_key":"abp_fs_out",  "subplot":3,               },
#     ]

#     fig_fn = lambda window: make_window_figure(window, specs)
    
#     app = create_time_series_viewer(
#     surr_subject_df,
#     fig_fn,
#     specs,
#     index_label_fn=lambda idx, row: f"{row.subject}-{row.window_count}"
# )
#     print(f'Total # of Subjects: {len(df_mimic3_gen['subject'].unique())}')
#     print(f'Total # of Windows: {df_mimic3_gen.shape[0]}')
#     print("Launching Dash at http://127.0.0.1:8050 …")
#     app.run(debug=True)

Saved General MIMIC III subject windows set to:  /Users/shayanriyaz/Documents/Projects/heart_rhythm_analysis/data/processed/length_full/mimic3_db_v1_sets/mimic3_data_1500_1_v1.h5
Saved General MIMIC III subject windows set to:  /Users/shayanriyaz/Documents/Projects/heart_rhythm_analysis/data/processed/length_full/mimic3_db_v1_sets/mimic3_data_1500_2_v1.h5
Saved General MIMIC III subject windows set to:  /Users/shayanriyaz/Documents/Projects/heart_rhythm_analysis/data/processed/length_full/mimic3_db_v1_sets/mimic3_data_1500_3_v1.h5
Saved General MIMIC III subject windows set to:  /Users/shayanriyaz/Documents/Projects/heart_rhythm_analysis/data/processed/length_full/mimic3_db_v1_sets/mimic3_data_1500_4_v1.h5
Saved General MIMIC III subject windows set to:  /Users/shayanriyaz/Documents/Projects/heart_rhythm_analysis/data/processed/length_full/mimic3_db_v1_sets/mimic3_data_1500_5_v1.h5
Saved General MIMIC III subject windows set to:  /Users/shayanriyaz/Documents/Projects/heart_rhythm_analy

In [None]:
# ver_num = 1
# out_filename = f'mimic3_db_v{ver_num}_test'
# out_path = os.path.join(f'data/processed/length_full/mimic3_db_v{ver_num}_test')    
# h5_path = f"{out_path}/{out_filename}.h5"
# print(f'Loading: {h5_path}')
# hf = h5py.File(h5_path, "r")
# subj_id = list(hf.keys())[0]
# window_id = list(hf[subj_id].keys())[0]
# print(dict(hf[subj_id][window_id].attrs))


## MIMIC IV

### MIMIC IV General Data

In [12]:
np.random.seed(42)
ver_num = 1
set_filenames = sorted(glob.glob("data/raw/mimic4_data/subject_sets/mimic4_data_50_*_v1.mat"))
def main():
    # root_path = os.path.join(f'data/raw/mimic3_data/{filename}')
    for file_paths in set_filenames:
        root_path = file_paths
        filename = file_paths.rpartition('/')[-1]
        out_filename = f'{filename.rpartition('.')[0]}'
        out_folder =  f'mimic4_db_v{ver_num}_sets/'
        out_path = os.path.join('data/processed/length_full/',out_folder)
    

        if not os.path.exists(out_path):
            os.mkdir(out_path)
        fs_in = 62.5
        fs_out = 20.83
        config = {
            "input_dir": root_path,
            "output_dir":  out_path,
            "fs_in": fs_in,
            "fs_out": fs_out,
            'fs_ekg': 62.5, 
            'fs_bp':fs_in,
            "window_size_sec": WINDOW_LEN,
            "scale_type": "norm",
            "decimate_signal": True,
            "zero_phase": True,
            "out_filename": out_filename 
        }
        bSetUpDB = True
        if bSetUpDB:
            etl = MimicETL(config)
            out_file = etl.process()
            print(f"Saved MIMIC IV  windows to", out_file)
        df = load_as_df(out_path,out_filename)
        display(df)
        return df

if __name__ == "__main__":

    df_mimic4_gen = main()
#     first = df_mimic4_gen.iloc[np.random.randint(0, len(df_mimic4_gen))] 
#     initial_window = df_mimic4_gen.sample(1).iloc[0]
#     specs = [
#         {"key":"raw_ppg",  "fs_key":"raw_ppg_fs",  "subplot":1, "legend":"Raw PPG"},
#         {"key":"proc_ppg", "fs_key":"ppg_fs_out",  "subplot":1, "legend":"Proc PPG"},
#         {"key":"raw_ekg",  "fs_key":"ekg_fs_out",  "subplot":2,               },
#         {"key":"raw_abp",  "fs_key":"abp_fs_out",  "subplot":3,               },
#     ]

#     # 3) bind specs into a single-arg figure fn
#     fig_fn = lambda window: make_window_figure(window, specs)
    
#     app = create_time_series_viewer(
#     df_mimic4_gen,
#     fig_fn,
#     specs,
#     index_label_fn=lambda idx, row: f"{row.subject}-{row.window_count}"
# )
#     print(f'Total # of Subjects: {len(df_mimic4_gen['subject'].unique())}')
#     print(f'Total # of Windows: {df_mimic4_gen.shape[0]}')
#     print("Launching Dash at http://127.0.0.1:8050 …")
    
    # app.run(debug=True)
        

Saved MIMIC IV  windows to /Users/shayanriyaz/Documents/Projects/heart_rhythm_analysis/data/processed/length_full/mimic4_db_v1_sets/mimic4_data_50_1_v1.h5
Loading: data/processed/length_full/mimic4_db_v1_sets//mimic4_data_50_1_v1.h5


Unnamed: 0,subject,window_id,window_count,rec_id,label,raw_ppg_fs,ppg_fs_out,ekg_fs_out,abp_fs_out,raw_len,proc_len,duration_raw_s,duration_proc_s,notes,raw_ppg,proc_ppg,raw_ekg,raw_abp
0,p10020306,00831e64-a201-4505-955e-49ee07dfee55,1,83404654,-1,62.4725,20.83,62.5,62.5,499,167,23.955833,8.017283,sinus arrhythmia with pvcs with borderline 1st...,"[0.4206543, 0.44995117, 0.48706055, 0.50634766...","[-0.4969925, 0.277146, 0.32385954, 0.50300753,...","[-0.045, -0.03, -0.05, -0.06, -0.065, -0.06, -...","[70.5625, 70.1875, 69.4375, 68.5, 67.3125, 66...."
1,p10020306,00b7ecd9-a0d8-448d-b11c-eff543e903ae,2,83404654,-1,62.4725,20.83,62.5,62.5,499,167,23.955833,8.017283,sinus arrhythmia with pvcs with borderline 1st...,"[0.3540039, 0.34570312, 0.34155273, 0.33325195...","[-0.48080215, -0.23129225, -0.31811625, -0.233...","[0.48, -0.03, -0.09, -0.07, -0.065, -0.07, -0....","[68.0625, 67.0, 66.0, 65.25, 64.5625, 63.8125,..."
2,p10020306,01654df3-b6a9-4cad-8989-485035784a8b,3,83404654,-1,62.4725,20.83,62.5,62.5,499,167,23.955833,8.017283,sinus arrhythmia with pvcs with borderline 1st...,"[0.4248047, 0.44262695, 0.46411133, 0.4819336,...","[-0.58107036, 0.18447798, 0.08413577, 0.168737...","[-0.115, -0.1, -0.085, -0.07, -0.06, -0.055, -...","[61.125, 60.5625, 60.125, 59.875, 59.8125, 59...."
3,p10020306,038145e7-af40-4b33-ade5-0b7ca3cf34a1,4,83404654,-1,62.4725,20.83,62.5,62.5,499,167,23.955833,8.017283,sinus arrhythmia with pvcs with borderline 1st...,"[0.64575195, 0.64990234, 0.64990234, 0.6499023...","[-0.24910767, 0.553352, 0.3165337, 0.3744953, ...","[-0.01, -0.015, -0.02, 0.01, 0.24, 0.525, 0.39...","[59.5, 59.0, 58.5, 58.0, 57.4375, 57.0, 56.562..."
4,p10020306,04c4a30f-216a-4f22-a108-43ec21707b12,5,83404654,-1,62.4725,20.83,62.5,62.5,499,167,23.955833,8.017283,sinus arrhythmia with pvcs with borderline 1st...,"[0.64990234, 0.64990234, 0.6569824, 0.6640625,...","[-0.49018386, 0.21350744, 0.04268983, 0.111809...","[-0.005, 0.0, -0.01, -0.02, -0.02, -0.02, -0.0...","[107.0, 104.8125, 101.1875, 96.1875, 90.0625, ..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11191,p19619764,fc24a48e-97d6-48c5-a7d9-3d786df58393,220,83958172,-1,62.4725,20.83,62.5,62.5,499,167,23.955833,8.017283,,"[0.49316406, 0.48657227, 0.4831543, 0.4819336,...","[-0.5714248, 0.18446803, -0.07297936, -0.01847...","[-0.055, -0.055, -0.05, -0.02, 0.015, 0.02, 0....","[97.5625, 95.9375, 93.5625, 90.625, 87.0625, 8..."
11192,p19619764,fcc7ddb2-d124-4745-a73f-a34df07c1acf,221,83958172,-1,62.4725,20.83,62.5,62.5,499,167,23.955833,8.017283,,"[0.47021484, 0.47314453, 0.47729492, 0.4809570...","[-0.78527504, 0.10444981, -0.093777, 0.0501251...","[-0.055, -0.04, 0.015, 0.01, 0.02, 0.065, 0.03...","[83.25, 81.25, 78.3125, 75.0, 72.1875, 70.4375..."
11193,p19619764,fce834dd-cdea-4daf-a861-ad2a60e66249,222,83958172,-1,62.4725,20.83,62.5,62.5,499,167,23.955833,8.017283,,"[0.48046875, 0.47045898, 0.45898438, 0.4506836...","[-0.57527995, 0.05251968, -0.23170784, -0.1473...","[-0.195, -0.16, -0.145, -0.135, -0.12, -0.11, ...","[78.6875, 78.0625, 77.5, 77.0, 76.5, 76.0625, ..."
11194,p19619764,fcfbbb08-b77d-4eb3-9053-8658e060a457,223,83958172,-1,62.4725,20.83,62.5,62.5,499,167,23.955833,8.017283,,"[0.5126953, 0.5046387, 0.4963379, 0.48901367, ...","[-0.6263717, 0.20693332, -0.16796103, -0.05976...","[0.185, 0.08, 0.0, -0.045, -0.055, -0.055, -0....","[82.6875, 88.625, 94.0, 97.5625, 99.0625, 98.8..."


In [None]:
subject_agg = {
    # sampling & metadata: just take the first (they're constant per subject)
    "raw_ppg_fs": "first",
    "ppg_fs_out": "first",
    "ekg_fs_out": "first",
    "abp_fs_out": "first",
    # lengths & durations: sum across windows
    "raw_len":       "sum",
    "proc_len":      "sum",
    "duration_raw_s":  "sum",
    "duration_proc_s": "sum",
    # signals: concatenate all windows end-to-end
    "raw_ppg":  lambda s: np.concatenate(s.values),
    "proc_ppg": lambda s: np.concatenate(s.values),
    "raw_ekg":  lambda s: np.concatenate(s.values),
    "raw_abp":  lambda s: np.concatenate(s.values),
    # if you want to keep track of window_ids or rec_ids:
    "window_id": lambda s: list(s.values),
    "rec_id":    lambda s: list(s.values),
    "label":     lambda s: list(s.values),
}

# ── 2) group by subject ─
df_by_subject = (
    df_mimic4_gen
    .groupby("subject", as_index=False)
    .agg(subject_agg)
)

# ── 3) now df_by_subject has one row per subject, with each signal a long array
print(df_by_subject.shape)
print(df_by_subject.raw_ppg.iloc[0].shape)  # e.g. (sum of all its windows,) 

# bind specs & figure fn exactly as before
fig_fn = lambda row: make_window_figure(row, specs)

# subject‐level dashboard (labels are just the subject IDs)
app = create_time_series_viewer(
    df_by_subject,
    fig_fn,
    specs,
    index_label_fn=lambda idx, row: str(row.subject)
)
print("Launching Dash at http://127.0.0.1:8080 …")
app.run(debug=True,port=8080)

# Create Train/Test Files

In [None]:
from src.lib.create_training_pipelines.prepare_dataset import *
import os, h5py

ver_num = 1
ROOT_PATH = 'data/processed/length_full'

mimic3_files = sorted(glob.glob('data/processed/length_full/mimic3_db_v1_sets/*'))
mimic4_files = sorted(glob.glob('data/processed/length_full/mimic4_db_v1_sets/*'))

file_paths = mimic3_files + mimic4_files
# file_paths = [f"{ROOT_PATH}/mimic3_db_v{ver_num}/mimic3_db_v{ver_num}.h5", f"{ROOT_PATH}/mimic4_db_v{ver_num}/mimic4_db_v{ver_num}.h5"]
out_path = 'data/development_dataset/length_full'
train_path, test_path = my_split_and_save(file_paths, train_ratio=0.8, output_dir=out_path)

def count_windows(h5_path):
    with h5py.File(h5_path, 'r') as f:
        subjects = list(f.keys())
        n_subjects = len(subjects)
        n_windows  = sum(len(f[subj].keys()) for subj in subjects)
    return n_subjects, n_windows



mimic3 → file not found: ../data/processed/length_full/mimic3_db/mimic3_db.h5
mimic4 → file not found: ../data/processed/length_full/mimic4_db/mimic4_db.h5
 train → file not found: ../data/development_dataset/length_full/train_dataset.h5
  test → file not found: ../data/development_dataset/length_full/test_dataset.h5


In [26]:
# Replace these with your real paths:
paths = {
    "train":   "data/development_dataset/length_full/train_dataset.h5",
    "test":    "data/development_dataset/length_full/test_dataset.h5",
}

for name, p in paths.items():
    if os.path.exists(p):
        subs, wins = count_windows(p)
        print(f"{name:>6} → {subs} subjects, {wins} windows")
    else:
        print(f"{name:>6} → file not found: {p}")

 train → 499 subjects, 89450 windows
  test → 499 subjects, 22363 windows
