In [1]:
import os
import re
import random
import numpy as np
import pandas as pd
import multiprocessing
from scipy import signal
import matplotlib.pyplot as plt
from sklearn.model_selection import KFold
from sklearn.preprocessing import MinMaxScaler

DATA_PATH = '../../DataLake/bidmc_csv'
DATA_SAVE_PATH = '/root/Workspace/Project-RRpo-2ndStudy/dataset' 
regex_bidmc = re.compile('bidmc_[0-9]+')

In [2]:
def interpolation(x, input):
    x0 = int(np.floor(x))
    y0 = input[x0]
    x1 = int(np.ceil(x))
    y1 = input[x1]
    y = (y1-y0)*(x-x0) + y0
    return y


def signal_resample(input_signal, org_fs, new_fs, method='interpolation'):
    output_signal = []
    new_x = np.arange(0, len(input_signal), org_fs/new_fs)
    
    if method == 'interpolation': 
        interp = interpolation

    for x in new_x:
        y = interp(x, input_signal)
        output_signal.append(y)

    return np.array(output_signal)


def generate_dataset(arg_pleth, arg_resp, org_fs=125, new_fs=30, shift_factor=4):
    fs = 125
    window_size = fs * 60 # 7500
    shift = int(window_size/shift_factor) # 125
    shift_n_times = int((len(arg_pleth)-window_size)/shift)+1

    window_pleth = np.array([arg_pleth[0+shift*i:window_size+shift*i] for i in range(shift_n_times)])
    window_rsmp_pleth = np.array([signal_resample(win, org_fs, new_fs) for win in window_pleth])
    window_resp = np.round(np.mean([arg_resp[0+(shift//fs)*i:(window_size//fs)+(shift//fs)*i] for i in range(shift_n_times)], axis=-1)).astype(np.int32).reshape(-1,1)
    dataset = np.array([np.array([window_rsmp_pleth[i], window_resp[i][0]]) for i in range(len(window_pleth))])
    return dataset


def preprocessing(filepath, id, numtaps, cutoff, shift_factor, org_fs, new_fs):
    pleth = pd.read_csv(f'{filepath}/{id}_Signals.csv', usecols=['Time [s]', ' PLETH'])[' PLETH'].values
    resp = pd.read_csv(f'{filepath}/{id}_Numerics.csv', usecols=['Time [s]', ' RESP'])[' RESP'].values

    taps = signal.firwin(numtaps=numtaps, cutoff=cutoff, window='hamming', pass_zero=False, fs=org_fs)
    filtered_pleth = signal.filtfilt(taps, 1.0, pleth)

    dataset = generate_dataset(filtered_pleth, resp, org_fs=org_fs, new_fs=new_fs, shift_factor=shift_factor)
    print(f'{id} --> {dataset.shape}')
    return id, dataset

In [3]:
bidmc_id = sorted(list(set([regex_bidmc.match(filename.name).group() for filename in os.scandir(DATA_PATH)])))

In [4]:
# dataset = preprocessing(filepath=DATA_PATH, id=bidmc_id[0], numtaps=2000, cutoff=[0.1,0.4], shift_factor=60, org_fs=125, new_fs=30)

In [5]:
pool = multiprocessing.Pool(processes=40)
results = pool.starmap(preprocessing, [(DATA_PATH, pid, 2000, [0.1,0.4], 60, 125, 30) for pid in bidmc_id])
pool.close()
pool.join()

  dataset = np.array([np.array([window_rsmp_pleth[i], window_resp[i][0]]) for i in range(len(window_pleth))])


bidmc_21 --> (421, 2)


  dataset = np.array([np.array([window_rsmp_pleth[i], window_resp[i][0]]) for i in range(len(window_pleth))])


bidmc_16 --> (421, 2)


  dataset = np.array([np.array([window_rsmp_pleth[i], window_resp[i][0]]) for i in range(len(window_pleth))])
  dataset = np.array([np.array([window_rsmp_pleth[i], window_resp[i][0]]) for i in range(len(window_pleth))])


bidmc_36 --> (421, 2)bidmc_26 --> (421, 2)



  dataset = np.array([np.array([window_rsmp_pleth[i], window_resp[i][0]]) for i in range(len(window_pleth))])


bidmc_08 --> (421, 2)


  dataset = np.array([np.array([window_rsmp_pleth[i], window_resp[i][0]]) for i in range(len(window_pleth))])


bidmc_04 --> (421, 2)


  dataset = np.array([np.array([window_rsmp_pleth[i], window_resp[i][0]]) for i in range(len(window_pleth))])


bidmc_19 --> (421, 2)


  dataset = np.array([np.array([window_rsmp_pleth[i], window_resp[i][0]]) for i in range(len(window_pleth))])


bidmc_28 --> (421, 2)


  dataset = np.array([np.array([window_rsmp_pleth[i], window_resp[i][0]]) for i in range(len(window_pleth))])
  dataset = np.array([np.array([window_rsmp_pleth[i], window_resp[i][0]]) for i in range(len(window_pleth))])


bidmc_14 --> (421, 2)
bidmc_09 --> (421, 2)


  dataset = np.array([np.array([window_rsmp_pleth[i], window_resp[i][0]]) for i in range(len(window_pleth))])


bidmc_18 --> (421, 2)


  dataset = np.array([np.array([window_rsmp_pleth[i], window_resp[i][0]]) for i in range(len(window_pleth))])


bidmc_34 --> (421, 2)


  dataset = np.array([np.array([window_rsmp_pleth[i], window_resp[i][0]]) for i in range(len(window_pleth))])


bidmc_07 --> (421, 2)


  dataset = np.array([np.array([window_rsmp_pleth[i], window_resp[i][0]]) for i in range(len(window_pleth))])


bidmc_15 --> (421, 2)


  dataset = np.array([np.array([window_rsmp_pleth[i], window_resp[i][0]]) for i in range(len(window_pleth))])


bidmc_27 --> (421, 2)


  dataset = np.array([np.array([window_rsmp_pleth[i], window_resp[i][0]]) for i in range(len(window_pleth))])
  dataset = np.array([np.array([window_rsmp_pleth[i], window_resp[i][0]]) for i in range(len(window_pleth))])


bidmc_22 --> (421, 2)bidmc_24 --> (421, 2)



  dataset = np.array([np.array([window_rsmp_pleth[i], window_resp[i][0]]) for i in range(len(window_pleth))])
  dataset = np.array([np.array([window_rsmp_pleth[i], window_resp[i][0]]) for i in range(len(window_pleth))])


bidmc_31 --> (421, 2)
bidmc_25 --> (421, 2)


  dataset = np.array([np.array([window_rsmp_pleth[i], window_resp[i][0]]) for i in range(len(window_pleth))])


bidmc_38 --> (421, 2)


  dataset = np.array([np.array([window_rsmp_pleth[i], window_resp[i][0]]) for i in range(len(window_pleth))])


bidmc_23 --> (421, 2)


  dataset = np.array([np.array([window_rsmp_pleth[i], window_resp[i][0]]) for i in range(len(window_pleth))])


bidmc_11 --> (421, 2)


  dataset = np.array([np.array([window_rsmp_pleth[i], window_resp[i][0]]) for i in range(len(window_pleth))])


bidmc_12 --> (421, 2)


  dataset = np.array([np.array([window_rsmp_pleth[i], window_resp[i][0]]) for i in range(len(window_pleth))])


bidmc_05 --> (421, 2)


  dataset = np.array([np.array([window_rsmp_pleth[i], window_resp[i][0]]) for i in range(len(window_pleth))])


bidmc_20 --> (421, 2)


  dataset = np.array([np.array([window_rsmp_pleth[i], window_resp[i][0]]) for i in range(len(window_pleth))])


bidmc_17 --> (421, 2)


  dataset = np.array([np.array([window_rsmp_pleth[i], window_resp[i][0]]) for i in range(len(window_pleth))])


bidmc_03 --> (421, 2)


  dataset = np.array([np.array([window_rsmp_pleth[i], window_resp[i][0]]) for i in range(len(window_pleth))])
  dataset = np.array([np.array([window_rsmp_pleth[i], window_resp[i][0]]) for i in range(len(window_pleth))])
  dataset = np.array([np.array([window_rsmp_pleth[i], window_resp[i][0]]) for i in range(len(window_pleth))])


bidmc_06 --> (421, 2)
bidmc_30 --> (421, 2)
bidmc_35 --> (421, 2)


  dataset = np.array([np.array([window_rsmp_pleth[i], window_resp[i][0]]) for i in range(len(window_pleth))])
  dataset = np.array([np.array([window_rsmp_pleth[i], window_resp[i][0]]) for i in range(len(window_pleth))])


bidmc_39 --> (421, 2)bidmc_01 --> (421, 2)



  dataset = np.array([np.array([window_rsmp_pleth[i], window_resp[i][0]]) for i in range(len(window_pleth))])


bidmc_13 --> (421, 2)


  dataset = np.array([np.array([window_rsmp_pleth[i], window_resp[i][0]]) for i in range(len(window_pleth))])
  dataset = np.array([np.array([window_rsmp_pleth[i], window_resp[i][0]]) for i in range(len(window_pleth))])


bidmc_32 --> (421, 2)

  dataset = np.array([np.array([window_rsmp_pleth[i], window_resp[i][0]]) for i in range(len(window_pleth))])


bidmc_29 --> (421, 2)

bidmc_40 --> (421, 2)


  dataset = np.array([np.array([window_rsmp_pleth[i], window_resp[i][0]]) for i in range(len(window_pleth))])


bidmc_37 --> (421, 2)


  dataset = np.array([np.array([window_rsmp_pleth[i], window_resp[i][0]]) for i in range(len(window_pleth))])


bidmc_10 --> (421, 2)


  dataset = np.array([np.array([window_rsmp_pleth[i], window_resp[i][0]]) for i in range(len(window_pleth))])


bidmc_02 --> (421, 2)


  dataset = np.array([np.array([window_rsmp_pleth[i], window_resp[i][0]]) for i in range(len(window_pleth))])


bidmc_33 --> (421, 2)


  dataset = np.array([np.array([window_rsmp_pleth[i], window_resp[i][0]]) for i in range(len(window_pleth))])


bidmc_50 --> (421, 2)


  dataset = np.array([np.array([window_rsmp_pleth[i], window_resp[i][0]]) for i in range(len(window_pleth))])


bidmc_43 --> (421, 2)


  dataset = np.array([np.array([window_rsmp_pleth[i], window_resp[i][0]]) for i in range(len(window_pleth))])


bidmc_53 --> (421, 2)


  dataset = np.array([np.array([window_rsmp_pleth[i], window_resp[i][0]]) for i in range(len(window_pleth))])
  dataset = np.array([np.array([window_rsmp_pleth[i], window_resp[i][0]]) for i in range(len(window_pleth))])


bidmc_48 --> (421, 2)
bidmc_47 --> (421, 2)


  dataset = np.array([np.array([window_rsmp_pleth[i], window_resp[i][0]]) for i in range(len(window_pleth))])


bidmc_45 --> (421, 2)


  dataset = np.array([np.array([window_rsmp_pleth[i], window_resp[i][0]]) for i in range(len(window_pleth))])


bidmc_49 --> (421, 2)


  dataset = np.array([np.array([window_rsmp_pleth[i], window_resp[i][0]]) for i in range(len(window_pleth))])


bidmc_52 --> (421, 2)


  dataset = np.array([np.array([window_rsmp_pleth[i], window_resp[i][0]]) for i in range(len(window_pleth))])


bidmc_42 --> (421, 2)

  dataset = np.array([np.array([window_rsmp_pleth[i], window_resp[i][0]]) for i in range(len(window_pleth))])



bidmc_41 --> (421, 2)


  dataset = np.array([np.array([window_rsmp_pleth[i], window_resp[i][0]]) for i in range(len(window_pleth))])


bidmc_46 --> (421, 2)


  dataset = np.array([np.array([window_rsmp_pleth[i], window_resp[i][0]]) for i in range(len(window_pleth))])


bidmc_51 --> (421, 2)


  dataset = np.array([np.array([window_rsmp_pleth[i], window_resp[i][0]]) for i in range(len(window_pleth))])


bidmc_44 --> (421, 2)


In [9]:
np.save(f'{DATA_SAVE_PATH}/230920/bidmc-preprocessed.npy', np.array(results))

  np.save(f'{DATA_SAVE_PATH}/230920/bidmc-preprocessed.npy', np.array(results))


### 호흡에 따른 데이터 분류

In [10]:
dataset = np.load(f'{DATA_SAVE_PATH}/230920/bidmc-preprocessed.npy', allow_pickle=True)
print(dataset.shape) # (subject_id, (pleth, resp))

(53, 2)


In [20]:
fast_id = []; normal_id = []; slow_id = []
for sample in dataset:
    pid = sample[0]
    mean_resp = np.mean([resp for _, resp in sample[1]])
    
    if mean_resp < 12: slow_id.append(pid)
    elif mean_resp > 18: fast_id.append(pid)
    else: normal_id.append(pid)

print(len(fast_id), len(normal_id), len(slow_id))

26 22 5


In [21]:
fast_dataset = []; normal_dataset = []; slow_dataset = []
for sample in dataset:
    if sample[0] in fast_id:
        fast_dataset.append(sample)
    elif sample[0] in slow_id:
        slow_dataset.append(sample)
    else:
        normal_dataset.append(sample)
print(len(fast_dataset), len(normal_dataset), len(slow_dataset))

26 22 5


In [30]:
np.save(f'{DATA_SAVE_PATH}/230920/bidmc-preprocessed_fastRR.npy', np.array(fast_dataset))
np.save(f'{DATA_SAVE_PATH}/230920/bidmc-preprocessed_normalRR.npy', np.array(normal_dataset))
np.save(f'{DATA_SAVE_PATH}/230920/bidmc-preprocessed_slowRR.npy', np.array(slow_dataset))

### Test Datset

호흡 속도(빠름, 보통, 느림)에 따라 분류된 3가지 그룹에서 각각 임의로 3 ID 씩 샘플링.

총 9명의 ID에 해당하는 데이터를 Test Dataset으로 지정.

### Train & Val Dataset

이외의 나머지 44명의 ID에 해당하는 데이터는 TrainVal Dataset으로 지정.

In [None]:
# pleth = pd.read_csv(f'{DATA_PATH}/{bidmc_id[0]}_Signals.csv', usecols=['Time [s]', ' PLETH'])[' PLETH'].values
# resp = pd.read_csv(f'{DATA_PATH}/{bidmc_id[0]}_Numerics.csv', usecols=['Time [s]', ' RESP'])[' RESP'].values

# taps = signal.firwin(numtaps=2000, cutoff=[0.1,0.4], window='hamming', pass_zero=False, fs=125)
# filtered_pleth = signal.filtfilt(taps, 1.0, pleth)
# filtered_pleth.shape

# fs = 125
# window_size = fs * 60 # 7500
# shift = int(window_size/60) # 125
# shift_n_times = int((len(pleth)-window_size)/shift)+1

# window_pleth = np.array([pleth[0+shift*i:window_size+shift*i] for i in range(shift_n_times)])
# window_rsmp_pleth = np.array([signal_resample(win, 125, 30) for win in window_pleth])
# window_resp = np.round(np.mean([resp[0+(shift//fs)*i:(window_size//fs)+(shift//fs)*i] for i in range(shift_n_times)], axis=-1)).astype(np.int32).reshape(-1,1)
# window_pleth.shape, window_rsmp_pleth.shape, window_resp.shape

# dataset = np.array([np.array([window_rsmp_pleth[i], window_resp[i][0]]) for i in range(len(window_pleth))])
# dataset.shape