In [1]:
import os
import re
import random
import numpy as np
import pandas as pd
import multiprocessing
from scipy import signal
import matplotlib.pyplot as plt
from sklearn.model_selection import KFold
from sklearn.preprocessing import MinMaxScaler, StandardScaler

DATA_PATH = '../../DataLake/bidmc_csv'
DATA_SAVE_PATH = '/root/Workspace/Project-RRpo-2ndStudy/dataset' 
regex_bidmc = re.compile('bidmc_[0-9]+')

In [2]:
def interpolation(x, input):
    x0 = int(np.floor(x))
    y0 = input[x0]
    x1 = int(np.ceil(x))
    y1 = input[x1]
    y = (y1-y0)*(x-x0) + y0
    return y


def signal_resample(input_signal, org_fs, new_fs, method='interpolation'):
    output_signal = []
    new_x = np.arange(0, len(input_signal), org_fs/new_fs)
    
    if method == 'interpolation': 
        interp = interpolation

    for x in new_x:
        y = interp(x, input_signal)
        output_signal.append(y)

    return np.array(output_signal)


def generate_dataset(arg_pleth, arg_resp, org_fs=125, new_fs=30, shift_factor=4):
    window_size = org_fs * 60 # 7500
    shift = int(window_size/shift_factor) # 125
    shift_n_times = int((len(arg_pleth)-window_size)/shift)+1

    window_pleth = np.array([arg_pleth[0+shift*i:window_size+shift*i] for i in range(shift_n_times)])
    window_rsmp_pleth = np.array([signal_resample(win, org_fs, new_fs) for win in window_pleth])
    window_resp = np.round(np.mean([arg_resp[0+(shift//org_fs)*i:(window_size//org_fs)+(shift//org_fs)*i] for i in range(shift_n_times)], axis=-1)).astype(np.int32).reshape(-1,1)
    dataset = np.array([np.array([window_rsmp_pleth[i], window_resp[i][0]], dtype=object) for i in range(len(window_pleth))])
    return dataset


def preprocessing(filepath, id, numtaps, cutoff, shift_factor, org_fs, new_fs):
    pleth = pd.read_csv(f'{filepath}/{id}_Signals.csv', usecols=['Time [s]', ' PLETH'])[' PLETH'].values
    resp = pd.read_csv(f'{filepath}/{id}_Numerics.csv', usecols=['Time [s]', ' RESP'])[' RESP'].values

    if np.isnan(resp).sum() != 0:
        print(id)
        return id, None

    taps = signal.firwin(numtaps=numtaps, cutoff=cutoff, window='hamming', pass_zero=False, fs=org_fs)
    pleth = signal.filtfilt(taps, 1.0, pleth)

    dataset = generate_dataset(pleth, resp, org_fs=org_fs, new_fs=new_fs, shift_factor=shift_factor)
    print(f'{id} --> {dataset.shape}')
    return id, dataset

In [21]:
bidmc_id = sorted(list(set([regex_bidmc.match(filename.name).group() for filename in os.scandir(DATA_PATH)])))
test_idx = 10
np.array(bidmc_id[test_idx])

array('bidmc_11', dtype='<U8')

In [77]:
pleth = pd.read_csv(f'{DATA_PATH}/{bidmc_id[test_idx]}_Signals.csv', usecols=['Time [s]', ' PLETH'])[' PLETH'].values
resp = pd.read_csv(f'{DATA_PATH}/{bidmc_id[test_idx]}_Numerics.csv', usecols=['Time [s]', ' RESP'])[' RESP'].values

In [22]:
pool = multiprocessing.Pool(processes=40)
results = pool.starmap(preprocessing, [(DATA_PATH, pid, 2000, [0.1,0.4], 60, 125, 30) for pid in bidmc_id])
pool.close()
pool.join()

bidmc_15
bidmc_13
bidmc_19
bidmc_02 --> (421, 2)
bidmc_06 --> (421, 2)bidmc_40 --> (421, 2)

bidmc_14 --> (421, 2)
bidmc_03 --> (421, 2)
bidmc_25 --> (421, 2)
bidmc_20 --> (421, 2)
bidmc_16 --> (421, 2)
bidmc_38 --> (421, 2)
bidmc_21 --> (421, 2)
bidmc_24 --> (421, 2)
bidmc_18 --> (421, 2)
bidmc_37 --> (421, 2)
bidmc_26 --> (421, 2)
bidmc_12 --> (421, 2)
bidmc_31 --> (421, 2)
bidmc_30 --> (421, 2)
bidmc_35 --> (421, 2)
bidmc_27 --> (421, 2)
bidmc_09 --> (421, 2)
bidmc_05 --> (421, 2)
bidmc_28 --> (421, 2)
bidmc_39 --> (421, 2)
bidmc_01 --> (421, 2)
bidmc_04 --> (421, 2)
bidmc_33 --> (421, 2)
bidmc_22 --> (421, 2)bidmc_32 --> (421, 2)

bidmc_34 --> (421, 2)
bidmc_41 --> (421, 2)
bidmc_11 --> (421, 2)
bidmc_08 --> (421, 2)
bidmc_23 --> (421, 2)
bidmc_17 --> (421, 2)
bidmc_10 --> (421, 2)
bidmc_29 --> (421, 2)
bidmc_07 --> (421, 2)
bidmc_42 --> (421, 2)
bidmc_36 --> (421, 2)
bidmc_43 --> (421, 2)
bidmc_47 --> (421, 2)bidmc_49 --> (421, 2)

bidmc_45 --> (421, 2)
bidmc_50 --> (421, 2)
bidmc

In [29]:
new_results = []
for result in results:
    if result[1] is None: print(result[0])
    else: new_results.append(result)

bidmc_13
bidmc_15
bidmc_19


In [36]:
np.save(f'{DATA_SAVE_PATH}/230921/bidmc-preprocessed.npy', np.array(new_results, dtype=object))

### 호흡에 따른 데이터 분류

In [3]:
dataset = np.load(f'{DATA_SAVE_PATH}/230921/bidmc-preprocessed.npy', allow_pickle=True)
print(dataset.shape) # (subject_id, (pleth, resp))

(50, 2)


In [4]:
count = 0
for id, samples in dataset:
    count += len(samples)
count

21050

In [38]:
fast_id = []; normal_id = []; slow_id = []
for sample in dataset:
    pid = sample[0]
    mean_resp = np.mean([resp for _, resp in sample[1]])
    
    if mean_resp < 12: slow_id.append(pid)
    elif mean_resp > 18: fast_id.append(pid)
    else: normal_id.append(pid)

print(len(fast_id), len(normal_id), len(slow_id))

26 22 2


In [39]:
fast_dataset = []; normal_dataset = []; slow_dataset = []
for sample in dataset:
    if sample[0] in fast_id:
        fast_dataset.append(sample)
    elif sample[0] in slow_id:
        slow_dataset.append(sample)
    else:
        normal_dataset.append(sample)
print(len(fast_dataset), len(normal_dataset), len(slow_dataset))

26 22 2


In [40]:
np.save(f'{DATA_SAVE_PATH}/230921/bidmc-preprocessed_fastRR.npy', np.array(fast_dataset))
np.save(f'{DATA_SAVE_PATH}/230921/bidmc-preprocessed_normalRR.npy', np.array(normal_dataset))
np.save(f'{DATA_SAVE_PATH}/230921/bidmc-preprocessed_slowRR.npy', np.array(slow_dataset))

### Test Datset

호흡 속도(빠름, 보통, 느림)에 따라 분류된 3가지 그룹에서 각각 임의로 3 ID 씩 샘플링.

총 9명의 ID에 해당하는 데이터를 Test Dataset으로 지정.

### Train & Val Dataset

이외의 나머지 44명의 ID에 해당하는 데이터는 TrainVal Dataset으로 지정.

In [41]:
dataset = np.load(f'{DATA_SAVE_PATH}/230921/bidmc-preprocessed.npy', allow_pickle=True)
print(dataset.shape) # (subject_id, (pleth, resp))

(50, 2)


In [44]:
test_fast_id = random.sample(fast_id, 3)
test_normal_id = random.sample(normal_id, 3)
test_slow_id = random.sample(slow_id, 1)
test_id = test_fast_id + test_normal_id + test_slow_id
print(len(test_id))

7


In [45]:
X_dataset = []
test_dataset = []
for sample in dataset:
    if sample[0] in test_id:
        test_dataset.append(sample)
    else:
        X_dataset.append(sample)

X_dataset = np.array(X_dataset)
test_dataset = np.array(test_dataset)

In [46]:
X_dataset.shape, test_dataset.shape

((43, 2), (7, 2))

In [47]:
np.save(f'{DATA_SAVE_PATH}/230921/bidmc-test_dataset.npy', test_dataset)
np.save(f'{DATA_SAVE_PATH}/230921/bidmc-trainval_dataset.npy', X_dataset)

In [48]:
test_dataset = np.load(f'{DATA_SAVE_PATH}/230921/bidmc-test_dataset.npy', allow_pickle=True)
test_dataset.shape

(7, 2)

In [49]:
test_dataset

array([['bidmc_01',
        array([[array([2.65525228e-05, 1.18955319e-03, 2.34956343e-03, ...,
                       6.72999127e-04, 1.10741206e-03, 1.54288096e-03])    , 24],
               [array([0.02343542, 0.02353949, 0.02359297, ..., 0.007924  , 0.00779951,
                       0.00764851])                                                    ,
                24],
               [array([ 0.00971467,  0.00893033,  0.0081457 , ..., -0.00243145,
                       -0.00277405, -0.00310012])                              ,
                24],
               [array([-0.00852073, -0.00881286, -0.0090815 , ..., -0.00369558,
                       -0.00346161, -0.00321997])                              ,
                24],
               [array([-0.00845703, -0.00821034, -0.0079531 , ...,  0.0017923 ,
                        0.001801  ,  0.00179685])                              ,
                24],
               [array([ 1.69444045e-03,  2.06063804e-03,  2.42345772e-03, ...,