In [1]:
import os
import re
import random
import numpy as np
import pandas as pd
import multiprocessing
from scipy import signal
import matplotlib.pyplot as plt
from sklearn.model_selection import KFold
from sklearn.preprocessing import MinMaxScaler

DATA_PATH = '../../DataLake/Capnobase/data/csv'
DATA_SAVE_PATH = '/root/Workspace/Project-RRpo-2ndStudy/dataset' 
regex_capno = re.compile('[0-9]{4}_8min')

In [2]:
def interpolation(x, input):
    x0 = int(np.floor(x))
    y0 = input[x0]
    x1 = int(np.ceil(x))
    y1 = input[x1]
    y = (y1-y0)*(x-x0) + y0
    return y


def signal_resample(input_signal, org_fs, new_fs, method='interpolation'):
    output_signal = []
    new_x = np.arange(0, len(input_signal), org_fs/new_fs)
    
    if method == 'interpolation': 
        interp = interpolation

    for x in new_x:
        y = interp(x, input_signal)
        output_signal.append(y)

    return np.asarray(output_signal)


def generate_dataset(arg_pleth, arg_resp, org_fs=125, new_fs=30, shift_factor=4):
    window_size = org_fs * 60 # 18000
    shift = int(window_size/shift_factor) # 300
    shift_n_times = int((len(arg_pleth)-window_size)/shift)+1

    window_pleth = np.array([arg_pleth[0+shift*i:window_size+shift*i] for i in range(shift_n_times)])
    window_rsmp_pleth = np.array([signal_resample(win, org_fs, new_fs) for win in window_pleth])
    window_resp = np.array([round(np.mean(arg_resp.loc[(arg_resp['resp_x']>=(0+(shift//org_fs)*i)) & (arg_resp['resp_x']<(window_size//org_fs) + (shift//org_fs)*i)]['resp_y'].values)) for i in range(shift_n_times)])
    dataset = np.array([np.array([window_rsmp_pleth[i], window_resp[i]]) for i in range(len(window_rsmp_pleth))])
    return dataset


def preprocessing(filepath, id, numtaps, cutoff, shift_factor, org_fs, new_fs):
    pleth = pd.read_csv(f'{filepath}/{id}_signal.csv', usecols=['pleth_y'])['pleth_y'].values
    resp_x = np.asarray(pd.read_csv(f'{filepath}/{id}_reference.csv')['rr_co2_x'][0].strip().split(' ')).reshape(-1,1)
    resp_y = np.asarray(pd.read_csv(f'{filepath}/{id}_reference.csv')['rr_co2_y'][0].strip().split(' ')).reshape(-1,1)
    resp = np.concatenate((resp_x, resp_y), axis=1)
    resp = pd.DataFrame(resp, columns=['resp_x', 'resp_y'], dtype=np.float32)

    taps = signal.firwin(numtaps=numtaps, cutoff=cutoff, window='hamming', pass_zero=False, fs=org_fs)
    filtered_pleth = signal.filtfilt(taps, 1.0, pleth)  

    dataset = generate_dataset(filtered_pleth, resp, org_fs=org_fs, new_fs=new_fs, shift_factor=shift_factor)
    print(f'{id} --> {dataset.shape}')
    return id, dataset

In [3]:
capno_id = sorted(list(set([regex_capno.match(filename.name).group() for filename in os.scandir(DATA_PATH)])))
len(capno_id)

42

### Adult Only
- 13 ID is upper 18

In [4]:
selected_capno_id = []
for id in capno_id:
    subject_age = pd.read_csv(f'{DATA_PATH}/{id}_meta.csv', usecols=['subject_age']).values
    if subject_age > 18.0:
        selected_capno_id.append(id)
len(selected_capno_id)

13

In [5]:
capno_meta = pd.concat([pd.read_csv(f'{DATA_PATH}/{id}_meta.csv') for id in selected_capno_id], axis=0)

### Process
- resp_x 는 초 단위 추정

In [6]:
pool = multiprocessing.Pool(processes=40)
results = pool.starmap(preprocessing, [(DATA_PATH, pid, 2000, [0.1,0.4], 60, 300, 30) for pid in selected_capno_id])
pool.close()
pool.join()

  dataset = np.array([np.array([window_rsmp_pleth[i], window_resp[i]]) for i in range(len(window_rsmp_pleth))])


0313_8min --> (421, 2)


  dataset = np.array([np.array([window_rsmp_pleth[i], window_resp[i]]) for i in range(len(window_rsmp_pleth))])


0309_8min --> (421, 2)


  dataset = np.array([np.array([window_rsmp_pleth[i], window_resp[i]]) for i in range(len(window_rsmp_pleth))])


0311_8min --> (421, 2)


  dataset = np.array([np.array([window_rsmp_pleth[i], window_resp[i]]) for i in range(len(window_rsmp_pleth))])


0331_8min --> (421, 2)


  dataset = np.array([np.array([window_rsmp_pleth[i], window_resp[i]]) for i in range(len(window_rsmp_pleth))])


0325_8min --> (421, 2)


  dataset = np.array([np.array([window_rsmp_pleth[i], window_resp[i]]) for i in range(len(window_rsmp_pleth))])


0328_8min --> (421, 2)


  dataset = np.array([np.array([window_rsmp_pleth[i], window_resp[i]]) for i in range(len(window_rsmp_pleth))])


0370_8min --> (421, 2)


  dataset = np.array([np.array([window_rsmp_pleth[i], window_resp[i]]) for i in range(len(window_rsmp_pleth))])
  dataset = np.array([np.array([window_rsmp_pleth[i], window_resp[i]]) for i in range(len(window_rsmp_pleth))])


0333_8min --> (421, 2)0332_8min --> (421, 2)



  dataset = np.array([np.array([window_rsmp_pleth[i], window_resp[i]]) for i in range(len(window_rsmp_pleth))])


0329_8min --> (421, 2)


  dataset = np.array([np.array([window_rsmp_pleth[i], window_resp[i]]) for i in range(len(window_rsmp_pleth))])


0330_8min --> (421, 2)

  dataset = np.array([np.array([window_rsmp_pleth[i], window_resp[i]]) for i in range(len(window_rsmp_pleth))])



0322_8min --> (421, 2)


  dataset = np.array([np.array([window_rsmp_pleth[i], window_resp[i]]) for i in range(len(window_rsmp_pleth))])


0312_8min --> (421, 2)


In [8]:
np.save(f'{DATA_SAVE_PATH}/230920/capno-preprocessed.npy', np.array(results))

  np.save(f'{DATA_SAVE_PATH}/230920/capno-preprocessed.npy', np.array(results))


### 호흡에 따른 데이터 분류

In [3]:
dataset = np.load(f'{DATA_SAVE_PATH}/230920/capno-preprocessed.npy', allow_pickle=True)
print(dataset.shape) # (subject_id, (pleth, resp))

(13, 2)


In [4]:
count = 0
for id, samples in dataset:
    count += len(samples)
count

5473

In [10]:
fast_id = []; normal_id = []; slow_id = []
for sample in dataset:
    pid = sample[0]
    mean_resp = np.mean([resp for _, resp in sample[1]])
    
    if mean_resp < 12: slow_id.append(pid)
    elif mean_resp > 18: fast_id.append(pid)
    else: normal_id.append(pid)

print(len(fast_id), len(normal_id), len(slow_id))

1 4 8


In [11]:
fast_dataset = []; normal_dataset = []; slow_dataset = []
for sample in dataset:
    if sample[0] in fast_id:
        fast_dataset.append(sample)
    elif sample[0] in slow_id:
        slow_dataset.append(sample)
    else:
        normal_dataset.append(sample)
print(len(fast_dataset), len(normal_dataset), len(slow_dataset))

1 4 8


In [12]:
np.save(f'{DATA_SAVE_PATH}/230920/capno-preprocessed_fastRR.npy', np.array(fast_dataset))
np.save(f'{DATA_SAVE_PATH}/230920/capno-preprocessed_normalRR.npy', np.array(normal_dataset))
np.save(f'{DATA_SAVE_PATH}/230920/capno-preprocessed_slowRR.npy', np.array(slow_dataset))

In [7]:
## ARCHIVE
# pleth = pd.read_csv(f'{DATA_PATH}/{selected_capno_id[0]}_signal.csv', usecols=['pleth_y'])['pleth_y'].values
# resp_x = np.asarray(pd.read_csv(f'{DATA_PATH}/{selected_capno_id[0]}_reference.csv')['rr_co2_x'][0].strip().split(' ')).reshape(-1,1)
# resp_y = np.asarray(pd.read_csv(f'{DATA_PATH}/{selected_capno_id[0]}_reference.csv')['rr_co2_y'][0].strip().split(' ')).reshape(-1,1)
# resp = np.concatenate((resp_x, resp_y), axis=1)
# resp = pd.DataFrame(resp, columns=['resp_x', 'resp_y'], dtype=np.float32)

# taps = signal.firwin(numtaps=2000, cutoff=[0.1,0.4], window='hamming', pass_zero=False, fs=300)
# filtered_pleth = signal.filtfilt(taps, 1.0, pleth)
# filtered_pleth.shape

# org_fs = 300
# new_fs = 30
# shift_factor = 60
# window_size = org_fs * 60 # 18000
# shift = int(window_size/shift_factor) # 300
# shift_n_times = int((len(pleth)-window_size)/shift)+1

# window_pleth = np.array([pleth[0+shift*i:window_size+shift*i] for i in range(shift_n_times)])
# window_rsmp_pleth = np.array([signal_resample(win, org_fs, new_fs) for win in window_pleth])
# window_resp = np.array([round(np.mean(resp.loc[(resp['resp_x']>=(0+(shift//org_fs)*i)) & (resp['resp_x']<(window_size//org_fs) + (shift//org_fs)*i)]['resp_y'].values)) for i in range(shift_n_times)])
# window_pleth.shape, window_rsmp_pleth.shape, window_resp.shape

# dataset = np.array([np.array([window_rsmp_pleth[i], window_resp[i]]) for i in range(len(window_rsmp_pleth))])
# dataset.shape

(144001,)