# Imports and Setup

In [None]:
import pandas as pd
import numpy as np
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, ConvLSTM2D, TimeDistributed, RepeatVector, Flatten, Conv2D, MaxPooling2D, Dropout, Reshape, LeakyReLU, Conv1D, MaxPooling1D
from tensorflow.keras import initializers
import tensorflow as tf
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.model_selection import train_test_split
import os
import math
import glob
import scipy
!pip install tensorflow_datasets
import tensorflow_datasets as tfds

from statistics import stdev

import pickle
import matplotlib.pyplot as plt
from scipy.signal import find_peaks
from scipy import interpolate


In [None]:
print(30)

In [None]:
print(tf.__version__)

In [None]:
!python3 --version

# Data Pre-Processing

The Pre-Processed Data can be found at "Data/75.pkl"

## Dataset Used

University of California, Irvine (UCI) crated a Machine learning Repository dataset, derived from the largest publically availble database ‘Multi-parameter Intelligent Monitoring in Intensive Care (MIMIC-II)’ (Physionet repository). This database has simultaneous recordings of multi-parameters of Intensive care unit (ICU) patients which include physiological signals and parameters. For the purpose of this study, the simultaneous recordings of electro-cardiogram (ECG), photoplethysmograph (PPG) and arterial blood pressure (ABP) of 12000 subjects which are provided in the [UCI repository]('https://archive.ics.uci.edu/ml/datasets/Cuff-Less%2BBlood%2BPressure%2BEstimation') were used.

## Cleaning the Data

First, PPG and ECG data are pre-processed to eradicate the data of insufficient duration (less than 8 minutes recording), resulted in approximately 83% reduction in the dataset. Then, data segmentation is performed by taking 8 seconds window with 75% overlapping. Further, unreliable signals such as missing data (Nan), and very high/low BP and HR values (SBP ≥ 180, SBP ≤ 80, DBP ≥ 130, DBP ≤ 60, HR < 40,
HR > 220 ) are excluded from the dataset, reduced the remaining data by approximately 20%. These cleaning steps result in reduction of total subjects from 12000 to 1557. This cleaned dataset of 1557 patients is used for the purpose of this study.

This cleaning process was referred from the work done by Panwar et. al, "PP-Net: A Deep Learning Framework for PPG-Based Blood Pressure and Heart Rate Estimation".




In [None]:
li = []

# The dataset has 12 parts

for i in range (1,13):
  path = 'your_path/dataset/PART_' + str(i)
  number_of_patients = 0

  filenames = glob.glob(path + "/*.csv")

  for filename in filenames:
      data = pd.read_csv(filename, names=['PPG', 'ABP', 'ECG'], index_col=None, header=0)
      ## add the data cleaning part, and remove the outliers

      abp_data = data["ABP"]
      number_of_patients += 1

      if (data["ABP"].size < 60000): ## Number of samples check, 60k = 8mins * 60s * 125Hz (atleast 8 mins footage)
          continue
  
      peaks, _ = find_peaks(abp_data, distance = 40)
      valleys, _ = find_peaks(-abp_data, distance = 40)
      time_mins = (data["ABP"].size/125)/60 #fs=125 Hz

      if (np.mean(np.array(abp_data[peaks])) <= 180 and np.mean(np.array(abp_data[peaks])) >= 80 and np.mean(np.array(abp_data[valleys])) <=130 and np.mean(np.array(abp_data[valleys])) >= 60):
          li.append(data)

          address = str(i) + '/' + 'dataset_' + str(number_of_patients) + '.csv'
          data.to_csv('your_path/clean_samples/PART_'+ address)

df = pd.concat(li, axis=0, ignore_index=True)

In [None]:
df.size

In [None]:
df.head()

## Pre-Processing

In [None]:
max_PPG = max(df['PPG'])
max_ABP = max(df['ABP'])
max_ECG = max(df['ECG'])

df['PPG'] = df['PPG']/max_PPG
df['ABP'] = df['ABP']/max_ABP
df['ECG'] = df['ECG']/max_ECG

In [None]:
df = pd.DataFrame(df, columns = ['PPG','ABP','ECG'])

### Obtaining Peaks

In [None]:
lower = 0
upper = df['ABP'].size
abp = df['ABP'].to_numpy()
abp = abp[lower:upper]

peaks, _ = find_peaks(abp, distance=40)  # Systolic (110, 180)

valleys, _ = find_peaks(-abp, distance=40) # Diastolic (40, 100)

# plt.plot(abp)
# plt.plot(peaks, abp[peaks], "o", color='r')
# plt.plot(valleys, abp[valleys], "o", color='b')
# plt.show()

### Interpolating Values

In [None]:
func = interpolate.interp1d(peaks, abp[peaks])

x_vals_peaks = np.arange(peaks[0],peaks[len(peaks)-1])
peaks_vals = func(x_vals_peaks)


func = interpolate.interp1d(valleys, abp[valleys])

x_vals_valleys = np.arange(valleys[0],valleys[len(valleys)-1])
valleys_vals = func(x_vals_valleys)

### Extending the Initial and Last Values

In [None]:
x_vals = np.arange(0,upper)

cont_peaks = np.zeros(upper)
cont_peaks[0:peaks[0]] = peaks_vals[0]
cont_peaks[peaks[0]:peaks[len(peaks)-1]] = peaks_vals[:]
cont_peaks[peaks[len(peaks)-1]:] = peaks_vals[len(peaks_vals)-1]

cont_valleys = np.zeros(upper)
cont_valleys[0:valleys[0]] = valleys_vals[0]
cont_valleys[valleys[0]:valleys[len(valleys)-1]] = valleys_vals[:]
cont_valleys[valleys[len(valleys)-1]:] = valleys_vals[len(valleys_vals)-1]

In [None]:
plt.plot(valleys, abp[valleys], 'o', x_vals, cont_valleys, color='r')
plt.plot(peaks, abp[peaks], 'o', x_vals, cont_peaks, color='b')

In [None]:
cont_peaks = cont_peaks.reshape(-1,1)
cont_valleys = cont_valleys.reshape(-1,1)

In [None]:
train_x = np.array(df.drop(columns=["ABP"]))
train_y = np.concatenate((cont_peaks, cont_valleys), axis=1)

### Preparing Data For Input

In [None]:
time_window = 1000
scaling_factor = 4
input_size = int(time_window/scaling_factor)
stride = int(input_size/4)

from scipy.signal import butter
from scipy.signal import sosfilt
from scipy.signal import resample

# denoising
sos_ppg_up = butter(3, 20, 'low', fs=125, output='sos')
sos_ppg_low = butter(3, 1.5, 'high', fs=125, output='sos')
sos_ecg_up = butter(3, 40, 'low', fs=125, output='sos')
sos_ecg_low = butter(3, 0.05, 'high', fs=125, output='sos')

train_x[:,0] = sosfilt(sos_ppg_up, train_x[:,0])
train_x[:,0] = sosfilt(sos_ppg_low, train_x[:,0])
train_x[:,1] = sosfilt(sos_ecg_up, train_x[:,1])
train_x[:,1] = sosfilt(sos_ecg_low, train_x[:,1])

# downsampling
train_x = resample(train_x, int(train_x.shape[0]/scaling_factor))
train_y = resample(train_y, int(train_y.shape[0]/scaling_factor))

print(train_x.shape)
print(train_y.shape)

In [None]:
dataset_x = tf.keras.preprocessing.timeseries_dataset_from_array(
    train_x, train_y, sequence_length=input_size, sequence_stride=stride, batch_size=1) ## /4 to get 25% different values

In [None]:
dataset_y = tf.keras.preprocessing.timeseries_dataset_from_array(
    train_y, train_x, sequence_length=input_size, sequence_stride=stride, batch_size=1) ## /4 to get 25% different values

In [None]:
def dataset_to_numpy(ds):
    """
    Convert tensorflow dataset to numpy arrays
    """
    x = [train_x[:250]]
    y = [train_y[:250]]
   

    # Iterate over a dataset
    for i, (inp, out) in enumerate(tfds.as_numpy(ds)):
        if i > 0:
            inp = inp.reshape(input_size,2)
            out = out.reshape(2)
            x.append(inp)
            y.append(out)
        
    return x, y

In [None]:
x, _= dataset_to_numpy(dataset_x)

In [None]:
y, _= dataset_to_numpy(dataset_y)

In [None]:
train_x = np.array(x)
train_y = np.array(y)

In [None]:
filename = 'Data/75.pkl'

In [None]:
with open(filename, 'wb') as f:
    pickle.dump([train_x, train_y, max_ABP], f)

In [None]:
train_x.shape
# samples * 250 * 2

In [None]:
train_y.shape
# samples * 2