## 1: Data preperation
Extraction of normal PPG segments around a normal blood pressure measurement.

#### Importing several libraries  
These libraries are commonly used for data analysis, visualization, file handling, and signal processing tasks.

In [1]:
# A library for data manipulation and analysis. 
# It provides data structures like DataFrames that allow to work with structured data efficiently.
import pandas as pd

# A library for numerical computations.
import numpy as np
from numpy.ma.core import zeros_like

# A plotting library that provides a wide range of functions for creating static, animated, and interactive visualizations.
from matplotlib import pyplot as plt
from matplotlib.patches import Rectangle

# A module that helps you find files/directories matching a specified pattern.
import glob

# A library for reading and writing Excel files in Python.
import openpyxl

# A module that provides a way to interact with the operating system. It allows to perform operations like accessing files, directories, and environment variables.
import os

# A module for manipulating dates and times.
import datetime
from datetime import datetime

# A module that provides signal processing functions for filtering, spectral analysis, interpolation, and more.
from scipy import signal
from scipy.signal import savgol_filter, argrelextrema

# A library for heart rate variability (HRV) analysis.
import heartpy as hp

# A module that provides functions for generating random numbers, selecting random elements, shuffling sequences, and more.
import random

# A class from the sklearn.model_selection module that is used for creating a cross-validation splitting strategy that takes into account grouping or clustering of data samples.
from sklearn.model_selection import GroupKFold

# A class from the sklearn.impute module that provides strategies for imputing missing values in a dataset.
from sklearn.impute import SimpleImputer

# A class from the collections module that helps you count the frequency of elements in a list or an iterable.
from collections import Counter

# Install several Python packages using the %pip install command. 
# %pip install heartpy
# %pip install matplotlib notebook
# %pip install imbalanced-learn
# %pip install --upgrade scikit-learn
# %pip install --upgrade imbalanced-learn
# %pip install scikit-learn==0.24.2
# %pip install imbalanced-learn==0.8.0

#### Upload meta-data 
The metadata obtained from Nervio contains weight, height and gender data of all patients according to their ID number. This dataset also contains a baseline time used to indicate the onset of anesthesia for each patient.

In [3]:
# Upload the meta data excel file to a data frame
directory = "C:/Users/shaha/Desktop/Final Project-PPG/Patient_details.xlsx"
patients_meta_data = pd.read_excel(directory, engine='openpyxl')

# Change the ID column name for convenience
patients_meta_data.rename(columns={'IONM': 'patient_ID'}, inplace=True)

# Creating a new column of the base time for the indication of the start of anesthesia. 
# Based on the value of Anesthesia Induction if it exists, or alternatively the value of the first time when propofol/muscle paralyzer is given.
patients_meta_data['baseline_time'] = patients_meta_data['Anesthesia Induction'].copy()
patients_meta_data['new_column'] = np.nan
for i, booli in enumerate(patients_meta_data['Propofol: Diprivan,Diprofol \ Scholine'].notna()):
    if booli:
        val = patients_meta_data['Propofol: Diprivan,Diprofol \ Scholine'][i]
        new_val = pd.to_datetime(val).to_pydatetime()
        patients_meta_data.at[i, 'new_column'] = new_val
patients_meta_data['baseline_time'].fillna(patients_meta_data['new_column'], inplace=True)

# Fix a specific value that contains str instead of datetime
date_string = patients_meta_data['baseline_time'][59:60].values[0]
datetime_obj = datetime.strptime(date_string, "AM %I:%M %d/%m/%Y")
patients_meta_data.at[59, 'baseline_time'] = datetime_obj

# Drop the unneccesery columns of the data frame
columns_to_drop = ['new_column', 'Propofol: Diprivan,Diprofol \ Scholine', 'Anesthesia Induction']
patients_meta_data = patients_meta_data.drop(columns_to_drop, axis=1)

# Apply SimpleImputer to the Weight column that not have values for all the patients
column_to_impute = 'Weight'
impute_df = patients_meta_data[[column_to_impute]]
imp = SimpleImputer(missing_values=np.nan, strategy='mean')
imputed_column = pd.DataFrame(imp.fit_transform(impute_df), columns=[column_to_impute])
patients_meta_data[column_to_impute] = imputed_column

#### Upload the BPs data 
The Excel files of the blood pressure labels received from Nervio include two sheets for each patient. One sheet contains the invasive blood pressure measurements taken from the anesthesia system and the other sheet contains the non-invasive blood pressure measurements taken with a measuring cuff. The invasive blood pressure data is more reliable and therefore when this data is available it will be used except in cases where there is no invasive data and therefore the non-invasive data will be taken for labeling.

In [4]:
def import_data_BP(patient):
    """
    This function imports and prepares the blood pressure (BP) files for a specific patient.
    
    Parameters:
    - patient (str): The patient identifier or pattern to match the BP files.
    
    Returns:
    - df_BP_prep (pandas DataFrame): The prepared BP data for the patient.
    - type_BP (str): The type of blood pressure (either 'Invasive' or 'Non Invasive').
    """
    
    # Directory where the BP files are located
    directory = "C:/Users/shaha/Desktop/Final Project-PPG/DATA_PPG_till 250922/PPG+BP_data/"
    
    # Create a pattern to match the BP file for the specified patient
    pattern = '*' + patient 
    
    # Search for the BP file matching the pattern
    excel_file = glob.glob(directory + pattern + '.xlsx')
    
    # Specify the columns to import from the BP file
    usecols = ['Unnamed: 0', 'Unnamed: 1']
    
    # Read the Invasive BP sheet from the excel file using pandas
    df_IBP = pd.read_excel(excel_file[0], sheet_name='Invasive BP', usecols=usecols, engine='openpyxl')
    
    # Assume the type of BP is Invasive
    type_BP = 'Invasive'
    
    if len(df_IBP) > 2 : 
        # If the length of the Invasive BP DataFrame is greater than 2 (indicating it has data),
        # perform data preparation on the Invasive BP data using the Data_Preperation_BP function
        df_IBP_prep = Data_Preperation_BP(df_IBP)
        
        # Get the start of the Invasive BP 
        start_IBP = df_IBP_prep['DataTime'][2]
        
        # Return the prepared Invasive BP DataFrame and the type of BP
        return df_IBP_prep, type_BP
   
    else: 
        # If the Invasive BP DataFrame has no data, read the Non Invasive BP sheet from the excel file
        df_NIBP = pd.read_excel(excel_file[0], sheet_name='NIBPS D', usecols=usecols, engine='openpyxl')
        
        # Perform data preparation on the Non Invasive BP data using the Data_Preperation_BP function
        df_NIBP_prep = Data_Preperation_BP(df_NIBP)
        
        # Get the start of the Non Invasive BP using the start_BP function
        start_NIBP = df_NIBP_prep['DataTime'][2]
        
        # Set the type of BP as Non Invasive
        type_BP = 'Non Invasive'
        
        # Return the prepared Non Invasive BP DataFrame and the type of BP
        return df_NIBP_prep, type_BP

In [5]:
def Data_Preperation_BP(df_BP):
    """
    This function performs data preparation on the blood pressure (BP) data frame.

    Parameters:
    - df_BP (pandas DataFrame): The BP data frame to be prepared.

    Returns:
    - df_BP (pandas DataFrame): The prepared BP data frame.
    """

    # Delete the first 2 rows from the BP data frame
    df_BP = df_BP.drop(labels=[0, 1], axis=0)

    # Fix the indexes to start from 0
    df_BP.reset_index(drop=True, inplace=True)

    # Rename the titles of the BP data frame
    df_BP.rename(columns={'Unnamed: 0': 'DataTime', 'Unnamed: 1': 'Value'}, inplace=True)

    # Separate the column 'Value' into two columns: 'SBP' and 'DBP'
    df_BP[['SBP', 'DBP']] = df_BP['Value'].str.split('/', expand=True)

    # Separate the column 'DataTime' into different columns for each time part
    df_BP['DataTime'] = pd.to_datetime(df_BP['DataTime'])

    # Check for any NaN values in 'SBP' or 'DBP' columns
    c1 = df_BP['SBP'].isna().sum()
    c2 = df_BP['DBP'].isna().sum()
    if (c1 > 0) or (c2 > 0):
        # This BP data includes NaN values. Treat this by dropping them.
        df_BP = df_BP.dropna(subset=['SBP'])
        df_BP = df_BP.dropna(subset=['DBP'])

    # Create a column for mean arterial pressure (MAP) using the formula: 1/3(SBP) + 2/3(DBP)
    df_BP['SBP'] = df_BP['SBP'].astype(int)
    df_BP['DBP'] = df_BP['DBP'].astype(int)
    df_BP['MAP'] = (1/3 * df_BP['SBP']) + (2/3 * df_BP['DBP'])

    # Reset the index of the BP data frame
    df_BP.reset_index(drop=True, inplace=True)

    # Return the prepared BP data frame
    return df_BP

#### Upload the PPG data
The Excel files of the PPG signals received from Nerbio include between a single file and several continuous files for each patient, so the file name includes the patient's ID number. Each file contains many columns extracted from the PPG sensor. The relevant columns we have passed are the PLETH column which describes the PPG wave signal itself, the time column in the jumps in which the sample was taken (every 10 milliseconds) and the Prefusion index column which indicates the correctness of the measurement, so that a measurement in which the value is above 1 is considered correct.

In [6]:
def import_data_PPG(patient):
    """
    This function finds all the files associated with a specific patient.

    Parameters:
    - patient (str): The patient identifier or pattern to match the files.

    Returns:
    - csv_files (list): A list of file paths matching the patient identifier.
    """

    # Folder path where the files are located
    folder_path = "C:/Users/shaha/Desktop/Final Project-PPG/DATA_PPG_till 250922/"

    # Create a pattern to match the files for the specified patient
    pattern = '*' + patient

    # Search for the files matching the pattern in the folder
    csv_files = glob.glob(folder_path + pattern + '.csv')

    # Return the list of file paths
    return csv_files

In [7]:
def FileName_and_dfPPG_and_FileNum(patient):
    """
    This function creates data frames of the patient's files and returns the number of files.

    Parameters:
    - patient (str): The patient identifier or pattern to match the files.

    Returns:
    - files (list): A list of pandas DataFrames containing the patient's files.
    - files_names (list): A list of file names corresponding to each DataFrame.
    """

    # Folder path where the files are located
    folder_path = "C:/Users/shaha/Desktop/Final Project-PPG/DATA_PPG_till 250922/"

    # Initialize empty lists for the files and file names
    files = []
    files_names = []

    # Use the import_data_PPG function to get the list of file paths
    csv_files = import_data_PPG(patient)

    # Check if any files were found
    if csv_files:
        # Iterate over each file path
        for i, csv in enumerate(csv_files):
            # Read the CSV file and create a DataFrame
            df = pd.read_csv(csv_files[i])
            files.append(df)

            # Get the file name by removing the folder path from the file path
            file_name = csv.replace("\\", "/").replace(folder_path, "")
            files_names.append(file_name)

        # Get the number of files
        num_of_files = len(files)
        print(f'Patient number: {patient}, has {num_of_files} files of PPG.')

    # Return the list of DataFrames and file names
    return files, files_names

In [8]:
def total_time_PPG(df):
    """
    This function calculates the total time of the PPG file based on the 'TIMESTAMP_MS' column.

    Parameters:
    - df (pandas DataFrame): The PPG data frame containing the 'TIMESTAMP_MS' column.

    Returns:
    - hours (float): The total time of the PPG file in hours.
    """

    # Get the last value in the 'TIMESTAMP_MS' column (in milliseconds)
    last_value = df.iloc[-1]['TIMESTAMP_MS']

    # Convert milliseconds to seconds
    seconds = last_value / 1000

    # Convert seconds to minutes
    minutes = seconds / 60

    # Convert minutes to hours
    hours = minutes / 60

    # Return the total time in hours
    return hours

In [9]:
# A function that concatenates all the processes into one function.

def patient_all_data(patient, files, files_names, df_BP, type_BP, len_signal, padding, patients_meta_data, all_parts_num=0, rows_parts_num=0, valid_bp_parts_num=0, parts_num=0):
    """
    This function performs various data processing steps and concatenates them into one function.

    Parameters:
    - patient (str): The patient identifier or pattern.
    - files (list): A list of pandas DataFrames representing the patient's files.
    - files_names (list): A list of file names corresponding to each DataFrame.
    - df_BP (pandas DataFrame): The blood pressure (BP) data frame.
    - type_BP (str): The type of blood pressure ('Invasive' or 'Non Invasive').
    - len_signal (float): The length of the PPG signal in seconds.
    - padding (float): The padding time in seconds.
    - patients_meta_data (pandas DataFrame): Meta data of patients.
    - all_parts_num (int, optional): The total number of parts encountered in the process.
    - rows_parts_num (int, optional): The number of parts that meet the row condition.
    - valid_bp_parts_num (int, optional): The number of parts that meet the blood pressure condition.
    - parts_num (int, optional): The number of valid segments.

    Returns:
    - df_list (list): A list of pandas DataFrames representing the merged data for each file.
    - concatenated_df (pandas DataFrame): The concatenated data frame with relevant columns.
    - total_time_file (float): The total time of the patient's files.
    - all_parts_num (int): The updated total number of parts encountered in the process.
    - rows_parts_num (int): The updated number of parts that meet the row condition.
    - valid_bp_parts_num (int): The updated number of segments that meet the blood pressure condition.
    """

    # Multiply the signal and padding lengths by 100 to account for 10 ms per row
    len_signal *= 100
    padding *= 100

    # Initialize empty lists and variables
    df_list = []
    total_time_file = 0
    all_non_invasive_BP = 0
    all_invasive_BP = 0
    # Iterate over each file DataFrame
    for i, df in enumerate(files):
        # Count the total time of the case
        total_time_file += total_time_PPG(df)

        # Get the start time of the BP measurement from the meta data
        start_time = patients_meta_data.loc[patients_meta_data['patient_ID'] == int(patient), 'baseline_time'].values[0]
        df.insert(0, 'BaseLine_Time', start_time)

        # Insert patient metadata columns into the DataFrame
        df = df.assign(Age=patients_meta_data.loc[patients_meta_data['patient_ID'] == int(patient), 'Age'].values[0],
                       Weight=patients_meta_data.loc[patients_meta_data['patient_ID'] == int(patient), 'Weight'].values[0],
                       Gender=patients_meta_data.loc[patients_meta_data['patient_ID'] == int(patient), 'Gender'].values[0])

        # Create a column 'DataTime' filled with the start time of the file
        df.insert(0, 'DataTime', start_time)

        # Fix the 'DataTime' column with the correct time calculated by 10 ms jumps
        df['DataTime'] = pd.to_timedelta(df['TIMESTAMP_MS']*1e6) + df['DataTime']

        # Create dummy columns with NaN values according to the BP columns
        df = df.assign(DataTime_BP=np.nan, SBP=np.nan, DBP=np.nan, MAP=np.nan, BP_type=np.nan)


        # Find the boundaries of the closest datetimes
        index0 = abs(df.at[0, 'DataTime'] - df_BP['DataTime']).argmin()
        index1 = abs(df.at[df.shape[0]-1, 'DataTime'] - df_BP['DataTime']).argmin()

        for row in range(index0, index1+1):
            # Find the index of the PPG DataFrame that has the closest time of BP measurement
            BP_ind = abs(df['DataTime'] - df_BP.at[row, 'DataTime']).argmin()
            
            # Insert the BP rows in the appropriate index of the PPG DataFrame
            df.at[BP_ind, 'DataTime_BP'] = df_BP.at[row, 'DataTime']
            df.at[BP_ind, 'SBP'] = df_BP.at[row, 'SBP']
            df.at[BP_ind, 'DBP'] = df_BP.at[row, 'DBP']
            df.at[BP_ind, 'MAP'] = df_BP.at[row, 'MAP']
            
            # Insert the BP type column with int values instead of str 
            df.loc[BP_ind, 'BP_type'] = {'Invasive': 1, 'Non Invasive': 0}.get(type_BP, -1)
                
        # Find and insert the value of the BP at the start time of the PPG file
        start_BP_ind = abs(start_time - df_BP['DataTime']).argmin()
        baseline_BP = df_BP.at[start_BP_ind, 'MAP']
        df.insert(0, 'BaseLine_MAP', baseline_BP)

        # Create a merged DataFrame with only the signal+padding rows parts of every BP measurement
        n = int(len_signal + (padding*2))
        has_BP_df = df[~df['DataTime_BP'].isna()]
        merged_df = pd.DataFrame()
        file_count_parts = 0

        for ind in range(len(has_BP_df)):
            index = has_BP_df.iloc[ind:ind+1].index[0]
            part_df = pd.DataFrame()
            part_df = df[:int(index+(padding)+1)][-n:]
            all_parts_num += 1
            
            # Check if every part (n rows) of the merged DataFrame is valid and meets the row condition
            cond_1 = (part_df.iloc[-1]['TIMESTAMP_MS'] - part_df.iloc[0]['TIMESTAMP_MS']) == ((n*10)-10)
            if cond_1:
                rows_parts_num += 1

            # Check if every part (n rows) of the merged DataFrame meets the blood pressure condition
            cond_2 = ((has_BP_df['SBP'] > 290) & (has_BP_df['SBP'] > 40)).sum() == 0
            if cond_1 & cond_2:
                valid_bp_parts_num += 1

            # Check if every part (n rows) of the merged DataFrame meets the perfusion index condition
            cond_3 = (part_df['PERFUSION_INDEX'] >= 1).sum() == len(part_df)
            if cond_1 & cond_2 & cond_3:
                parts_num += 1
                part_df['Part_Number'] = parts_num
                merged_df = pd.concat([merged_df, part_df])
                file_count_parts += 1

        df_list.append(merged_df)
        print(f'The {i+1} file of this patient includes {file_count_parts} valid segments of {n/100} seconds.')
    
    # Concatenate the rows of the data frames
    concatenated_df = pd.concat(df_list)

    # Drop all non-relevant columns from the concatenated data frame
    if len(concatenated_df) > 0:
        concatenated_df = concatenated_df.drop(labels=['COUNTER', 'DEVICE_ID', 'SPO2_STATUS', 'BATTERY_PCT', 'PERFUSION_INDEX'], axis=1)
    concatenated_df.reset_index(drop=True, inplace=True)

    return df_list, concatenated_df, total_time_file, all_parts_num, rows_parts_num, valid_bp_parts_num

#### Create a full data frame contains all valid PPG segments around valid blood pressure measurements

In [10]:
def create_ID_list():
    """
    This function retrieves a list of all the unique IDs present in the 'PPG+BP_data' folder.

    Returns:
    - ID_list_unique (list): A list of unique IDs extracted from the filenames in the folder.
    """

    # Specify the folder path
    folder_path = "C:/Users/shaha/Desktop/Final Project-PPG/DATA_PPG_till 250922/PPG+BP_data/"

    # Get the list of filenames in the folder (excluding the last one)
    files = os.listdir(folder_path)[:-1]

    # Initialize an empty list to store unique IDs
    ID_list_unique = []

    # Iterate over each filename
    for file in files:
        # Extract the ID from the filename
        start = file.find("_") + 1
        ids = file[start:-5]  # Remove the file extension (.xlsx)
        ID_list_unique.append(ids)

    return ID_list_unique

In [11]:
# Processes data for multiple patients and creates a comprehensive DataFrame (full_df) that combines the processed data for all patients. 
# It also tracks the total time of each patient's case in the case_times list.

# Calling the create_ID_list() function to obtain a list of patient IDs (patient_list).
patient_list = create_ID_list()

# Create an empty DataFrame (full_df) to store the processed data of all patients.
full_df = pd.DataFrame()

# process the data for the first patient specified as '22043146'.
patient = '22043146'

# Calls the FileName_and_dfPPG_and_FileNum() function to retrieve the files and file names for the patient.
files, files_names = FileName_and_dfPPG_and_FileNum(patient)

# Calls the import_data_BP() function to import the patient's BP data and determine the type of BP.
df_BP, type_BP = import_data_BP(patient)

# Calls the patient_all_data() function to process the patient's data and retrieve relevant information
df_list, df, total_time_file, all_parts_num, rows_parts_num, valid_bp_parts_num = patient_all_data(patient, files, files_names, df_BP, type_BP, 5, 1, patients_meta_data)

# Adds the patient ID (patient) as a column in the df DataFrame.
df['patient_ID'] = patient

# Concatenates the df DataFrame to the full_df DataFrame.
full_df = pd.concat([full_df, df])

# Appends the total time of the patient's case (total_time_file) to the case_times list.
case_times = []
case_times.append(total_time_file)

# Process the data for the remaining patients in the patient_list using a loop. 
# It performs similar steps as mentioned above, including retrieving files, importing BP data, processing patient data, adding the patient ID as a column, concatenating the processed data to full_df, and appending the total time of each patient's case to case_times.
for patient in range(1, len(patient_list)):
    files, files_names = FileName_and_dfPPG_and_FileNum(patient_list[patient])
    df_BP, type_BP = import_data_BP(patient_list[patient])
    parts_num = full_df['Part_Number'][-1:].values[0]
    df_list, concatenated_df, total_time_file, all_parts_num, rows_parts_num, valid_bp_parts_num = patient_all_data(patient_list[patient], files, files_names, df_BP, type_BP, 5, 1, patients_meta_data, all_parts_num, rows_parts_num, valid_bp_parts_num, parts_num)
    concatenated_df['patient_ID'] = patient_list[patient]
    full_df = pd.concat([full_df, concatenated_df])

    case_times.append(total_time_file)
    
# Fix the last patient import data
patient = '53403719'
directory = "C:/Users/shaha/Desktop/Final Project-PPG/DATA_PPG_till 250922/"
pattern = '*' + patient
excel_file = glob.glob(directory + pattern + '.xlsx')

# Read the Excel file for the last patient
df = pd.read_excel(excel_file[0], sheet_name='20220524T160016.406+0300_534037', engine='openpyxl')
df['patient_ID'] = patient

# Prepare the files, files_names, df_BP, and type_BP for the last patient
files = [df]
file_name = excel_file[0].replace("\\", "/").replace(directory, "")
files_names = [file_name]
df_BP, type_BP = import_data_BP(patient)

# Get the parts_num from full_df
parts_num = full_df['Part_Number'][-1:].values[0]

# Process the data for the last patient
df_list, df, total_time_file, all_parts_num, rows_parts_num, valid_bp_parts_num = patient_all_data(patient, files, files_names, df_BP, type_BP, 5, 1, patients_meta_data, all_parts_num, rows_parts_num, valid_bp_parts_num, parts_num)

# Append the total_time_file to case_times
case_times.append(total_time_file)

# Concatenate the processed data for the last patient to full_df
full_df = pd.concat([full_df, df])

# Reset the indexes
full_df.reset_index(drop=True, inplace=True)

Patient number: 22043146, has 2 files of PPG.
The 1 file of this patient includes 0 valid segments of 7.0 seconds.
The 2 file of this patient includes 6 valid segments of 7.0 seconds.
Patient number: 22054597, has 1 files of PPG.
The 1 file of this patient includes 1 valid segments of 7.0 seconds.
Patient number: 22105955, has 2 files of PPG.
The 1 file of this patient includes 5 valid segments of 7.0 seconds.
The 2 file of this patient includes 1 valid segments of 7.0 seconds.
Patient number: 22389334, has 5 files of PPG.
The 1 file of this patient includes 1 valid segments of 7.0 seconds.
The 2 file of this patient includes 1 valid segments of 7.0 seconds.
The 3 file of this patient includes 0 valid segments of 7.0 seconds.
The 4 file of this patient includes 0 valid segments of 7.0 seconds.
The 5 file of this patient includes 7 valid segments of 7.0 seconds.
Patient number: 23772858, has 1 files of PPG.
The 1 file of this patient includes 76 valid segments of 7.0 seconds.
Patient nu

Patient number: 46582101, has 1 files of PPG.
The 1 file of this patient includes 35 valid segments of 7.0 seconds.
Patient number: 48049854, has 1 files of PPG.
The 1 file of this patient includes 201 valid segments of 7.0 seconds.
Patient number: 48456249, has 1 files of PPG.
The 1 file of this patient includes 126 valid segments of 7.0 seconds.
Patient number: 48555243, has 1 files of PPG.
The 1 file of this patient includes 26 valid segments of 7.0 seconds.
Patient number: 48995255, has 1 files of PPG.
The 1 file of this patient includes 97 valid segments of 7.0 seconds.
Patient number: 50966344, has 1 files of PPG.
The 1 file of this patient includes 268 valid segments of 7.0 seconds.
Patient number: 51156276, has 3 files of PPG.
The 1 file of this patient includes 0 valid segments of 7.0 seconds.
The 2 file of this patient includes 0 valid segments of 7.0 seconds.
The 3 file of this patient includes 55 valid segments of 7.0 seconds.
Patient number: 51392584, has 1 files of PPG.
T

#### Calculates and prints various statistics related to the processed data.

In [13]:
# Display the total number of parts before any filtering.
print(f'Total segments before any filtering: {all_parts_num}')

# Display the number of segments that meet the rows filtering condition.
print(f'Rows segments: {rows_parts_num}, which are {np.round((rows_parts_num/all_parts_num)*100,2)}%')

# Display the number of segments that meet both the rows and perfusion index filtering conditions. the number of segments that meet the valid BP filtering condition.
print(f'Valid BP segments: {valid_bp_parts_num}, which are {np.round((valid_bp_parts_num/rows_parts_num)*100,2)}%')

# Display the number of segments that meet both the rows and perfusion index filtering conditions.
parts_num = full_df['Part_Number'][-1:].values[0]
print(f'Valid segments (after 2 filtering- rows & prefusion index): {parts_num}, which are {np.round((parts_num/valid_bp_parts_num)*100,2)}%')

# Display the number of valid segments that have invasive & non invasive BP reference
non_invasive_BP = full_df.loc[full_df['BP_type'] == 0, 'Part_Number'].nunique()
invasive_BP = full_df.loc[full_df['BP_type'] == 1, 'Part_Number'].nunique()
print(f'Valid segments with invasive BP reference: {invasive_BP}, and non invasive BP reference: {non_invasive_BP}')

# Display the range of the total time of the cases.
print(f'Maximum case time: {np.round(np.max(case_times),5)}, minimum case time: {np.round(np.min(case_times),5)}')
print(f'Median case time: {np.round(np.median(case_times), 3)}')

Total segments before any filtering: 8710
Rows segments: 8548, which are 98.14%
Valid BP segments: 8548, which are 100.0%
Valid segments (after 2 filtering- rows & prefusion index): 6303.0, which are 73.74%
Valid segments with invasive BP reference: 5817, and non invasive BP reference: 486
Maximum case time: 18.80829, minimum case time: 0.00022
Median case time: 2.869


## 2: Split the data to train & test
The code performs a GroupKFold split on the df_split DataFrame to create train and test sets for a supervised learning task.

In [14]:
# Create a copy of the full_df DataFrame
df_split = full_df.copy()

# Define the parameters to the GroupKFold split
X = df_split.drop(['patient_ID', 'MAP', 'BP_type', 'DBP', 'SBP', 'DataTime_BP', 'BaseLine_MAP'], axis=1)
Y = df_split[['MAP', 'BP_type', 'DBP', 'SBP', 'DataTime_BP', 'BaseLine_MAP']]
groups = df_split['patient_ID']

# Initialize the GroupKFold object with n_splits=5
gfk = GroupKFold(n_splits=5)

# Generate the train and test indices using the GroupKFold split
train_idx, test_idx = next(gfk.split(X, Y, groups=groups))

# Create the train and test data frames using the indices

# Select the rows from df_split based on the train indices
train = df_split.iloc[train_idx]

# Select the rows from df_split based on the test indices
test = df_split.iloc[test_idx]

# Reset the indices of train and test data frames
train.reset_index(drop=True, inplace=True)
test.reset_index(drop=True, inplace=True)

# Create copies of the train and test data frames
train_df = train.copy()
test_df = test.copy()

# Count the parts in the train and test data frames
parts_train = len(np.unique(train['Part_Number']))
parts_test = len(np.unique(test['Part_Number']))

# Print the total number of parts, train parts, and test parts
print(f'Total number of parts: {parts_train + parts_test}, {parts_train} train parts and {parts_test} test parts.')

Total number of parts: 6303, 5041 train parts and 1262 test parts.


In [15]:
def count_parts(df, kind):
    """
    Count the number of parts and the distribution of BP types in the given DataFrame.
    
    Parameters:
    - df (DataFrame): The DataFrame containing the data.
    - kind (str): The kind of data or category to count parts for.
    
    Returns:
    None
    
    Prints the count of total parts, invasive BP parts, non-invasive BP parts, and parts with no BP recorded.
    """

    # Count the total number of unique segments
    total_parts = len(np.unique(df['Part_Number']))
    
    # Count the invasive & non-invasive parts in the DataFrame
    
    # Count the number of rows where 'BP_type' is 1.0 (indicating invasive BP)
    invasive_BP = (df['BP_type'] == 1.0).sum()
    
    # Count the number of rows where 'BP_type' is 0 (indicating non-invasive BP)
    no_invasive_BP = (df['BP_type'] == 0).sum()

    # Print the counts of parts for the given kind
    print(f'{kind} - total parts: {total_parts}, invasive BP: {invasive_BP}, non-invasive BP: {no_invasive_BP}')