In [None]:
import pandas as pd
import xgboost
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import confusion_matrix, f1_score
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score, recall_score, precision_score, accuracy_score
import xgboost as xgb
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV, train_test_split
import neurokit2 as nk
import hrvanalysis as hrv
from scipy import ndimage
from sklearn.svm import SVC
from sklearn.datasets import make_classification
import numpy as np
from sklearn.feature_selection import RFECV
import os

In [None]:
# Data preprocessing and feature extraction

folder_path = '/home/mw/input/hfindex7735/vol1/PPC'
npz_files = [os.path.join(root, file) for root, _, files in os.walk(folder_path) for file in files if file.endswith('.npz')]
ecg = []
rsp = []
spo = []
acc = []
for i, file_path in enumerate(npz_files):
    data = np.load(file_path)
    ecg.append(data['ecg_list'])
    rsp.append(data['rsp_list'])
    spo.append(data['spo_value'])
    acc.append(data['acc'])


ecg_raw = ecg
rsp_raw = rsp
spo_raw = spo
acc_raw = acc

def smooth_signal(signal, size):
    smoothed_signal = np.zeros(len(signal))
    for i in np.arange(len(signal)):
        if i == 0:
            smoothed_signal[i] = signal[i]
        elif i < np.divide((size - 1), 2):
            smoothed_signal[i] = np.mean(signal[:i * 2 + 1])
        elif i > len(signal) - np.divide((size - 1), 2) - 1:
            smoothed_signal[i] = np.mean(signal[i - (len(signal) - 1 - i):len(signal)])
        else:
            start = int(i - np.divide((size - 1), 2))
            end = int(i + 1 + np.divide((size - 1), 2))
            smoothed_signal[i] = np.mean(signal[start:end])
    return smoothed_signal


def remove_outliers(signal, size):

    for i in np.arange(size, len(signal) - size):
        mean = np.mean(np.hstack((signal[i-size:i],signal[i+1:i+size+1])))
        std = np.std(np.hstack((signal[i-size:i],signal[i+1:i+size+1])))
        if ((signal[i] > mean + 3*std) | (signal[i] < mean - 3*std)):
            signal[i] = mean
    return signal

# smooth:
ecg_smoothed = smooth_signal(ecg_raw, size=200)
rsp_smoothed = smooth_signal(rsp_raw, size=25)
spo_smoothed = smooth_signal(spo_raw, size=10)
acc_smoothed = smooth_signal(acc_raw, size=10)

#Removing outliers
ecg_remove = remove_outliers(ecg_smoothed, size=200)
rsp_remove = remove_outliers(rsp_smoothed, size=25)
spo_remove = remove_outliers(spo_smoothed, size=10)
acc_remove = remove_outliers(acc_smoothed, size=10)

_, results = nk.ecg_peaks(ecg_remove, sampling_rate=200, method='Hamilton')
nn = results['ECG_R_Peaks']

# HRV features
hrv_feature = hrv.extract_features.get_time_domain_features(nn_intervals=nn)
hrv_feature2 = hrv.extract_features.get_frequency_domain_features(nn_intervals)

# Characterization of the arrhythmic load
ar_burden = rr_process(ecg_remove)

# Respiratory characteristics
rsp_info = nk.rsp_findpeaks(rsp_cleaned=rsp_remove, sampling_rate=25, method="khodadad2018")
rsp_feature = rsp_process(rsp_info)

# Oxygen Characteristics
spo_feature = spo_process(spo_remove)

# Sleep features
slp_features = sleep_process(ecg_remove, rsp_remove, spo_remove)

# Merge
Physiology_features = hrv_feature.merge(hrv_feature2, on='key', how='inner') \
    .merge(ar_burden, on='key', how='inner') \
    .merge(rsp_feature, on='key', how='inner') \
    .merge(spo_feature, on='key', how='inner') \
    .merge(slp_features, on='key', how='inner')

# clinical
clinical_features = pd.read_csv('clinical_data.csv')

all_features = pd.merge(Physiology_features, clinical_features, on='key', how='inner')

with pd.ExcelWriter('Physiological and Clinical Characteristics of Heart valve surgery patients.xlsx') as writer:
    clinical_features.to_excel(writer, sheet_name='cln', index=False)
    Physiology_features.to_excel(writer, sheet_name='phy', index=False)
    all_features.to_excel(writer, sheet_name='both', index=False)

In [None]:
# Feature screening

excel_file_path = 'Physiological and Clinical Characteristics of Heart valve surgery patients.xlsx'


#Choose a different learner
models_dict = {
    "XGBoost": XGBClassifier(learning_rate=0.01, n_estimators=10, max_depth=1, random_state=1),
    "Logistic Regression": LogisticRegression(C=0.01, random_state=1),
    "Random Forest": RandomForestClassifier(n_estimators=10, max_depth=1, random_state=1),
    "SVM": SVC(kernel='linear', random_state=1, probability=True),
    "KNN": KNeighborsClassifier()
}

sheets = ['cln', 'phy', 'both']
models = ['XGBoost', 'Logistic Regression', 'Random Forest', 'SVM', 'KNN']

#Stored in the corresponding model sheet
with pd.ExcelWriter('Feature screened _data.xlsx') as writer:
  
    for model_name in models:
        estimator = models_dict[model_name]  
        

        for sheet in sheets:
            print(f"Processing data for model , sheet {sheet}")

            df = pd.read_excel(excel_file_path, sheet_name=sheet)
            X = df.iloc[:, :-1].values 
            y = df.iloc[:, -1].values  

            rfecv = RFECV(estimator=estimator, step=1, cv=StratifiedKFold(5), scoring='roc_auc')
            rfecv.fit(X, y)

            print(f"Optimal number of features for {model_name} on {sheet}: {rfecv.n_features_}")
            print(f"Ranking of features for {model_name} on {sheet}: {rfecv.ranking_}")

            selected_features = df.columns[:-1][rfecv.support_]
            processed_data = df[selected_features]               
            processed_data['Label'] = y                         

            output_sheet_name = f"{model_name}_{sheet}"
            processed_data.to_excel(writer, sheet_name=output_sheet_name, index=False)

In [None]:
# Model tuning

excel_file_path = 'Feature screened _data.xlsx'

# Define the model and corresponding sheet names
# cln for clinical data, phy for physiologic data, both for total data
sheet_mapping = {
    "XGBoost": ["xgb_cln", "xgb_phy", "xgb_both"],
    "Logistic Regression": ["lr_cln", "lr_phy", "lr_both"],
    "Random Forest": ["rf_cln", "rf_phy", "rf_both"],
    "SVM": ["svm_cln", "svm_phy", "svm_both"],
    "KNN": ["knn_cln", "knn_phy", "knn_both"]
}

# Define initial parameters for each model
models = {
    "XGBoost": xgb.XGBClassifier(learning_rate=0.01, n_estimators=10, max_depth=1, random_state=1),
    "Logistic Regression": LogisticRegression(C=0.01, random_state=1),
    "Random Forest": RandomForestClassifier(n_estimators=10, max_depth=1, random_state=1),
    "SVM": SVC(kernel='poly', C=0.01, random_state=1, probability=True),
    "KNN": KNeighborsClassifier()
}

# Define the parameter grid
param_grids = {
    "XGBoost": {
        'n_estimators': list(range(10, 101, 10)),
        'max_depth': list(range(1, 10)),
        'learning_rate': [0.01, 0.1, 0.2],
        'random_state' : list(range(1,50))
    },
    "Logistic Regression": {
        'C': [0.01, 0.1, 0.2, 0.5, 1, 10]
        'random_state' : list(range(1,50))
    },
    "Random Forest": {
        'n_estimators': [10, 40, 70, 100],
        'max_depth': [1, 2, 4, 6, 8]
        'random_state' : list(range(1,50))
    },
    "SVM": {
        'C': [0.01, 0.1, 0.2, 0.5, 1],
        'kernel': ['linear', 'poly', 'rbf']
        'random_state' : list(range(1,50))
    },
    "KNN": {
        'n_neighbors': [3, 5, 7, 9],
        'weights': ['uniform', 'distance']
    }
}

# Read and process the corresponding sheet for each model and perform parameter search, training and evaluation
for model_name, sheets in sheet_mapping.items():
    model = models[model_name]
    print(f"processing: {model_name}")
    

    for sheet in sheets:
        print(f" data: {sheet}")
        
        df = pd.read_excel(excel_file_path, sheet_name=sheet)
        all_features = df.values
        X = all_features[:, :-1] 
        y = all_features[:, -1]

        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
        

        grid_search = GridSearchCV(estimator=model, param_grid=param_grids[model_name],
                                   scoring='roc_auc', cv=5, n_jobs=-1)
        

        grid_search.fit(X_train, y_train)

        print(f" Best parameters found: {grid_search.best_params_}")
        print(f" Best AUC score: {grid_search.best_score_}")
        

        best_model = grid_search.best_estimator_
        y_pred_proba = best_model.predict_proba(X_test)[:, 1]
        test_auc = roc_auc_score(y_test, y_pred_proba)
        print(f" Test AUC score:: {test_auc}")


In [None]:
# Model training

models = {
    "XGBoost": xgb.XGBClassifier(learning_rate=0.1, n_estimators=32, max_depth=1, random_state=30),
    "Logistic Regression": LogisticRegression(C=0.2, random_state=1),
    "Random Forest": RandomForestClassifier(n_estimators=40, max_depth=2, random_state=1),
    "SVM": SVC(kernel='poly', C=0.2, random_state=10, probability=True),
    "KNN": KNeighborsClassifier(n_neighbors=5, weights='uniform')
}

sheet_mapping = {
    "XGBoost": ["xgb_cln", "xgb_phy", "xgb_both"],
    "Logistic Regression": ["lr_cln", "lr_phy", "lr_both"],
    "Random Forest": ["rf_cln", "rf_phy", "rf_both"],
    "SVM": ["svm_cln", "svm_phy", "svm_both"],
    "KNN": ["knn_cln", "knn_phy", "knn_both"]
}

results = {}
predictions = {model_name: [] for model_name in models.keys()}

for model_name, model in models.items():
    a_auc_per_sheet = []  # Store the AUC value for each sheet
    for sheet in sheet_mapping[model_name]:
        df = pd.read_excel('Feature screened _data.xlsx', sheet_name=sheet)
        X = df.iloc[:, :-1]
        y = df.iloc[:, -1]

        scaler = StandardScaler()
        X = scaler.fit_transform(X)

        #5-fold cross validation
        skf = StratifiedKFold(n_splits=5, random_state=10, shuffle=True) 
        a_auc, a_acc, a_f1, a_pre = [], [], [], []
        sheet_predictions = []

        for tr_idx, test_idx in skf.split(X, y):
            lab = y
            model.fit(X[tr_idx], y[tr_idx])
            y_predict = model.predict_proba(X[test_idx])[:, -1] 
            sheet_predictions.extend(y_predict)  
            aauc = roc_auc_score(y[test_idx], y_predict)
            f1_s = f1_score(lab[test_idx.tolist()], predict)
            prec = precision_score(lab[test_idx.tolist()], predict)
            aauc = roc_auc_score(lab[test_idx.tolist()], y_predict[:, -1])
            a_auc.append(aauc)
            a_acc.append(acc)
            a_f1.append(f1_s)
            a_pre.append(prec)
        mean_auc = np.mean(a_auc)
        a_auc_per_sheet.append(mean_auc)
        predictions[model_name].append(np.array(sheet_predictions))  
        results[model_name] = a_auc_per_sheet
        print(f "{model_name}:")
        
        #Calculation of auc,acc,f1
        print(round(np.mean(a_auc), 2), '±', round(np.std(a_auc, ddof=1), 2))
        print(round(np.mean(a_acc), 2), '±', round(np.std(a_acc, ddof=1), 2))
        print(round(np.mean(a_f1), 2), '±', round(np.std(a_f1, ddof=1), 2))
        print(round(np.mean(a_pre), 2), '±', round(np.std(a_pre, ddof=1), 2))