In [None]:
import pandas as pd
import xgboost
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import confusion_matrix, f1_score
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score, recall_score, precision_score, accuracy_score
import xgboost as xgb
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV, train_test_split
import neurokit2 as nk
import hrvanalysis as hrv
from scipy import ndimage
from sklearn.svm import SVC
from sklearn.datasets import make_classification
import numpy as np
from sklearn.feature_selection import RFECV

In [None]:
# Data preprocessing and feature extraction

df = pd.read_excel(r'Physiological and Clinical Characteristics of Sleep 100.xlsx', sheet_name='use1')
raw_data = {'ecg':[df['ecg_list']], 'rsp':[df['rsp_list']], 'spo':[df['spo_value']], 'acc':[df['acc']]}
ecg_raw = raw_data['ecg']
rsp_raw = raw_data['rsp']
spo_raw = raw_data['spo']
acc_raw = raw_data['acc']


# smooth:
ecg_smoothed = ndimage.median_filter(ecg_raw, size=3)
rsp_smoothed = ndimage.median_filter(rsp_raw, size=9)
spo_smoothed = ndimage.median_filter(spo_raw, size=3)
acc_smoothed = ndimage.median_filter(acc_raw, size=11)

_, results = nk.ecg_peaks(ecg_smoothed, sampling_rate=200, method='Hamilton')
nn = results['ECG_R_Peaks']

# HRV features
hrv_feature = hrv.extract_features.get_time_domain_features(nn_intervals=nn)

# Characterization of the arrhythmic load
ar_burden = rr_process(ecg_smoothed)

# Respiratory characteristics
rsp_info = nk.rsp_findpeaks(rsp_cleaned=rsp_smoothed, sampling_rate=25, method="khodadad2018")
rsp_feature = rsp_process(rsp_info)

# Oxygen Characteristics
spo_feature = spo_process(spo_smoothed)

# Sleep features
rsp_raw = raw_data['rsp']
spo_saw = raw_data['spo']
sp_features = sleep_process(ecg_smoothed, rsp_smoothed, spo_smoothed)

# Merge
Physiology_features = hrv_feature.merge(ar_burden, on='key', how='inner') \
    .merge(rsp_feature, on='key', how='inner') \
    .merge(spo_feature, on='key', how='inner') \
    .merge(sp_features, on='key', how='inner')

# clinical
clinical_features = pd.read_csv('clinical_data.csv')

all_features = pd.merge(Physiology_features, clinical_features, on='key', how='inner')


In [None]:
# Feature screening


# Create a simulated categorical dataset
all_features = all_features.values
X = all_features[:, -1]
y = all_features[-1]


# Defining the base model
estimator = SVC(kernel="linear", probability=True, random_state=42)

# Initialize RFECV, set cross-validation folds and scoring criteria to AUC
rfecv = RFECV(estimator, step=1, cv=StratifiedKFold(5), scoring='roc_auc')

# Fitting RFECV
rfecv.fit(X, y)

# Print the number of features selected
print("Optimal number of features : %d" % rfecv.n_features_)

# Print the ranking of each feature
print("Ranking of features : ")
print(rfecv.ranking_)

# Print AUC values at different number of features
print("Grid scores : ")
print(rfecv.grid_scores_)

In [None]:
# Model training

df = pd.read_excel(r'Physiological and Clinical Characteristics of Sleep 100.xlsx', sheet_name='xgb_rd1')
X = df.iloc[:, :-1]
y = df.iloc[:, -1]
scaler = StandardScaler()
X = scaler.fit_transform(X)
rfc = xgboost.XGBClassifier(learning_rate=0.1, n_estimators=32, max_depth=3, random_state=1)
skf = StratifiedKFold(n_splits=5, random_state=1, shuffle=True)
a_auc, a_acc, a_f1, a_pre, a_recall = [], [], [], [], []
for tr_idx, test_idx in skf.split(X, y):
    lab = y
    rfc.fit(X[tr_idx.tolist()], lab[tr_idx.tolist()])
    predict = rfc.predict(X[test_idx.tolist()])
    best_yH = rfc.predict_proba(X[test_idx.tolist()])
    cm = confusion_matrix(lab[test_idx.tolist()], predict)
    print(confusion_matrix(lab[test_idx.tolist()], predict))
    f1_s = f1_score(lab[test_idx.tolist()], predict)
    recall = recall_score(lab[test_idx.tolist()], predict)
    prec = precision_score(lab[test_idx.tolist()], predict)
    aauc = roc_auc_score(lab[test_idx.tolist()], best_yH[:, -1])
    acc = accuracy_score(lab[test_idx.tolist()], predict)
    a_auc.append(aauc)
    a_acc.append(acc)
    a_f1.append(f1_s)
    a_pre.append(prec)
    a_recall.append(recall)
print(np.mean(a_auc), np.std(a_auc, ddof=1))
print(np.mean(a_acc), np.std(a_acc, ddof=1))
print(np.mean(a_f1), np.std(a_f1, ddof=1))
print(np.mean(a_pre), np.std(a_pre, ddof=1))
print(np.mean(a_recall), np.std(a_recall, ddof=1))
print(round(np.mean(a_auc), 4), '±', round(np.std(a_auc, ddof=1), 4))
print(round(np.mean(a_acc), 4), '±', round(np.std(a_acc, ddof=1), 4))
print(round(np.mean(a_f1), 4), '±', round(np.std(a_f1, ddof=1), 4))
print(round(np.mean(a_pre), 4), '±', round(np.std(a_pre, ddof=1), 4))
print(round(np.mean(a_recall), 4), '±', round(np.std(a_recall, ddof=1), 4))

In [None]:
# Model tuning

all_features = all_features.values
X = all_features[:, -1]
y = all_features[-1]

# Divide the training set and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the XGBoost classifier
xgb_clf = XGBClassifier(objective='binary:logistic', use_label_encoder=False, eval_metric='logloss')

# Define the parameter grid to be searched
param_grid = {
    'n_estimators': list(range(10, 1001, 10)),  # Number of trees
    'max_depth': list(range(1, 10, 1)),        # Maximum depth of the tree
    'learning_rate': [0.01, 0.2, 0.1],  # learning rate
}

# Creating GridSearchCV Objects
grid_search = GridSearchCV(estimator=xgb_clf, param_grid=param_grid, scoring='roc_auc', cv=5, n_jobs=-1)

# Perform a grid search
grid_search.fit(X_train, y_train)

# Output optimal parameters
print("Best parameters found: ", grid_search.best_params_)
print("Best AUC score: ", grid_search.best_score_)

# Prediction on a test set using a model with optimal parameters
best_clf = grid_search.best_estimator_
y_pred_proba = best_clf.predict_proba(X_test)[:, 1]
test_auc = roc_auc_score(y_test, y_pred_proba)
print("Test AUC score: ", test_auc)