In [1]:
import csv
import os

import tqdm

import biosppy.signals.ecg as ecg
import biosppy
import neurokit2 as nk

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import scipy as sp

import hrvanalysis
import heartpy as hp

import sys

sys.path.append("..")
from util import load_base_data


SAMPLING_RATE = 300.0
DATAPATH = "/Users/ericschreiber/dev/ETH/AML/Project_1/aml-2023/task2/data"

In [2]:
X_train, y_train, X_test = load_base_data()

In [3]:
def compute_hrv_features(peaks):
    try:
        tdf = hrvanalysis.get_time_domain_features(peaks)
        gf = hrvanalysis.get_geometrical_features(peaks)
        fdf = hrvanalysis.get_frequency_domain_features(peaks)
        pcp = hrvanalysis.get_poincare_plot_features(peaks)
        samp = hrvanalysis.get_sampen(peaks)
    except:
        return [0] * 22

    return [
        tdf["mean_nni"],
        tdf["sdnn"],
        tdf["sdsd"],
        tdf["nni_50"],
        tdf["pnni_50"],
        tdf["nni_20"],
        tdf["pnni_20"],
        tdf["rmssd"],
        tdf["median_nni"],
        tdf["range_nni"],
        tdf["cvsd"],
        tdf["cvnni"],
        tdf["mean_hr"],
        tdf["max_hr"],
        tdf["min_hr"],
        fdf["lf"],
        fdf["hf"],
        fdf["lf_hf_ratio"],
        pcp["sd1"],
        pcp["sd2"],
        pcp["ratio_sd2_sd1"],
        samp["sampen"],
    ]

In [4]:
def compute_hp_features(ecg):
    try:
        _, measures = hp.process(ecg, SAMPLING_RATE)
    except:
        try:
            _, measures = hp.process(hp.flip_signal(ecg), SAMPLING_RATE)
        except:
            return [0] * 12

    return [
        measures["bpm"],
        measures["ibi"],
        measures["sdnn"],
        measures["sdsd"],
        measures["rmssd"],
        measures["pnn20"],
        measures["pnn50"],
        measures["hr_mad"],
        measures["sd1"],
        measures["sd2"],
        measures["s"],
        np.log10(measures["sd1/sd2"] ** 2),
    ]

In [5]:
def clean_input(ecg):
    #  filtered using a finite impulse response bandpass filter
    cleaned = nk.ecg_clean(ecg, sampling_rate=SAMPLING_RATE, method="biosppy")
    cleaned, was_inverted = nk.ecg_invert(cleaned, sampling_rate=300, show=False)
    _, info = nk.ecg_peaks(ecg_cleaned=cleaned, sampling_rate=SAMPLING_RATE)
    rpeaks = info["ECG_R_Peaks"]
    return cleaned, rpeaks

In [6]:
def make_features(ecg):
    cleaned, rpeaks = clean_input(ecg)
    features = []
    features += compute_hp_features(cleaned)
    features += compute_hrv_features(rpeaks)
    features = np.array(features).flatten()
    return features


def make_features_from_df(df):
    features = []
    for i in tqdm.tqdm(range(len(df))):
        ecg = df.iloc[i].values
        features.append(make_features(ecg))
    numpy = np.array(features)
    df = pd.DataFrame(numpy).reset_index(drop=False)
    df.drop(columns=["index"], inplace=True)
    df.index.name = "id"

    return df

In [8]:
features_X_train = make_features_from_df(X_train)

  warn(
  warn(
  ecg_signal = pd.DataFrame.pad(pd.Series(ecg_signal))
A theoretically impossible result was found during the iteration
process for finding a smoothing spline with fp = s: s too small.
There is an approximation returned but the corresponding weighted sum
of squared residuals does not satisfy the condition abs(fp-s)/s < tol.
  warn(
  ecg_signal = pd.DataFrame.pad(pd.Series(ecg_signal))
  warn(
  ecg_signal = pd.DataFrame.pad(pd.Series(ecg_signal))
  warn(
  ecg_signal = pd.DataFrame.pad(pd.Series(ecg_signal))
  warn(
  ecg_signal = pd.DataFrame.pad(pd.Series(ecg_signal))
  warn(
  ecg_signal = pd.DataFrame.pad(pd.Series(ecg_signal))
  warn(
  ecg_signal = pd.DataFrame.pad(pd.Series(ecg_signal))
  warn(
  ecg_signal = pd.DataFrame.pad(pd.Series(ecg_signal))
  warn(
  ecg_signal = pd.DataFrame.pad(pd.Series(ecg_signal))
  warn(
  ecg_signal = pd.DataFrame.pad(pd.Series(ecg_signal))
  warn(
  ecg_signal = pd.DataFrame.pad(pd.Series(ecg_signal))
  warn(
  ecg_signal = pd.Da

In [9]:
features_X_train

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,24,25,26,27,28,29,30,31,32,33
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,147.929374,405.598958,137.243406,34.685420,263.315979,1.000000,1.000000,131.666667,186.183195,53.144026,...,27.403847,594.059406,3.690945,200.235003,0.223621,895.421767,13.786725,6761.714180,490.451094,0.005089
1,270.962630,221.432749,153.799805,129.419914,225.587465,0.917431,0.770642,153.333333,159.474032,149.948851,...,32.412754,327.868852,7.076306,620.932932,1.962904,316.333846,14.402293,3512.042128,243.852980,0.099091
2,406.658740,147.543860,165.698388,117.934294,237.166709,1.000000,0.945946,70.000000,167.642892,167.257172,...,33.532124,309.278351,7.471980,906.090860,1.175528,770.794483,7.597883,3366.705881,443.111021,-0.000000
3,148.094374,405.147059,78.331109,78.038511,104.253144,0.676923,0.369231,23.333333,73.702371,82.349378,...,33.421030,508.474576,6.954103,274.532781,1.373484,199.880645,15.548627,3528.866446,226.956787,0.021693
4,97.983422,612.348485,51.037615,12.623526,17.268214,0.186047,0.023256,15.000000,12.209979,71.861050,...,32.958559,357.142857,7.116594,312.336083,0.403520,774.028480,10.833753,3541.244920,326.871487,-0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5112,344.899016,173.963964,114.182925,102.483775,158.985309,0.884058,0.666667,85.000000,112.398330,120.067690,...,36.420775,199.335548,11.820331,786.574809,6.872229,114.457012,44.199825,2126.774028,48.117250,0.044452
5113,370.599956,161.899642,124.066308,116.126814,207.596359,0.948052,0.870130,83.333333,146.628240,105.239084,...,44.384811,606.060606,7.367387,586.347251,1.551529,377.915750,11.091845,3429.076703,309.152951,-0.000000
5114,246.442953,243.464052,148.660630,124.371273,240.027203,0.927835,0.865979,116.666667,169.514080,122.075249,...,40.933608,508.474576,7.127584,701.632331,0.846470,828.892312,20.044174,3576.350191,178.423427,0.038466
5115,216.867470,276.666667,184.453845,128.382814,295.007921,0.976471,0.929412,170.000000,208.484095,153.508891,...,42.684035,582.524272,7.275373,629.130067,0.916589,686.382055,5.932415,3473.121484,585.448161,-0.000000


In [10]:
# How many infs
print(f"Number of infs: {np.sum(np.isinf(features_X_train), axis=0).sum()}")
# Which columns have infs
cols_with_infs = np.where(np.isinf(features_X_train))[1]
cols_with_infs_unique = np.unique(cols_with_infs)
print(f"Columns with infs: {len(cols_with_infs_unique)}")
print(
    f"biggest pos value except inf: {np.max(features_X_train[features_X_train != np.inf].max())}"
)
print(f"biggest neg value: {np.min(features_X_train).min()}")
biggest_pos = np.max(features_X_train[features_X_train != np.inf].max())
biggest_neg = np.min(features_X_train).min()
# Replace infs with biggest pos value
features_X_train[features_X_train == np.inf] = biggest_pos
# Replace -infs with biggest neg value
features_X_train[features_X_train == -np.inf] = biggest_neg

Number of infs: 222
Columns with infs: 1
biggest pos value except inf: 283374.275266772
biggest neg value: -2.958154171416197


In [11]:
X_train_save_path = os.path.join(DATAPATH, "feature_extraction/hrv_X_train.csv")
features_X_train.to_csv(X_train_save_path, index=True)

# X Test

In [13]:
test_features = make_features_from_df(X_test)

# How many infs
print(f"Number of infs: {np.sum(np.isinf(test_features), axis=0).sum()}")
# Which columns have infs
cols_with_infs = np.where(np.isinf(test_features))[1]
cols_with_infs_unique = np.unique(cols_with_infs)
print(f"Columns with infs: {len(cols_with_infs_unique)}")
print(
    f"biggest pos value except inf: {np.max(test_features[test_features != np.inf].max())}"
)
print(
    f"biggest neg value excpet inf : {np.min(test_features[test_features != -np.inf]).min()}"
)
biggest_pos = np.max(test_features[test_features != np.inf].max())
biggest_neg = np.min(test_features[test_features != -np.inf]).min()
# Replace infs with biggest pos value
test_features[test_features == np.inf] = biggest_pos
# Replace -infs with biggest neg value
test_features[test_features == -np.inf] = biggest_neg

  warn(
  ecg_signal = pd.DataFrame.pad(pd.Series(ecg_signal))
  warn(
  ecg_signal = pd.DataFrame.pad(pd.Series(ecg_signal))
  warn(
  ecg_signal = pd.DataFrame.pad(pd.Series(ecg_signal))
  warn(
  ecg_signal = pd.DataFrame.pad(pd.Series(ecg_signal))
  warn(
  ecg_signal = pd.DataFrame.pad(pd.Series(ecg_signal))
  warn(
  ecg_signal = pd.DataFrame.pad(pd.Series(ecg_signal))
  warn(
  ecg_signal = pd.DataFrame.pad(pd.Series(ecg_signal))
  warn(
  ecg_signal = pd.DataFrame.pad(pd.Series(ecg_signal))
  warn(
  ecg_signal = pd.DataFrame.pad(pd.Series(ecg_signal))
  warn(
  ecg_signal = pd.DataFrame.pad(pd.Series(ecg_signal))
  warn(
  ecg_signal = pd.DataFrame.pad(pd.Series(ecg_signal))
  warn(
  ecg_signal = pd.DataFrame.pad(pd.Series(ecg_signal))
  warn(
  ecg_signal = pd.DataFrame.pad(pd.Series(ecg_signal))
  warn(
  ecg_signal = pd.DataFrame.pad(pd.Series(ecg_signal))
  warn(
  ecg_signal = pd.DataFrame.pad(pd.Series(ecg_signal))
  warn(
  ecg_signal = pd.DataFrame.pad(pd.Series(ecg_s

Number of infs: 117
Columns with infs: 1
biggest pos value except inf: 260290.5498992953
biggest neg value excpet inf : -2.770324381472468





In [14]:
X_test_save_path = os.path.join(DATAPATH, "feature_extraction/hrv_X_test.csv")
test_features.to_csv(X_test_save_path, index=True)