In [16]:
import sys
import tqdm

sys.path.append("..")
from util import load_base_data

import biobss
import os
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import neurokit2
import neurokit2 as nk

SAMPLING_RATE = 300

In [2]:
X_train, y_train, X_test = load_base_data()

In [3]:
def mean_signal(signal):
    # cut the signal to the length of the shortest signal
    signal = np.array(signal)
    min_length = min([len(s) for s in signal])
    mean_length = np.mean([len(s) for s in signal])
    print("Mean signal length: ", mean_length, "min length: ", min_length)
    signal = [s[:min_length] for s in signal]
    return np.mean(signal, axis=0)


mean_train_ecg = mean_signal(X_train)

Mean signal length:  17807.0 min length:  17807


In [4]:
def get_loc_peaks(ecg):
    # filtered_ecg=biobss.preprocess.filter_signal(sig, sampling_rate=SAMPLING_RATE, signal_type='ECG', method='pantompkins')
    locs_peaks = biobss.ecgtools.ecg_detectpeaks(ecg, SAMPLING_RATE, "pantompkins")
    # peaks = ecg[locs_peaks]
    # info = biobss.sqatools.check_phys(locs_peaks, SAMPLING_RATE)
    return locs_peaks

In [5]:
def check_difference_to_mean(ecg, mean_ecg):
    loc_peaks = get_loc_peaks(mean_ecg)
    info = biobss.sqatools.template_matching(ecg, loc_peaks)
    return info

In [6]:
def calculate_features_based_on_difference_to_mean(ecg, mean_ecg):
    info = check_difference_to_mean(ecg, mean_ecg)
    # compute the mean, median and std of the correlation coefficients of the peaks compared to the mean signal
    mean = np.mean(info[0])
    median = np.median(info[0])
    std = np.std(info[0])
    return mean, median, std

In [7]:
def get_fiducials(ecg, loc_peaks):
    _, fiducials = neurokit2.ecg_delineate(
        ecg_cleaned=ecg, rpeaks=loc_peaks, sampling_rate=SAMPLING_RATE, method="peak"
    )

    # p_peaks_locs = fiducials["ECG_P_Peaks"]
    # q_peaks_locs = fiducials["ECG_Q_Peaks"]
    # s_peaks_locs = fiducials["ECG_S_Peaks"]
    # t_peaks_locs = fiducials["ECG_T_Peaks"]
    # p_onset_locs = fiducials["ECG_P_Onsets"]
    # t_offset_locs = fiducials["ECG_T_Offsets"]
    return fiducials

In [8]:
def compute_features_rpeaks(ecg):
    loc_peaks = get_loc_peaks(ecg)
    features_rpeaks = biobss.ecgtools.ecg_features.from_Rpeaks(
        ecg, loc_peaks, SAMPLING_RATE, average=False  # take the mean outside
    )
    return features_rpeaks

In [9]:
def compute_features_from_P_Q_R_S_T(ecg):
    loc_peaks = get_loc_peaks(ecg)
    fiducials = get_fiducials(ecg, loc_peaks)
    features = biobss.ecgtools.ecg_features.from_waves(
        ecg, loc_peaks, fiducials, SAMPLING_RATE, average=False  # Take the mean outside
    )
    return features

In [10]:
features_based_on_difference_to_mean_test = (
    calculate_features_based_on_difference_to_mean(X_train.iloc[0], mean_train_ecg)
)
features_rpeaks_test = compute_features_rpeaks(X_train.iloc[0])
features_from_P_Q_R_S_T_test = compute_features_from_P_Q_R_S_T(X_train.iloc[0])

  "a_R": lambda sig, _0, peaks_locs, beatno: sig[peaks_locs[beatno]],
  feature = sig[loc_array2[beatno]] - sig[loc_array1[beatno]]


In [11]:
def average_result(result):
    """input is a dict of dicts. Average over all the not nan values
       an example is
       1: {'ecg_a_R': -147.0,
     'ecg_RR0': 0.36666666666666664,
     'ecg_RR1': 0.84,
     'ecg_RR2': 0.7766666666666666,
     'ecg_RRm': 0.6611111111111111,
     'ecg_RR_0_1': 0.4365079365079365,
     'ecg_RR_2_1': 0.9246031746031745,
     'ecg_RR_m_1': 0.7870370370370371},
    2: {'ecg_a_R': -69.0,
     'ecg_RR0': 0.84,
     'ecg_RR1': 0.7766666666666666,
     'ecg_RR2': 0.89,
     'ecg_RRm': 0.8355555555555556,
     'ecg_RR_0_1': 1.0815450643776825,
     'ecg_RR_2_1': 1.145922746781116,
     'ecg_RR_m_1': 1.0758226037195995},"""
    number_of_results = sorted(list(result.keys()))

    if len(number_of_results) <= 0:
        print("No results to average over")
        print(result)
        return [], []
    # get all keys
    keys = list(result[number_of_results[0]].keys())
    # create a dict with empty lists
    averaged_result = {key: [] for key in keys}
    std_result = {key: [] for key in keys}
    # iterate over all the results
    for index in number_of_results:
        r = result[index]
        # iterate over all the keys
        for key in keys:
            # if the value is not nan
            if not np.isnan(r[key]):
                # append it to the list
                averaged_result[key].append(r[key])
    # compute the mean of all the values
    for key in keys:
        std_result[key] = np.std(averaged_result[key])
        averaged_result[key] = np.mean(averaged_result[key])

    # sort the keys alphabetically and return a list of the values and a list of the keys
    sorted_keys = sorted(averaged_result.keys())

    # rename key to mean_key and std_key
    return_keys = []
    for key in sorted_keys:
        return_keys.append(key + "_mean")
        return_keys.append(key + "_std")

    return_values = []
    for key in sorted_keys:
        return_values.append(averaged_result[key])
        return_values.append(std_result[key])
    return return_values, return_keys

In [12]:
average_result(features_rpeaks_test)

([0.8164583333333333,
  0.06360106513346385,
  0.8236458333333333,
  0.028877848284243373,
  0.8232291666666667,
  0.028831218311156325,
  0.9921928659291079,
  0.07982498319270012,
  1.0002605372265776,
  0.03958271714975402,
  0.9974844677185619,
  0.034642660812381286,
  0.8211111111111111,
  0.0299607510743351,
  -22.984375,
  143.15054271940213],
 ['ecg_RR0_mean',
  'ecg_RR0_std',
  'ecg_RR1_mean',
  'ecg_RR1_std',
  'ecg_RR2_mean',
  'ecg_RR2_std',
  'ecg_RR_0_1_mean',
  'ecg_RR_0_1_std',
  'ecg_RR_2_1_mean',
  'ecg_RR_2_1_std',
  'ecg_RR_m_1_mean',
  'ecg_RR_m_1_std',
  'ecg_RRm_mean',
  'ecg_RRm_std',
  'ecg_a_R_mean',
  'ecg_a_R_std'])

In [13]:
def clean_input(ecg):
    #  filtered using a finite impulse response bandpass filter
    cleaned = nk.ecg_clean(ecg, sampling_rate=SAMPLING_RATE, method="biosppy")
    cleaned, was_inverted = nk.ecg_invert(cleaned, sampling_rate=300, show=False)
    return cleaned

In [14]:
def make_all_features(ecg_pandas, mean_train_ecg):
    ecg_array = ecg_pandas.to_numpy()
    print(ecg_array[0].shape)

    feature_names = [
        "Mean correlation to the average signal",
        "Median correlation to the average signal",
        "Std correlation to the average signal",
    ]
    feature_names = (
        feature_names + average_result(compute_features_rpeaks(ecg_array[0]))[1]
    )
    feature_names = (
        feature_names + average_result(compute_features_from_P_Q_R_S_T(ecg_array[0]))[1]
    )

    num_features_for_rpeaks = len(
        average_result(compute_features_rpeaks(ecg_array[0]))[1]
    )
    num_features_for_P_Q_R_S_T = len(
        average_result(compute_features_from_P_Q_R_S_T(ecg_array[0]))[1]
    )

    mean_ecg = mean_train_ecg
    features_based_on_difference_to_mean = []
    features_rpeaks = []
    features_from_P_Q_R_S_T = []

    # construct default 0 values to insert if there is an error
    names_rpeaks = average_result(compute_features_rpeaks(ecg_array[0]))[1]
    default_rpeaks = (np.zeros(len(names_rpeaks)), names_rpeaks)
    names_P_Q_R_S_T = average_result(compute_features_from_P_Q_R_S_T(ecg_array[0]))[1]
    default_P_Q_R_S_T = (np.zeros(len(names_P_Q_R_S_T)), names_P_Q_R_S_T)

    for index, ecg in tqdm.tqdm(enumerate(ecg_array)):
        ecg = clean_input(ecg)
        # MEAN
        features_based_on_difference_to_mean.append(
            calculate_features_based_on_difference_to_mean(ecg, mean_ecg)
        )

        # RPEAKS
        interim_features_rpeaks = average_result(compute_features_rpeaks(ecg))
        if len(interim_features_rpeaks[1]) != num_features_for_rpeaks:
            print(
                f"ERROR: Number of features for rpeaks is not consistent, {index} will be 0"
            )
            interim_features_rpeaks = default_rpeaks
        features_rpeaks.append(interim_features_rpeaks[0])

        # P_Q_R_S_T
        try:
            interim_features_from_P_Q_R_S_T = average_result(
                compute_features_from_P_Q_R_S_T(ecg)
            )
            if len(interim_features_from_P_Q_R_S_T[1]) != num_features_for_P_Q_R_S_T:
                print(
                    f"ERROR: Number of features for P_Q_R_S_T is not consistent, {index} will be 0"
                )
                interim_features_from_P_Q_R_S_T = default_P_Q_R_S_T
        except:
            print(
                f"ERROR: Number of features for P_Q_R_S_T is not consistent, {index} will be 0"
            )
            interim_features_from_P_Q_R_S_T = default_P_Q_R_S_T
        features_from_P_Q_R_S_T.append(interim_features_from_P_Q_R_S_T[0])

    # make np array
    features_based_on_difference_to_mean = np.array(
        features_based_on_difference_to_mean
    )
    features_rpeaks = np.array(features_rpeaks)
    features_from_P_Q_R_S_T = np.array(features_from_P_Q_R_S_T)

    return (
        np.concatenate(
            (
                features_based_on_difference_to_mean,
                features_rpeaks,
                features_from_P_Q_R_S_T,
            ),
            axis=1,
        ),
        feature_names,
    )

In [17]:
make_all_features(X_train.iloc[0:2], mean_train_ecg)

(17807,)


  warn(
  ecg_signal = pd.DataFrame.pad(pd.Series(ecg_signal))
  warn(
  ecg_signal = pd.DataFrame.pad(pd.Series(ecg_signal))
2it [00:00,  4.35it/s]


(array([[ 2.69555940e-01,  2.84571772e-01,  3.00993449e-01,
          8.16458333e-01,  7.06096345e-02,  8.24218750e-01,
          3.50656239e-02,  8.23229167e-01,  3.44599644e-02,
          9.92793245e-01,  9.66105792e-02,  1.00042519e+00,
          5.62783636e-02,  9.97739478e-01,  4.54764397e-02,
          8.21302083e-01,  3.06642981e-02, -1.09973392e+01,
          1.77491009e+02, -1.69268001e+02,  1.58080189e+02,
          1.15045792e+00,  2.36472241e+00,  4.20626767e+01,
          5.76132163e+02, -1.99224763e+01,  1.66228008e+02,
         -9.59841889e-01,  1.72442534e+00,  2.65068634e+00,
          7.24145042e+00, -1.87205959e+02,  1.59454024e+02,
          9.35504328e+01,  1.81368442e+02,  7.24326455e+01,
          1.87081456e+02, -1.55496295e+01,  4.22522470e+01,
          2.65206762e+02,  9.64187715e+01, -9.05750415e+01,
          1.83270275e+02, -2.41482478e+01,  1.99657919e+02,
          9.12822329e-01,  2.53105431e+01, -2.30268301e-01,
          6.88690545e-01,  2.79496863e+0

# Make the features

In [18]:
features_X_train, feature_names = make_all_features(X_train, mean_train_ecg)

(17807,)


  warn(
  ecg_signal = pd.DataFrame.pad(pd.Series(ecg_signal))
  warn(
  ecg_signal = pd.DataFrame.pad(pd.Series(ecg_signal))
  warn(
  ecg_signal = pd.DataFrame.pad(pd.Series(ecg_signal))
  warn(
  ecg_signal = pd.DataFrame.pad(pd.Series(ecg_signal))
  warn(
  ecg_signal = pd.DataFrame.pad(pd.Series(ecg_signal))
  warn(
  ecg_signal = pd.DataFrame.pad(pd.Series(ecg_signal))
  warn(
  ecg_signal = pd.DataFrame.pad(pd.Series(ecg_signal))
  warn(
  ecg_signal = pd.DataFrame.pad(pd.Series(ecg_signal))
  warn(
  ecg_signal = pd.DataFrame.pad(pd.Series(ecg_signal))
  warn(
  ecg_signal = pd.DataFrame.pad(pd.Series(ecg_signal))
  warn(
  ecg_signal = pd.DataFrame.pad(pd.Series(ecg_signal))
  warn(
  ecg_signal = pd.DataFrame.pad(pd.Series(ecg_signal))
  warn(
  ecg_signal = pd.DataFrame.pad(pd.Series(ecg_signal))
  warn(
  ecg_signal = pd.DataFrame.pad(pd.Series(ecg_signal))
  warn(
  ecg_signal = pd.DataFrame.pad(pd.Series(ecg_signal))
  warn(
  ecg_signal = pd.DataFrame.pad(pd.Series(ecg_s

No results to average over
{}
ERROR: Number of features for rpeaks is not consistent, 83 will be 0
ERROR: Number of features for P_Q_R_S_T is not consistent, 83 will be 0


  warn(
  ecg_signal = pd.DataFrame.pad(pd.Series(ecg_signal))
  warn(
  ecg_signal = pd.DataFrame.pad(pd.Series(ecg_signal))
  warn(
  ecg_signal = pd.DataFrame.pad(pd.Series(ecg_signal))
  warn(
  ecg_signal = pd.DataFrame.pad(pd.Series(ecg_signal))
  warn(
  ecg_signal = pd.DataFrame.pad(pd.Series(ecg_signal))
  warn(
  ecg_signal = pd.DataFrame.pad(pd.Series(ecg_signal))
  warn(
  ecg_signal = pd.DataFrame.pad(pd.Series(ecg_signal))
  warn(
  ecg_signal = pd.DataFrame.pad(pd.Series(ecg_signal))
  warn(
  ecg_signal = pd.DataFrame.pad(pd.Series(ecg_signal))
  warn(
  ecg_signal = pd.DataFrame.pad(pd.Series(ecg_signal))
  warn(
  ecg_signal = pd.DataFrame.pad(pd.Series(ecg_signal))
  warn(
  ecg_signal = pd.DataFrame.pad(pd.Series(ecg_signal))
  warn(
  ecg_signal = pd.DataFrame.pad(pd.Series(ecg_signal))
  warn(
  ecg_signal = pd.DataFrame.pad(pd.Series(ecg_signal))
  warn(
  ecg_signal = pd.DataFrame.pad(pd.Series(ecg_signal))
  warn(
  ecg_signal = pd.DataFrame.pad(pd.Series(ecg_s

No results to average over
{}
ERROR: Number of features for rpeaks is not consistent, 429 will be 0
ERROR: Number of features for P_Q_R_S_T is not consistent, 429 will be 0


  warn(
  ecg_signal = pd.DataFrame.pad(pd.Series(ecg_signal))
  warn(
  ecg_signal = pd.DataFrame.pad(pd.Series(ecg_signal))
  warn(
  ecg_signal = pd.DataFrame.pad(pd.Series(ecg_signal))
  warn(
  ecg_signal = pd.DataFrame.pad(pd.Series(ecg_signal))
  warn(
  ecg_signal = pd.DataFrame.pad(pd.Series(ecg_signal))
  warn(
  ecg_signal = pd.DataFrame.pad(pd.Series(ecg_signal))
  warn(
  ecg_signal = pd.DataFrame.pad(pd.Series(ecg_signal))
  warn(
  ecg_signal = pd.DataFrame.pad(pd.Series(ecg_signal))
  warn(
  ecg_signal = pd.DataFrame.pad(pd.Series(ecg_signal))
  warn(
  ecg_signal = pd.DataFrame.pad(pd.Series(ecg_signal))
  warn(
  ecg_signal = pd.DataFrame.pad(pd.Series(ecg_signal))
  warn(
  ecg_signal = pd.DataFrame.pad(pd.Series(ecg_signal))
  warn(
  ecg_signal = pd.DataFrame.pad(pd.Series(ecg_signal))
  warn(
  ecg_signal = pd.DataFrame.pad(pd.Series(ecg_signal))
  warn(
  ecg_signal = pd.DataFrame.pad(pd.Series(ecg_signal))
  warn(
  ecg_signal = pd.DataFrame.pad(pd.Series(ecg_s

No results to average over
{}
ERROR: Number of features for rpeaks is not consistent, 771 will be 0
ERROR: Number of features for P_Q_R_S_T is not consistent, 771 will be 0


  warn(
  ecg_signal = pd.DataFrame.pad(pd.Series(ecg_signal))
  warn(
  ecg_signal = pd.DataFrame.pad(pd.Series(ecg_signal))
  warn(
  ecg_signal = pd.DataFrame.pad(pd.Series(ecg_signal))
  warn(
  ecg_signal = pd.DataFrame.pad(pd.Series(ecg_signal))
  warn(
  ecg_signal = pd.DataFrame.pad(pd.Series(ecg_signal))
  warn(
  ecg_signal = pd.DataFrame.pad(pd.Series(ecg_signal))
  warn(
  ecg_signal = pd.DataFrame.pad(pd.Series(ecg_signal))
  warn(
  ecg_signal = pd.DataFrame.pad(pd.Series(ecg_signal))
  warn(
  ecg_signal = pd.DataFrame.pad(pd.Series(ecg_signal))
  warn(
  ecg_signal = pd.DataFrame.pad(pd.Series(ecg_signal))
  warn(
  ecg_signal = pd.DataFrame.pad(pd.Series(ecg_signal))
  warn(
  ecg_signal = pd.DataFrame.pad(pd.Series(ecg_signal))
  warn(
  ecg_signal = pd.DataFrame.pad(pd.Series(ecg_signal))
  warn(
  ecg_signal = pd.DataFrame.pad(pd.Series(ecg_signal))
  warn(
  ecg_signal = pd.DataFrame.pad(pd.Series(ecg_signal))
  warn(
  ecg_signal = pd.DataFrame.pad(pd.Series(ecg_s

No results to average over
{}
ERROR: Number of features for rpeaks is not consistent, 3424 will be 0
ERROR: Number of features for P_Q_R_S_T is not consistent, 3424 will be 0


  warn(
  warn(
  ecg_signal = pd.DataFrame.pad(pd.Series(ecg_signal))
  warn(
  ecg_signal = pd.DataFrame.pad(pd.Series(ecg_signal))
  warn(
  ecg_signal = pd.DataFrame.pad(pd.Series(ecg_signal))
  warn(
  ecg_signal = pd.DataFrame.pad(pd.Series(ecg_signal))
  warn(
  ecg_signal = pd.DataFrame.pad(pd.Series(ecg_signal))
  warn(
  ecg_signal = pd.DataFrame.pad(pd.Series(ecg_signal))
  warn(
  ecg_signal = pd.DataFrame.pad(pd.Series(ecg_signal))
  warn(
  ecg_signal = pd.DataFrame.pad(pd.Series(ecg_signal))
  warn(
  ecg_signal = pd.DataFrame.pad(pd.Series(ecg_signal))
  warn(
  ecg_signal = pd.DataFrame.pad(pd.Series(ecg_signal))
  warn(
  ecg_signal = pd.DataFrame.pad(pd.Series(ecg_signal))
  warn(
  ecg_signal = pd.DataFrame.pad(pd.Series(ecg_signal))
  warn(
  ecg_signal = pd.DataFrame.pad(pd.Series(ecg_signal))
  warn(
  ecg_signal = pd.DataFrame.pad(pd.Series(ecg_signal))
  warn(
  ecg_signal = pd.DataFrame.pad(pd.Series(ecg_signal))
  warn(
  ecg_signal = pd.DataFrame.pad(pd.Seri

No results to average over
{}
ERROR: Number of features for rpeaks is not consistent, 4163 will be 0
ERROR: Number of features for P_Q_R_S_T is not consistent, 4163 will be 0


  warn(
  ecg_signal = pd.DataFrame.pad(pd.Series(ecg_signal))
  warn(
  ecg_signal = pd.DataFrame.pad(pd.Series(ecg_signal))
  warn(
  ecg_signal = pd.DataFrame.pad(pd.Series(ecg_signal))
  warn(
  ecg_signal = pd.DataFrame.pad(pd.Series(ecg_signal))
  warn(
  ecg_signal = pd.DataFrame.pad(pd.Series(ecg_signal))
  warn(
  ecg_signal = pd.DataFrame.pad(pd.Series(ecg_signal))
  warn(
  ecg_signal = pd.DataFrame.pad(pd.Series(ecg_signal))
  warn(
  ecg_signal = pd.DataFrame.pad(pd.Series(ecg_signal))
  warn(
  ecg_signal = pd.DataFrame.pad(pd.Series(ecg_signal))
  warn(
  ecg_signal = pd.DataFrame.pad(pd.Series(ecg_signal))
  warn(
  ecg_signal = pd.DataFrame.pad(pd.Series(ecg_signal))
  warn(
  ecg_signal = pd.DataFrame.pad(pd.Series(ecg_signal))
  warn(
  ecg_signal = pd.DataFrame.pad(pd.Series(ecg_signal))
  warn(
  ecg_signal = pd.DataFrame.pad(pd.Series(ecg_signal))
  warn(
  ecg_signal = pd.DataFrame.pad(pd.Series(ecg_signal))
  warn(
  ecg_signal = pd.DataFrame.pad(pd.Series(ecg_s

In [19]:
features_X_train.shape
# feature_names
# droped_indices

(5117, 81)

# Check the data for inf and nan

In [20]:
# How many infs
print(f"Number of infs: {np.sum(np.isinf(features_X_train))}")
# Which columns have infs
cols_with_infs = np.where(np.isinf(features_X_train))[1]
cols_with_infs_unique = np.unique(cols_with_infs)
print(f"Columns with infs: {len(cols_with_infs_unique)}")
# Numbers bigger than 10000
print(f"Number of values bigger than 10000: {np.sum(features_X_train > 10000)}")
# Cap everything bigger than 10000 to 10000
features_X_train[features_X_train > 10000] = 10000
# Numbers smaller than -10000
print(f"Number of values smaller than -10000: {np.sum(features_X_train < -10000)}")
# Cap everything smaller than -10000 to -10000
features_X_train[features_X_train < -10000] = -10000
# Cap inf to 20000
features_X_train[np.isinf(features_X_train)] = 20000
# Cap -inf to -20000
features_X_train[np.isneginf(features_X_train)] = -20000

# How many nans
print(f"Number of nans: {np.sum(np.isnan(features_X_train))}")
# Which columns have nans
cols_with_nans = np.where(np.isnan(features_X_train))[1]
cols_with_nans_unique = np.unique(cols_with_nans)
print(f"Columns with nans: {len(cols_with_nans_unique)}")

Number of infs: 0
Columns with infs: 0
Number of values bigger than 10000: 30
Number of values smaller than -10000: 3
Number of nans: 376
Columns with nans: 52


In [21]:
# y_train_droped = y_train.drop(droped_indices)
# # reset the index
# y_train_droped = y_train_droped.reset_index(drop=True)
# # name the index column id
# y_train_droped.index.name = "id"

# DONT NEED TO DROP ANYTHING BECAUSE IS 0

In [22]:
# Write the features to a csv file
df = pd.DataFrame(features_X_train, columns=feature_names)
X_train_save_path = "/Users/ericschreiber/dev/ETH/AML/Project_1/aml-2023/task2/data/feature_extraction/bioss_X_train.csv"
df.index.name = "id"
df.to_csv(X_train_save_path, index=True)
y_train_save_path = "/Users/ericschreiber/dev/ETH/AML/Project_1/aml-2023/task2/data/feature_extraction/bioss_y_train.csv"
y_train.to_csv(y_train_save_path, index=True)
# droped_indices_save_path = "/Users/ericschreiber/dev/ETH/AML/Project_1/aml-2023/task2/data/feature_extraction/bioss_droped_indices_train.csv"
# dropped_rows = pd.DataFrame(droped_indices)
# dropped_rows.to_csv(droped_indices_save_path, index=False)

In [23]:
features_X_test, feature_names_test = make_all_features(X_test, mean_train_ecg)

(17807,)


  warn(
  ecg_signal = pd.DataFrame.pad(pd.Series(ecg_signal))
  warn(
  ecg_signal = pd.DataFrame.pad(pd.Series(ecg_signal))
  warn(
  ecg_signal = pd.DataFrame.pad(pd.Series(ecg_signal))
  warn(
  ecg_signal = pd.DataFrame.pad(pd.Series(ecg_signal))
  warn(
  ecg_signal = pd.DataFrame.pad(pd.Series(ecg_signal))
  warn(
  ecg_signal = pd.DataFrame.pad(pd.Series(ecg_signal))
  warn(
  ecg_signal = pd.DataFrame.pad(pd.Series(ecg_signal))
  warn(
  ecg_signal = pd.DataFrame.pad(pd.Series(ecg_signal))
  warn(
  ecg_signal = pd.DataFrame.pad(pd.Series(ecg_signal))
  warn(
  ecg_signal = pd.DataFrame.pad(pd.Series(ecg_signal))
  warn(
  ecg_signal = pd.DataFrame.pad(pd.Series(ecg_signal))
  warn(
  ecg_signal = pd.DataFrame.pad(pd.Series(ecg_signal))
  warn(
  ecg_signal = pd.DataFrame.pad(pd.Series(ecg_signal))
  warn(
  ecg_signal = pd.DataFrame.pad(pd.Series(ecg_signal))
  warn(
  ecg_signal = pd.DataFrame.pad(pd.Series(ecg_signal))
  warn(
  ecg_signal = pd.DataFrame.pad(pd.Series(ecg_s

No results to average over
{}
ERROR: Number of features for rpeaks is not consistent, 424 will be 0
ERROR: Number of features for P_Q_R_S_T is not consistent, 424 will be 0


  warn(
  ecg_signal = pd.DataFrame.pad(pd.Series(ecg_signal))
  warn(
  ecg_signal = pd.DataFrame.pad(pd.Series(ecg_signal))
  warn(
  ecg_signal = pd.DataFrame.pad(pd.Series(ecg_signal))
  warn(
  ecg_signal = pd.DataFrame.pad(pd.Series(ecg_signal))
  warn(
  ecg_signal = pd.DataFrame.pad(pd.Series(ecg_signal))
  warn(
  ecg_signal = pd.DataFrame.pad(pd.Series(ecg_signal))
  warn(
  ecg_signal = pd.DataFrame.pad(pd.Series(ecg_signal))
  warn(
  ecg_signal = pd.DataFrame.pad(pd.Series(ecg_signal))
  warn(
  ecg_signal = pd.DataFrame.pad(pd.Series(ecg_signal))
  warn(
  ecg_signal = pd.DataFrame.pad(pd.Series(ecg_signal))
  warn(
  ecg_signal = pd.DataFrame.pad(pd.Series(ecg_signal))
  warn(
  ecg_signal = pd.DataFrame.pad(pd.Series(ecg_signal))
  warn(
  ecg_signal = pd.DataFrame.pad(pd.Series(ecg_signal))
  warn(
  ecg_signal = pd.DataFrame.pad(pd.Series(ecg_signal))
  warn(
  ecg_signal = pd.DataFrame.pad(pd.Series(ecg_signal))
  warn(
  ecg_signal = pd.DataFrame.pad(pd.Series(ecg_s

No results to average over
{}
ERROR: Number of features for rpeaks is not consistent, 747 will be 0
ERROR: Number of features for P_Q_R_S_T is not consistent, 747 will be 0


  warn(
  ecg_signal = pd.DataFrame.pad(pd.Series(ecg_signal))
  ret = _var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  return _methods._mean(a, axis=axis, dtype=dtype,
  warn(
  ecg_signal = pd.DataFrame.pad(pd.Series(ecg_signal))
  warn(
  ecg_signal = pd.DataFrame.pad(pd.Series(ecg_signal))
  warn(
  ecg_signal = pd.DataFrame.pad(pd.Series(ecg_signal))
  warn(
  ecg_signal = pd.DataFrame.pad(pd.Series(ecg_signal))
  warn(
  ecg_signal = pd.DataFrame.pad(pd.Series(ecg_signal))
  warn(
  ecg_signal = pd.DataFrame.pad(pd.Series(ecg_signal))
  warn(
  ecg_signal = pd.DataFrame.pad(pd.Series(ecg_signal))
  warn(
  ecg_signal = pd.DataFrame.pad(pd.Series(ecg_signal))
  warn(
  ecg_signal = pd.DataFrame.pad(pd.Series(ecg_signal))
  warn(
  ecg_signal = pd.DataFrame.pad(pd.Series(ecg_signal))
  warn(
  ecg_signal = pd.DataFrame.pad(pd.Series(ecg_signal))
  warn(
  ecg_signal = pd.DataFrame.pad(pd.Series(ecg_signal))
  warn(
  ecg_signal = pd.DataFrame.pad(pd.Series(ecg_signal))
  warn(

No results to average over
{}
ERROR: Number of features for rpeaks is not consistent, 1223 will be 0
ERROR: Number of features for P_Q_R_S_T is not consistent, 1223 will be 0


  warn(
  ecg_signal = pd.DataFrame.pad(pd.Series(ecg_signal))
  warn(
  ecg_signal = pd.DataFrame.pad(pd.Series(ecg_signal))
  warn(
  ecg_signal = pd.DataFrame.pad(pd.Series(ecg_signal))
  warn(
  ecg_signal = pd.DataFrame.pad(pd.Series(ecg_signal))
  warn(
  ecg_signal = pd.DataFrame.pad(pd.Series(ecg_signal))
  warn(
  ecg_signal = pd.DataFrame.pad(pd.Series(ecg_signal))
  warn(
  ecg_signal = pd.DataFrame.pad(pd.Series(ecg_signal))
  warn(
  ecg_signal = pd.DataFrame.pad(pd.Series(ecg_signal))
  warn(
  ecg_signal = pd.DataFrame.pad(pd.Series(ecg_signal))
  warn(
  ecg_signal = pd.DataFrame.pad(pd.Series(ecg_signal))
  warn(
  ecg_signal = pd.DataFrame.pad(pd.Series(ecg_signal))
  warn(
  ecg_signal = pd.DataFrame.pad(pd.Series(ecg_signal))
  warn(
  ecg_signal = pd.DataFrame.pad(pd.Series(ecg_signal))
  warn(
  ecg_signal = pd.DataFrame.pad(pd.Series(ecg_signal))
  warn(
  ecg_signal = pd.DataFrame.pad(pd.Series(ecg_signal))
  warn(
  ecg_signal = pd.DataFrame.pad(pd.Series(ecg_s

No results to average over
{}
ERROR: Number of features for rpeaks is not consistent, 2032 will be 0
ERROR: Number of features for P_Q_R_S_T is not consistent, 2032 will be 0


  warn(
  ecg_signal = pd.DataFrame.pad(pd.Series(ecg_signal))
  warn(
  ecg_signal = pd.DataFrame.pad(pd.Series(ecg_signal))
  warn(
  ecg_signal = pd.DataFrame.pad(pd.Series(ecg_signal))
  warn(
  ecg_signal = pd.DataFrame.pad(pd.Series(ecg_signal))
  warn(
  ecg_signal = pd.DataFrame.pad(pd.Series(ecg_signal))
  warn(
  ecg_signal = pd.DataFrame.pad(pd.Series(ecg_signal))
  warn(
  ecg_signal = pd.DataFrame.pad(pd.Series(ecg_signal))
  warn(
  ecg_signal = pd.DataFrame.pad(pd.Series(ecg_signal))
  warn(
  ecg_signal = pd.DataFrame.pad(pd.Series(ecg_signal))
  warn(
  ecg_signal = pd.DataFrame.pad(pd.Series(ecg_signal))
  warn(
  ecg_signal = pd.DataFrame.pad(pd.Series(ecg_signal))
  warn(
  ecg_signal = pd.DataFrame.pad(pd.Series(ecg_signal))
  warn(
  ecg_signal = pd.DataFrame.pad(pd.Series(ecg_signal))
  warn(
  ecg_signal = pd.DataFrame.pad(pd.Series(ecg_signal))
  warn(
  ecg_signal = pd.DataFrame.pad(pd.Series(ecg_signal))
  warn(
  ecg_signal = pd.DataFrame.pad(pd.Series(ecg_s

No results to average over
{}
ERROR: Number of features for rpeaks is not consistent, 2327 will be 0
ERROR: Number of features for P_Q_R_S_T is not consistent, 2327 will be 0


  warn(
  ecg_signal = pd.DataFrame.pad(pd.Series(ecg_signal))
  warn(
  ecg_signal = pd.DataFrame.pad(pd.Series(ecg_signal))
  warn(
  ecg_signal = pd.DataFrame.pad(pd.Series(ecg_signal))
  warn(
  ecg_signal = pd.DataFrame.pad(pd.Series(ecg_signal))
  warn(
  ecg_signal = pd.DataFrame.pad(pd.Series(ecg_signal))
  warn(
  ecg_signal = pd.DataFrame.pad(pd.Series(ecg_signal))
  warn(
  ecg_signal = pd.DataFrame.pad(pd.Series(ecg_signal))
  warn(
  ecg_signal = pd.DataFrame.pad(pd.Series(ecg_signal))
  warn(
  ecg_signal = pd.DataFrame.pad(pd.Series(ecg_signal))
  warn(
  ecg_signal = pd.DataFrame.pad(pd.Series(ecg_signal))
  warn(
  ecg_signal = pd.DataFrame.pad(pd.Series(ecg_signal))
  warn(
  ecg_signal = pd.DataFrame.pad(pd.Series(ecg_signal))
  warn(
  ecg_signal = pd.DataFrame.pad(pd.Series(ecg_signal))
  warn(
  ecg_signal = pd.DataFrame.pad(pd.Series(ecg_signal))
  warn(
  ecg_signal = pd.DataFrame.pad(pd.Series(ecg_signal))
  warn(
  ecg_signal = pd.DataFrame.pad(pd.Series(ecg_s

In [24]:
features_X_test_double = features_X_test

In [25]:
# # We need to add the dropped indices to the test indices. We fill those rows with zeros

# droped_indices_test.sort(
#     reverse=True
# )  # we need to fill the indices from the back otherwise the indices will be wrong
# for index in droped_indices_test:
#     features_X_test = np.insert(features_X_test, index, 0, axis=0)

# NO NEED TO DROP ANYTHING BECAUSE IS 0

In [26]:
# How many infs
print(f"Number of infs: {np.sum(np.isinf(features_X_test))}")
# Which columns have infs
cols_with_infs = np.where(np.isinf(features_X_test))[1]
cols_with_infs_unique = np.unique(cols_with_infs)
print(f"Columns with infs: {len(cols_with_infs_unique)}")
# Numbers bigger than 10000
print(f"Number of values bigger than 10000: {np.sum(features_X_test > 10000)}")
# Cap everything bigger than 10000 to 10000
features_X_test[features_X_test > 10000] = 10000
# Numbers smaller than -10000
print(f"Number of values smaller than -10000: {np.sum(features_X_test < -10000)}")
# Cap everything smaller than -10000 to -10000
features_X_test[features_X_test < -10000] = -10000
# Cap inf to 20000
features_X_test[np.isinf(features_X_test)] = 20000
# Cap -inf to -20000
features_X_test[np.isneginf(features_X_test)] = -20000

Number of infs: 0
Columns with infs: 0
Number of values bigger than 10000: 16
Number of values smaller than -10000: 0


In [27]:
df_test = pd.DataFrame(features_X_test, columns=feature_names_test)
df_test.index.name = "id"
X_test_save_path = "/Users/ericschreiber/dev/ETH/AML/Project_1/aml-2023/task2/data/feature_extraction/bioss_X_test.csv"
df_test.to_csv(X_test_save_path, index=True)

In [28]:
print(features_X_test.shape)
print(features_X_test_double.shape)
print(X_test.shape)

(3411, 81)
(3411, 81)
(3411, 17807)


In [30]:
# Number of rows that only contain 0
rows_with_only_0 = 0
for i in range(len(features_X_test)):
    if features_X_test[i].sum() == 0:
        rows_with_only_0 += 1

print(f"Number of rows that only contain 0: {rows_with_only_0}")

Number of rows that only contain 0: 0
