# In this notebook we are creating a crude training and test set for logistic regression

Import libraries

In [8]:
import matplotlib.pyplot  as plt 
import numpy as np 
import wfdb
from utils import *
import heartpy as hp
import pandas as pd 
import os
import re

In [9]:
NB_SAMPLES_PER_MINUTE = 6000

train_files = ['a01', 'c01', 'b01', 'a02', 'c02', 'b02', 'a03', 'c03', 
              'b03', 'a04', 'c04', 'b04', 'a05', 'c05', 'a06', 'c06', 
              'a07', 'c07', 'a08', 'c08', 'a09', 'a10', 'a11', 'a12', 
              'a13', 'a14', 'a15', 'a16']


test_files = ['b05','c09','c10','a17','a18','a19','a20']
 
# assign directory
directory = 'apnea-ecg-database-1.0.0'

This function generates the csv with two features for each segment of 60 segments, the bpm and the sdnn

In [10]:
# creates csv in current directory
def create_data_csv(source_directory, files, output_name):
    samples = []
    for filename in files:
        print(filename)
        labels_session = []

        # load the outputs
        with open("outputs/" + filename + ".txt", "r") as file:
            lines = file.readlines()
            for line in lines:
                labels_session.append(1 if re.split(" +", line)[3] == "A" else 0)

        # here we want to make sure that we take the number of samples measured in the output file
        nb_samples_output = int(re.split(" +", lines[-1])[2]) + NB_SAMPLES_PER_MINUTE
        labels_session = labels_session[1:]
        # measure the ecg using wfdb and heartpy
        record = wfdb.rdrecord(
            source_directory + "/" + filename
        )  # ,  sampfrom = 0, sampto = nb_samples_output)
        # we do this because the number of labels could be bigger than the signal length
        session_ecg = record.p_signal[0:nb_samples_output]

        # compute the heart rates and sdnn, for every two minute with a slide of 60 seconds
        session_ecg = session_ecg.reshape((len(session_ecg)))
        working_data, measures = hp.process_segmentwise(
            session_ecg[3000:],
            sample_rate=100.0,
            segment_width=60,
            segment_overlap=0,
            segment_min_size=0,
            replace_outliers=True,
        )

        nb_samples = min(len(labels_session), len(measures["bpm"]))

        labels_session = labels_session[0:nb_samples]

        hr = measures["bpm"][0:nb_samples]
        hrv_sdnn = measures["sdnn"][0:nb_samples]
        handle_nans(hr)
        handle_nans(hrv_sdnn)

        session_samples = np.c_[labels_session, hr, hrv_sdnn]

        samples.extend(session_samples)
    np.savetxt(output_name + ".csv", samples, delimiter=",", fmt=("%d, %f, %f"))

In [11]:
create_data_csv(directory, train_files, 'train_set')
create_data_csv(directory, test_files, 'test_set')


a01
c01
b01
a02
c02
b02
a03
c03
b03
a04
c04
b04
a05
c05
a06
c06
a07
c07
a08
c08
a09
a10
a11
a12
a13
a14
a15
a16
b05
c09
c10
a17
a18
a19
a20


Here we are looking for the ratio of apnea and non apnea in our dataset

In [None]:
# creates csv in current directory
def compute_ratio(source_directory, files):
    labels = []
    for filename in files:
        labels_session = []
        # load the outputs
        with open("outputs/" + filename + ".txt", "r") as file:
            lines = file.readlines()
            for line in lines:
                labels_session.append(1 if re.split(" +", line)[3] == "A" else 0)
        labels.extend(labels_session)

    print("ratio of apnea : ", sum(labels) / len(labels))

all_recordings = []

all_recordings.extend(train_files)
all_recordings.extend(test_files)    

compute_ratio(directory, all_recordings)
