# Filter the MASS dataset

This notebook will filter the MASS dataset for training with the Portiloop.

## Filtering code

In [1]:
import numpy as np
from scipy.signal import firwin

def shift_numpy(arr, num, fill_value=np.nan):
    result = np.empty_like(arr)
    if num > 0:
        result[:num] = fill_value
        result[num:] = arr[:-num]
    elif num < 0:
        result[num:] = fill_value
        result[:num] = arr[-num:]
    else:
        result[:] = arr
    return result

class FIR:
    def __init__(self, nb_channels, coefficients, buffer=None):
        
        self.coefficients = np.expand_dims(np.array(coefficients), axis=1)
        self.taps = len(self.coefficients)
        self.nb_channels = nb_channels
        self.buffer = np.array(buffer) if buffer is not None else np.zeros((self.taps, self.nb_channels))
    
    def filter(self, x):
        self.buffer = shift_numpy(self.buffer, 1, x)
        filtered = np.sum(self.buffer * self.coefficients, axis=0)
        return filtered

    
class FilterPipeline:
    def __init__(self,
                 nb_channels,
                 sampling_rate,
                 power_line_fq=60,
                 use_custom_fir=False,
                 custom_fir_order=20,
                 custom_fir_cutoff=30,
                 alpha_avg=0.1,
                 alpha_std=0.001,
                 epsilon=0.000001,
                 filter_args=[]):
        if len(filter_args) > 0:
            use_fir, use_notch, use_std = filter_args
        else:
            use_fir=True,
            use_notch=False,
            use_std=True
        self.use_fir = use_fir
        self.use_notch = use_notch
        self.use_std = use_std
        self.nb_channels = nb_channels
        assert power_line_fq in [50, 60], f"The only supported power line frequencies are 50 Hz and 60 Hz"
        if power_line_fq == 60:
            self.notch_coeff1 = -0.12478308884588535
            self.notch_coeff2 = 0.98729186796473023
            self.notch_coeff3 = 0.99364593398236511
            self.notch_coeff4 = -0.12478308884588535
            self.notch_coeff5 = 0.99364593398236511
        else:
            self.notch_coeff1 = -0.61410695998423581
            self.notch_coeff2 =  0.98729186796473023
            self.notch_coeff3 = 0.99364593398236511
            self.notch_coeff4 = -0.61410695998423581
            self.notch_coeff5 = 0.99364593398236511
        self.dfs = [np.zeros(self.nb_channels), np.zeros(self.nb_channels)]
        
        self.moving_average = None
        self.moving_variance = np.zeros(self.nb_channels)
        self.ALPHA_AVG = alpha_avg
        self.ALPHA_STD = alpha_std
        self.EPSILON = epsilon
        
        if use_custom_fir:
            self.fir_coef = firwin(numtaps=custom_fir_order+1, cutoff=custom_fir_cutoff, fs=sampling_rate)
        else:
            self.fir_coef = [
                0.001623780150148094927192721215192250384,
                0.014988684599373741992978104065059596905,
                0.021287595318265635502275046064823982306,
                0.007349500393709578957568417933998716762,
                -0.025127515717112181709014251396183681209,
                -0.052210507359822452833064687638398027048,
                -0.039273839505489904766477593511808663607,
                0.033021568427940004020193498490698402748,
                0.147606943281569008563636202779889572412,
                0.254000252034505602516389899392379447818,
                0.297330876398883392486283128164359368384,
                0.254000252034505602516389899392379447818,
                0.147606943281569008563636202779889572412,
                0.033021568427940004020193498490698402748,
                -0.039273839505489904766477593511808663607,
                -0.052210507359822452833064687638398027048,
                -0.025127515717112181709014251396183681209,
                0.007349500393709578957568417933998716762,
                0.021287595318265635502275046064823982306,
                0.014988684599373741992978104065059596905,
                0.001623780150148094927192721215192250384]
        self.fir = FIR(self.nb_channels, self.fir_coef)
        
    def filter(self, value):
        """
        value: a numpy array of shape (data series, channels)
        """
        for i, x in enumerate(value):  # loop over the data series
            # FIR:
            if self.use_fir:
                x = self.fir.filter(x)
            # notch:
            if self.use_notch:
                denAccum = (x - self.notch_coeff1 * self.dfs[0]) - self.notch_coeff2 * self.dfs[1]
                x = (self.notch_coeff3 * denAccum + self.notch_coeff4 * self.dfs[0]) + self.notch_coeff5 * self.dfs[1]
                self.dfs[1] = self.dfs[0]
                self.dfs[0] = denAccum
            # standardization:
            if self.use_std:
                if self.moving_average is not None:
                    delta = x - self.moving_average
                    self.moving_average = self.moving_average + self.ALPHA_AVG * delta
                    self.moving_variance = (1 - self.ALPHA_STD) * (self.moving_variance + self.ALPHA_STD * delta**2)
                    moving_std = np.sqrt(self.moving_variance)
                    x = (x - self.moving_average) / (moving_std + self.EPSILON)
                else:
                    self.moving_average = x
            value[i] = x
        return value

## Loading the data
Here, we'll load all the EDF files from the right place.

In [3]:
import pyedflib

In [4]:
unfiltered_mass = "/project/portiloop_transformer/transformiloop/dataset/MASS_preds/"
dest_mass = "/project/portiloop-training/portiloop_software/dataset/MASS"

In [22]:
import os
import pyedflib
from pyedflib.highlevel import write_edf_quick


filtering = FilterPipeline(nb_channels=1, sampling_rate=250)

for file in os.listdir(unfiltered_mass):

    if file in os.listdir(dest_mass):
        print(f"{file} already filtered")
        continue
    
    # Check if file extension is edf
    if not file.endswith(".edf"):
        print(f"{file} is not an edf file")
        continue

    # Reading
    print(f"Reading {file}...")
    with pyedflib.EdfReader(os.path.join(unfiltered_mass, file)) as f:
        data = f.readSignal(0)

    # Filtering
    print(f"Filtering {file}...")
    data_filtered = [filtering.filter([point])[0].item() for point in data]
    print(f"Filtered {file}")

    out_file = os.path.join(dest_mass, file)
    fe_out = 250
    signals = np.array([data_filtered])
    print(f"Writing new edf file {out_file}...")
    write_edf_quick(out_file, signals, fe_out)


Reading 01-01-0035.edf...
Filtering 01-01-0035.edf...
Filtered 01-01-0035.edf
Writing new edf file /project/portiloop-training/portiloop_software/dataset/MASS/01-01-0035.edf...




Reading 01-03-0004.edf...
Filtering 01-03-0004.edf...
Filtered 01-03-0004.edf
Writing new edf file /project/portiloop-training/portiloop_software/dataset/MASS/01-03-0004.edf...




Reading 01-01-0004.edf...
Filtering 01-01-0004.edf...
Filtered 01-01-0004.edf
Writing new edf file /project/portiloop-training/portiloop_software/dataset/MASS/01-01-0004.edf...




Reading 01-03-0055.edf...
Filtering 01-03-0055.edf...
Filtered 01-03-0055.edf
Writing new edf file /project/portiloop-training/portiloop_software/dataset/MASS/01-03-0055.edf...




Reading 01-03-0015.edf...
Filtering 01-03-0015.edf...
Filtered 01-03-0015.edf
Writing new edf file /project/portiloop-training/portiloop_software/dataset/MASS/01-03-0015.edf...




Reading 01-03-0026.edf...
Filtering 01-03-0026.edf...
Filtered 01-03-0026.edf
Writing new edf file /project/portiloop-training/portiloop_software/dataset/MASS/01-03-0026.edf...




Reading 01-03-0033.edf...
Filtering 01-03-0033.edf...
Filtered 01-03-0033.edf
Writing new edf file /project/portiloop-training/portiloop_software/dataset/MASS/01-03-0033.edf...




Reading 01-03-0030.edf...
Filtering 01-03-0030.edf...
Filtered 01-03-0030.edf
Writing new edf file /project/portiloop-training/portiloop_software/dataset/MASS/01-03-0030.edf...




Reading 01-01-0006.edf...
Filtering 01-01-0006.edf...
Filtered 01-01-0006.edf
Writing new edf file /project/portiloop-training/portiloop_software/dataset/MASS/01-01-0006.edf...




Reading 01-02-0002.edf...
Filtering 01-02-0002.edf...
Filtered 01-02-0002.edf
Writing new edf file /project/portiloop-training/portiloop_software/dataset/MASS/01-02-0002.edf...




Reading 01-03-0045.edf...
Filtering 01-03-0045.edf...
Filtered 01-03-0045.edf
Writing new edf file /project/portiloop-training/portiloop_software/dataset/MASS/01-03-0045.edf...




Reading 01-03-0035.edf...
Filtering 01-03-0035.edf...
Filtered 01-03-0035.edf
Writing new edf file /project/portiloop-training/portiloop_software/dataset/MASS/01-03-0035.edf...




Reading 01-01-0014.edf...
Filtering 01-01-0014.edf...
Filtered 01-01-0014.edf
Writing new edf file /project/portiloop-training/portiloop_software/dataset/MASS/01-01-0014.edf...




Reading 01-03-0011.edf...
Filtering 01-03-0011.edf...
Filtered 01-03-0011.edf
Writing new edf file /project/portiloop-training/portiloop_software/dataset/MASS/01-03-0011.edf...




Reading 01-03-0006.edf...
Filtering 01-03-0006.edf...
Filtered 01-03-0006.edf
Writing new edf file /project/portiloop-training/portiloop_software/dataset/MASS/01-03-0006.edf...




Reading 01-03-0051.edf...
Filtering 01-03-0051.edf...
Filtered 01-03-0051.edf
Writing new edf file /project/portiloop-training/portiloop_software/dataset/MASS/01-03-0051.edf...




Reading patient_info.csv...


OSError: /project/portiloop_transformer/transformiloop/dataset/MASS_preds/patient_info.csv: the file is not EDF(+) or BDF(+) compliant (it contains format errors)

In [15]:

# Write to a new EDF file


Writing new edf file /project/portiloop-training/portiloop_software/dataset/MASS/01-01-0001.edf




True