# Stress Classifier
## Overview
* Data Extraction: Downloads and sorts through database
* Signal processing: 
    + Pre-processing - filtering and signal cleaning
    + Feature Extraction - PQRST peak extraction
    + Feature Addition - Adding new features

## Settings and Globals
Modify settings to select database, model etc. and tune model

In [None]:

# SETTINGS

# Select Database:
database = "Spider" # database = "BrainPatch"

# GLOBALS
if database == 'Spider':
    sampling_rate = 100


In [None]:
# Import necessary modules
import pandas as pd
import numpy as np
import neurokit2 as nk
import os
import subprocess
import Utilities
import importlib
importlib.reload(Utilities)

In [None]:
# Define DataExtraction class
class DataExtraction():
    def __init__(self) -> None:
        pass

    # Extract data and store to file named Data
    def download_data(self):
        if database == 'Spider':
            directory = 'Data/Spider'
            url = 'https://physionet.org/files/ecg-spider-clip/1.0.0/'
            if not os.path.isdir(directory):
                print("Downloading database...this may take a while")
                os.makedirs(directory)
                cmd = f"wget -r -N -c -np -P {directory} {url}"
                print(cmd)
                try:
                    subprocess.run(cmd)
                except:
                    print("Error: Unable to download database")
                    os.rmdir(directory)
            else:
                print("Using pre-downloaded database")
    
    # sorts data into a single dataframe for each participant into a collective dataframe list
    def sort_data(self):
        print("Sorting data...")

        # try loading existing df if available
        file_path = 'Data/StoredDataFrames/sorted_data.pkl'
        ECG_df = Utilities.load_dataframe(file_path)
        if ECG_df:
            return ECG_df
        
        # otherwise create dataframe from scratch
        ECG_df = []

        if database == 'Spider':
            database_directory = 'Data/Spider/physionet.org/files/ecg-spider-clip/1.0.0/'
            # Exclude VP70 because of noise
            sub_directories = ['VP02', 'VP03','VP05','VP06','VP08','VP09','VP11','VP12','VP14','VP15','VP17','VP18','VP20','VP23','VP24','VP26','VP27',
                    'VP29','VP30','VP32','VP33','VP35','VP36','VP38','VP39','VP41','VP42','VP44','VP45','VP47','VP48','VP50','VP51','VP53',
                    'VP54','VP56','VP57','VP59','VP61','VP62','VP63','VP64','VP65','VP66','VP68','VP69','VP71','VP72','VP73','VP74',
                    'VP75','VP76','VP77','VP78','VP79','VP80']
            for index, sub in enumerate(sub_directories):
                # set path
                ECG_file = f'{database_directory}{sub}/BitalinoECG.txt'
                triggers_file = f'{database_directory}{sub}/Triggers.txt'

                # append data to dataframe
                ECG_participant_df = pd.read_csv(ECG_file, sep='\t', names = ['ECG','time','NA'], engine='python')
                ECG_participant_df = ECG_participant_df.drop(columns=['NA'])
                
                # set the start time to use to normalize the other times
                normalized_time = ECG_participant_df.iloc[0,1]
                ECG_participant_df.time = ECG_participant_df.time-normalized_time

                # read in trigger file
                triggers_df_temp = pd.read_csv(triggers_file, sep='\t', names = ['clip','on','off'], engine='python')
                triggers_df_temp.on = triggers_df_temp.on-normalized_time
                triggers_df_temp.off = triggers_df_temp.off-normalized_time

                # Create the 'Stressed' (label) column with all zeros
                ECG_participant_df["Stressed"] = np.zeros(len(ECG_participant_df))
                # This checks which time stamps fall into the time ranges when the clips are delivered, results in a column of "true" and "false"
                conditions = pd.concat([(ECG_participant_df['time'] >= triggers_df_temp.on[i]) & (ECG_participant_df['time'] <= triggers_df_temp.off[i]) for i in range(0,17)],axis=1).any(axis=1)
                ECG_participant_df["Stressed"] = conditions

                # append data to complete df dictionary
                ECG_df.append(ECG_participant_df)

                Utilities.progress_bar(index, len(sub_directories)-1)

        # save dataframe for nextime and return
        Utilities.save_dataframe(ECG_df, file_path)
        return ECG_df

In [None]:
# Define SignalProcessing Class

# Using Neurokit2 - using method 'neurokit' (5th order Butterworth filter) but can be changed to other cleaning method
# Documentation can be found here: https://neuropsychology.github.io/NeuroKit/functions/ecg.html                      
class SignalProcessing():
    def __init__(self, ECG_df):
        self.ECG_df = ECG_df

    def clean(self):
        print('Cleaning data...')
        # try loading existing df if available
        file_path = 'Data/StoredDataFrames/cleaned_data.pkl'
        ECG_df = Utilities.load_dataframe(file_path)
        if ECG_df:
            self.ECG_df = ECG_df
            return self.ECG_df
        
        # otherwise, create df from scratch, overwriting ECG_df with cleaned version 
        ECG_df = []
        for index, ECG_particpant_df in enumerate(self.ECG_df):
            ECG_df.append(nk.ecg_process(ECG_particpant_df['ECG'], sampling_rate, method='neurokit'))
            Utilities.progress_bar(index, len(self.ECG_df)-1)
        self.ECG_df = ECG_df
        
        # save dataframe for nextime
        Utilities.save_dataframe(self.ECG_df, file_path)

## Data Extraction and Pre-processing
* Downloads data, normalizes timeframe and puts data into a dataframe dictionary of all partcipant data - `ECG_df`.
* Cleans data using 

This will take a while if you haven't previously ran this.

In [None]:
# Download and sort data to create dataframe
de = DataExtraction()
de.download_data()
ECG_df = de.sort_data()

# Pass dataframe to be signal processed
sp = SignalProcessing(ECG_df)
sp.clean()