# Data Preparation

### Purpose
Purpose of this notebook is to get the Mindware data (signals & events) ready for further processing.

### Approach
1. Load the signal data and the event data per subject
2. If there are > 1 signal / event data, load both
3. In the event data, assign event onset and event offset for each event. Every event minimally has an onset
4. Join the event data with the signal data and concatenate in case of > 1 event/signal data
### Input / Output

- Input `~/data/raw/signals` & `~/data/raw/events`
- Output `~/data/interim/signals` & `~/data/interim/events`


### Imports

In [6]:
# fmt: off
from pathlib import Path
from typing import Union, List, Dict
import sys
import pandas as pd 
import importlib
sys.path.append(str(Path().cwd().parent/"src"))
import ecg_utils.data_utils as data_utils
import ecg_utils.parameters as parameters
import ecg_utils.common as common
import numpy as np
importlib.reload(data_utils)
importlib.reload(parameters)
importlib.reload(common)
# fmt:on


<module 'ecg_utils.common' from '/Users/lukasspiess/Library/CloudStorage/OneDrive-SpiessSolution/Neurophysiological profiles/General/Mindware data analysis/src/ecg_utils/common.py'>

## Parameters

In [None]:
WORKING_DIR = Path().cwd()
ROOT_DIR = WORKING_DIR.parent

DATA_DIR = ROOT_DIR / 'data'

RAW_SIGNAL_DATA_DIR = DATA_DIR / 'raw' / 'signals'
RAW_EVENT_DATA_DIR = DATA_DIR / 'raw' / 'events'

INTERIM_SIGNAL_DATA_DIR = DATA_DIR / 'interim' / 'signals'
INTERIM_EVENT_DATA_DIR = DATA_DIR / 'interim' / 'events'

INTERIM_SIGNAL_DATA_DIR.mkdir(exist_ok=True, parents=True)
INTERIM_EVENT_DATA_DIR.mkdir(exist_ok=True, parents=True)

# Load and prepare raw data

For some participants, recording has stopped and was continued again. This resulted in having two (or more) data files per participant. This is true for both the raw signal data and for the event data. 

In [8]:
sampling_frequency = parameters.base_params['general'].get("sampling_frequency")

In [9]:
"""
Get all unique subject ids from the list of files
"""
all_subject_ids = set()
data_filepaths = list(RAW_SIGNAL_DATA_DIR.glob('*.txt'))

for flp in data_filepaths:
    filename = flp.stem
    if not (filename.lower().startswith("asamb")) and filename[:4].isnumeric():
        sub_id_str = filename.split("_")[0]
        all_subject_ids.add(sub_id_str)


In [10]:
"""
Read data and event files, join them, and export as single file
"""
for subject_id in all_subject_ids:
    print(f"Processing subject {subject_id}")
    # if subject_id != "2084":
    #     continue
    # retrieve filepaths of signal and event data and assure they match in number
    data_filepaths = list(RAW_SIGNAL_DATA_DIR.glob(f"{subject_id}*"))
    event_filepaths = list(RAW_EVENT_DATA_DIR.glob(f"ASAMB_{subject_id}*"))
    if len(data_filepaths) not in (1,2):
        raise ValueError(f"Expected 1 or 2 data files for subject {subject_id}. Got {len(data_filepaths)}")
    if len(data_filepaths) != len(event_filepaths):
        event_filepaths = list(RAW_EVENT_DATA_DIR.glob(f"asamb_{subject_id}*"))
    if len(data_filepaths) != len(event_filepaths):
        raise ValueError(f"Did not find the same number of data files and event files for subject {subject_id}. Got {len(data_filepaths)} and {len(event_filepaths)}")
    
    # Read the signal and event data
    data_df_list = [pd.read_csv(RAW_SIGNAL_DATA_DIR / flp, delimiter="\t", skiprows=1).assign(source_file = str(flp.parts[-1])) for flp in np.sort(data_filepaths)]
    event_df_list = [pd.read_csv(RAW_EVENT_DATA_DIR / flp, delimiter="\t").assign(source_file = str(flp.parts[-1])) for flp in np.sort(event_filepaths)]
    
    
    # Preprocess the event data
    try:
        event_df_list = [data_utils.preprocess_event_data(df) for df in event_df_list]
    except Exception as e:
        print(f"The following error occurred when processing subject {subject_id}: {e}")
    
    # Left-Join the signal data with the events data
    merged_df_list = []
    for signal_df, event_df in zip(data_df_list, event_df_list):
        event_df["Time"] = event_df["Time"].apply(common.comma_str_2_float)
        signal_df['Time (s)'] = signal_df['Time (s)'].apply(lambda x: common.comma_str_2_float(x) if isinstance(x, str) else x)
        merged_df_list.append(
            signal_df.merge(
                event_df,
                how = 'left',
                left_on = ['Time (s)'],
                right_on = ['Time'],
                suffixes=('_signal', '_event')
            ).drop(columns = ["Time"])
        )
    
    # concatenate and save
    signal_df = pd.concat(merged_df_list)
    signal_df = (
        signal_df
        .assign(subject_id = subject_id, row_index = range(len(signal_df)))
        .rename(columns = {"Name": "event_name", "Time (s)": "time_seconds_original_file"})
        .set_index("row_index", drop = True)
        )
    signal_df.index.name = "row_index"
    signal_df.to_csv(INTERIM_SIGNAL_DATA_DIR/f"{subject_id}_signal_events.csv", index = False)
    signal_df[~pd.isnull(signal_df["event_name"])].to_excel(INTERIM_EVENT_DATA_DIR/f"{subject_id}_events.xlsx", index=False)
    

Processing subject 2114
Processing subject 2108
Processing subject 2103
Processing subject 2115
Processing subject 2105
Processing subject 2112
Processing subject 2113
Processing subject 2110
Processing subject 2104
Processing subject 2100
Processing subject 2109
Processing subject 2102
Processing subject 2111
Processing subject 2106
Processing subject 2107
