# EEG Dataset Processing Pipeline

Scope:
- Process a raw EEG dataset 
- Dataset containing the results of an EEG study on multiple subjects
- Dataset that has been downloaded from OpenNeuro, and structured as per the BIDS standard, and in EEGLab '.set' format

The Pipeline Stages (For each subject in an EEG study dataset):
- EEG Dataset Load - Get the raw source EEG signal data
- EEG Preprocessing - Execute filtering etc of the raw EEG time series data
- Power Spectra (PSD) Calculate - Calculate the power spectra, for all channels recorded
- Spectral Parameterisation - Determine the best fitting Aperiodic and Periodic components
- Features Set - Collate & save the study, subject and EEG data into a features set, Pandas Dataframe


## To Review

Projects:
- Perplexity project: https://www.perplexity.ai/search/i-am-building-a-pipeline-in-py-sVmw70rHQXa0VKkIT_p5tg?0=r
- Python Handbook: https://github.com/ZitongLu1996/Python-EEG-Handbook/tree/master
- Pickle: https://www.perplexity.ai/search/in-a-jupyter-notebook-i-have-c-0LbAAH9ITFGfcPYaWlrt6Q

## Dependencies

General dependencies:
- python = 3.11.13
- numpy = 2.0.2
- scipy = 1.15.3
- pandas = 2.2.3
- matplotlib = 3.10.3

ML dependencies:
- scikit-learn = 1.6.1

EEG specific dependencies:
- mne = 1.9.0
- specparam = 2.0.0rc3

## Python-MNE

Used for Import:
- MNE-Python: https://mne.tools/stable/index.html
- The Brain Imaging Data Structure (BIDS): https://bids.neuroimaging.io

Used for Power Spectrum Calculate
- MNE vs NeuroDSP: https://www.perplexity.ai/search/using-python-which-package-is-zOoiPqUvTnKbO.QfgmPsJQ

Formats:
- Assumes OpenNeuro, BIDS compliant datasets manually downloaded into the defined folders structure
- Assumes EEGLab '.set' format


## Spectral Parameterisation

Spectral Parameterisation:
- The Aperiodic Methods project - Documentation: https://aperiodicmethods.github.io/docs/index.html and Repo: in https://github.com/AperiodicMethods/AperiodicMethods
- And cite: https://www.biorxiv.org/content/10.1101/2024.09.15.613114v1

Documentation:
- SpecParam: https://specparam-tools.github.io and https://github.com/fooof-tools
- FOOOF: https://fooof-tools.github.io/fooof/ and https://github.com/fooof-tools/fooof

FOOOF vs SpecPram:
- FOOOF: More stable and used but deprecated
- SpecParam: Release candidate but some improved model/fit selection: https://pmc.ncbi.nlm.nih.gov/articles/PMC11326208/
- Summary: https://www.perplexity.ai/search/using-python-which-package-is-M7kzhERoTLuCrIKbXxN9sQ


# Imports & Utilities

In [1]:
# Not availble through a Conda install/environment - PIP Install may be required
# %pip install specparam


In [2]:
# General imports
import os
import gc
from datetime import datetime
from pprint import pprint

import math
import numpy as np
import pandas as pd

# Plots
import matplotlib.pyplot as plt
plt.style.use('ggplot')

# MNE-Python
import mne

# SpecParam
from specparam import SpectralGroupModel
from specparam.plts.spectra import plot_spectra

# Check the version of SpecParam
from specparam import __version__ as specparam_version
print('Current SpecParam version:', specparam_version)

Current SpecParam version: 2.0.0rc3


In [3]:
# A utility function to establish relative paths for a given folder
def get_folder_path(folder_name):
    project_root = os.path.dirname(os.getcwd())
    folder_path = os.path.join(project_root, folder_name)
    if not os.path.isdir(folder_path):
        raise FileNotFoundError(f'Directory not found: {folder_path}')  
     
    return folder_path

# Utility function to check for the existence of a file in a given directory
def get_file_path(folder, file_name):
    file_path = os.path.join(folder, file_name)
    if not os.path.isfile(file_path):
        raise FileNotFoundError(f'File not found: {file_path}')
    return file_path


# Classes & Functions

## EEG_Study Class

In [4]:
# Class defining the paraemters for an EEG study

class EEG_Study:
    """
    Defintion and parameters for an EEG study

    Attributes
    ----------
    study_run_id : str
    subjects_df : dataframe of subjects in study
    """

    # Data Folders
    __eeg_datasets_source_folder = 'Data/EEG_Datasets_Source_exgithub'
    __eeg_study_features_folder = 'Data/EEG_Datasets_Processed'

    # BIDS structure, file name for source of subjects data
    __subjects_source_file = 'participants.tsv'

    def __init__(self, dataset_name):
        """
        Initialise EEG_Study instance.

        Parameters
        ----------
        dataset_name : str

        Returns
        -------
        Study : class instance
        """

        # Input validation - Valid Folder paths
        eeg_datasets_folder_path = get_folder_path(self.__eeg_datasets_source_folder)
        eeg_study_features_folder_path = get_folder_path(self.__eeg_study_features_folder)

        # Input validation - Valid Dataset
        datasets_list = os.listdir(eeg_datasets_folder_path)
        datasets_list = [d for d in datasets_list if d.startswith('ds') and os.path.isdir(os.path.join(eeg_datasets_folder_path, d))]

        if dataset_name not in datasets_list:
            raise ValueError(f"Dataset '{dataset_name}' not found in list of available datasets: {datasets_list}")
        dataset_path = os.path.join(eeg_datasets_folder_path, dataset_name)
        if not os.path.exists(dataset_path):
            raise FileNotFoundError(f"Path does not exist: {dataset_path}")
        subjects_file = os.path.join(eeg_datasets_folder_path, dataset_name, self.__subjects_source_file)
        if not os.path.isfile(subjects_file):
            raise FileNotFoundError(f'File not found: {subjects_file}')
        
        # Create ID / Name for the study run
        current_date = datetime.now().strftime('%Y%m%d')
        study_run_id = f'{dataset_name}_{current_date}'
        
        # Private Attributes
        # TODO: Any private attributes?

        # Public Attributes
        # self.dataset_name = dataset_name
        self.study_run_id = study_run_id
        self.dataset_path = dataset_path
        self.subjects_df = self._create_subjects_df(subjects_file)

    # Private functions

    def _create_subjects_df(self, subjects_csv):
        # Read the datset csv file to get selected subjects data
        try:
            temp_subjects_df = pd.read_csv(subjects_csv, sep='\t')
        except Exception as e:
            raise IOError(f"Failed to read subjects file '{subjects_csv}': {e}")
        subjects_df = temp_subjects_df[['participant_id', 'AGE', 'GENDER', 'TYPE']].copy()
        subjects_df.columns = ['subject_id', 'age', 'gender', 'pd']

        return subjects_df
    
    # Public functions
    



## Get Raw EEG data

In [9]:
# Function to get the raw EEG signal data
def get_EEG_raw(study, subject_id):
    """
    Get raw EEG data.

    Parameters
    ----------
    study : Study class
    subject_id : str

    Returns
    -------
    EEG_raw : raw
    """
    
    # BIDS File Structure
    # dataset_root = 
    # dataset_name = 
    subject = subject_id
    session = ''
    task = 'Rest'
    datatype='eeg'

    # EEGLab .set file name
    temp_path = os.path.join(study.dataset_path, subject, session, datatype)
    temp_file_name = subject + '_task-' + task + '_' + datatype + '.set'
    eeg_lab_file_path = get_file_path(temp_path, temp_file_name)

    # Get the raw EEG data
    if 'VERBOSE' in globals() and VERBOSE:
        mne.set_log_level('DEBUG')
    else:
        mne.set_log_level('WARNING')
    eeg_dataset_raw = mne.io.read_raw_eeglab(eeg_lab_file_path, preload=True)

    return eeg_dataset_raw

# Execute Full EEG Pipeline - Single Dataset

In [14]:
# Define the execution parameters
#

# Set progress messages, testing
VERBOSE = True
max_subjects_run = 1 #np.inf
test_channel = 'P5'

# Set Execution Parameters



# Define the Study
study_details = EEG_Study('ds004584-1.0.0')
# print(study_details)
# attrs = {attr: getattr(study_details, attr) for attr in dir(study_details) if not attr.startswith('_') and not callable(getattr(study_details, attr))}
# pprint(attrs)




In [15]:
# Run the pipeline for each subject in the study
#

for idx, subject in study_details.subjects_df.iterrows():
    # Break for testing
    if int(idx) > (max_subjects_run - 1):
        break

    subject_id = subject['subject_id']
    print('\n-----------------------------------------------------------------------------------------------')
    print(f"Processing subject: {subject_id}")

    # Get the raw EEG data
    temp = get_EEG_raw(study_details, subject_id)
    


-----------------------------------------------------------------------------------------------
Processing subject: sub-001
Reading /Users/stuartgow/GitHub/EEG_ML_Pipeline/Data/EEG_Datasets_Source_exgithub/ds004584-1.0.0/sub-001/eeg/sub-001_task-Rest_eeg.fdt
Reading 0 ... 140829  =      0.000 ...   281.658 secs...
Cropping annotations 1970-01-01 00:00:00+00:00 - 1970-01-01 00:04:41.660000+00:00
  [0] Keeping  (1970-01-01 00:00:00+00:00 - 1970-01-01 00:00:00+00:00 -> 0.0 - 0.0)
Cropping complete (kept 1)


  eeg_dataset_raw = mne.io.read_raw_eeglab(eeg_lab_file_path, preload=True)
