# EEG Pipeline - WIP

WIP:
- Perplexity project: https://www.perplexity.ai/search/i-am-building-a-pipeline-in-py-sVmw70rHQXa0VKkIT_p5tg?0=r
- Python Handbook: https://github.com/ZitongLu1996/Python-EEG-Handbook/tree/master




## Pickle saving, loading

https://www.perplexity.ai/search/in-a-jupyter-notebook-i-have-c-0LbAAH9ITFGfcPYaWlrt6Q

In [None]:
# def load_dataframe_safely(file_path):
#     """Safely load DataFrame with validation."""
#     try:
#         if not Path(file_path).exists():
#             raise FileNotFoundError(f"File not found: {file_path}")
#         df = pd.read_pickle(file_path)
#         print(f"DataFrame loaded: shape {df.shape}")
#         return df
#     except Exception as e:
#         print(f"Error loading DataFrame: {e}")
#         return None

# # Save with compression
# df.to_pickle('data/large_dataset.pkl.gz', compression='gzip')

# # Load compressed file
# df = pd.read_pickle('data/large_dataset.pkl.gz')

# EEG Pipeline Functions

Functions for the main stages in the pipeline:
- Pipeline Parameters
- EEG Data Load & Import
- Power Spectrum (PSD) Calculate
- Spectral Parameterisation

## General Packages & Utilities

In [None]:
# General imports
import os
import gc
from datetime import datetime
import math
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
plt.style.use('ggplot')

In [None]:
# A simple utility function to establish paths for data files
def get_folder_path(data_directory, file_name = ''):
    project_root = os.path.dirname(os.getcwd())
    data_path = os.path.join(project_root, data_directory, file_name)
    if not os.path.isdir(data_path):
        raise Exception(f'Directory not found: {data_path}')  
     
    return data_path

# Utility function to check for the existence of a file in a given directory
def get_file_path(directory, file_name):
    file_path = os.path.join(directory, file_name)
    if not os.path.isfile(file_path):
        raise Exception(f'File not found: {file_path}')
    return file_path

# Save dataframe using Pickle
def save_dataframe(study_name, features_df):
    """Save DataFrame with validation."""

    # save_file_name = 'test'
    # dataset_name = datasets_list[0]
    current_date = datetime.now().strftime('%Y%m%d')
    save_file_name = f'features_{study_name}_{current_date}.pkl'
    save_file_path = os.path.join(eeg_study_features_path, save_file_name)

    try:
        if os.path.exists(save_file_path):
            raise Exception("Warning: File already exists")
        features_df.to_pickle(save_file_path)
        print(f"DataFrame saved successfully to {save_file_path}")

    except Exception as e:
        print(f"Error saving DataFrame: {e}")

## Pipeline Parameters


In [None]:
# Set the Establish parameters for the overall pipeline

# Testing & Logging
pipeline_verbose = False

# EEG Datasets & Data Load
eeg_datasets_source_folder = 'Full_Pipeline_Data/EEG_Datasets_Source_Raw'
eeg_study_features_folder = 'Full_Pipeline_Data/EEG_Study_Features'

# EEG Preprocessing
# TODO: Define

# Power Spectrum Calculation
psd_method = 'welch'
psd_frequency_min = 1
psd_frequency_max = np.inf

# Spectral Parameterisation
spm_max_peak_width = 12
spm_min_peak_height = 0.05
spm_max_n_peaks = 10
spm_fit_freq_range = [1, 75]
spm_aperiodic_mode = 'fixed'
spm_max_channels = 72
# TODO: Subject 68 had 66 channels but max should be 63? Check during preprocessing, before FOOOFING



In [None]:
# Establish Data Folders
eeg_datasets_path = get_folder_path(eeg_datasets_source_folder)
eeg_study_features_path = get_folder_path(eeg_study_features_folder)

# Get a list of datasets in the EEG datasets source folder
datasets_list = os.listdir(eeg_datasets_path)
datasets_list = [d for d in datasets_list if d.startswith('ds') and os.path.isdir(os.path.join(eeg_datasets_path, d))]

print('\n---------------------------------')
print('EEG Pipeline Parameters - Data')
print(f'EEG Source Datasets Folder: {eeg_datasets_path}')
print('Datasets found:', datasets_list)
print(f'EEG Study Features Folder: {eeg_study_features_path}')

del eeg_datasets_source_folder, eeg_study_features_folder

## EEG Data Load & Import

Tools Used for Import:
- MNE-Python: https://mne.tools/stable/index.html
- The Brain Imaging Data Structure (BIDS): https://bids.neuroimaging.io
- see example using mne https://neurodsp-tools.github.io/neurodsp/auto_examples/plot_mne_example.html#sphx-glr-auto-examples-plot-mne-example-py

MNE vs NueroDSP:
- Use MNE
- https://www.perplexity.ai/search/i-am-building-a-pipeline-in-py-sVmw70rHQXa0VKkIT_p5tg?0=d#1

Formats:
- Assumed OpenNeuro, BIDS compliant datasets manually downloaded into the defined folders structure


In [None]:
# MNE-Python
import mne

In [None]:
# Function to get the raw EEG signal data
def get_EEG_raw(dataset_folder, dataset_name, subject_id, verbose=False):
    """
    Get raw EEG data.
    Parameters:
        dataset_name - str
        subject_id - str
    Returns:
        EEG - raw
    """

    print('\n---------------------------------')
    print('EEG Raw Dataset Import')
    
    # BIDS File Structure
    dataset_root = dataset_folder
    dataset_name = dataset_name
    subject = subject_id
    session = ''
    task = 'Rest'
    datatype='eeg'

    # EEGLab .set file name
    temp_path = os.path.join(dataset_root, dataset_name, subject, session, datatype)
    temp_file_name = subject + '_task-' + task + '_' + datatype + '.set'
    eeg_lab_file_path = get_file_path(temp_path, temp_file_name)

    # Get the raw EEG data & Inspect it
    eeg_dataset_raw = mne.io.read_raw_eeglab(eeg_lab_file_path, preload=True)

    if verbose:
        print(f"Description: {eeg_dataset_raw.info['description']} on {eeg_dataset_raw.info['meas_date']}")
        print(eeg_dataset_raw)
        print(eeg_dataset_raw.info)
        # print(eeg_dataset_raw.info['sfreq'])

        # Plot Time Series - MNE wbdwindowow plot
        # eeg_dataset_raw.plot(duration=10, n_channels=16, clipping=None)
        # TODO: Add an inline plot of all the channels time series ... NeuroDSP if not MNE

        # Settings for exploring an example channel of data
        fs = eeg_dataset_raw.info['sfreq']
        ch_label = 'P5'
        t_start = 0
        t_stop = int(t_start + (10 * fs))

        # Extract an example channel
        sig, times = eeg_dataset_raw.get_data(mne.pick_channels(eeg_dataset_raw.ch_names, [ch_label]),
                                start=t_start, stop=t_stop,
                                return_times=True)
        sig = np.squeeze(sig)

        plt.figure(figsize=(12, 4))
        plt.plot(times, sig, color='blue')
        plt.title(f"EEG Time Series - Channel {ch_label}")
        plt.xlabel("Time (s)")
        plt.ylabel("Amplitude (uV)")
        plt.tight_layout()
        plt.show()

        # Average PSD All Channels
        eeg_dataset_raw.compute_psd().plot(average=True)
        eeg_dataset_raw.compute_psd().plot()
        plt.show()

    return eeg_dataset_raw

In [None]:
# Function to get a subject details from the study file
def get_subject_data(dataset_folder, dataset_name, subjects_source_file):
    """
    Get subject data.
    Parameters:
        dataset_name - str
        subject_id - str
    Returns:
        EEG - raw
    """

    # Get subjects details
    subjects_file = os.path.join(dataset_folder, dataset_name, subjects_source_file)
    if not os.path.isfile(subjects_file):
        raise Exception(f'File not found: {subjects_file}')
    subjects_df = pd.read_csv(subjects_file, sep='\t')

    return subjects_df


## EEG Preprocessing

- TO DO
- Re-referencing
- Filtering
- ICA


In [None]:
# Function to preprocess raw EEG data
def perform_EEG_preprocessing(EEG_raw, verbose=False):
    """
    Preprocessing of the EEG raw data.
    Parameters:
        xxxx
        xxxx
    Returns:
        EEG - clean
    """

    print('\n---------------------------------')
    print(f'Preprocess of raw EEG')
    print(f'For EEG: {EEG_raw}')

    # TODO: Add filtering
    # TODO: Add ICA
    # TODO: Subject 68 had 66 channels but max should be 63? Check during preprocessing, before FOOOFING


    EEG_clean = EEG_raw

    return EEG_clean

## Power Spectrum (PSD) Calculate

Tools Used for PSD:
- MNE-Python: https://mne.tools/stable/index.html

In [None]:
# Function to Calculate the Power Spectrum - MNE
def get_spectra(eeg_raw, verbose=False):
    """
    Calculate the Power Spectra for all channels
    Parameters:
        eeg_raw - raw
    Returns:
        Spectra - frequencies and powers for each channel
    """
    print('\n---------------------------------')
    print(f"Power Spectrum Computation")
    print(f"Using {psd_method} method for frequencies {psd_frequency_min} to {psd_frequency_max} Hz")

    spectra = eeg_raw.compute_psd(method=psd_method, fmin=psd_frequency_min, fmax=psd_frequency_max)
    
    if verbose:
        print(spectra.info)

        x_freqs = spectra.freqs
        x_powers = spectra.get_data(return_freqs=False)

        # Log-log plot for all channels
        plt.figure(figsize=(12, 6))
        for idx, ch in enumerate(eeg_raw.ch_names):
            plt.loglog(x_freqs, x_powers[idx], alpha=0.5, label=ch if idx < 10 else None)  # label only first 10 for clarity
        plt.title(f"Power Spectrum (Log-Log) - All Channels ({psd_method} method)")
        plt.xlabel("Frequency (Hz)")
        plt.ylabel("Power ($V^2/Hz$)")
        plt.tight_layout()
        plt.show()

        # Log-lin plot for all channels
        plt.figure(figsize=(12, 6))
        for idx, ch in enumerate(eeg_raw.ch_names):
            plt.semilogy(x_freqs, x_powers[idx], alpha=0.5, label=ch if idx < 10 else None)  # label only first 10 for clarity
        plt.title(f"Power Spectrum (Log-Lin) - All Channels ({psd_method} method)")
        plt.xlabel("Frequency (Hz)")
        plt.ylabel("Power ($V^2/Hz$)")
        plt.tight_layout()
        plt.show()

        # A selected channel
        ch_label = 'P5'
        fig, (ax0, ax1) = plt.subplots(nrows=1, ncols=2, figsize=(15, 5))
        fig.suptitle(f'Power Spectrum - Channel {ch_label} ({psd_method} method)', fontsize=20)
        ax0.set_title('Log/Log')
        ax1.set_title('Log/Lin')
        plot_spectra(x_freqs, x_powers[eeg_raw.ch_names.index(ch_label)], log_freqs=True, log_powers=True, ax=ax0, colors='blue')
        plot_spectra(x_freqs, x_powers[eeg_raw.ch_names.index(ch_label)], log_powers=True, ax=ax1, colors='blue')

        plt.show()
    
    return spectra


## Spectral Parameterisation

Spectral Parameterisation:
- SpecParam / FOOOF
- The Aperiodic Methods project - Documentation: https://aperiodicmethods.github.io/docs/index.html and Repo: in https://github.com/AperiodicMethods/AperiodicMethods
- And cite: https://www.biorxiv.org/content/10.1101/2024.09.15.613114v1

Examples
- Explanation of FOOOFing and calculation: https://fooof-tools.github.io/fooof/auto_tutorials/plot_01-ModelDescription.html#sphx-glr-auto-tutorials-plot-01-modeldescription-py


In [None]:
# FOOF
from fooof import FOOOF
from fooof import FOOOFGroup
from fooof.plts.spectra import plot_spectra

# Check the version of the module
from fooof import __version__ as fooof_version
print('Current fooof version:', fooof_version)

In [None]:
# Function to perform Spectral Parameterisation and return the Aperiodic and Periodic components
def get_specparams(spectra, verbose=False):
    """
    Execute Spectral Parameterisation
    Parameters:
        xxxx
    Returns:
        Specparam features - dict of aperiodic and periodic components
    """

    print('\n---------------------------------')
    print(f"Group Spectral Parameterisation")

    # Parameters for SpecParam
    # max_peak_width = 12
    # min_peak_height = 0.05
    # max_n_peaks = 25
    # fit_freq_range = [1, 75]
    # aperiodic_mode = 'fixed'

    # For group of spectra
    freqs = spectra.freqs
    powers = spectra.get_data(return_freqs=False)
    n_channels = powers.shape[0]
    n_frequencies = powers.shape[1]
    freq_resolution = freqs[1] - freqs[0]
    print(f'For Spectra Group:')
    print(f'- {n_channels} channels & {n_frequencies} frequencies')
    print(f'- Frequency Range: {freqs.min():.0f} to {freqs.max():.0f} Hz')
    print(f'- Frequency resolution {freq_resolution:.2f} Hz')

    peak_width_limits = [int(math.ceil(2 * freq_resolution)), spm_max_peak_width]
    print('Extracting Periodic & Aperiodic Components with Parameters:')
    print(f'- Peak width limits: [{peak_width_limits[0]:.0f}, {peak_width_limits[1]:.0f}]')
    print(f'- Min peak height: {spm_min_peak_height:.2f}')
    print(f'- Max peaks: {spm_max_n_peaks}')
    print(f'- Range: {spm_fit_freq_range}')
    print(f'- Aperiodic Mode: {spm_aperiodic_mode}')

    if n_channels > spm_max_channels:
        raise Exception(f'WARNING: Found: {n_channels} channels, but only {spm_max_channels} max are saved')

    # Initialize FOOOF model
    fg = FOOOFGroup(peak_width_limits=peak_width_limits, 
                    max_n_peaks=spm_max_n_peaks, min_peak_height=spm_min_peak_height,
                    aperiodic_mode=spm_aperiodic_mode)

    # Fit the group of n spectra
    fg.fit(freqs, powers, spm_fit_freq_range, progress='tqdm.notebook')

    # Get the component parameters and create a features dataframe
    aperiodics = fg.get_params('aperiodic_params')
    periodics = fg.get_params('peak_params')

    specparam_features = {}
    for next_channel in range(aperiodics.shape[0]):
        specparam_features[f'ch{next_channel}_aperiodic_offset'] = aperiodics[next_channel, 0]
        specparam_features[f'ch{next_channel}_aperiodic_exponent'] = aperiodics[next_channel, 1]

        # For up to the max peaks, get periodic components
        peaks_ch = periodics[periodics[:, 3] == next_channel]
        peaks_found = peaks_ch.shape[0]
        if peaks_found > spm_max_n_peaks:
            raise Exception(f'WARNING: Peaks found: {peaks_found} but only {spm_max_n_peaks} saved')
        
        for next_peak in range(spm_max_n_peaks):
            if peaks_found > next_peak:
                specparam_features[f'ch{next_channel}_pk{next_peak}_periodic_CF'] = peaks_ch[next_peak, 0]
                specparam_features[f'ch{next_channel}_pk{next_peak}_periodic_PW'] = peaks_ch[next_peak, 1]
                specparam_features[f'ch{next_channel}_pk{next_peak}_periodic_BW'] = peaks_ch[next_peak, 2]
            else:
                specparam_features[f'ch{next_channel}_pk{next_peak}_periodic_CF'] = float('nan')
                specparam_features[f'ch{next_channel}_pk{next_peak}_periodic_PW'] = float('nan')
                specparam_features[f'ch{next_channel}_pk{next_peak}_periodic_BW'] = float('nan')


    # Examine results if verbose
    if verbose:
        # Overall group results
        fg.print_results()
        fg.plot()
        plt.show()

        # Visualise the fit for a sample single channel within the group
        channel_indx = 1
        ch_label = str(channel_indx)
        # TODO: Proper lable lookup
        # ch_label = eeg_dataset_raw.ch_names[channel_indx]

        fm = fg.get_fooof(ind=channel_indx, regenerate=True)
        fm.print_results()

        # fm.plot()
        fig, (ax0, ax1) = plt.subplots(nrows=1, ncols=2, figsize=(12, 8))
        fig.suptitle(f'Spec Param for Channel: {ch_label}', fontsize=20)
        ax0.set_title('Components - Log/Lin')
        ax1.set_title('Components - Log/Log')
        fm.plot(plot_peaks='shade', peak_kwargs={'color' : 'green'}, ax=ax0)
        fm.plot(plot_peaks='shade', peak_kwargs={'color' : 'green'}, plt_log=True, ax=ax1)
        plt.show()

    return specparam_features


# EEG Pipeline Execute

- For a defined study with subject details and raw EEG signal data
- Produce a features dataframe with Spectral Parameterisation results, ie Aperiodic and Periodic components

In [None]:
# End to end pipeline for all datasets

print('\n---------------------------------')
print('EEG Pipeline')

# Create the end dataframe - all features for each subject, with subject details and EEG periodic, aperiodic parameters
study_features_df = pd.DataFrame()

# Iterate through each datset
for next_dataset in datasets_list:
    print(f'For dataset: {next_dataset}')

    # Get subjects details
    subjects_df = get_subject_data(eeg_datasets_path, next_dataset, 'participants.tsv')

    # Iterate through each subject in the dataset
    for _, subject_row in subjects_df.iterrows():
        subject_id = subject_row['participant_id']

        print('\n---------------------------------')
        print(f'EEG Pipeline Start')
        print(f'For Subject: {subject_id}')

        # Get the subject features
        subject_features = {'subject_id': subject_id, 
                            'age': subject_row['AGE'], 'gender': subject_row['GENDER'], 'pd': subject_row['TYPE']}

        # Get the raw EEG data
        # temp_EEG_raw = get_EEG_raw(eeg_datasets_path, next_dataset, subject_row['participant_id'])
        temp_EEG_raw = get_EEG_raw(eeg_datasets_path, next_dataset, subject_id, 
                                   verbose=pipeline_verbose)

        # EEG Preprocessing
        temp_EEG_clean = perform_EEG_preprocessing(temp_EEG_raw, verbose=pipeline_verbose)

        # Power spectrum
        temp_spectra = get_spectra(temp_EEG_clean, verbose=pipeline_verbose)
        
        # Spectral Param
        specparam_features = get_specparams(temp_spectra, verbose=pipeline_verbose)

        # Add subject and component features to the features dataframe
        subject_features.update(specparam_features)
        study_features_df = pd.concat([study_features_df, pd.DataFrame([subject_features])], ignore_index=True)

        # if locals().get('TEST_MODE', False) is True:
        #     break
        # break
    
    # Save the features dataframe
    save_dataframe(next_dataset, study_features_df)
    # TODO: Catch an exception and abort the entire run

del next_dataset, subjects_df, subject_row, subject_id, subject_features, specparam_features, temp_EEG_raw, temp_EEG_clean, temp_spectra
