# 0. Study Setup

- 1 Dataset Download [Jump To](#run-1-eeg-dataset-download)
- 2 Study Setup [Jump To](#run-2-study-setup)



## Ref Links

- The Brain Imaging Data Structure (BIDS): https://bids.neuroimaging.io
- MNE-Python: https://mne.tools/stable/index.html

## Dependencies

General dependencies:
- python = 3.11.13
- numpy = 2.0.2
- scipy = 1.15.3
- pandas = 2.2.3
- matplotlib = 3.10.3

EEG specific dependencies:
- mne = 1.9.0

# Imports & Functions

## Imports

In [None]:
# General imports
import os
import sys
import gc
import warnings

from datetime import datetime
from pprint import pprint
import time
import pickle
import random
from collections import Counter

import subprocess
import shutil

# Custom Functions
sys.path.append(os.path.abspath('../Notebooks/Utilities')) 
import cust_utilities as utils

# Maths, Pandas etc
import math
import numpy as np
import pandas as pd
import scipy as sci

# MNE-Python
import mne


## Subjects

In [None]:
# Create a subjects df using the given source participents.tsv

def create_subjects_df(subjects_file, study_name):
     # Read the datset csv file to get selected subjects data
    try:
        temp_subjects_df = pd.read_csv(subjects_file, sep='\t')
    except Exception as e:
        raise IOError(f"Failed to read subjects file '{subjects_file}': {e}")

    # Cater for different format participants.tsv file
    possible_column_names = [
                (['participant_id', 'GROUP', 'AGE', 'GENDER'], ['subject_id', 'pd', 'age', 'gender']),
                (['participant_id', 'Group', 'age', 'sex'], ['subject_id', 'pd', 'age', 'gender']),
                (['participant_id', 'age', 'gender'], ['subject_id', 'age', 'gender'])
            ]
    for cols, new_cols in possible_column_names:
        try:
            subjects_df = temp_subjects_df[cols].copy()
            subjects_df.columns = new_cols
            break
        except KeyError:
            continue
    else:
        raise KeyError("Required columns in subjects file. Found: "f"{temp_subjects_df.columns.tolist()}")
    
    # Add the study_name (slightly redundant)
    subjects_df.insert(0, 'study_name', study_name)
    
    # Derive 'pd' column if not present, based on subject_id containing 'pd'
    if 'pd' not in subjects_df.columns:
        subjects_df['pd'] = subjects_df['subject_id'].apply(lambda x: 1 if 'pd' in str(x).lower() else 0)
        cols = list(subjects_df.columns)
        cols.insert(1, cols.pop(cols.index('pd')))
        subjects_df = subjects_df[cols]

    # Change 'PD', 'Control', 'CTl to yes / no
    if not subjects_df['pd'].isin([0, 1]).all():
        mapping = {'PD': 1, 'pd': 1, 'Control': 0, 'CTL': 0}
        if not subjects_df['pd'].isin(mapping.keys()).all():
            invalid_vals = subjects_df.loc[~subjects_df['pd'].isin(mapping.keys()), 'pd'].unique()
            raise ValueError(f"Invalid values in 'pd' column: {invalid_vals}")
        subjects_df['pd'] = subjects_df['pd'].map(mapping).astype(int)

    return subjects_df

## EEG Data

In [None]:
# Function to get the raw EEG signal data
#

def get_EEG_raw(dataset_path, subject_id, task):
    """
    Get raw EEG data.

    Parameters
    ----------
    dataset_path : str
    subject_id : str

    Returns
    -------
    EEG_raw : FIF
    """

    # BIDS File Structure
    # dataset_root = 
    # dataset_name = 
    subject = subject_id
    session = ''
    # task = 'Rest'
    datatype='eeg'

    # Cater for none or multiple sessions, eg 'ses-01', 'ses-02'
    temp_path = os.path.join(dataset_path, subject)
    subfolders = [f for f in os.listdir(temp_path) if os.path.isdir(os.path.join(temp_path, f))]
    if 'eeg' in subfolders:
        session = ''
    elif any(f.startswith('ses-') for f in subfolders):
        session = sorted([f for f in subfolders if f.startswith('ses-')])
    else:
        raise FileNotFoundError(f"No 'eeg' or session folder found in {temp_path}")
    
    # Cater for 0 or multiple sessions
    # print(f'Test Found: {session}')
    sessions_eeg_raw = []
    if session == '':
        temp_path = os.path.join(dataset_path, subject, datatype)
        temp_file_name = subject + '_task-' + task + '_' + datatype + '.set'
        eeg_lab_file_path = utils.get_file_path(temp_path, temp_file_name)
        try:
            with warnings.catch_warnings(record=True) as w:
                warnings.simplefilter("always")
                eeg_dataset_raw = mne.io.read_raw_eeglab(eeg_lab_file_path, preload=True, verbose=True)
                sessions_eeg_raw.append(eeg_dataset_raw)
                for warn in w:
                    print(f"Warning loading EEG data for subject {subject}: {warn.message}")
                    if eeg_dataset_raw.annotations is not None:
                        print('Removing Annotations')
                        boundary_idx = []
                        for i, desc in enumerate(eeg_dataset_raw.annotations.description):
                            if 'boundary' in desc.lower():
                                boundary_idx.append(i)
                        if boundary_idx:
                            eeg_dataset_raw.annotations.delete(boundary_idx)
        except Exception as e:
            raise ValueError(f"Failed to load EEG data for subject {subject}: {e}")
    else:
        for next_session in session:
            temp_path = os.path.join(dataset_path, subject, next_session, datatype)
            temp_file_name = subject + '_' + next_session + '_task-' + task + '_' + datatype + '.set'
            eeg_lab_file_path = utils.get_file_path(temp_path, temp_file_name)
            try:
                eeg_dataset_raw = mne.io.read_raw_eeglab(eeg_lab_file_path, preload=True, verbose=True)
                sessions_eeg_raw.append(eeg_dataset_raw)
            except Exception as e:
                raise ValueError(f"Failed to load EEG data for subject {subject}: {e}")
    
    return sessions_eeg_raw

In [None]:
# Function to save EEG data for a given subject
#

def save_EEG_for_subject(eeg_data, folder_path, subject_name, tag, epoched=False, format='fif', verbose=False):

    if epoched:
        subject_results_file = os.path.join(folder_path, f'{subject_name}_{tag}_epo.{format}')
    else:
        subject_results_file = os.path.join(folder_path, f'{subject_name}_{tag}_raw.{format}')

    if format == 'fif':
        eeg_data.save(subject_results_file, overwrite=False, verbose=verbose)
    elif format == 'set':
        mne.export.export_raw(subject_results_file, eeg_data, fmt='eeglab', overwrite=False, verbose=verbose)
    else:
        raise ValueError(f"Unsupported format: {format}")


# Run 1: EEG Dataset Download


In [None]:
# Run the shell script in the target EEG directory to download the dataset

#---- Run Parameters --------------------------------
eeg_folder = 'EEG_Datasets_Source_exgithub'
dataset_name = 'ds004580-1.0.0'
script = 'ds004580-1.0.0.sh'
#----------------------------------------------------

# Establish folders
data_folder_path = utils.get_folder_path(eeg_folder)
script_file_path = utils.get_file_path(data_folder_path, script)
results_folder_path = utils.extend_folder_path(data_folder_path, dataset_name, exists_ok=False)

# Run the provided shell script to download the dataset in the target folder
# shutil.copy(script_file_path, results_folder_path)
try:
	result = subprocess.run(['bash', script], cwd=data_folder_path, check=True, capture_output=True, text=True)
	print(result.stdout)
	# os.remove(os.path.join(results_folder_path, script))
except subprocess.CalledProcessError as e:
	print(f"Script failed with error:\n{e.stderr}")


# Run: 2. Study Setup

In [None]:
# Execute the Study setup - Run just once for start on a particular study
#

#---- Run Parameters --------------------------------
# Study Specific Details
# study_name = 'IOWA_Rest'
# study_task = 'Rest'
# dataset_ref = 'ds004584-1.0.0'
# source_url = 'https://doi.org/10.18112/openneuro.ds003490.v1.1.0'
# source_EEG_format = {'format': 'EEGLab', 
#                      'types': ['EEGLab', 'BDF']}

study_name = 'UNM_Oddball'
study_task = 'Rest'
dataset_ref = 'ds003490-1.1.0'
source_url = 'https://doi.org/10.18112/openneuro.ds003490.v1.1.0'
source_EEG_format = {'format': 'EEGLab', 
                     'types': ['EEGLab', 'BDF']}

# study_name = 'UCSD_Rest'
# study_task = 'Rest'
# dataset_ref = 'ds002778-1.0.2'
# source_url = 'https://doi.org/10.18112/openneuro.ds002778.v1.0.2'
# source_EEG_format = {'format': 'BDF', 
#                      'types': ['EEGLab', 'BDF']}

# study_name = 'IOWA_Simon'
# study_task = 'Simon'
# dataset_ref = 'ds004580-1.0.0'
# source_url = 'https://doi.org/10.18112/openneuro.ds004580.v1.0.0'
# source_EEG_format = {'format': 'BDF', 
#                      'types': ['EEGLab', 'BDF']}
#----------------------------------------------------

# EEG Datasets Source Structure - BIDS
eeg_datasets_source_folder = 'EEG_Datasets_Source_exgithub'
subjects_list = 'participants.tsv'

# Folders Structure
EEG_Source_Data = '0_EEG_Source_Data'
EEG_Processing_Results = '1_EEG_Processing_Results'
ML_Training_Results = '2_ML_Training_Results'

# Check for EEG source datasets 
eeg_datasets_folder_path = utils.get_folder_path(eeg_datasets_source_folder)
datasets_list = os.listdir(eeg_datasets_folder_path)
datasets_list = [d for d in datasets_list if d.startswith('ds') and os.path.isdir(os.path.join(eeg_datasets_folder_path, d))]
if dataset_ref not in datasets_list:
    raise ValueError(f"Dataset '{dataset_ref}' not found in list of available datasets: {datasets_list}")
dataset_path = utils.get_folder_path(eeg_datasets_source_folder + '/' + dataset_ref)

# Establish standard folder paths
study_folder_path = utils.make_folder_path('Study_' + study_name, exists_ok=False)
eeg_source_data_path = utils.make_folder_path('Study_' + study_name + '/' + EEG_Source_Data, exists_ok=False)
eeg_processing_results_path = utils.make_folder_path('Study_' + study_name + '/' + EEG_Processing_Results, exists_ok=False)
ml_training_results_path = utils.make_folder_path('Study_' + study_name + '/' + ML_Training_Results, exists_ok=False)

# Save key study info
temp = source_EEG_format['format']
study_info = pd.Series({
    'study_name': study_name, 'dataset_ref': dataset_ref,
    'source_url': source_url, 'source_EEG_format': temp,
    'dataset_path': dataset_path, 
    'eeg_source_data_path': eeg_source_data_path,
    'eeg_processing_results_path': eeg_processing_results_path,
    'ml_training_results_path': ml_training_results_path
    })

# Get the subjects data from the subjects file
subjects_file = utils.get_file_path(dataset_path, subjects_list)
study_subjects_df = create_subjects_df(subjects_file, study_name)

# Save the data files
try:
    study_info.to_pickle(study_folder_path + '/study_inf.pkl', compression='zip')
    study_subjects_df.to_pickle(study_folder_path + '/study_subjects_df.pkl', compression='zip')
except Exception as e:
    print(f"Dataframe pickle save failed: {e}")

# Get Source EEG data for each subject
for idx, subject in study_subjects_df.iterrows():

    subject_id = subject['subject_id']
    print(f'--- Subject: {subject_id} -----------------------------------------')

    EEG_raw_files = get_EEG_raw(dataset_path, subject_id, task=study_task)
    # TODO: How to handle multiple session EEG raw files?
    EEG_raw = EEG_raw_files[0]
    
    save_EEG_for_subject(EEG_raw, eeg_source_data_path, subject_id, tag='source')
    print(EEG_raw)
    print(EEG_raw.info)
