# 0. Study & EEG Dataset Acquisition

Scope:
- Download dataset from OpenNeuro
- Establish a simple Study dataframe of reference information
- Establish a flexible folder structure to retain multiple processing runs
- Establish a dataframe of subjects in the study



# Imports & Utilities

In [1]:
# General imports
import os
import sys
import gc
from datetime import datetime
from pprint import pprint
import time
import pickle

import subprocess
import shutil

# Custom Functions
sys.path.append(os.path.abspath('../Notebooks/Utilities')) 
import cust_utilities as utils

# Maths, Pandas etc
import math
import numpy as np
import pandas as pd
import scipy as sci


# Classes & Functions

In [2]:
# Create a subjects df using the given source participents.tsv

def create_subjects_df(subjects_file):
     # Read the datset csv file to get selected subjects data
    try:
        temp_subjects_df = pd.read_csv(subjects_file, sep='\t')
    except Exception as e:
        raise IOError(f"Failed to read subjects file '{subjects_file}': {e}")

    # Cater for different format participants.tsv file
    possible_column_names = [
                (['participant_id', 'GROUP', 'AGE', 'GENDER'], ['subject_id', 'pd', 'age', 'gender']),
                (['participant_id', 'Group', 'age', 'sex'], ['subject_id', 'pd', 'age', 'gender']),
                (['participant_id', 'age', 'gender'], ['subject_id', 'age', 'gender'])
            ]
    for cols, new_cols in possible_column_names:
        try:
            subjects_df = temp_subjects_df[cols].copy()
            subjects_df.columns = new_cols
            break
        except KeyError:
            continue
    else:
        raise KeyError("Required columns in subjects file. Found: "f"{temp_subjects_df.columns.tolist()}")
    
    # Derive 'pd' column if not present, based on subject_id containing 'pd'
    if 'pd' not in subjects_df.columns:
        subjects_df['pd'] = subjects_df['subject_id'].apply(lambda x: 1 if 'pd' in str(x).lower() else 0)
        cols = list(subjects_df.columns)
        cols.insert(1, cols.pop(cols.index('pd')))
        subjects_df = subjects_df[cols]

    # Change 'PD', 'Control', 'CTl to yes / no
    if not subjects_df['pd'].isin([0, 1]).all():
        mapping = {'PD': 1, 'pd': 1, 'Control': 0, 'CTL': 0}
        if not subjects_df['pd'].isin(mapping.keys()).all():
            invalid_vals = subjects_df.loc[~subjects_df['pd'].isin(mapping.keys()), 'pd'].unique()
            raise ValueError(f"Invalid values in 'pd' column: {invalid_vals}")
        subjects_df['pd'] = subjects_df['pd'].map(mapping).astype(int)

    return subjects_df

# EEG Dataset Download


In [None]:
# Run the shell script in the target EEG directory to download the dataset

eeg_folder = 'EEG_Datasets_Source_exgithub'
dataset_name = 'ds002778-1.0.2'
script = 'ds002778-1.0.2.sh'

# Establish folders
data_folder_path = utils.get_folder_path(eeg_folder)
script_file_path = utils.get_file_path(data_folder_path, script)
results_folder_path = utils.extend_folder_path(data_folder_path, dataset_name, exists_ok=False)

# Run the provided shell script to download the dataset in the target folder
# shutil.copy(script_file_path, results_folder_path)
try:
	result = subprocess.run(['bash', script], cwd=data_folder_path, check=True, capture_output=True, text=True)
	print(result.stdout)
	# os.remove(os.path.join(results_folder_path, script))
except subprocess.CalledProcessError as e:
	print(f"Script failed with error:\n{e.stderr}")


# Study Setup

In [3]:
# Execute the Study setup - Run just once for start on a particular study
#

# Study Specific Details
study_name = 'IOWA_Rest'
dataset_ref = 'ds004584-1.0.0'
source_url = 'https://doi.org/10.18112/openneuro.ds003490.v1.1.0'
source_EEG_format = {'format': 'EEGLab', 
                     'types': ['EEGLab', 'BDF']}

# study_name = 'UNM_Oddball'
# dataset_ref = 'ds003490-1.1.0'
# source_url = 'https://doi.org/10.18112/openneuro.ds003490.v1.1.0'
# source_EEG_format = {'format': 'EEGLab', 
#                      'types': ['EEGLab', 'BDF']}

# study_name = 'UCSD_Rest_New'
# dataset_ref = 'ds002778-1.0.2'
# source_url = 'https://doi.org/10.18112/openneuro.ds002778.v1.0.2'
# source_EEG_format = {'format': 'BDF', 
#                      'types': ['EEGLab', 'BDF']}

# ----------------------------------------
# Folder Names
eeg_datasets_source_folder = 'EEG_Datasets_Source_exgithub'
EEG_Processing_Results = '1_EEG_Processing_Results'
ML_Training_Results = '2_ML_Training_Results'

# EEG Datasets Source Structure - BIDS
subjects_list = 'participants.tsv'

# Check for EEG source datsets 
eeg_datasets_folder_path = utils.get_folder_path(eeg_datasets_source_folder)
datasets_list = os.listdir(eeg_datasets_folder_path)
datasets_list = [d for d in datasets_list if d.startswith('ds') and os.path.isdir(os.path.join(eeg_datasets_folder_path, d))]
if dataset_ref not in datasets_list:
    raise ValueError(f"Dataset '{dataset_ref}' not found in list of available datasets: {datasets_list}")
dataset_path = utils.get_folder_path(eeg_datasets_source_folder + '/' + dataset_ref)

# Get the subjects data from the subjects file
subjects_file = utils.get_file_path(dataset_path, subjects_list)
study_subjects_df = create_subjects_df(subjects_file)

# Establish standard folder paths
study_folder_path = utils.make_folder_path('Study_' + study_name, exists_ok=False)
eeg_processing_results_path = utils.make_folder_path('Study_' + study_name + '/' + EEG_Processing_Results, exists_ok=False)
ml_training_results_path = utils.make_folder_path('Study_' + study_name + '/' + ML_Training_Results, exists_ok=False)

# Study dataframe
temp = source_EEG_format['format']
# study_info_df = pd.DataFrame({
#     'study_name': study_name, 'dataset_ref': dataset_ref,
#     'source_url': source_url, 'source_EEG_format': temp,
#     'dataset_path': dataset_path, 
#     'eeg_processing_results_path': eeg_processing_results_path,
#     'ml_training_results_path': ml_training_results_path
#     }, index=[0])
study_info = pd.Series({
    'study_name': study_name, 'dataset_ref': dataset_ref,
    'source_url': source_url, 'source_EEG_format': temp,
    'dataset_path': dataset_path, 
    'eeg_processing_results_path': eeg_processing_results_path,
    'ml_training_results_path': ml_training_results_path
    })

# study_dataset_df = pd.DataFrame({
#     'study_name': [study_name],
#     'dataset_path': [dataset_path]
# })
# Save the dataframes
try:
    study_info.to_pickle(study_folder_path + '/study_inf.pkl', compression='zip')
    study_subjects_df.to_pickle(study_folder_path + '/study_subjects_df.pkl', compression='zip')
except Exception as e:
    print(f"Dataframe pickle save failed: {e}")

