In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import os
import pandas as pd
import codecs, json
from tqdm import tqdm
import typing as t
import numpy as np
!pip install mne
import mne
import pandas as pd
import scipy.signal as sps
from scipy import stats

import sys
sys.path.append('/content/drive/MyDrive/BmiResearch')

from constants.constants import ORIGINAL_DATASET_PATH, PREPROCESSED_DATASET_PATH
from constants.constants import EEG_CHANNELS
from utils.debugger import logger
# from pipeline_structure.preprocessing.preprocessed_dataset import to_preprocessed_dataset
from utils.processer import train_test_split_time_eeg, sliding_window_iter, create_output_for_chunks



In [None]:
len(EEG_CHANNELS)

33

In [None]:
low_filter = 1
high_filter = 20
frequency = 500
minutes_for_test = 2
size = 100
overlap = 80

In [None]:
def preprocess_eeg_data(low_filter: float, high_filter: float, frequency: float, subject_eeg_folder: str,
                        subject: str, eeg_file: str, events_file: str, preprocessed_path: str):
    try:

        logger.info(f"Reading EEG data from {subject}")
        raw = mne.io.read_raw_eeglab(f'{subject_eeg_folder}{eeg_file}', preload=True)

        raw.filter(low_filter, high_filter)

        eeg_data = raw.get_data()
        eeg_channels = [element for element in raw.ch_names if element in EEG_CHANNELS]

        events_dataframe = pd.read_csv(f'{subject_eeg_folder}{events_file}', sep='\t')

        events_dataframe['time'] = (events_dataframe['sample'].astype(int)) / frequency
        events_dataframe['onset_sample'] = (events_dataframe['onset'] * frequency).astype(int)

        task_dict = events_dataframe.set_index('onset_sample')['value'].to_dict()
        eeg_events_dataframe = pd.DataFrame(eeg_data.T, columns=raw.ch_names)

        tasks_list = []
        task = ''

        for k in list(eeg_events_dataframe.index):
            if k in task_dict.keys():
                task = task_dict[k]
            tasks_list.append(task)

        eeg_dataframe = eeg_events_dataframe[eeg_channels].copy()
        eeg_dataframe['task'] = tasks_list

        filtered_eeg_dataframe = eeg_dataframe[eeg_dataframe['task'].isin(['RightTO', 'RightHS', 'LeftTO', 'LeftHS'])]
        print(filtered_eeg_dataframe.columns)

        filtered_eeg_dataframe.to_csv(f'{preprocessed_path}/eeg_data.csv', index=False)
        logger.info(f'Saved to {preprocessed_path}')

    except Exception as e:
        logger.error(f'Unexpected error: {e}')

In [None]:
def to_preprocessed_dataset(low_filter, high_filter, frequency):
    try:
        logger.info(f"Reading BIDS dataset from {ORIGINAL_DATASET_PATH}")
        subjects_list = sorted(os.listdir(ORIGINAL_DATASET_PATH))
        print(subjects_list)

        for i, subject in tqdm(enumerate(subjects_list[6:]), total=len(subjects_list[6:])):
            preprocessed_path = (f'{PREPROCESSED_DATASET_PATH}/low-{low_filter}-high-{high_filter}-frequency'
                                 f'-{frequency}/{subject}')

            os.makedirs(preprocessed_path)

            subject_eeg_folder = f"{ORIGINAL_DATASET_PATH}/{subject}/eeg/"
            subject_eeg_files = os.listdir(subject_eeg_folder)

            file_types = ['channels.tsv', 'eeg.set', 'events.tsv']
            selected_files = {ftype: next(file_name for file_name in subject_eeg_files if ftype in file_name)
                              for ftype in file_types}

            preprocess_eeg_data(low_filter, high_filter, frequency, subject_eeg_folder, subject,
                                selected_files['eeg.set'], selected_files['events.tsv'],
                                preprocessed_path)

    except FileNotFoundError:
        logger.error(f'Dataset path {ORIGINAL_DATASET_PATH} not found.')

    except Exception as e:
        logger.error(f'Unexpected error: {e}')

In [None]:
print(os.listdir(PREPROCESSED_DATASET_PATH))

['low-1-high-20-frequency-500', 'baseline_chunked_data', 'SLF', 'ASR', 'ASR_ICA', 'NOSP_ICA', 'SLF_ICA', 'SLF_CSP', 'ASR_CSP', 'NOSP_CSP', 'SLF_STFT', 'ASR_STFT', 'NOSP_STFT']


In [None]:
to_preprocessed_dataset(low_filter, high_filter, frequency)

In [None]:
def prep_chanks(data_chanks_list_train):
    print('[prep_chanks]')
    final_train_set = []
    for chank_df in tqdm(data_chanks_list_train):
        final_train_set.append(chank_df[EEG_CHANNELS].copy().T)
    return np.array(final_train_set)

In [None]:
preprocessed_path = (f'{PREPROCESSED_DATASET_PATH}/low-{low_filter}-high-{high_filter}-frequency'
                         f'-{frequency}')

if not os.path.exists(preprocessed_path):
    to_preprocessed_dataset(low_filter, high_filter, frequency)

for subject in sorted(os.listdir(preprocessed_path)):
  try:
    print(subject)
    subject_path = f'{preprocessed_path}/{subject}/eeg_data.csv'
    preprocessed_eeg_train, preprocessed_eeg_test = train_test_split_time_eeg(subject_path, minutes_for_test, frequency)
    output_path = (f'{PREPROCESSED_DATASET_PATH}/baseline_chunked_data_2/{subject}')
    os.makedirs(output_path)

    print('chunks_test')
    chunks_test = list(sliding_window_iter(preprocessed_eeg_test, size, overlap))
    # logger.info(f'Length of chunks for test = {len(chunks_test)} data windows')
    print(f'Length of chunks for test = {len(chunks_test)} data windows')
    print('y_test')
    y_test = create_output_for_chunks(chunks_test)
    chunks_test = prep_chanks(chunks_test)


    chunks_test = chunks_test.tolist()
    json.dump(chunks_test, codecs.open(f'{output_path}/X_test_chunks.json', 'w', encoding='utf-8'),
      separators=(',', ':'),
      sort_keys=True,
      indent=4) ### this saves the array in .json format

    print('y_test write')
    y_test = y_test.tolist()
    json.dump(y_test, codecs.open(f'{output_path}/y_test_chunks.json', 'w', encoding='utf-8'),
      separators=(',', ':'),
      sort_keys=True,
      indent=4) ### this saves the array in .json format

    print('fit_X')
    fit_X = preprocessed_eeg_train[EEG_CHANNELS].to_numpy()
    fit_X = fit_X.tolist()
    json.dump(fit_X, codecs.open(f'{output_path}/X_fit.json', 'w', encoding='utf-8'),
      separators=(',', ':'),
      sort_keys=True,
      indent=4)
    class_names_dict = {'RightTO': 0, 'RightHS': 1, 'LeftTO': 2, 'LeftHS': 3}
    print('fit_y')
    fit_y = np.array([class_names_dict[el] for el in preprocessed_eeg_train['task'].tolist()])
    fit_y = fit_y.tolist()
    json.dump(fit_y, codecs.open(f'{output_path}/y_fit.json', 'w', encoding='utf-8'),
      separators=(',', ':'),
      sort_keys=True,
      indent=4) ### this saves the array in .json format

    print('chunks_train')
    chunks_train = list(sliding_window_iter(preprocessed_eeg_train, size, overlap))
    # logger.info(f'Length of chunks for train = {len(chunks_train)} data windows')
    print(f'Length of chunks for train = {len(chunks_train)} data windows')
    print('y_train')
    y_train = create_output_for_chunks(chunks_train)
    chunks_train = prep_chanks(chunks_train)
    chunks_train = chunks_train.tolist()
    json.dump(chunks_train, codecs.open(f'{output_path}/X_train_chunks.json', 'w', encoding='utf-8'),
      separators=(',', ':'),
      sort_keys=True,
      indent=4)
    print('y_train write')
    y_train = y_train.tolist()
    json.dump(y_train, codecs.open(f'{output_path}/y_train_chunks.json', 'w', encoding='utf-8'),
      separators=(',', ':'),
      sort_keys=True,
      indent=4)

#   filtered_eeg.to_csv(f'{result_path}/eeg_data.csv', index=False)

  except Exception as e:
      print(f'Unexpected error: {e}')
