In [None]:
import os
import mne
import numpy as np
import pandas as pd
from scipy.signal import resample
# import h5py
# import mat73
import warnings
warnings.filterwarnings("ignore")

: 

In [2]:
SAMPLE_RATE = 128  # fs
SAMPLE_LEN = 128   # T

In [3]:
# root dir
root = '/home/stud/timlin/bhome/DiffusionEEG/data/caueeg_bids'
# participants file path
participants_path = os.path.join(root, 'participants.tsv')
participants = pd.read_csv(participants_path, sep='\t')
participants

Unnamed: 0,participant_id,age,group,ad_syndrome,ad_syndrome_3
0,sub-00001,78,mci_ad,mci,mci
1,sub-00002,56,hc,smc,hc (+smc)
2,sub-00003,93,,,
3,sub-00004,78,ad,dementia,dementia
4,sub-00005,75,vd,mci,mci
...,...,...,...,...,...
1374,sub-01384,57,ad,dementia,dementia
1375,sub-01385,77,ad,dementia,dementia
1376,sub-01386,80,ad,dementia,dementia
1377,sub-01387,83,mci_ad,mci,mci


## Labels

In [4]:
labels = np.empty(shape=(participants.shape[0],2), dtype='int32')
labels.shape

(1379, 2)

In [7]:
for i, participant in enumerate(participants.values):
    # Check if the value is NaN
    if pd.isna(participant[4]):
        labels[i, 0] = -1  # or any label you choose for missing values
    elif 'hc (+smc)' in participant[4]:
        labels[i, 0] = 0
    elif 'mci' in participant[4]:
        labels[i, 0] = 1
    elif 'dementia' in participant[4]:
        labels[i, 0] = 2
    else:
        labels[i, 0] = -1
    labels[i, 1] = subject_id
    subject_id += 1


In [8]:
label_path = 'Processed/CAUEEG/Label'
if not os.path.exists(label_path):
    os.makedirs(label_path)
np.save(label_path + '/label.npy', labels)

In [9]:
np.load('Processed/CAUEEG/Label/label.npy')

array([[   1,    3],
       [   0,    4],
       [  -1,    5],
       ...,
       [   2, 1379],
       [   1, 1380],
       [   1, 1381]], dtype=int32)

## Features

In [10]:
# resample the time series data from original_fs to target_fs
def resample_time_series(data, original_fs, target_fs):
    T, C = data.shape
    new_length = int(T * target_fs / original_fs)

    resampled_data = np.zeros((new_length, C))
    for i in range(C):
        resampled_data[:, i] = resample(data[:, i], new_length)

    return resampled_data

# split the EEG data into segments of length segment_length, dropping the last segment if it is shorter than segment_length
def split_eeg_segments(data, segment_length=128, half_overlap=False):
    T, C = data.shape  

    if half_overlap:
        step = segment_length // 2
    else:
        step = segment_length

    num_segments = (T - segment_length) // step + 1
    segments = np.zeros((num_segments, segment_length, C))

    for i in range(num_segments):
        start_idx = i * step
        end_idx = start_idx + segment_length
        segments[i] = data[start_idx:end_idx]

    return segments

In [8]:
feature_path = 'Processed/AD-Auditory/Feature'
if not os.path.exists(feature_path):
    os.makedirs(feature_path)

# Test 
# deal with the matlab 7.3 file format
sub_id = 1
for sub in os.listdir(root):
    if 'sub-' in sub:
        sub_path = os.path.join(root, sub, 'eeg/')
        print(sub_path)
        for file in os.listdir(sub_path):
            if '.set' in file:
                set_file_path = os.path.join(sub_path, file)
                print("Read .set file to see sub info", set_file_path)
                with h5py.File(set_file_path, 'r') as f:
                    n_channels = f['nbchan'][()]
                    n_points = f['pnts'][()]
                    n_trials = f['trials'][()]
                    srate = f['srate'][()]
                    chanlocs = f['chanlocs']['labels'][:]
                    print("Number of channels:", n_channels)
                    print("Number of points per trial:", n_points)
                    print("Number of trials:", n_trials)
                    print("Sampling rate:", srate)
                    # print("Channel labels:", chanlocs)
            if '.fdt' in file:
                print("Read .fdt file to load raw EEG data")
                fdt_file_path = os.path.join(sub_path, file)
                data = np.fromfile(fdt_file_path, dtype=np.float32).reshape(-1,19)
                # 250Hz, 19 monopolar channels, no 'T3' and 'T4', only 'T7' and 'T8', which are same as 'T3' and 'T4'
                print("Raw EEG data shape:", data.shape)   
                data = resample_time_series(data, 250, SAMPLE_RATE)
                feature_array = split_eeg_segments(data, SAMPLE_LEN, half_overlap=True)
                print("Downsampling and segmented data shape ", feature_array.shape)
                np.save(feature_path + '/feature_{:02d}.npy'.format(sub_id), feature_array)
                print("\n")           
        sub_id += 1
        print("--------------------------------")

AD-Auditory/sub-01\eeg/
Read .fdt file to load raw EEG data
Raw EEG data shape: (87500, 19)
Downsampling and segmented data shape  (699, 128, 19)


Read .set file to see sub info AD-Auditory/sub-01\eeg/sub-01_task-40HzAuditoryEntrainment_eeg.set
Number of channels: [[19.]]
Number of points per trial: [[87500.]]
Number of trials: [[1.]]
Sampling rate: [[250.]]
--------------------------------
AD-Auditory/sub-02\eeg/
Read .fdt file to load raw EEG data
Raw EEG data shape: (87500, 19)
Downsampling and segmented data shape  (699, 128, 19)


Read .set file to see sub info AD-Auditory/sub-02\eeg/sub-02_task-40HzAuditoryEntrainment_eeg.set
Number of channels: [[19.]]
Number of points per trial: [[87500.]]
Number of trials: [[1.]]
Sampling rate: [[250.]]
--------------------------------
AD-Auditory/sub-03\eeg/
Read .fdt file to load raw EEG data
Raw EEG data shape: (87500, 19)
Downsampling and segmented data shape  (699, 128, 19)


Read .set file to see sub info AD-Auditory/sub-03\eeg/sub-03_t

In [None]:
# Test the saved npy file
# example

path = 'Processed/CAUEEG/Feature/'

for file in os.listdir(path):
    sub_path = os.path.join(path, file)
    print(np.load(sub_path).shape)

: 