OK.
We need our data in S3 to have subdirectory train/test/val.

We shall have to create a static dataset (not dynamically creating artefacts as before) - It would be advised to at least create a larger dataset in this case in a real-world problem.

In [1]:
import os
import numpy as np
import scipy.io
from sklearn.model_selection import train_test_split

# Config
seq_len = 512
num_samples = 3000  # total synthetic samples to generate
snr_db_range = (-5, 5)
split = [0.8, 0.1, 0.1]

output_dir = "eeg_prepared_dataset"
os.makedirs(output_dir, exist_ok=True)

# Load .mat files
eeg_mat = scipy.io.loadmat("eeg_data_from_s3/EEG_all_epochs.mat")['EEG_all_epochs']
eog_mat = scipy.io.loadmat("eeg_data_from_s3/EOG_all_epochs.mat")['EOG_all_epochs']

print(f"EEG: {eeg_mat.shape}, EOG: {eog_mat.shape}")

# recall std for reversing normalisation if rqd
print(f"std of EEG: {np.std(eeg_mat)}")
print(f"std of EOG: {np.std(eog_mat)}")

# Normalize
eeg_mat = eeg_mat / np.std(eeg_mat)
eog_mat = eog_mat / np.std(eog_mat)

# Build clean/noisy pairs
clean_samples = []
noisy_samples = []

for _ in range(num_samples):
    eeg = eeg_mat[np.random.randint(eeg_mat.shape[0])]
    eog = eog_mat[np.random.randint(eog_mat.shape[0])]

    """
    # Random crop - if we had larger sequences maybe.
    start_idx = np.random.randint(0, min(len(eeg), len(eog)) - seq_len)
    clean = eeg[start_idx : start_idx + seq_len]
    noise = eog[start_idx : start_idx + seq_len]
    """
    clean = eeg
    noise = eog

    # Mix with SNR
    snr_db = np.random.uniform(*snr_db_range)
    signal_power = np.mean(clean ** 2)
    noise_power = signal_power / (10 ** (snr_db / 10))
    scale = np.sqrt(noise_power / np.mean(noise ** 2))
    noisy = clean + scale * noise

    clean_samples.append(clean[:, np.newaxis])
    noisy_samples.append(noisy[:, np.newaxis])

clean_samples = np.array(clean_samples)
noisy_samples = np.array(noisy_samples)

# Split into train/val/test
train_X, temp_X, train_y, temp_y = train_test_split(noisy_samples, clean_samples, test_size=1 - split[0])
val_X, test_X, val_y, test_y = train_test_split(temp_X, temp_y, test_size=split[2] / (split[1] + split[2]))

# Save as .npz files
def save_split(X, y, folder):
    path = os.path.join(output_dir, folder)
    os.makedirs(path, exist_ok=True)
    for i, (noisy, clean) in enumerate(zip(X, y)):
        np.savez(os.path.join(path, f"sample_{i:04d}.npz"), noisy=noisy, clean=clean)

save_split(train_X, train_y, "train")
save_split(val_X, val_y, "val")
save_split(test_X, test_y, "test")

print("Done preparing local dataset.")


EEG: (4514, 512), EOG: (3400, 512)
std of EEG: 228.55191045728395
std of EOG: 154.4631755259437
Done preparing local dataset.


In [2]:
from sagemaker.s3 import S3Uploader

s3_uri = S3Uploader.upload("eeg_prepared_dataset", "s3://eeg-denoise/prepared")
print("Uploaded to:", s3_uri)




sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/sagemaker-user/.config/sagemaker/config.yaml
Uploaded to: s3://eeg-denoise/prepared
