In [12]:
import os
import pickle
import numpy as np
from tqdm import tqdm
import matplotlib.pyplot as plt
import mne

In [13]:
DATA_FOLDER ='/home/deepak/Documents/Deepak/Students/Susmit_23CS60R75/Sleep_Data/'
SOURCE_DATA_FOLDER = DATA_FOLDER  + "SleepSource/"
LEADERBOARD_TARGET_DATA_FOLDER = DATA_FOLDER + "LeaderboardSleep/sleep_target/"
LEADERBOARD_TEST_DATA_FOLDER = DATA_FOLDER + "LeaderboardSleep/testing/"
FINAL_TARGET_DATA_FOLDER = DATA_FOLDER + "finalSleep/sleep_target/"
FINAL_TEST_DATA_FOLDER = DATA_FOLDER + "finalSleep/testing/"

In [14]:
def load_data(data_folder, load_labels=True):
    fn_list = sorted(os.listdir(data_folder))
    print(f"Loading data from folder: {data_folder} ({len(fn_list)} files) - Load labels {load_labels}")

    data_map = {}
    subject_list = []
    sample_counter = 0

    for fn in tqdm(fn_list):
        if fn.endswith("X.npy"):
            code = fn.split("_")[1][:-4]
        elif fn == "headerInfo.npy":
            meta = np.load(data_folder + fn, allow_pickle=True)
            print(meta)
            continue
        else:
            continue

        eeg = np.load(data_folder + fn, allow_pickle=True)

        if load_labels:
            label_fn = fn.replace("X", "y")
            label = np.load(data_folder + label_fn, allow_pickle=True)
        else:
            label = None

        s_part, r_part = code.split("r")
        subject = int(s_part[1:])
        repetition = int(r_part[:-1])
        
        subject_list.append(subject)

        if subject not in data_map.keys():
            data_map[subject] = {}

        data_map[subject][repetition] = {"eeg": eeg, "label": label}
        sample_counter += len(eeg)

    subject_list = np.unique(subject_list)
    print(f"Loaded total {sample_counter} samples for subjects: {subject_list}")
    return data_map, subject_list

In [15]:
def prepare_window_data(data, subject_list=None):
    window_data = []
    window_labels = []

    if subject_list is None:
        subject_list = data.keys()

    for s in tqdm(subject_list):
        for r in data[s].keys():
            eeg = data[s][r]["eeg"]
            label = data[s][r]["label"]

            window_data.extend(eeg)
            
            if label is not None:
                window_labels.extend(label)

    return window_data, window_labels

In [16]:
def print_stats(desc, data):
    print(f"{desc} mean: {np.mean(data)}, std: {np.std(data)}, min: {np.min(data)}, max: {np.max(data)}")

def normalize(data, mean_value, std_value, desc=""):
    data = np.array(data)
    data = (data - mean_value) / std_value
    print_stats(desc, data)
    return list(data)

def filter_freq(data, f_min, f_max, FS):
    return mne.filter.filter_data(np.array(data, dtype=np.float64), FS, f_min, f_max, method="iir", verbose=False)

def downsample(data, FS, FS_new):
    return mne.filter.resample(data, down=FS/FS_new)

In [17]:
import os
import torch
import random
import numpy as np

def seed_everything(seed):
    os.environ["PYTHONHASHSEED"] = str(seed)
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True

In [18]:
def get_phase_1_data():
    source_data, source_subjects = load_data(SOURCE_DATA_FOLDER)
    source_data, source_labels = prepare_window_data(source_data, source_subjects)
    # calculate stats of source data and normalize it
    source_data = np.array(source_data)
    source_mean = np.mean(source_data)
    source_std = np.std(source_data)
    source_data = (source_data - source_mean) / source_std
    print(f"Source mean: {source_mean}, std: {source_std}, min: {np.min(source_data)}, max: {np.max(source_data)}")
    source_data  = list(source_data)
    # load and normalize target data
    lb_target_data, lb_target_subjects = load_data(LEADERBOARD_TARGET_DATA_FOLDER)
    lb_target_data, lb_target_labels = prepare_window_data(lb_target_data, lb_target_subjects)
    lb_target_data = normalize(lb_target_data, source_mean, source_std, "Leadeboard target")
    # load and normalize test data
    lb_test_data, lb_test_subjects = load_data(LEADERBOARD_TEST_DATA_FOLDER, load_labels=False)
    lb_test_data, lb_test_labels = prepare_window_data(lb_test_data, lb_test_subjects)
    lb_test_data = normalize(lb_test_data, source_mean, source_std, "Leadeboard test")

    return source_data, source_labels, lb_target_data, lb_target_labels, lb_test_data, lb_test_data


In [19]:
source_data, source_labels, target_data, target_labels, test_data, mixup_data = get_phase_1_data()

Loading data from folder: /home/deepak/Documents/Deepak/Students/Susmit_23CS60R75/Sleep_Data/SleepSource/ (158 files) - Load labels True


  0%|                                                   | 0/158 [00:00<?, ?it/s]

<Info | 8 non-empty values
 bads: []
 ch_names: Fpz-Cz, Pz-Oz
 chs: 2 EEG
 custom_ref_applied: False
 highpass: 0.5 Hz
 lowpass: 100.0 Hz
 meas_date: 1991-09-26 15:00:00 UTC
 nchan: 2
 projs: []
 sfreq: 100.0 Hz
 subject_info: 2 items (dict)
>


100%|█████████████████████████████████████████| 158/158 [00:38<00:00,  4.08it/s]


Loaded total 90545 samples for subjects: [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38]


100%|█████████████████████████████████████████| 39/39 [00:00<00:00, 1965.65it/s]


Source mean: -2.175762164676705e-07, std: 1.6836217270029333e-05, min: -14.7172265485441, max: 12.426637935405019
Loading data from folder: /home/deepak/Documents/Deepak/Students/Susmit_23CS60R75/Sleep_Data/LeaderboardSleep/sleep_target/ (25 files) - Load labels True


  0%|                                                    | 0/25 [00:00<?, ?it/s]

<Info | 8 non-empty values
 bads: []
 ch_names: Fpz-Cz, Pz-Oz
 chs: 2 EEG
 custom_ref_applied: False
 highpass: 0.5 Hz
 lowpass: 100.0 Hz
 meas_date: 1990-03-13 15:09:00 UTC
 nchan: 2
 projs: []
 sfreq: 100.0 Hz
 subject_info: 2 items (dict)
>


100%|███████████████████████████████████████████| 25/25 [00:06<00:00,  3.91it/s]


Loaded total 15442 samples for subjects: [0 1 2 3 4 5]


100%|███████████████████████████████████████████| 6/6 [00:00<00:00, 2661.08it/s]


Leadeboard target mean: 0.038902552623313796, std: 1.0560441229752748, min: -11.688042547053058, max: 12.010867582259191
Loading data from folder: /home/deepak/Documents/Deepak/Students/Susmit_23CS60R75/Sleep_Data/LeaderboardSleep/testing/ (25 files) - Load labels False


100%|███████████████████████████████████████████| 25/25 [00:10<00:00,  2.39it/s]


Loaded total 25748 samples for subjects: [ 6  7  8  9 10 11 12 13 14 15 16 17]


100%|█████████████████████████████████████████| 12/12 [00:00<00:00, 3713.14it/s]


Leadeboard test mean: 0.014119353426048422, std: 0.8645048345254686, min: -11.985021370728651, max: 11.832680288053835


In [20]:
tmp = np.array(target_data)
supervised_mixup_data = {}
for c in np.unique(target_labels):
    supervised_mixup_data[c] = tmp[target_labels == c]
    print(c, np.shape(supervised_mixup_data[c]))

del tmp

0 (6010, 2, 3000)
1 (1672, 2, 3000)
2 (5035, 2, 3000)
3 (704, 2, 3000)
4 (414, 2, 3000)
5 (1607, 2, 3000)


In [21]:
import numpy as np
import os

def split_train_val_test(train_data, train_labels, train_size=0.8, val_size=0.1, test_size=0.1, random_state=42):
    from sklearn.model_selection import train_test_split

    # First, split into training and temp (validation + test)
    train_data_split, temp_data, train_labels_split, temp_labels = train_test_split(
        train_data, train_labels, train_size=train_size, random_state=random_state, stratify=train_labels
    )

    # Calculate the proportion of validation and test sizes relative to temp_data
    temp_val_size = val_size / (val_size + test_size)
    val_data, test_data, val_labels, test_labels = train_test_split(
        temp_data, temp_labels, test_size=1 - temp_val_size, random_state=random_state, stratify=temp_labels
    )

    return train_data_split, val_data, test_data, train_labels_split, val_labels, test_labels

def create_data_directories(base_dir='data', subdirs=['train', 'validate', 'test']):
    for subdir in subdirs:
        path = os.path.join(base_dir, subdir)
        os.makedirs(path, exist_ok=True)
        print(f"Directory created: {path}")

def save_splits(base_dir, splits):
    """
    Saves the data and labels into respective directories.

    Parameters:
    - base_dir: The main directory containing subdirectories.
    - splits: A tuple containing (train_data, val_data, test_data, train_labels, val_labels, test_labels).
    """
    train_data, val_data, test_data, train_labels, val_labels, test_labels = splits
    splits_dict = {
        'train': (train_data, train_labels),
        'validate': (val_data, val_labels),
        'test': (test_data, test_labels)
    }

    for split_name, (data, labels) in splits_dict.items():
        data_path = os.path.join(base_dir, split_name, 'data.npy')
        labels_path = os.path.join(base_dir, split_name, 'labels.npy')
        np.save(data_path, data)
        np.save(labels_path, labels)
        print(f"Saved {split_name} data to {data_path} and labels to {labels_path}")


splits = split_train_val_test(source_data, source_labels)
create_data_directories()
save_splits('data', splits)


Directory created: data/train
Directory created: data/validate
Directory created: data/test
Saved train data to data/train/data.npy and labels to data/train/labels.npy
Saved validate data to data/validate/data.npy and labels to data/validate/labels.npy
Saved test data to data/test/data.npy and labels to data/test/labels.npy


In [22]:
# import numpy as np

# # Load train data and labels
# source_data = np.load('data/train/data.npy')
# source_labels = np.load('data/train/labels.npy')

# # Load validate data and labels
# target_data = np.load('data/validate/data.npy')
# target_labels = np.load('data/validate/labels.npy')

# # Load test data and labels
# test_data = np.load('data/test/data.npy')
# test_labels = np.load('data/test/labels.npy')

# # Print the shape of each
# print("Train data shape:", source_data.shape)
# print("Train labels shape:", source_labels.shape)
# print("Validate data shape:", target_data.shape)
# print("Validate labels shape:", target_labels.shape)
# print("Test data shape:", test_data.shape)
# print("Test labels shape:", test_labels.shape)

# # Calculate and print total data size (total number of samples)
# total_data_size = source_labels.shape[0] + target_data.shape[0] + test_data.shape[0]
# print("Total data size (number of samples):", total_data_size)
