In [1]:
# Data splitting into train, val and test

In [1]:
import os
import shutil
import sys
sys.path.append("..")

In [2]:
import json
import pandas as pd
import numpy as np
from data_utils.dataloader import Dataloader
from data_utils.splitting import DataSplitter

In [3]:
data_base_path = os.path.join("..", "data", "TrainingData")
dataloader = Dataloader(data_base_path)

In [4]:
subject_ids = ["001", "002", "003", "004", "005", "006", "007", "008"]
session_numbers = ["01", "02", "03", "04", "05", "06", "07", "08"]

In [5]:
# Read the original data: This is not really necassary, but is done to
# make the code future-proof, in case we need stratified splits.
all_ys = []
for subject_id in subject_ids:
    for session_number in session_numbers:
        try:
            x, y = dataloader.load_and_join_data(subject_id, session_number)
            uid = f"{subject_id}_{session_number}"
            y["uid"] = [uid for _ in range(len(y))]
            all_ys.append(y)
        except FileNotFoundError:
            pass

In [6]:
labels = pd.concat(all_ys, axis=0).reset_index(drop=True)

In [7]:
splitter = DataSplitter()

In [8]:
splits = splitter.split_ids(labels)

In [9]:
splits

{'train': ['005_02',
  '001_06',
  '003_02',
  '001_05',
  '002_02',
  '003_01',
  '003_03',
  '005_01',
  '001_07',
  '002_05',
  '004_02',
  '002_03',
  '001_02',
  '002_04',
  '001_03',
  '004_01'],
 'val': ['001_08', '002_01', '001_01', '001_04'],
 'test': ['005_03',
  '006_01',
  '006_02',
  '006_03',
  '007_01',
  '007_02',
  '007_03',
  '007_04',
  '008_01']}

In [10]:
# Save to file for documentation
with open(os.path.join("metadata", "split_ids.json"), "w") as f:
    json.dump(splits, f, indent=2)