# OpenNeuro Data Loader
A data loader for open neuro MRI datasets https://openneuro.org/

Getting usable data from open neuro was more difficult than it should be. I aim to create a 3 part system to expedite this process.

The architecture is as follows:
1. Given a dataset ID (ds#######) download the dataset to a specified folder and extract it using datalad
1. A 'patient' class to hold data relevant to model training as well as data related to the patient
1. A dataset class that has various dataset-related methods (preprocessing, train-val-test splits or stratified k-fold cross validation, ect)

## Todos
1. Using datalad and git, download dataset
1. Figure out memory measuring tool
1. Load batch of n scans based on available memory
1. Create generator of m batches of n scans which load on demand

## Install Packages

In [2]:
!pip install -r requirements.txt

Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
Collecting nipy
  Downloading nipy-0.6.1-cp39-cp39-win_amd64.whl (3.0 MB)
     ---------------------------------------- 3.0/3.0 MB 8.4 MB/s eta 0:00:00
Collecting nibabel
  Downloading nibabel-5.3.3-py3-none-any.whl (3.3 MB)
     ---------------------------------------- 3.3/3.3 MB 69.9 MB/s eta 0:00:00
Collecting nilearn
  Downloading nilearn-0.12.1-py3-none-any.whl (12.7 MB)
     --------------------------------------- 12.7/12.7 MB 50.4 MB/s eta 0:00:00
Collecting scipy
  Downloading scipy-1.13.1-cp39-cp39-win_amd64.whl (46.2 MB)
     --------------------------------------- 46.2/46.2 MB 65.6 MB/s eta 0:00:00
Collecting matplotlib
  Downloading matplotlib-3.9.4-cp39-cp39-win_amd64.whl (7.8 MB)
     ---------------------------------------- 7.8/7.8 MB 50.1 MB/s eta 0:00:00
Collecting transforms3d
  Downloading transforms3d-0.4.2-py3-none-any.whl (1.4 MB)
     ---------------------------------------- 1.4/1.4 MB 91.1 

You should consider upgrading via the 'D:\Side_Projects\MRI_Project\env_mri\Scripts\python.exe -m pip install --upgrade pip' command.


In [39]:
import time
import nibabel as nib
import numpy as np
import os
import json
import random
import SimpleITK as sitk
import psutil
# from datalad.api import get, drop
import datalad.api as dl
import shutil
from datalad.api import install
import subprocess


In [4]:
class patient:
    '''
    Struct for holding patient information and scan data
    '''
    def __init__(self,path):
        self.info = {} #data-metadata pairs using pre-extension name
        self.folder_path = path
        self.date_loaded = time.time()
        self.parse_and_assign_filenames(self.folder_path)
        self.loaded_into_memory = False
        
    def __str__(self):
        return f'{len(self.info.keys())} scans from {self.folder_path}'
        
    def parse_and_assign_filenames(self,path):
        patient_scans=[]
        for root,dirs,files in os.walk(path):
            # compressed_files = [file for file in files if file.split('.')[-2] == 'nii' and file.split('.')[-1] == 'gz']
            compressed_files = [file for file in files if file.split('.')[-1] == 'gz']
            for file in compressed_files:
                self.info[file.split('.')[0]] = {
                    'scan':os.path.join(root,file),
                    'metadata':os.path.join(root,file.split('.')[0]+'.json') if os.path.exists(os.path.join(root,file.split('.')[0]+'.json')) else None,
                }

    
    def load(self):
        #return 4D set of values [(H,W,Scans(Depth),N),metadata]
        def load_json(path):
            with open(path) as f:
                out = json.load(f)
            return out
        def load_scan(path):
            #use datalad to fetch unavailable data
            for k,v in self.info.items():
                dl.get(v['scan'])
            self.loaded_into_memory = True
            
            # replace with datalad
            img = nib.load(path)
            data = np.asarray(img.dataobj)
            return sitk.GetImageFromArray(data)
        return {
            'data':[load_scan(v['scan']) for k,v in self.info.items()], 
            'metadata':[[k,load_json(v['metadata'])] for k,v in self.info.items()]
            }
    def unload(self):
        #use datalad to unload scan
        self.loaded_into_memory = False
        dl.drop(self.folder_path,recursive=True)
        
class patient_dataset:
    '''
    Responsible for organizing and grouping scans + metadata per patient
    Passes path to patient class 
    Also responsible for image preprocessing methods
    '''
    def __init__(self,path,standard_size=(256,256,200)):
        #where path is the path to the dataset (should end in ds007045 or similar)
        dl.api.install(
            path=path,
            source=f"https://github.com/OpenNeuroDatasets/{path}.git"
        )
        
        self.run(["git-annex", "init"],dataset_path=dataset_path)
        self.run(["git", "annex", "enableremote", "s3-PUBLIC"],dataset_path=dataset_path)
        
        self.path = path
        self.standard_size = standard_size
        self.patients = []
        for folder in os.listdir(self.path):
            if self._is_folder(folder) == False:
                continue
            p = patient(os.path.join(self.path,folder))
            if len(p.info) != 0: #filter non-patient folders
                self.patients.append(p)
        print('length patients', len(self.patients))
        self.length = len(self.patients)
        self.loaded_idxs = []#if slow, replace with a deque
    def run(self,cmd, check=True,dataset_path=''):
            print(f"$ {' '.join(cmd)}")
            return subprocess.run(' '.join(cmd),cwd=dataset_path, check=False, capture_output=True)
    def _is_folder(self,folder):
        is_folder = True
        if 'sub' not in folder.split('-'): #temp fix for picking up non-patient folders
            is_folder = False
        if os.path.isdir(os.path.join(self.path,folder)) == False:
            is_folder = False
        return is_folder
    
    def __iter__(self):
        """
        Stream samples one-by-one without holding everything in memory.
        """
        for file_id in range(self.length):
            yield self.get(file_id)
    
    def __getitem__(self, file_id):
        if isinstance(file_id, slice):
            start, stop, step = file_id.indices(self.length)
            return [self.get(i) for i in range(start, stop, step)]
        elif isinstance(file_id, list):
            return [self.get(i) for i in file_id]
        elif isinstance(file_id, int):
            if file_id < 0 or file_id >= self.length:
                raise IndexError("patient index out of range")
            return self.get(file_id)
        else:
            raise TypeError("Indices must be integers, slices, or a list")
    
    def get(self,file_id):
        #check available memory
        available_ram,total_ram,percent_ram_used = self.get_ram_info()
        available_disk,total_disk,percent_disk_used = self.get_disk_info('D:\\')#hardcoded disk
        
        #This will fail with OOM if the file size is more than 10% of RAM or disk space
        while percent_ram_used > 0.9 or percent_disk_used > 0.9 : #this is imperfect and should check how large the incoming data is. 
            self.drop_an_item()
            available_ram,total_ram,percent_ram_used = self.get_ram_info()
            available_disk,total_disk,percent_disk_used = self.get_disk_info('D:\\')#hardcoded disk
        self.loaded_idxs.append(file_id)
        return self.patients[file_id].load()
    def drop_an_item(self):
        self.patients[self.loaded_idxs[0]].unload()
        self.loaded_idxs.pop(0)#if slow, replace with a deque
    def get_ram_info(self):
        vm = psutil.virtual_memory()
        total_ram = vm.total      # bytes
        available_ram = vm.available  # bytes
        return available_ram, total_ram, available_ram/total_ram

    def get_disk_info(path="/"):
        usage = shutil.disk_usage(path)
        total = usage.total      # bytes
        available = usage.free   # bytes
        return available, total, available/total


# example usage
if __name__ == "__main__":
    total, available = get_disk_info("/")  # or "C:\\" on Windows
    print(f"Total disk: {total / (1024**3):.2f} GB")
    print(f"Available disk: {available / (1024**3):.2f} GB")
    
    def sample(self):
        #get one random patient obj and call get method
        random_idx = random.randint(0,self.length)
        return self.get(random_idx)
    def resample_to_shape(
        self,
        images, #list of sitk images
        out_size,
        interpolator=sitk.sitkLinear
    ):
        resampled_images = []
        for img in images:
            original_size = img.GetSize()
            original_spacing = [1.0,1.0,1.0] #change to grabbing this from metadata
            # original_spacing = self. #change to grabbing this from metadata
        
            new_spacing = [
                (original_size[i] * original_spacing[i]) / out_size[i]
                for i in range(3)
            ]
            
            resampler = sitk.ResampleImageFilter()
            
            resampler.SetSize(out_size)
            resampler.SetOutputSpacing(new_spacing)
            resampler.SetInterpolator(interpolator)
            resampler.SetOutputDirection(img.GetDirection())
            resampler.SetOutputOrigin(img.GetOrigin())
            resampled_images.append(resampler.Execute(img))
        return resampled_images
    
    def preprocess(self,idx,count):
        #standardize size
        scan_sets = self.patients[idx:idx+count]
        patient_scan_sets = [p['data'] for p in scan_sets]
        resized_patient_scans = [self.resample_to_shape(patient_scans,self.standard_size) for patient_scans in patient_scan_sets]
    
    def generate_folds(self,k=10):
        #Create an array from 0 to self.length, shuffle, and make k-1 even cuts 
        assignments = [i for i in range(self.length)]
        random.shuffle(assignments)
        fold_size = self.length//k #last fold will have extra items from excluded by rounding
        self.folds = {}
        for foldnum in range(k-2):
            self.folds[foldnum] = assignments[fold_size*foldnum:fold_size*(foldnum+1)]
        self.folds[k-1] = assignments[fold_size*(foldnum+1):]

    def get_fold(self,fold_num):
        assert len(self.folds.keys()) > 0
        return self.__getitem__(self.folds[fold_num])#what if this ALSO returned a generator??
# dataset = patient_dataset('ds007045')
# dataset = patient_dataset('ds007156')
dataset = patient_dataset('ds002424')


length patients 0


In [5]:
start = time.time()
dataset.generate_folds(15)
fold = dataset.get_fold(2)
end = time.time()
(end-start)/60, "Minutes for ",len(fold)," Scans"

(0.0, 'Minutes for ', 0, ' Scans')

In [52]:
len(fold)

6

In [34]:
dataset.get(10)['data']


[<SimpleITK.SimpleITK.Image; proxy of <Swig Object of type 'itk::simple::Image *' at 0x000001E2A601B990> >,
 <SimpleITK.SimpleITK.Image; proxy of <Swig Object of type 'itk::simple::Image *' at 0x000001E2A601B300> >,
 <SimpleITK.SimpleITK.Image; proxy of <Swig Object of type 'itk::simple::Image *' at 0x000001E2A601BF60> >,
 <SimpleITK.SimpleITK.Image; proxy of <Swig Object of type 'itk::simple::Image *' at 0x000001E2A601B960> >]

In [30]:
start = time.time()
print(len([d['data'] for d in dataset[0:100]]))
end = time.time()
(end-start)/60,'Minutes for ',dataset.length,' images' #2.7min for scans and metadata

100


(2.7974769433339435, 'Minutes for ', 337, ' images')

In [11]:
from datalad.api import status
dataset_number = 'ds007156'
res = status(path=dataset_number)
print(res)

untracked: ds007156\ds007156 (directory)
[{'type': 'directory', 'state': 'untracked', 'path': 'D:\\Side_Projects\\MRI_Project\\OpenNeuroDataLoader\\ds007156\\ds007156', 'parentds': 'D:\\Side_Projects\\MRI_Project\\OpenNeuroDataLoader\\ds007156', 'status': 'ok', 'refds': 'D:\\Side_Projects\\MRI_Project\\OpenNeuroDataLoader', 'action': 'status'}, {'type': 'file', 'gitshasum': '692fd97d0338c46b41eedbe5a8e6f97d28aa5104', 'bytesize': 12, 'prev_gitshasum': '692fd97d0338c46b41eedbe5a8e6f97d28aa5104', 'state': 'clean', 'path': 'D:\\Side_Projects\\MRI_Project\\OpenNeuroDataLoader\\ds007156\\.bidsignore', 'parentds': 'D:\\Side_Projects\\MRI_Project\\OpenNeuroDataLoader\\ds007156', 'status': 'ok', 'refds': 'D:\\Side_Projects\\MRI_Project\\OpenNeuroDataLoader', 'action': 'status'}, {'type': 'file', 'gitshasum': '1b49f5da4dd88e419b375775157037b9b8f7e0a9', 'bytesize': 63, 'prev_gitshasum': '1b49f5da4dd88e419b375775157037b9b8f7e0a9', 'state': 'clean', 'path': 'D:\\Side_Projects\\MRI_Project\\OpenNeur

In [28]:
import datalad.api
from datalad.api import Dataset
import time
d = Dataset('ds007156')


In [29]:
%%time
d.get('sub-S4',recursive=True, reckless="availability",jobs="auto")


[INFO] Ensuring presence of Dataset("D:\Side_Projects\MRI_Project\OpenNeuroDataLoader\ds007156") to get D:\Side_Projects\MRI_Project\OpenNeuroDataLoader\ds007156\sub-S4 


KeyboardInterrupt: 

In [13]:
datalad.api.drop('ds007156/sub-S4/',recursive=True)


[{'path': 'D:\\Side_Projects\\MRI_Project\\OpenNeuroDataLoader\\ds007156\\sub-S4',
  'type': 'directory',
  'status': 'notneeded',
  'message': ('nothing to drop from %s',
   'D:\\Side_Projects\\MRI_Project\\OpenNeuroDataLoader\\ds007156\\sub-S4'),
  'action': 'drop',
  'refds': 'D:\\Side_Projects\\MRI_Project\\OpenNeuroDataLoader'}]

In [None]:
# ds002424,ds007045,ds007156

In [16]:
from datalad.api import install
dataset_path = 'ds007045'

install(
    path=dataset_path,
    source="https://github.com/OpenNeuroDatasets/ds007156.git"
)


[INFO] Attempting a clone into D:\Side_Projects\MRI_Project\OpenNeuroDataLoader\ds007045 
[INFO] Attempting to clone from https://github.com/OpenNeuroDatasets/ds007156.git to D:\Side_Projects\MRI_Project\OpenNeuroDataLoader\ds007045 
[INFO] Start enumerating objects 
[INFO] Start counting objects 
[INFO] Start compressing objects 
[INFO] Start receiving objects 
[INFO] Start resolving deltas 
[INFO] Completed clone attempts for Dataset("D:\Side_Projects\MRI_Project\OpenNeuroDataLoader\ds007045") 
[INFO] Detected a filesystem without fifo support. 
[INFO] Disabling ssh connection caching. 
[INFO] Detected a crippled filesystem. 
[INFO] Entering an adjusted branch where files are unlocked as this filesystem does not support locked files. 
[INFO] Switched to branch 'adjusted/main(unlocked)' 
[INFO] Remote origin not usable by git-annex; setting annex-ignore 
[INFO] https://github.com/OpenNeuroDatasets/ds007156.git/config download failed: Not Found 
[INFO] access to 1 dataset sibling s3-BA

install(ok): D:\Side_Projects\MRI_Project\OpenNeuroDataLoader\ds007045 (dataset)


Dataset('D:\\Side_Projects\\MRI_Project\\OpenNeuroDataLoader\\ds007045')

In [26]:
import datalad.api as dl
from datalad.api import Dataset
d = Dataset(dataset_path)


In [27]:
# d.repo.get_remotes()
d.repo.enable_remote("s3-PUBLIC")


In [31]:
#wilo 2/6/2026: The issue is i need to enable the s3-PUBLIC remote prior to any get calls. 
    #it doesnt look like the API works for this. Use subprocess and be sure cwd is the path to the 
    #installed dataset
import subprocess
def run(cmd, check=True,dataset_path=''):
    print(f"$ {' '.join(cmd)}")
    return subprocess.run(' '.join(cmd),cwd=dataset_path, check=False, capture_output=True)
run(["git-annex", "init"],dataset_path=dataset_path)
run(["git", "annex", "enableremote", "s3-PUBLIC"],dataset_path=dataset_path)


$ git-annex init
$ git annex enableremote s3-PUBLIC


CompletedProcess(args='git annex enableremote s3-PUBLIC', returncode=0, stdout=b'enableremote s3-PUBLIC ok\n(recording state in git...)\n', stderr=b'')

In [32]:
d.repo.get_remotes()#,d.repo.get_file_annexinfo('sub-S05/anat/sub-S05_T1w.nii.gz')

['origin', 's3-PUBLIC']

In [33]:
d.get?

[1;31mSignature:[0m
[0md[0m[1;33m.[0m[0mget[0m[1;33m([0m[1;33m
[0m    [1;33m*[0m[1;33m,[0m[1;33m
[0m    [0msource[0m[1;33m=[0m[1;32mNone[0m[1;33m,[0m[1;33m
[0m    [0mdataset[0m[1;33m=[0m[1;32mNone[0m[1;33m,[0m[1;33m
[0m    [0mrecursive[0m[1;33m=[0m[1;32mFalse[0m[1;33m,[0m[1;33m
[0m    [0mrecursion_limit[0m[1;33m=[0m[1;32mNone[0m[1;33m,[0m[1;33m
[0m    [0mget_data[0m[1;33m=[0m[1;32mTrue[0m[1;33m,[0m[1;33m
[0m    [0mdescription[0m[1;33m=[0m[1;32mNone[0m[1;33m,[0m[1;33m
[0m    [0mreckless[0m[1;33m=[0m[1;32mNone[0m[1;33m,[0m[1;33m
[0m    [0mjobs[0m[1;33m=[0m[1;34m'auto'[0m[1;33m,[0m[1;33m
[0m[1;33m)[0m[1;33m[0m[1;33m[0m[0m
[1;31mDocstring:[0m
Get any dataset content (files/directories/subdatasets).

This command only operates on dataset content. To obtain a new independent
dataset from some source use the `clone` command.

By default this command operates recursively within a dataset,

In [35]:
# %%time
d.get(
    # path='sub-S4/anat/sub-S4_T1w.nii.gz',
    # path = 'sub-BO7/anat/sub-BO7_T2w.nii.gz',
    path = 'sub-BO7',
    recursive=True,
    reckless="availability",
    # jobs="auto",
)

[INFO] Ensuring presence of Dataset("D:\Side_Projects\MRI_Project\OpenNeuroDataLoader\ds007045") to get D:\Side_Projects\MRI_Project\OpenNeuroDataLoader\ds007045\sub-BO7 


get(ok): sub-BO7\anat\sub-BO7_FLAIR.nii.gz (file) [from s3-PUBLIC...]
get(ok): sub-BO7\anat\sub-BO7_run-01_T1w.nii.gz (file) [from s3-PUBLIC...]
get(ok): sub-BO7\anat\sub-BO7_run-02_T1w.nii.gz (file) [from s3-PUBLIC...]
get(ok): sub-BO7 (directory)
action summary:
  get (ok: 4)


[{'type': 'file',
  'refds': 'D:\\Side_Projects\\MRI_Project\\OpenNeuroDataLoader\\ds007045',
  'status': 'ok',
  'path': 'D:\\Side_Projects\\MRI_Project\\OpenNeuroDataLoader\\ds007045\\sub-BO7\\anat\\sub-BO7_FLAIR.nii.gz',
  'action': 'get',
  'annexkey': 'SHA256E-s3901422--376318fca395404af3b5b6881169307d727d99a5917a578b9c44c10b2215d39d.nii.gz',
  'message': 'from s3-PUBLIC...'},
 {'type': 'file',
  'refds': 'D:\\Side_Projects\\MRI_Project\\OpenNeuroDataLoader\\ds007045',
  'status': 'ok',
  'path': 'D:\\Side_Projects\\MRI_Project\\OpenNeuroDataLoader\\ds007045\\sub-BO7\\anat\\sub-BO7_run-01_T1w.nii.gz',
  'action': 'get',
  'annexkey': 'SHA256E-s1513039--5dbe306439406d14c5f9562b7cc70f511185d5be051e90e76e51f12e9b9ff277.nii.gz',
  'message': 'from s3-PUBLIC...'},
 {'type': 'file',
  'refds': 'D:\\Side_Projects\\MRI_Project\\OpenNeuroDataLoader\\ds007045',
  'status': 'ok',
  'path': 'D:\\Side_Projects\\MRI_Project\\OpenNeuroDataLoader\\ds007045\\sub-BO7\\anat\\sub-BO7_run-02_T1w.nii.g

In [48]:
siblings_list = dl.siblings(dataset=dataset_path, return_type='list')
siblings_list

.: here(+) [git]
.: s3-PUBLIC(+) [git]
.: origin(-) [https://github.com/OpenNeuroDatasets/ds007156.git (git)]


[{'action': 'query-sibling',
  'path': 'D:\\Side_Projects\\MRI_Project\\OpenNeuroDataLoader\\ds007156',
  'type': 'sibling',
  'refds': 'D:\\Side_Projects\\MRI_Project\\OpenNeuroDataLoader\\ds007156',
  'name': 'here',
  'annex-uuid': 'ef3acd4b-902f-4e41-ad32-d00f37452e2b',
  'annex-bare': 'false',
  'annex-version': '10',
  'available_local_disk_space': 262328991488,
  'annex-description': 'ryan@DESKTOP-N64ELUB:D:\\Side_Projects\\MRI_Project\\OpenNeuroDataLoader\\ds007156',
  'status': 'ok'},
 {'action': 'query-sibling',
  'path': 'D:\\Side_Projects\\MRI_Project\\OpenNeuroDataLoader\\ds007156',
  'type': 'sibling',
  'refds': 'D:\\Side_Projects\\MRI_Project\\OpenNeuroDataLoader\\ds007156',
  'name': 's3-PUBLIC',
  'annex-s3': 'true',
  'annex-uuid': 'a3a1f800-e0aa-4ccf-a727-a3ed59559401',
  'annex-ignore': 'false',
  'skipfetchall': 'true',
  'annex-description': '[s3-PUBLIC]',
  'annex-autoenable': 'true',
  'annex-bucket': 'openneuro.org',
  'annex-datacenter': 'US',
  'annex-encryp

In [10]:
import requests

url = "https://s3.amazonaws.com/openneuro/ds007156/sub-S05/anat/sub-S05_T1w.nii.gz"
r = requests.head(url)
print(r.status_code)


404
