<a href="https://colab.research.google.com/github/kaoriito/ohbm-hackthon2020/blob/master/HBM_brainhack2020_bids2Niftytorch.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Converting BIDS data to niftytorch format

In [677]:
import os, sys
import glob
import shutil
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedShuffleSplit, train_test_split

In [678]:
from google.colab import drive
drive.mount('/content/drive/')

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


### Set input (BIDS) and output directories, select the variable to perform classification

In [679]:
bids_dir='/content/drive/My Drive/brainhack2020/MedData/'
output_dir='/content/drive/My Drive/brainhack2020/OutputData/'

In [680]:
# make sure this variable matches with the variable name in participants.tsv
variable_to_classify='sex'
# for each of the below set sizes, make sure you have have enough data so each class is represented
# e.g., the minimum sample size you need is 20 subjects to set a 10% size for a binary classifier
val_set_size=0.2
test_set_size=0.1

### Get all subject directories and list of subjects

In [681]:
filepaths=os.path.join(bids_dir,'sub-*')
subj_dirs=sorted(glob.glob(filepaths))

# get list of subjects
subjList=[]
for subj in subj_dirs:
  subjID=os.path.basename(subj)
  subjList.append(subjID)

In [682]:
# check if the variable_to_classify is in the participants.tsv file
if not variable_to_classify in list(participant_metadata.columns):
  sys.exit("ERROR: please make sure your variable is a column in your participants.tsv file")

### Make new folders for each subject



In [683]:
try:
  os.mkdir(output_dir)
except OSError:
  print ("Creation of the directory %s failed" % output_dir)
else:
  print ("Successfully created the directory %s " % output_dir)


Creation of the directory /content/drive/My Drive/brainhack2020/OutputData/ failed


In [685]:
for subj in subj_dirs:
  subjname=os.path.basename(subj)
  print(subjname)
  newdirpath=os.path.join(output_dir,subjname)
  try: 
    os.mkdir(newdirpath)
  except OSError:
    print ("Creation of the directory %s failed" % newdirpath)
  else:
    print ("Successfully created the directory %s " % newdirpath)

sub-mgh01
Successfully created the directory /content/drive/My Drive/brainhack2020/OutputData/sub-mgh01 
sub-mgh02
Successfully created the directory /content/drive/My Drive/brainhack2020/OutputData/sub-mgh02 
sub-mgh03
Successfully created the directory /content/drive/My Drive/brainhack2020/OutputData/sub-mgh03 
sub-mgh04
Successfully created the directory /content/drive/My Drive/brainhack2020/OutputData/sub-mgh04 
sub-mgh05
Successfully created the directory /content/drive/My Drive/brainhack2020/OutputData/sub-mgh05 
sub-mgh06
Successfully created the directory /content/drive/My Drive/brainhack2020/OutputData/sub-mgh06 
sub-mgh07
Successfully created the directory /content/drive/My Drive/brainhack2020/OutputData/sub-mgh07 
sub-mgh08
Successfully created the directory /content/drive/My Drive/brainhack2020/OutputData/sub-mgh08 
sub-mgh09
Successfully created the directory /content/drive/My Drive/brainhack2020/OutputData/sub-mgh09 
sub-mgh10
Successfully created the directory /content/d

### Copy nifti files from bids dir to output directory

In [686]:
def copyImageFiles(dirContainingModalityDirs,subjOutputDir, subjID):
# provide (1) path to the parent directory of the modality directories 
# (either the subject or session level), and (2) the subject output directory

  workingdir=dirContainingModalityDirs
  modalpaths=os.path.join(workingdir,'*')
  modalities=sorted(glob.glob(modalpaths))

  for modality in modalities:
    print(modality)
    imgfilepaths=os.path.join(modality,'*.nii*')
    imgfiles=sorted(glob.glob(imgfilepaths))
    if not imgfiles:
      sys.exit("could not find nifti files for", subjID)
    for img in imgfiles:
      try:
        shutil.copy(img,subjOutputDir)
      except OSError:
        print("unable to copy nifti files")
      else:
        print("copied files successfully")

In [687]:
for subj in subj_dirs: 
  sespath=os.path.join(subj,'ses-*')
  ses_dirs=sorted(glob.glob(sespath))
  subjID=os.path.basename(subj)
 
  subjOutputDir=os.path.join(output_dir,subjID)
  
  # some BIDS directories may have session folders.
  if ses_dirs:
    for session in ses_dirs:
      print(session)
      workingdir=session
      copyImageFiles(workingdir,subjOutputDir, subjID)

  if not ses_dirs:
    print("ses_dirs do not exist")
    workingdir=subj
    copyImageFiles(workingdir,subjOutputDir, subjID)

/content/drive/My Drive/brainhack2020/MedData/sub-mgh01/ses-1
/content/drive/My Drive/brainhack2020/MedData/sub-mgh01/ses-1/anat
copied files successfully
/content/drive/My Drive/brainhack2020/MedData/sub-mgh02/ses-1
/content/drive/My Drive/brainhack2020/MedData/sub-mgh02/ses-1/anat
copied files successfully
/content/drive/My Drive/brainhack2020/MedData/sub-mgh03/ses-1
/content/drive/My Drive/brainhack2020/MedData/sub-mgh03/ses-1/anat
copied files successfully
/content/drive/My Drive/brainhack2020/MedData/sub-mgh04/ses-1
/content/drive/My Drive/brainhack2020/MedData/sub-mgh04/ses-1/anat
copied files successfully
/content/drive/My Drive/brainhack2020/MedData/sub-mgh05/ses-1
/content/drive/My Drive/brainhack2020/MedData/sub-mgh05/ses-1/anat
copied files successfully
/content/drive/My Drive/brainhack2020/MedData/sub-mgh06/ses-1
/content/drive/My Drive/brainhack2020/MedData/sub-mgh06/ses-1/anat
copied files successfully
/content/drive/My Drive/brainhack2020/MedData/sub-mgh07/ses-1
/content

# Assign subjects to test/train datasets

### Check that a participants.tsv file exists within the BIDS directory

In [688]:
participantsTsvPath=os.path.join(bids_dir,'participants.tsv')
participantsTsvExists=os.path.exists(participantsTsvPath)

if not participantsTsvExists:
  sys.exit("ERROR: participants.tsv file missing. Do not continue without this file")

else:
  print("participants.tsv file found")

participants.tsv file found


### Read in the participants.tsv file as a dataframe

In [689]:
participant_metadata=pd.read_csv(participantsTsvPath, sep='\t')
participant_metadata=participant_metadata.sort_values('participant_id')

### Get subset of participants.tsv that we actually have data for

In [690]:
try:
  subsetDf=participant_metadata[participant_metadata["participant_id"].isin(subjList)]
except OSError:
  sys.exit("ERROR: check that your participants are listed in the participants.tsv file")

In [691]:
if not (pd.Series(subjList).isin(subsetDf["participant_id"]).all()):
  sys.exit("ERROR: there are participants missing in your participants.tsv file")

In [692]:
if subsetDf[variable_to_classify].isnull().values.any():
  sys.exit("You have missing values in your selected variable for classification.")

### First Split test_set_size% of total data to test set

In [693]:
subjListKey={v: k for k, v in enumerate(subjList)}

In [694]:
y=subsetDf[variable_to_classify].to_numpy()
num_samples=len(y)

In [695]:
X=np.zeros(num_samples)

In [696]:
sss=StratifiedShuffleSplit(n_splits=2,test_size=test_set_size)
indices1,indices2=sss.split(X,y)

In [697]:
test_indices=indices1[1]

In [698]:
# remove test set from rest of data for re-splitting and save subj ids
test_subj=[]
for subj in subjListKey:
  if subjListKey[subj] in test_indices:
    indexNames=subsetDf[subsetDf['participant_id']==subj].index
    subsetDf=subsetDf.drop(indexNames)
    test_subj.append(subj)
    try: 
      subjList.remove(subj)
    except:
      'subject not in list anymore'

subjListKey={v: k for k, v in enumerate(subjList)}
subjListKey

{'sub-mgh01': 0,
 'sub-mgh02': 1,
 'sub-mgh03': 2,
 'sub-mgh04': 3,
 'sub-mgh05': 4,
 'sub-mgh06': 5,
 'sub-mgh07': 6,
 'sub-mgh08': 7,
 'sub-mgh09': 8,
 'sub-mgh10': 9,
 'sub-mgh11': 10,
 'sub-mgh12': 11,
 'sub-mgh13': 12,
 'sub-mgh14': 13,
 'sub-mgh16': 14,
 'sub-mgh17': 15,
 'sub-mgh19': 16,
 'sub-mgh20': 17}

### Split the rest with Stratified Shuffle Split


In [699]:
# recalculate val_set_size percentage based on remaining participants
new_val_setsize=(val_set_size*num_samples)/((val_set_size*num_samples)+((1-(val_set_size+test_set_size))*num_samples))
new_val_setsize=round(new_val_setsize,2)

In [700]:
y=subsetDf[variable_to_classify].to_numpy()
num_samples=len(y)
X=np.zeros(num_samples)

In [701]:
if num_samples*new_val_setsize>=2:
  sss=StratifiedShuffleSplit(n_splits=2,test_size=new_val_setsize)
  print('validation set size:', new_val_setsize)
else:
  sss=StratifiedShuffleSplit(n_splits=2,test_size=0.5)
  print('validation set size: 0.5')

indices1,indices2=sss.split(X,y)

validation set size: 0.22


In [702]:
train_indices=indices1[0]
validation_indices=indices1[1]

### Move subjects into respective train/test directories

In [703]:
train_dir=os.path.join(output_dir,'train')
val_dir=os.path.join(output_dir,'val')
test_dir=os.path.join(output_dir,'test')

try:
  os.mkdir(train_dir)
except OSError:
  print ("Creation of the directory %s failed" % train_dir)
else:
  print ("Successfully created the directory %s " % train_dir)

try:
  os.mkdir(test_dir)
except OSError:
  print ("Creation of the directory %s failed" % test_dir)
else:
  print ("Successfully created the directory %s " % test_dir)

try:
  os.mkdir(val_dir)
except OSError:
  print ("Creation of the directory %s failed" % val_dir)
else:
  print ("Successfully created the directory %s " % val_dir)

Successfully created the directory /content/drive/My Drive/brainhack2020/OutputData/train 
Successfully created the directory /content/drive/My Drive/brainhack2020/OutputData/test 
Successfully created the directory /content/drive/My Drive/brainhack2020/OutputData/val 


In [704]:
for subj in test_subj:
  print(subj, "is in test set")
  subjOrigDir=os.path.join(output_dir,subj)
  destination=test_dir  
  try:
    dest = shutil.move(subjOrigDir, destination) 
  except OSError:
    print("destination may already exist")

sub-mgh15 is in test set
sub-mgh18 is in test set


In [705]:
for subj in subjListKey:
  if subjListKey[subj] in train_indices:
    print(subj, "is in training set")
    subjOrigDir=os.path.join(output_dir,subj)
    destination=train_dir
    try:
      dest = shutil.move(subjOrigDir, destination) 
    except OSError:
      print("destination may already exist")

  elif subjListKey[subj] in validation_indices:
    print(subj, "is in validation set")
    subjOrigDir=os.path.join(output_dir,subj)
    destination=val_dir
    try:
     dest = shutil.move(subjOrigDir, destination) 
    except OSError:
      print("destination may already exist")

sub-mgh01 is in training set
sub-mgh02 is in training set
sub-mgh03 is in training set
sub-mgh04 is in training set
sub-mgh05 is in validation set
sub-mgh06 is in training set
sub-mgh07 is in training set
sub-mgh08 is in training set
sub-mgh09 is in training set
sub-mgh10 is in training set
sub-mgh11 is in validation set
sub-mgh12 is in training set
sub-mgh13 is in validation set
sub-mgh14 is in training set
sub-mgh16 is in training set
sub-mgh17 is in training set
sub-mgh19 is in validation set
sub-mgh20 is in training set
