<a href="https://colab.research.google.com/github/kaoriito/ohbm-hackthon2020/blob/master/HBM_brainhack2020_bids2Niftytorch.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Converting BIDS data to niftytorch format

In [341]:
import os, sys
import glob
import shutil
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedShuffleSplit, train_test_split

In [342]:
from google.colab import drive
drive.mount('/content/drive/')

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


### Set input (BIDS) and output directories, select the variable to perform classification

In [343]:
bids_dir='/content/drive/My Drive/brainhack2020/SmallData/'
output_dir='/content/drive/My Drive/brainhack2020/OutputData/'

In [374]:
# make sure this variable matches with the variable name in participants.tsv
variable_to_classify='sex'

### Get all subject directories and list of subjects

In [373]:
filepaths=os.path.join(bids_dir,'sub-*')
subj_dirs=sorted(glob.glob(filepaths))

# get list of subjects
subjList=[]
for subj in subj_dirs:
  subjID=os.path.basename(subj)
  subjList.append(subjID)

In [346]:
# check if the variable_to_classify is in the participants.tsv file
if not variable_to_classify in list(participant_metadata.columns):
  print("ERROR: please make sure your variable is a column in your participants.tsv file")

### Make new folders for each subject



In [347]:
try:
  os.mkdir(output_dir)
except OSError:
  print ("Creation of the directory %s failed" % output_dir)
else:
  print ("Successfully created the directory %s " % output_dir)


Successfully created the directory /content/drive/My Drive/brainhack2020/OutputData/ 


In [348]:
for subj in subj_dirs:
  subjname=os.path.basename(subj)
  print(subjname)
  newdirpath=os.path.join(output_dir,subjname)
  try: 
    os.mkdir(newdirpath)
  except OSError:
    print ("Creation of the directory %s failed" % newdirpath)
  else:
    print ("Successfully created the directory %s " % newdirpath)

sub-mgh01
Successfully created the directory /content/drive/My Drive/brainhack2020/OutputData/sub-mgh01 
sub-mgh02
Successfully created the directory /content/drive/My Drive/brainhack2020/OutputData/sub-mgh02 
sub-mgh03
Successfully created the directory /content/drive/My Drive/brainhack2020/OutputData/sub-mgh03 
sub-mgh04
Successfully created the directory /content/drive/My Drive/brainhack2020/OutputData/sub-mgh04 
sub-mgh05
Successfully created the directory /content/drive/My Drive/brainhack2020/OutputData/sub-mgh05 
sub-mgh06
Successfully created the directory /content/drive/My Drive/brainhack2020/OutputData/sub-mgh06 


### Copy nifti files from bids dir to output directory

In [349]:
def copyImageFiles(dirContainingModalityDirs,subjOutputDir):
# provide (1) path to the parent directory of the modality directories 
# (either the subject or session level), and (2) the subject output directory

  workingdir=dirContainingModalityDirs
  modalpaths=os.path.join(workingdir,'*')
  modalities=sorted(glob.glob(modalpaths))

  for modality in modalities:
    imgfilepaths=os.path.join(modality,'*.nii*')
    imgfiles=sorted(glob.glob(imgfilepaths))
    for img in imgfiles:
      try:
        shutil.copy(img,subjOutputDir)
      except OSError:
        print("unable to copy nifti files")
      else:
        print("copied files successfully")

In [350]:
for subj in subj_dirs: 
  sespath=os.path.join(subj,'ses-*')
  ses_dirs=sorted(glob.glob(sespath))
  subjID=os.path.basename(subj)
 # isFile=os.path.isfile(ses_dirs)
  
  subjOutputDir=os.path.join(output_dir,subjID)
  
  # some BIDS directories may have session folders.
  if ses_dirs:
    for session in ses_dirs:
      #print(session)
      workingdir=session
      copyImageFiles(workingdir,subjOutputDir)

  if not ses_dirs:
   # print("ses_dirs do not exist")
    workingdir=subj
    copyImageFiles(workingdir,subjOutputDir)

copied files successfully
copied files successfully
copied files successfully
copied files successfully
copied files successfully
copied files successfully
copied files successfully
copied files successfully
copied files successfully
copied files successfully
copied files successfully
copied files successfully
copied files successfully
copied files successfully
copied files successfully
copied files successfully
copied files successfully
copied files successfully
copied files successfully
copied files successfully
copied files successfully
copied files successfully
copied files successfully
copied files successfully
copied files successfully
copied files successfully
copied files successfully
copied files successfully
copied files successfully
copied files successfully
copied files successfully
copied files successfully
copied files successfully
copied files successfully
copied files successfully
copied files successfully


# Assign subjects to test/train datasets

### Check that a participants.tsv file exists within the BIDS directory

In [354]:
participantsTsvPath=os.path.join(bids_dir,'participants.tsv')
participantsTsvExists=os.path.exists(participantsTsvPath)

if not participantsTsvExists:
  print("ERROR: participants.tsv file missing. Do not continue without this file")

else:
  print("participants.tsv file found")

participants.tsv file found


### Read in the participants.tsv file as a dataframe

In [375]:
participant_metadata=pd.read_csv(participantsTsvPath, sep='\t')
participant_metadata=participant_metadata.sort_values('participant_id')

### Get subset of participants.tsv that we actually have data for

In [356]:
try:
  subsetDf=participant_metadata[participant_metadata["participant_id"].isin(subjList)]
except OSError:
  print("ERROR: check that your participants are listed in the participants.tsv file")

In [357]:
if subsetDf[variable_to_classify].isnull().values.any():
  print("You have missing values in your selected variable for classification.")

### First Split 20% of total data to validation set

In [376]:
subjListKey={v: k for k, v in enumerate(subjList)}

In [359]:
y=subsetDf[variable_to_classify].to_numpy()
num_samples=len(y)

In [360]:
X=np.zeros(num_samples)

In [361]:
# first, do 80/20 split (the test set will be the validation set)
sss=StratifiedShuffleSplit(n_splits=2,test_size=0.20)
indices1,indices2=sss.split(X,y)

In [362]:
validation_indices=indices1[1]
print(validation_indices)

[5 2]


In [363]:
# remove validation set from rest of data for re-splitting and save subj ids
val_subj=[]
for subj in subjListKey:
  if subjListKey[subj] in validation_indices:
    indexNames=subsetDf[subsetDf['participant_id']==subj].index
    subsetDf=subsetDf.drop(indexNames)
    val_subj.append(subj)
    try: 
      subjList.remove(subj)
    except:
      'subject not in list anymore'

subjListKey={v: k for k, v in enumerate(subjList)}
subjListKey

{'sub-mgh01': 0, 'sub-mgh02': 1, 'sub-mgh04': 2, 'sub-mgh05': 3}

### Split the rest with Stratified Shuffle Split


In [364]:
y=subsetDf[variable_to_classify].to_numpy()
num_samples=len(y)
X=np.zeros(num_samples)

In [366]:
if num_samples>=8:
  sss=StratifiedShuffleSplit(n_splits=2,test_size=0.25)
else:
  sss=StratifiedShuffleSplit(n_splits=2,test_size=0.5)

indices1,indices2=sss.split(X,y)

In [367]:
train_indices=indices1[0]
test_indices=indices1[1]

### Move subjects into respective train/test directories

In [368]:
train_dir=os.path.join(output_dir,'train')
val_dir=os.path.join(output_dir,'val')
test_dir=os.path.join(output_dir,'test')

try:
  os.mkdir(train_dir)
except OSError:
  print ("Creation of the directory %s failed" % train_dir)
else:
  print ("Successfully created the directory %s " % train_dir)

try:
  os.mkdir(test_dir)
except OSError:
  print ("Creation of the directory %s failed" % test_dir)
else:
  print ("Successfully created the directory %s " % test_dir)

try:
  os.mkdir(val_dir)
except OSError:
  print ("Creation of the directory %s failed" % val_dir)
else:
  print ("Successfully created the directory %s " % val_dir)

Successfully created the directory /content/drive/My Drive/brainhack2020/OutputData/train 
Successfully created the directory /content/drive/My Drive/brainhack2020/OutputData/test 
Successfully created the directory /content/drive/My Drive/brainhack2020/OutputData/val 


In [369]:
for subj in val_subj:
  print(subj, "is in validation set")
  subjOrigDir=os.path.join(output_dir,subj)
  destination=val_dir  
  try:
    dest = shutil.move(subjOrigDir, destination) 
  except OSError:
    print("destination may already exist")

sub-mgh03
sub-mgh06


In [371]:
for subj in subjListKey:
  if subjListKey[subj] in train_indices:
    print(subj, "is in training set")
    subjOrigDir=os.path.join(output_dir,subj)
    destination=train_dir
    try:
      dest = shutil.move(subjOrigDir, destination) 
    except OSError:
      print("destination may already exist")

  elif subjListKey[subj] in test_indices:
    print(subj, "is in testing set")
    subjOrigDir=os.path.join(output_dir,subj)
    destination=test_dir
    try:
     dest = shutil.move(subjOrigDir, destination) 
    except OSError:
      print("destination may already exist")

sub-mgh01 is in training set
destination may already exist
sub-mgh02 is in testing set
destination may already exist
sub-mgh04 is in testing set
destination may already exist
sub-mgh05 is in training set
destination may already exist
