Install relevant packages and modules

In [1]:
!apt-get install -y dcm2niix # '!' runs commands in the console
!apt-get install -y parallel

# dependencies for freesurfer
!apt-get install -y wget
!apt-get install -y grep
!apt-get install -y tcsh
!apt-get install -y bc

!pip install nibabel
!pip install pydicom

import os
import glob

import numpy as np
import pandas as pd

import nibabel as nib
import pydicom

# install freesurfer (might take a few minutes)

# if you are having issues downloading freesurfer, it is likely due to the version selected here
!wget -O freesurfer.tar.gz https://freesurfer.net/pub/dist/freesurfer/7.4.1/freesurfer-linux-ubuntu22_amd64-7.4.1.tar.gz
!tar -xzf freesurfer.tar.gz

# set the relevant freesurfer directories
os.environ['FREESURFER_HOME'] = '/content/freesurfer'
os.environ['SUBJECTS_DIR'] = '/content/freesurfer_output'
os.environ['PATH'] += ':/content/freesurfer/bin'

!source /content/freesurfer/SetUpFreeSurfer.sh

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
The following additional packages will be installed:
  libyaml-cpp0.7 pigz
The following NEW packages will be installed:
  dcm2niix libyaml-cpp0.7 pigz
0 upgraded, 3 newly installed, 0 to remove and 49 not upgraded.
Need to get 417 kB of archives.
After this operation, 1,393 kB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu jammy/universe amd64 pigz amd64 2.6-1 [63.6 kB]
Get:2 http://archive.ubuntu.com/ubuntu jammy/main amd64 libyaml-cpp0.7 amd64 0.7.0+dfsg-8build1 [97.7 kB]
Get:3 http://archive.ubuntu.com/ubuntu jammy/universe amd64 dcm2niix amd64 1.0.20211006-1build1 [256 kB]
Fetched 417 kB in 1s (722 kB/s)
Selecting previously unselected package pigz.
(Reading database ... 123622 files and directories currently installed.)
Preparing to unpack .../archives/pigz_2.6-1_amd64.deb ...
Unpacking pigz (2.6-1) ...
Selecting previously unselected package libyaml-cpp

--------------------------------------------------------------------------------
--------------------------------------------------------------------------------
--------------------------------------------------------------------------------

Add the FreeSurfer License path. This is free to get but due to their policy I cannot include it here. It is available at https://surfer.nmr.mgh.harvard.edu/registration.html

In [None]:
from google.colab import files
license = files.upload()

# change this to what you named the license
your_file_name = 'freesurfer_license.txt'

# set the enviorment to use the license
os.environ['FS_LICENSE'] = f'/content/{your_file_name}'

Saving freesurfer_license.txt to freesurfer_license.txt


Get anonymized (T1 weighted) dicom files for analysis, and create temporary folders to store these and other files.

In [2]:
# clone the repository to get the raw dicoms
!git clone https://github.com/datalad/example-dicom-structural

# create folders for the NIfTI conversions and corresponding recons
!mkdir /content/nii_files/
!mkdir /content/freesurfer_output/

'''
Note: "T1 weighting" refers to a form of MRI image processing performed by technicians.
Online resources will nearly always provide processed MRI images,
though they may not be T1 weighted
''';

Cloning into 'example-dicom-structural'...
remote: Enumerating objects: 393, done.[K
remote: Total 393 (delta 0), reused 0 (delta 0), pack-reused 393 (from 1)[K
Receiving objects: 100% (393/393), 15.45 MiB | 18.61 MiB/s, done.
Resolving deltas: 100% (223/223), done.


Convert the DICOMs to NIfTIs

In [None]:
# prepare paths
INPUT_PATH = '/content/example-dicom-structural/dicoms/'
OUTPUT_PATH = '/content/nii_files/'

# grep to ignore some warnings regarding the manufacturer (since we're using sample dicoms)
!dcm2niix -o '{OUTPUT_PATH}' '{INPUT_PATH}' | grep -v "Unknown manufacturer"

Chris Rorden's dcm2niiX version v1.0.20211006  (JP2:OpenJPEG) GCC11.2.0 x86-64 (64-bit Linux)
Found 384 DICOM file(s)
Convert 384 DICOM as /content/nii_files/dicoms_anat-T1w_20130717141500_401 (274x384x384x1)
Conversion required 0.412129 seconds (0.389445 for core code).


Reconstruct (recon) the NIfTI files


Parallel allows you to recon multiple subjects simultaneously by recruiting n CPU cores (determined by --jobs n). It is irrelevant here, but is good to be aware of

Recons take a long time (4 hours+ per nii) and are quite large (~100Mb). For this reason, I have included the finished recon within this repository, with only the essential files kept

In [None]:
# prepare paths
INPUT_PATH = '/content/freesurfer_output'
nii_paths = glob.glob('/content/nii_files/*.nii') # list of paths
ALL_NII = ' '.join(nii_paths)  # format that parallel wants

# use parallel to execute recon-all on each NIfTI file
!parallel --jobs 1 recon-all -i {} -s {/.} -all ::: /content/nii_files/*.nii

Load in the sample recon

In [3]:
!git clone https://github.com/SamAndTheSun/sMRI_BrainAge_Tutorial.git
recon_path = '/content/sMRI_BrainAge_Tutorial/sample_recon'

Cloning into 'sMRI_BrainAge_Tutorial'...
remote: Enumerating objects: 254, done.[K
remote: Counting objects: 100% (103/103), done.[K
remote: Compressing objects: 100% (100/100), done.[K
remote: Total 254 (delta 50), reused 0 (delta 0), pack-reused 151 (from 1)[K
Receiving objects: 100% (254/254), 15.13 MiB | 7.65 MiB/s, done.
Resolving deltas: 100% (98/98), done.


--------------------------------------------------------------------------------
--------------------------------------------------------------------------------
--------------------------------------------------------------------------------

Deep Learning Analysis Example 1: CNN using brain.mgz


The brain.mgz file represents the combination of each individual "slice" of the brain stitched together to form a single cohesive volume. We can use this file to construct a CNN, trained on 3D images, to predict brain age

In [None]:
# load in the brain.mgz files
brain_files = glob.glob(f'{recon_path}/*/mri/brain.mgz')

Convert the brain files to the same space, so that the model is fed consistent data

In [None]:
# get the converted files

for subj_brain in brain_files:

  # convert the brain to
  mri_vol2vol -i {} -s {/.} -all

  # get the brain data
  mgz_file = nib.load(subj_brain)
  brain_data = mgz_file.get_fdata()

  # convert the brain data to native space using freesurfer
  mri_vol2vol -i {} -s {/.} -all


--------------------------------------------------------------------------------
--------------------------------------------------------------------------------
--------------------------------------------------------------------------------

Deep Learning Analysis Example 2: graphSAGE using the cortex files

The .pial files (one for each hemisphere) represent the geometric vertices and faces (i.e. the connections between vertices) for each individual's cortex. Freesurfer also provides other files, such as the cortical thickness (.thickness) and white-grey matter intensity ratio (.w-g.pct.mgh) which provide information on each of these vertices.

We can use these files in conjunction to construct a graph of each individual's brain, with vertices serving as nodes, faces serving as edges, and attributes serving as features.

In [26]:
# get each subject (only one here)
subj_files = glob.glob(f'{recon_path}')

# load in the pial files
pial_files = glob.glob(f'{recon_path}/surf/*h.pial') # *h because we want both hemispheres

# load in the thickness files
thickness_files = glob.glob(f'{recon_path}/surf/*h.thickness')

# load in the white-grey matter intensity ratio files
wg_ratio_files = glob.glob(f'{recon_path}/surf/*h.w-g.pct.mgh')

# glob uses a random sort order, so we sort alphabetically to match everything
subj_files = sorted(subj_files)
pial_files = sorted(pial_files)
thickness_files = sorted(thickness_files)
wg_ratio_files = sorted(wg_ratio_files)

# make a dictionary for the training data (we will make it structured as: subjects -> nodes -> features/edges)
training_data = {}

# make a dictionary for the edge indices
edge_indices = {}

# loop through the subjects to construct the desired dictionaries
for i, subj in enumerate(subj_files): # since we only have on subject, this only runs once

  # get the last part of the path
  subj_id = subj.split('/')[-1]

  # get the vertice and face data for the subject
  lh_vertices, lh_faces = nib.freesurfer.read_geometry(pial_files[i]) # we know lh is before rh because we sorted alphabetically
  rh_vertices, rh_faces = nib.freesurfer.read_geometry(pial_files[i+1])

  # combine them into a single array
  vertices = np.vstack((lh_vertices, rh_vertices+(np.max(lh_vertices)+1))) # vertices uses relative node index (min=0), so we need to account for this

  # do the same for every other file type
  lh_thickness = nib.freesurfer.io.read_morph_data(thickness_files[i]) # be mindful of which nib reading varient to use
  rh_thickness = nib.freesurfer.io.read_morph_data(thickness_files[i+1])
  thickness = np.hstack((lh_thickness, rh_thickness)) # not relative, notice the use of h-stack for single-dimension variables
  #
  lh_ratio = nib.load(wg_ratio_files[i]).get_fdata()
  rh_ratio = nib.load(wg_ratio_files[i+1]).get_fdata()
  ratio = np.vstack((lh_ratio, rh_ratio)) # not relative
  ratio = ratio.squeeze() # this has dimensions (n_nodes, 1, 1) otherwise

  # create a node for each vertice and a seperate list for the edges
  training_data[subj_id] = [[] for _ in range(vertices.shape[0])]

  # for each node add the corresponding features
  for n, node in enumerate(training_data[subj_id]):
    node.extend(vertices[n, :])
    node.append(thickness[n])
    node.append(ratio[n])

  # additionally, get the faces of the each subject as an edge index
  faces = np.vstack((lh_faces, rh_faces+(np.max(lh_faces)+1))) # recall that we got the faces from the pial files, which use relative indexing
  edge_index = []

  # loop through the faces and create 2-dimensional representation
  # these edges are undirected and thus should include both directions
  for face in faces:
      edges = [
          (face[i], face[j])
          for i in range(3)
          for j in range(i + 1, 3)
      ]
      edge_index.extend(edges)
  edge_index = np.array(edge_index).T
  edge_indices[subj_id] = edge_index

# get the size of each sub-structure
num_subjects = len(training_data)
num_nodes = len(next(iter(training_data.values())))
num_values = len(next(iter(next(iter(training_data.values())))))
num_edges = edge_indices[subj_id].shape[1]

print("Number of subjects:", num_subjects)
print("Number of nodes (in subject 1):", num_nodes)
print("Number of features:", num_values) # recall that spatial position is 3 features: x, y, and z
print("Number of num_edges (in subject 1):", num_edges)

Number of subjects: 1
Number of nodes (in subject 1): 320845
Number of features: 5
Number of num_edges (in subject 1): 1925046


The next step is to get the data regarding participant ages, or whatever it is that we want to predict. Many datasets will include "demographic.csv" or "metadata.csv" files, but sometimes they won't. In these cases, we need to extract the metadata from the original DICOM files.

We can reasonably expect that every DICOM for a given subject will have the same demographic information within its metadata, so we only need to look at any random DICOM file for each subject. Let's take a look at the metadata.

In [5]:
# select all subject folders (only one in this case)
dicom_paths = glob.glob('/content/example-dicom-structural/*/')

for path in dicom_paths:

  # select an arbitray DICOM file within the subject folder
  subj_dicoms = glob.glob(f'{path}/*')
  target_dicom = subj_dicoms[0] # 0 is arbitrary

  # get the metadata
  metadata = pydicom.dcmread(target_dicom)

  # print out all of the metadata
  for elem in metadata.iterall():
    print(elem)

(0008,0008) Image Type                          CS: ['DERIVED', 'SECONDARY']
(0008,0016) SOP Class UID                       UI: MR Image Storage
(0008,0018) SOP Instance UID                    UI: 1.2.826.0.1.3680043.2.1143.7980170295326065434086375780975261994
(0008,0020) Study Date                          DA: '20130717'
(0008,0021) Series Date                         DA: '20130717'
(0008,0022) Acquisition Date                    DA: '20130717'
(0008,0023) Content Date                        DA: '20130717'
(0008,0030) Study Time                          TM: '141500'
(0008,0031) Series Time                         TM: '142035.93000'
(0008,0032) Acquisition Time                    TM: '132518'
(0008,0033) Content Time                        TM: '142035.93'
(0008,0050) Accession Number                    SH: ''
(0008,0060) Modality                            CS: 'MR'
(0008,0070) Manufacturer                        LO: 'BIOLAB'
(0008,0080) Institution Name                    LO: ''
(000

We can see that patient age is present. Our next step is to construct a loop that assembles the patient ages to line up correctly with the feature data

In [6]:
# select all subject folders (only one in this case)
dicom_paths = glob.glob('/content/example-dicom-structural/*/')

# SORT the dicom paths. This is essential to making sure that the data is aligned.
# By sorting across all usages of glob we can make sure our results are consistent.
dicom_paths = sorted(dicom_paths)

# create a list for all of the subjects
ages = []

for path in dicom_paths:

  # select an arbitray DICOM file within the subject folder
  subj_dicoms = glob.glob(f'{path}/*')
  target_dicom = subj_dicoms[0] # 0 is arbitrary

  # get the metadata
  metadata = pydicom.dcmread(target_dicom)

  # get the age and add it to the list for all subjects
  age = metadata[(0x0010, 0x1010)].value

  # typically, age will be in the format str('55Y'),
  # but it varies by dataset. Here, it is simply str('55')
  ages.append(int(age))

print(ages)

[42]


Now lets get the relevant dependencies for the next steps (formatting the data then building a model using graphSAGE)

In [23]:
!pip install torch-geometric

import torch
from torch_geometric.data import Data, Batch



Format the Data and Batch objects in accordance with PyTorch Geometric's specifications. We do this by creating a unique Data object for each subject and adding this to a list, with this list then being used to create a Batch object for training

In [27]:
# create an empty list to store the subject data
data_list = []

for subject, features in training_data.items():
    # convert the node features to a tensor
    x = torch.tensor(features, dtype=torch.float)

    # convert the edge indices array directly to a tensor
    edge_index = torch.tensor(edge_indices[subject], dtype=torch.long)

    # create the Data object and add it to the list
    data = Data(x=x, edge_index=edge_index)
    data_list.append(data)

batch = Batch.from_data_list(data_list)
batch

DataBatch(x=[320845, 5], edge_index=[2, 1925046], batch=[320845], ptr=[2])