In [1]:
import getpass
import pprint
import os

import pandas
import qmenta.client
from tqdm import tqdm

# 1. Sign-up and login in QMENTA

Go to https://platform.qmenta.com/#/register, create an account to use the QMENTA platform, and then login with your recently created account to acces the platform.

#### PROMOTIONAL CODE: **HACKATONSALUD2018**

![QMENTA Platform Registration form](assets/qmenta_platform_registration.png)

Check this **Getting started** article in case you need help with registration and login

https://support.qmenta.com/hc/en-us

# 2. Set up the QMENTA Client

Fill in the following cells with your username and password

In [2]:
username = 'user'

In [3]:
password = getpass.getpass('Password:')

Instantiate a QMENTA Client account, and get a Project instance to interact with the Hackaton project in the platform 

In [4]:
acc = qmenta.client.Account(username=username, password=password)

In [5]:
project = acc.get_project('CNIC-QMENTA 1000 Brains Challenge')

# 3. List all the subjects in the project and their metadata

Get all the subjects and sort them by patient_secret_name

In [6]:
subjects_metadata = project.get_subjects_metadata()  # This returns a list of dictionaries
subjects_metadata = sorted(subjects_metadata, key=lambda x: x['patient_secret_name'])

We should have 1100 subjects

In [7]:
assert len(subjects_metadata) == 1100

Let's have a look at the data structure that represents each subject

In [8]:
subjects_metadata[0]

{u'_id': 144335.0,
 u'age_at_scan': None,
 u'container_id': 194999,
 u'data_location': u'eu',
 u'date_at_scan': {u'$date': 1480032000000},
 u'md_age': 31.0,
 u'md_gender': None,
 u'md_handedness': None,
 u'md_set': u'train',
 u'owner': u'Albert Puente Encinas',
 u'patient_secret_name': u'28329',
 u'qa_comments': u'',
 u'qa_status': u'',
 u'ssid': u'1',
 u'tags': [],
 u'user_id': u'apuente'}

The relevant fields are:
- **patient_secret_name**: unique identifier for this subject in this project. We also refer to this field as **SubjectID** 
- **md_set**: identifies if the subject belongs to the 'train' or the 'test' set
- **md_age**: the age of the subject

Let's separate the subjects between train and set

In [9]:
train_subjects = [x for x in subjects_metadata if x['md_set'] == 'train']
assert len(train_subjects) == 1000

In [10]:
test_subjects = [x for x in subjects_metadata if x['md_set'] == 'test']
assert len(test_subjects) == 100

We will only use the subjects in 'train' to build a CSV which maps the SubjectID with the Age.

In [11]:
train_csv_info_dict = {x['patient_secret_name']: {'Age': x['md_age']} for x in train_subjects}

In [12]:
train_csv_dataframe = pandas.DataFrame.from_dict(train_csv_info_dict, orient='index')

In [13]:
train_csv_dataframe

Unnamed: 0,Age
28329,31.0
28330,35.0
28338,30.0
28339,65.0
28341,35.0
28342,32.0
28347,86.0
28348,23.0
28349,64.0
28352,30.0


Let's create a local folder to store all the data from the Hackaton 

In [14]:
hackaton_dir = os.path.expanduser('~/qmenta_cnic_1000_brains_challenge')
print(hackaton_dir)

/home/santi/qmenta_cnic_1000_brains_challenge


In [15]:
if not os.path.isdir(hackaton_dir):
    os.makedirs(hackaton_dir)

Now we will store the pandas DataFrame as a CSV in this folder, so that we can use it later to train a Machine Learning model.

In [16]:
train_csv_dataframe.to_csv(os.path.join(hackaton_dir, 'train.csv'))

# 4. Fetch the analysis results for all subjects

We will again list all the completed analysis in the platform and sort them by patient secret name

In [17]:
analysis = project.list_analysis()
analysis = [x for x in analysis if x['state'] == 'completed']
analysis = sorted(analysis, key=lambda x: x['patient_secret_name'])

Again we should have 1100 analyses

In [18]:
assert len(analysis) == 1100

Let's see the data structure that represents each analysis

In [19]:
analysis[0]

{u'_id': 74397,
 u'config': {u'time_project_end': {u'$date': 1525476383015},
  u'time_project_start': {u'$date': 1525458856529}},
 u'description': u'',
 u'in_container_id': 194999,
 u'name': u'ANTs Morphology 2.1 (v.4.6)',
 u'out_container_id': 197028,
 u'owner': u'Albert Puente Encinas',
 u'patient_secret_name': u'28329',
 u'progress': [-1, u'Completed', {u'$date': 1525476382889}],
 u'projectset_id': 1658,
 u'qa_comments': u'',
 u'qa_status': u'',
 u'script_name': u'qmenta_ants_morphology_2',
 u'settings': {u'acpc_alignment': u'1',
  u'age_months': 368.88,
  u'alignment_tool': u'ants',
  u'atlas_template': u'DKT40',
  u'do_thickness': u'1',
  u'input': {u'container_id': 194999,
   u'date': {u'$date': 1480032000000},
   u'filters': {u'c_T1': {u'files': [{u'_id': 2542921.0,
       u'modality': u'T1',
       u'name': u'anat.nii.gz',
       u'tags': []}],
     u'has_to_choose': 0,
     u'passed': True,
     u'range': [1, 1]}},
   u'in_out': u'in',
   u'passed': True,
   u'ssid': u'1',
   

As mentioned before, the analysis run for this database quantifies volumetry and morphometry of the brain given a structural MR image, concretely a T1-weighted MR image.
The specific tool that we used is build upon [ANTs](The specific tool that we used is build upon [ANTs]). You can learn about the specifics of this tool in this support article [here](https://support.qmenta.com/hc/en-us/articles/115000760611-ANTs-Morphology-2-1-0-).

The important field in this case is the **out_container_id**, because this identifies the data container that stores the result files. We will need to download one or more of these result files to use them as the predictor variables for our ML model.

Let's see the typical set of files produced by an ANTs analysis

In [20]:
out_container_id_example = analysis[0]['out_container_id']
results_files_example = project.list_container_files_metadata(out_container_id_example)
pprint.pprint(results_files_example)

[{u'metadata': {u'format': u'nifti', u'info': {}, u'modality': u'T1'},
  u'name': u'T1_original.nii.gz',
  u'size': 6567956,
  u'tags': []},
 {u'metadata': {u'format': u'nifti', u'info': {}, u'modality': u'T1'},
  u'name': u'T1_acpc.nii.gz',
  u'size': 27649938,
  u'tags': [u'head']},
 {u'metadata': {u'format': u'nifti', u'info': {}, u'modality': u'T1'},
  u'name': u'T1strip.nii.gz',
  u'size': 5647925,
  u'tags': [u'strip']},
 {u'metadata': {u'format': None, u'info': {}},
  u'name': u'CSF.nii.gz',
  u'size': 209818,
  u'tags': []},
 {u'metadata': {u'format': None, u'info': {}},
  u'name': u'WM.nii.gz',
  u'size': 186476,
  u'tags': []},
 {u'metadata': {u'format': None, u'info': {}},
  u'name': u'GM.nii.gz',
  u'size': 320731,
  u'tags': []},
 {u'metadata': {u'format': None, u'info': {}},
  u'name': u'T1strip_bin.nii.gz',
  u'size': 114762,
  u'tags': [u'mask']},
 {u'metadata': {u'format': None, u'info': {}},
  u'name': u'tissueSegmentation.nii.gz',
  u'size': 293473,
  u'tags': [u'tis

As you can see ANTs generates a lot of intermediate results, however we are only interested in few things:
- T1_strip.nii.gz (modality=T1, tags=strip): skull-stripped brain, in case you want to train a Machine or Deep Learning algorithm on the raw structural data.
- thickness.nii.gz (tags=thickness): thickness map of the brain, in which each voxel belonging to the cortex has a value that indicates the thickness in mm.
- tissueSegmentation.nii.gz (tags=tissue_segmentation): segmentation map of the brain into its main tissues, namely Gray Matter, White Matter, CerebroSpinal Fluid (CSF), Deep Brain, Brain-Stem and Cerebellum.
- labels.nii.gz (tags=labels): Brain parcellation of the cortex and other important structures.
- volumetric.csv: CSV with volume, average thickness and standard deviation of thickness information for each tissue and region in the brain.

In our example we will only use the information provided by **volumetric.csv**, however you are free to use any of the result files available in there, and even the original T1 image.

Let's download the volumetric.csv for our example analysis and inspect it using pandas

In [21]:
project.download_file(container_id=out_container_id_example, file_name='volumetric.csv', local_filename='/tmp/volumetric.csv', overwrite=True)

True

In [22]:
volumetric_csv_example = pandas.read_csv('/tmp/volumetric.csv')

In [23]:
volumetric_csv_example

Unnamed: 0,x,y,z,t,value,mass,volume,count,label,group
0,89.976523,108.760726,88.164410,0,1,335782,3.357820e+05,335782,CSF,CSF
1,88.975179,107.440289,81.721023,0,2,571448,5.714480e+05,571448,Gray matter,Gray matter
2,89.011984,108.351186,85.822648,0,3,453088,4.530880e+05,453088,White matter,White matter
3,89.754156,119.133126,69.046722,0,4,39639,3.963900e+04,39639,Deep brain,Deep brain
4,88.972211,94.929482,39.321145,0,5,19144,1.914400e+04,19144,Brain-Stem,Brain-Stem
5,89.485400,66.568722,38.616717,0,6,159934,1.599340e+05,159934,Cerebellum,Cerebellum
6,0.000000,0.000000,0.000000,0,-1,0,1.579035e+06,0,ICV,ICV
7,0.000000,0.000000,0.000000,0,-1,0,7.873499e+01,0,BPF,BPF
8,100.538313,108.214466,75.613902,0,10,6747,6.747000e+03,6747,thalamusproper_L,Subcortical-Left
9,104.428624,136.369541,76.726606,0,11,2725,2.725000e+03,2725,caudate_L,Subcortical-Left


Now we can download all the **volumetric.csv** for all subjects in order to use this information to train an ML algorithm to predict the age.

First we will create a folder to store all the volumetric data in our computer from the train and test sets

In [24]:
train_volumetric_data_dir = os.path.join(hackaton_dir, 'train')
print(train_volumetric_data_dir)

/home/santi/qmenta_cnic_1000_brains_challenge/train


In [25]:
if not os.path.isdir(train_volumetric_data_dir):
    os.makedirs(train_volumetric_data_dir)

In [26]:
test_volumetric_data_dir = os.path.join(hackaton_dir, 'test')
print(test_volumetric_data_dir)

/home/santi/qmenta_cnic_1000_brains_challenge/test


In [27]:
if not os.path.isdir(test_volumetric_data_dir):
    os.makedirs(test_volumetric_data_dir)

We distinguish between analysis from the train and test set

In [28]:
train_subjects_set = set([x['patient_secret_name'] for x in train_subjects])
test_subjects_set = set([x['patient_secret_name'] for x in test_subjects])

In [29]:
train_analysis = [x for x in analysis if x['patient_secret_name'] in train_subjects_set]
test_analysis = [x for x in analysis if x['patient_secret_name'] in test_subjects_set]

For each analysis we download the volumetric data

In [30]:
for analysis_instance in tqdm(train_analysis):
    analysis_container_id = analysis_instance['out_container_id']
    patient_name = analysis_instance['patient_secret_name']
    volumetric_filepath = os.path.join(train_volumetric_data_dir, '{}_volumetric.csv'.format(patient_name))
    project.download_file(container_id=analysis_container_id, file_name='volumetric.csv', local_filename=volumetric_filepath, overwrite=True)

100%|██████████| 1000/1000 [08:58<00:00,  1.86it/s]


In [31]:
for analysis_instance in tqdm(test_analysis):
    analysis_container_id = analysis_instance['out_container_id']
    patient_name = analysis_instance['patient_secret_name']
    volumetric_filepath = os.path.join(test_volumetric_data_dir, '{}_volumetric.csv'.format(patient_name))
    project.download_file(container_id=analysis_container_id, file_name='volumetric.csv', local_filename=volumetric_filepath, overwrite=True)

100%|██████████| 100/100 [00:48<00:00,  2.07it/s]
