# Querying the MRIQC Web API

This notebook shows how the web-API can be leveraged to analyze the image quality metrics (IQMs) that have been extracted with MRIQC

This notebook is a derivative work of https://gist.github.com/chrisfilo/eccdb8b98f8e74d24a3395a49fbadf03 and https://github.com/poldracklab/mriqc/blob/master/notebooks/MRIQC%20Web%20API.ipynb

In [174]:
import pandas as pd
from json import load
import urllib.request, json 
from pandas.io.json import json_normalize
import seaborn as sns
import pylab as plt
import multiprocessing as mp
import numpy as np
from sklearn.manifold import TSNE
from pathlib import Path
%matplotlib inline
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('max_colwidth',500)
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))
from urllib.error import HTTPError
from time import sleep

## Preparation

Let's define a function that will query the appropriate endpoint and a helper function to plot some distributions (at the bottom of this notebook).

In [2]:
def get_iqms(modality, versions=None, software='mriqc'):
    """
    Grab all iqms for the given modality and the list of versions
    """
    url_root = 'https://mriqc.nimh.nih.gov/api/v1/{modality}?{query}'
    page = 1
    dfs = []
    
    if versions is None:
        versions = ['*']

    for version in versions:
        while True:
            query = []
            
            if software is not None:
                query.append('"provenance.software":"%s"' % software)
            
            if version != '*':
                query.append('"provenance.version":"%s"' % version)
                
            page_url = url_root.format(
                modality=modality,
                query='where={%s}&page=%d' % (','.join(query), page)
            )
            with urllib.request.urlopen(page_url) as url:
                data = json.loads(url.read().decode())
                dfs.append(json_normalize(data['_items']))
                if 'next' not in data['_links'].keys():
                    break
                else:
                    page += 1

    # Compose a pandas dataframe
    return pd.concat(dfs, ignore_index=True)

def get_iqms(modality, versions=None, software='mriqc', start_page=1):
        """
        Grab all iqms for the given modality and the list of versions
        """
        url_root = 'https://mriqc.nimh.nih.gov/api/v1/{modality}?{query}'
        page = start_page
        dfs = []
        
        if versions is None:
            versions = ['*']
    
        for version in versions:
            while True:
                query = []
                
                if software is not None:
                    query.append('"provenance.software":"%s"' % software)
                
                if version != '*':
                    query.append('"provenance.version":"%s"' % version)
                    
                page_url = url_root.format(
                    modality=modality,
                    query='where={%s}&page=%d&max_results=50' % (','.join(query), page)
                )
                #print("Opening url")
                req = urllib.request.Request(page_url)
                #req.add_header('max_results', 10000)
                try:
                    with urllib.request.urlopen(req) as url:
                        data = json.loads(url.read().decode())
                        dfs.append(json_normalize(data['_items']))

                        if 'next' not in data['_links'].keys():
                            break
                        else:
                            page += 1
                except HTTPError:
                    try:
                        sleep(10)
                        with urllib.request.urlopen(req) as url:
                            data = json.loads(url.read().decode())
                            dfs.append(json_normalize(data['_items']))

                            if 'next' not in data['_links'].keys():
                                break
                            else:
                                page += 1
                    except HTTPError:
                        pd.concat(dfs, ignore_index=True).to_csv(f'mriqc_dl_tmp_{modality}.csv')
                        print(f'failed at {page}')
                if len(dfs) % 1000 == 0:
                    print("Finished reading %d"%(page-1),flush=True)
                if len(dfs) % 10000 == 0:
                    pd.concat(dfs, ignore_index=True).to_csv(f'mriqc_dl_tmp_{modality}.csv')
        # Compose a pandas datafram
        return pd.concat(dfs, ignore_index=True)

def plot_measure(data, xlabel=None, label=None, ax=None, min=None, max=None):
    """
    Distribution plot of a given measure
    """
    sns.distplot(data, ax=ax, label=label)
    
    if xlabel is not None:
        ax.set_xlabel(xlabel)
    
    if min is None:
        min = np.percentile(data, 0.5)
    
    if max is None:
        max = np.percentile(data, 99.5)
    ax.set_xlim((min, max))

## Fetch IQMs

Fetch IQMs for the three modalities currently supported by MRIQC, T1-weighted images, T2-weighted images, and BOLD-fMRI.  This is definitely the slow way to do it, with single worker it takes about a week each for the T1 and BOLD datasets. Much faster to download data from Chris Gorgolewski's Kaggle page: https://www.kaggle.com/chrisfilo/mriqc

In [None]:
# T1
df_t1w = get_iqms('T1w', software=None, start_page=10001)
df_t1w.to_csv('all_t1s.csv')

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.




failed at 10400
failed at 10461
failed at 10524
failed at 10586
failed at 10648
failed at 10708
failed at 10766
failed at 10826
failed at 10886
failed at 10947
Finished reading 11000
failed at 11010
failed at 11073
failed at 11134
failed at 11191
failed at 11248
failed at 11305
failed at 11361
failed at 11421
failed at 11478
failed at 11536
failed at 11597
failed at 11653
failed at 11709
failed at 11768
failed at 11822
failed at 11879
failed at 11937
failed at 11991
Finished reading 12000
failed at 12043
failed at 12095
failed at 12152
failed at 12206
failed at 12260
failed at 12315
failed at 12367
failed at 12418
failed at 12472
failed at 12523
failed at 12573
failed at 12625
failed at 12674
failed at 12726
failed at 12780
failed at 12829
failed at 12879
failed at 12928
failed at 12978
Finished reading 13000
failed at 13026
failed at 13077
failed at 13127
failed at 13177
failed at 13225
failed at 13270
failed at 13318
failed at 13364
failed at 13414
failed at 13462
failed at 13511
fai

failed at 16578
failed at 16584
failed at 16591
failed at 16591
failed at 16597
failed at 16604
failed at 16604
failed at 16609
failed at 16615
failed at 16615
failed at 16621
failed at 16628
failed at 16628
failed at 16635
failed at 16643
failed at 16643
failed at 16648
failed at 16656
failed at 16656
failed at 16662
failed at 16669
failed at 16675
failed at 16675
failed at 16681
failed at 16688
failed at 16688
failed at 16694
failed at 16701
failed at 16701
failed at 16706
failed at 16712
failed at 16712
failed at 16718
failed at 16725
failed at 16725
failed at 16730
failed at 16738
failed at 16738
failed at 16742
failed at 16749
failed at 16749
failed at 16755
failed at 16762
failed at 16762
failed at 16768
failed at 16776
failed at 16783
failed at 16783
failed at 16789
failed at 16798
failed at 16805
failed at 16805
failed at 16812
failed at 16819
failed at 16825
failed at 16825
failed at 16831
failed at 16838
failed at 16838
failed at 16843
failed at 16850
failed at 16850
failed a

failed at 19005
failed at 19010
failed at 19010
failed at 19015
failed at 19021
failed at 19027
failed at 19032
failed at 19032
failed at 19037
failed at 19042
failed at 19048
failed at 19054
failed at 19060
failed at 19066
failed at 19066
failed at 19071
failed at 19077
failed at 19083
failed at 19089
failed at 19095
failed at 19101
failed at 19106
failed at 19112
failed at 19118
failed at 19124
failed at 19129
failed at 19135
failed at 19140
failed at 19146
failed at 19152
failed at 19157
failed at 19157
failed at 19161
failed at 19166
failed at 19172
failed at 19178
failed at 19184
failed at 19189
failed at 19189
failed at 19192
failed at 19199
failed at 19205
failed at 19211
failed at 19216
failed at 19216
failed at 19221
failed at 19226
failed at 19232
failed at 19238
failed at 19244
failed at 19250
failed at 19255
failed at 19255
failed at 19258
failed at 19264
failed at 19270
failed at 19275
failed at 19281
failed at 19286
failed at 19286
failed at 19289
failed at 19295
failed a

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.




failed at 20003
failed at 20008
failed at 20014
failed at 20020
failed at 20025
failed at 20030
failed at 20036
failed at 20040
failed at 20045
failed at 20051
failed at 20057
failed at 20062
failed at 20067


In [None]:
# T2
df_t2w = get_iqms('T2w', software=None, start_page=10001)
df_t2w.to_csv('all_t2s.csv')

In [None]:
# BOLD
df_bold = get_iqms('bold')
df_bold.to_csv('all_bolds.csv')
