# Transform Phenotypic Data into a Common Format
This script accesses the extracted data in their original format and applys a mapping to harmonize the data into a common tabular format.

In [34]:
import os
import json
import glob

import numpy as np
import pandas as pd

In [10]:
# Path to repositories to pull raw data out of.
curdir = os.path.abspath(os.path.curdir)
repos = ['fcp-indi']
extract_paths = [os.path.join(curdir, repo) for repo in repos]
# Path to where the tranformed projects will go.
xfm_dir = os.path.join(curdir, 'metadata')
if not os.path.exists(xfm_dir):
    os.mkdir(xfm_dir)
# Project dirs.
extract_dirs = list()
for extract_path in extract_paths:
    project_paths = [os.path.join(extract_path, project) for project in os.listdir(extract_path) if '.' not in project]
    extract_dirs.extend(project_paths)

In [116]:
def apply_mapping(mapping, df_list):
    """
    Reads a dictionary mapping and list of dataframe, then merges the
    dataframes and convers the column name and values into a common
    format. Note: the dataframes should have a common structure.
    
    Example Mapping Structure
    =========================
    {  
        "DX_GROUP":
      {
        "element": "diagnosis",
        "type": "category",
        "1": "autism",
        "2": "control"
      }
    }
    """
    results = list()
    csv = pd.concat(df_list)
    csv.reset_index(drop=True, inplace=True)
    for col, elem in mapping.iteritems():
        # Use get in case the col is missing/static.
        series = csv.get(col)
        # Categories are mapped to common values.
        if elem.get('type') == 'str':
            result = series.apply(lambda x: elem.get(str(x)))
        # Values parsed as numbers are checked for any mappings (e.g., -999 == NaN).
        elif elem.get('type') == 'float':
            result = series.apply(lambda x: np.NaN if str(x) in elem.keys() else x)
        # Used to create a column of all the same value.
        elif elem.get('type') == 'static':
            val = [elem.get('value')] * csv.shape[0]
            result = pd.Series(val, index=csv.index)
        else:
            try:
                # Handle IDs being read as float.
                result = series.apply(lambda x: str(int(x)) if pd.notnull(x) else x)
            except ValueError as e:
                result = series
        # Concat all csv in a extract dir into one dataframe.        
        df = pd.DataFrame()
        df[elem.get('element')] = result
        results.append(df)
        concat = pd.concat(results, axis=1)
    # Merge any columns with duplicate names to fil in NaN from concat step.
    return concat.groupby(level=0, axis=1).first()

In [121]:
# Read a mapping.json file from each sub directory to process.
for extract_dir in extract_dirs:
    df_list = list()
    mapping = dict()
    project_name = extract_dir.split('/')[-1]
    # All files must use the same data dictionary in a given directory.
    extract_files = [os.path.join(extract_dir, i) for i in os.listdir(extract_dir) if i not in ['mapping.json']]
    mapping_file = os.path.join(extract_dir, 'mapping.json')
    if os.path.exists(mapping_file):
        with open(mapping_file, 'rb') as fi:
            mapping.update(json.load(fi))
        ext_type = dict(csv=',', tsv='\t')
        # Grab dtype for parsing csv
        dtype = {k: v.get('type') for k, v in mapping.iteritems() if v.get('type') in ['str', 'int', 'float']}
        dtype.update({k: 'str' for k, v in mapping.iteritems() if v.get('type') in ['static']})
        # Dealing with NaNs.
        na_values = ['NoPhenotypicData', '#']
        for extract_file in extract_files:
            # Process each file.
            ext = extract_file.split('.')[-1]
            sep_type = ext_type.get(ext)
            df = pd.read_csv(extract_file, sep=sep_type, dtype=dtype, na_values=na_values)
            df_list.append(df)
        xfm = apply_mapping(mapping, df_list)
        pheno_file = ''.join([project_name, '_', 'phenotype.csv'])
        pheno_path = os.path.join(xfm_dir, pheno_file)
        xfm.to_csv(pheno_path, index=False)

In [118]:
# Merge all the transformed files into a single frame
meta_files = glob.glob(xfm_dir + '/*.csv')
meta_list = list()
for meta_file in meta_files:
    df = pd.read_csv(meta_file)
    meta_list.append(df)
meta_df = pd.concat(meta_list).set_index(['project', 'session_id', 'participant_id'], drop=False)
meta_df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,age,diagnosis,dsm_iv_tr,full iq,handedness,participant_id,performance iq,project,session_id,sex,site_id,species,verbal iq
project,session_id,participant_id,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
abide_initiative,1,50002,16.77,autism,autism,103.0,ambidextrous,50002,89.0,abide_initiative,1,Male,PITT,homo-sapiens,116.0
abide_initiative,1,50003,24.45,autism,autism,124.0,right handed,50003,115.0,abide_initiative,1,Male,PITT,homo-sapiens,128.0
abide_initiative,1,50004,19.09,autism,autism,113.0,right handed,50004,117.0,abide_initiative,1,Male,PITT,homo-sapiens,108.0
abide_initiative,1,50005,13.73,autism,autism,119.0,right handed,50005,118.0,abide_initiative,1,Female,PITT,homo-sapiens,117.0
abide_initiative,1,50006,13.37,autism,autism,109.0,left handed,50006,119.0,abide_initiative,1,Male,PITT,homo-sapiens,99.0


In [90]:
# Load the CSV with MRI URLs
mri_path = os.path.abspath('clean-csv/all-session.csv')
mri = pd.read_csv(mri_path).set_index(['project', 'session_id', 'participant_id'], drop=False)
mri.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,t1_url,participant_id,project,session_id,session_count
project,session_id,participant_id,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
acpi,1,28031,https://s3.amazonaws.com/fcp-indi/data/Project...,28031,acpi,1,1
acpi,1,28032,https://s3.amazonaws.com/fcp-indi/data/Project...,28032,acpi,1,1
acpi,1,28033,https://s3.amazonaws.com/fcp-indi/data/Project...,28033,acpi,1,1
acpi,1,28034,https://s3.amazonaws.com/fcp-indi/data/Project...,28034,acpi,1,1
acpi,1,28035,https://s3.amazonaws.com/fcp-indi/data/Project...,28035,acpi,1,1


In [91]:
r = pd.concat([meta_df, mri])

In [101]:
foo = {k: v.get('type') for k, v in mapping.iteritems() if v.get('type') in ['str', 'float']}

In [102]:
pd.read_csv('fcp-indi/abide_initiative/Phenotypic_V1_0b_preprocessed1.csv', dtype=foo)

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,SUB_ID,X,subject,SITE_ID,FILE_ID,DX_GROUP,DSM_IV_TR,AGE_AT_SCAN,...,qc_notes_rater_1,qc_anat_rater_2,qc_anat_notes_rater_2,qc_func_rater_2,qc_func_notes_rater_2,qc_anat_rater_3,qc_anat_notes_rater_3,qc_func_rater_3,qc_func_notes_rater_3,SUB_IN_SMP
0,0,1,50002,1,50002,PITT,no_filename,1,1,16.77,...,,OK,,fail,ic-parietal-cerebellum,OK,,fail,ERROR #24,1
1,1,2,50003,2,50003,PITT,Pitt_0050003,1,1,24.45,...,,OK,,OK,,OK,,OK,,1
2,2,3,50004,3,50004,PITT,Pitt_0050004,1,1,19.09,...,,OK,,OK,,OK,,OK,,1
3,3,4,50005,4,50005,PITT,Pitt_0050005,1,1,13.73,...,,OK,,maybe,ic-parietal-cerebellum,OK,,OK,,0
4,4,5,50006,5,50006,PITT,Pitt_0050006,1,1,13.37,...,,OK,,maybe,ic-parietal slight,OK,,OK,,1
5,5,6,50007,6,50007,PITT,Pitt_0050007,1,1,17.78,...,,OK,,maybe,ic-cerebellum_temporal_lob,OK,,OK,,1
6,6,7,50008,7,50008,PITT,Pitt_0050008,1,1,32.45,...,,OK,,OK,,OK,,OK,,1
7,7,8,50009,8,50009,PITT,Pitt_0050009,1,1,33.86,...,,OK,,fail,ic-parietal-cerebellum,OK,,OK,,1
8,8,9,50010,9,50010,PITT,Pitt_0050010,1,1,35.20,...,,OK,,maybe,ic-cerebellum_temporal_lob,OK,,OK,,1
9,9,10,50011,10,50011,PITT,Pitt_0050011,1,1,16.93,...,,OK,,maybe,ic-cerebellum_temporal_lob,OK,,OK,,0
