# Transform Phenotypic Data into a Common Format
This script accesses the extracted data in their original format and applys a mapping to harmonize the data into a common tabular format.

In [64]:
import os
import json

import numpy as np
import pandas as pd

print(os.path.abspath(os.path.curdir))

/Users/nicholsn/Repos/metasearch/crawler/transform


In [65]:
# Path to projects with extracted files.
extract_path = os.path.abspath('../extract')
# Path to where the tranformed projects will go.
xfm_path = os.path.abspath(os.path.curdir)
# Project dirs.
project_names = [i for i in os.listdir(extract_path) if '.' not in i]
extract_dirs = [os.path.join(extract_path, i) for i in project_names]
xfm_dirs = [os.path.join(xfm_path, i) for i in project_names]

In [66]:
def apply_mapping(mapping, df_list):
    """
    Reads a dictionary mapping and list of dataframe, then merges the
    dataframes and convers the column name and values into a common
    format. Note: the dataframes should have a common structure.
    
    Example Mapping Structure
    =========================
    {  
        "DX_GROUP":
      {
        "element": "diagnosis",
        "type": "category",
        "1": "autism",
        "2": "control"
      }
    }
    """
    df = pd.DataFrame()
    csv = pd.concat(df_list)
    for col, elem in mapping.iteritems():
        series = csv[col]
        if elem.get('type') == 'category':
            result = series.apply(lambda x: elem.get(str(x)))
        elif elem.get('type') == 'number':
            result = series.apply(lambda x: np.NaN if str(x) in elem.keys() else x)
        else:
            try:
                # Handle ids being read as float.
                result = series.apply(lambda x: str(int(x)) if pd.notnull(x) else x)
            except ValueError as e:
                result = series
        df[elem.get('element')] = result
    return df

In [69]:
# Read a mapping.json file from each sub directory to process.
for project_name in project_names:
    df_list = list()
    mapping = dict()
    extract_dir = os.path.join(extract_path, project_name)
    xfm_dir = os.path.join(xfm_path, project_name)
    # All files must use the same data dictionary in a given directory.
    extract_files = [os.path.join(extract_dir, i) for i in os.listdir(extract_dir)]
    mapping_file = os.path.join(xfm_dir, 'mapping.json')
    if os.path.exists(mapping_file):
        with open(mapping_file, 'rb') as fi:
            mapping.update(json.load(fi))
        for extract_file in extract_files:
            df = pd.read_csv(extract_file)
            df_list.append(df)
        xfm = apply_mapping(mapping, df_list)
        pheno_file = ''.join([project_name, '_', 'phenotype.csv'])
        pheno_path = os.path.join(xfm_dir, pheno_file)
        # Add a column for the specific project.
        xfm['project'] = project_name
        xfm.to_csv(pheno_path, index=False)

In [70]:
xfm

Unnamed: 0,verbal iq,sex,age,handedness,site_id,diagnosis,full iq,participant_id,performance iq,project
0,133.0,Male,16.92,right handed,brown university,,120.0,26001,104.0,adhd200
1,106.0,Male,15.68,right handed,brown university,,107.0,26002,106.0,adhd200
2,119.0,Female,14.99,right handed,brown university,,125.0,26004,123.0,adhd200
3,116.0,Female,15.16,right handed,brown university,,126.0,26005,131.0,adhd200
4,113.0,Male,16.91,left handed,brown university,,97.0,26009,81.0,adhd200
5,101.0,Female,16.21,right handed,brown university,,102.0,26014,102.0,adhd200
6,127.0,Female,15.20,right handed,brown university,,113.0,26015,98.0,adhd200
7,120.0,Male,16.07,right handed,brown university,,109.0,26016,96.0,adhd200
8,95.0,Female,14.56,right handed,brown university,,89.0,26017,87.0,adhd200
9,105.0,Male,17.83,right handed,brown university,,109.0,26022,111.0,adhd200
