# Extract Phenotypic Data From AWS S3
This script accesses phenotypic data from a number of projects organized under the International Neuroimaging Data-sharing Initiative that are stored on AWS S3.

In [1]:
import os
import boto
conn = boto.connect_s3(anon=True)
fcp = conn.get_bucket('fcp-indi-new')

print(os.path.abspath(os.path.curdir))

/Users/nicholsn/Repos/metasearch/crawler/extract


In [2]:
# Curated list of csv files containing phenotypic data on AWS
projects = {'ABIDE_Initiative': ['data/Projects/ABIDE_Initiative/Phenotypic_V1_0b_preprocessed1.csv'],
            'ACPI': ['data/Projects/ACPI/PhenotypicData/acpi_aggregated_phenotypic_data.csv'],
            'ADHD200': ['data/Projects/ADHD200/RawData/Brown_TestRelease_phenotypic.csv',
                        'data/Projects/ADHD200/RawData/KKI_phenotypic.csv',
                        'data/Projects/ADHD200/RawData/NYU_phenotypic.csv',
                        'data/Projects/ADHD200/RawData/OHSU_TestRelease_phenotypic.csv',
                        'data/Projects/ADHD200/RawData/OHSU_phenotypic.csv',
                        'data/Projects/ADHD200/RawData/Peking_1_TestRelease_phenotypic.csv',
                        'data/Projects/ADHD200/RawData/Peking_1_phenotypic.csv',
                        'data/Projects/ADHD200/RawData/Pittsburgh_phenotypic.csv'],
            'CC_ME': [None], # no phenotypic data
            'CORR': ['data/Projects/CORR/RawData/CoRR_AggregatedPhenotypicData.csv'],
            'CPAC_Regression_Test': [None],
            'HBNSS': ['data/Projects/HBNSS/PhenotypicData/cmi_hbnssi_pheno_data.csv'],
            'INDI': ['data/Projects/INDI/HypnosisBarrios/RawData/participants.tsv',
                     'data/Projects/INDI/SLIM/swu_slim_phenodata_time1.tsv',
                     'data/Projects/INDI/SLIM/swu_slim_phenodata_time2.tsv',
                     'data/Projects/INDI/SLIM/swu_slim_phenodata_time3.tsv'], # phenotypic data is tsv
            'RocklandSample': ['data/Projects/RocklandSample/PhenotypicData/nki-rs_lite_r4_phenotypic_v1.csv',
                               'data/Projects/RocklandSample/PhenotypicData/nki-rs_lite_r6_phenotypic_v1.csv',
                               'data/Projects/RocklandSample/PhenotypicData/nki-rs_lite_r7_phenotypic_v1.csv',
                               'data/Projects/RocklandSample/PhenotypicData/nki-rs_lite_r8_phenotypic_v1.csv']}

In [3]:
# Extract all the csv files.
for k, v in projects.iteritems():
    # Get all the key objects
    keys = [fcp.get_key(i) for i in v]
    # Create project directries
    if not os.path.exists(k):
        os.mkdir(k.lower())
    # Download all the keys as csv
    for key in keys:
        if key:
            fname = key.name.split('/')[-1]
            fpath = os.path.join(os.path.abspath(k), fname)
            key.get_contents_to_filename(fpath)
            print("Files available at: {}".format(fpath))

Files available at: /Users/nicholsn/Repos/metasearch/crawler/extract/RocklandSample/nki-rs_lite_r4_phenotypic_v1.csv
Files available at: /Users/nicholsn/Repos/metasearch/crawler/extract/RocklandSample/nki-rs_lite_r6_phenotypic_v1.csv
Files available at: /Users/nicholsn/Repos/metasearch/crawler/extract/RocklandSample/nki-rs_lite_r7_phenotypic_v1.csv
Files available at: /Users/nicholsn/Repos/metasearch/crawler/extract/RocklandSample/nki-rs_lite_r8_phenotypic_v1.csv
Files available at: /Users/nicholsn/Repos/metasearch/crawler/extract/ADHD200/Brown_TestRelease_phenotypic.csv
Files available at: /Users/nicholsn/Repos/metasearch/crawler/extract/ADHD200/KKI_phenotypic.csv
Files available at: /Users/nicholsn/Repos/metasearch/crawler/extract/ADHD200/NYU_phenotypic.csv
Files available at: /Users/nicholsn/Repos/metasearch/crawler/extract/ADHD200/OHSU_TestRelease_phenotypic.csv
Files available at: /Users/nicholsn/Repos/metasearch/crawler/extract/ADHD200/OHSU_phenotypic.csv
Files available at: /Use

In [4]:
# Explore File Contents
import pandas as pd

# Start with a concat of two files.
adhd = pd.read_csv('/Users/nicholsn/Repos/metasearch/crawler/extract/ADHD200/NYU_phenotypic.csv')
abide = pd.read_csv('/Users/nicholsn/Repos/metasearch/crawler/extract/ABIDE_Initiative/Phenotypic_V1_0b_preprocessed1.csv')

print("ADHD columns: {}".format(adhd.columns.values))
print("ABIDE columns: {}".format(abide.columns.values))

ADHD columns: ['ScanDir ID' 'Site' 'Gender' 'Age' 'Handedness' 'DX' 'Secondary Dx '
 'ADHD Measure' 'ADHD Index' 'Inattentive' 'Hyper/Impulsive' 'IQ Measure'
 'Verbal IQ' 'Performance IQ' 'Full2 IQ' 'Full4 IQ' 'Med Status'
 'QC_Rest_1' 'QC_Rest_2' 'QC_Rest_3' 'QC_Rest_4' 'QC_Anatomical_1'
 'QC_Anatomical_2']
ABIDE columns: ['Unnamed: 0' 'Unnamed: 0.1' 'SUB_ID' 'X' 'subject' 'SITE_ID' 'FILE_ID'
 'DX_GROUP' 'DSM_IV_TR' 'AGE_AT_SCAN' 'SEX' 'HANDEDNESS_CATEGORY'
 'HANDEDNESS_SCORES' 'FIQ' 'VIQ' 'PIQ' 'FIQ_TEST_TYPE' 'VIQ_TEST_TYPE'
 'PIQ_TEST_TYPE' 'ADI_R_SOCIAL_TOTAL_A' 'ADI_R_VERBAL_TOTAL_BV'
 'ADI_RRB_TOTAL_C' 'ADI_R_ONSET_TOTAL_D' 'ADI_R_RSRCH_RELIABLE'
 'ADOS_MODULE' 'ADOS_TOTAL' 'ADOS_COMM' 'ADOS_SOCIAL' 'ADOS_STEREO_BEHAV'
 'ADOS_RSRCH_RELIABLE' 'ADOS_GOTHAM_SOCAFFECT' 'ADOS_GOTHAM_RRB'
 'ADOS_GOTHAM_TOTAL' 'ADOS_GOTHAM_SEVERITY' 'SRS_VERSION' 'SRS_RAW_TOTAL'
 'SRS_AWARENESS' 'SRS_COGNITION' 'SRS_COMMUNICATION' 'SRS_MOTIVATION'
 'SRS_MANNERISMS' 'SCQ_TOTAL' 'AQ_TOTAL' 'COMORBIDITY' 

In [5]:
# Check data representation for given data element.
print("ADHD Gender Representation: {}".format(adhd.Gender.unique()))
print("ABIDE Sex Representation: {}".format(abide.SEX.unique()))

ADHD Gender Representation: [  1.   0.  nan]
ABIDE Sex Representation: [1 2]


In [6]:
%%HTML
'<iframe src="http://fcon_1000.projects.nitrc.org/indi/abide/ABIDE_LEGEND_V1.02.pdf" width="1000" height="700">'