# See how many cases we find for each covariate
The purpose of this notebook is to find how many cases of the covariates of interest we can find

In [1]:
import os
import numpy as np
import pandas as pd

In [2]:
% matplotlib notebook

In [3]:
# Paths
pheno_path_0 = '/data1/abide/Pheno/existing_pheno.csv'
pheno_path_1 = '/data1/abide/Pheno/unconstrained_3box_189_sample.csv'
pheno_path_2 = '/data1/abide/Pheno/unconstrained_2box_308_sample.csv'
pheno_path_3 = '/data1/abide/Pheno/nyu_pheno.csv'
# Load them
pheno_0 = pd.read_csv(pheno_path_0)
pheno_1 = pd.read_csv(pheno_path_1)
pheno_2 = pd.read_csv(pheno_path_2)
pheno_3 = pd.read_csv(pheno_path_3)

In [4]:
list(pheno_0.columns)

['SITE_ID',
 'SUB_ID',
 'DX_GROUP',
 'DSM_IV_TR',
 'AGE_AT_SCAN',
 'SEX',
 'HANDEDNESS_CATEGORY',
 'HANDEDNESS_SCORES',
 'FIQ',
 'VIQ',
 'PIQ',
 'FIQ_TEST_TYPE',
 'VIQ_TEST_TYPE',
 'PIQ_TEST_TYPE',
 'ADI_R_SOCIAL_TOTAL_A',
 'ADI_R_VERBAL_TOTAL_BV',
 'ADI_RRB_TOTAL_C',
 'ADI_R_ONSET_TOTAL_D',
 'ADI_R_RSRCH_RELIABLE',
 'ADOS_MODULE',
 'ADOS_TOTAL',
 'ADOS_COMM',
 'ADOS_SOCIAL',
 'ADOS_STEREO_BEHAV',
 'ADOS_RSRCH_RELIABLE',
 'ADOS_GOTHAM_SOCAFFECT',
 'ADOS_GOTHAM_RRB',
 'ADOS_GOTHAM_TOTAL',
 'ADOS_GOTHAM_SEVERITY',
 'SRS_VERSION',
 'SRS_RAW_TOTAL',
 'SRS_AWARENESS',
 'SRS_COGNITION',
 'SRS_COMMUNICATION',
 'SRS_MOTIVATION',
 'SRS_MANNERISMS',
 'SCQ_TOTAL',
 'AQ_TOTAL',
 'COMORBIDITY',
 'CURRENT_MED_STATUS',
 'MEDICATION_NAME',
 'OFF_STIMULANTS_AT_SCAN',
 'VINELAND_RECEPTIVE_V_SCALED',
 'VINELAND_EXPRESSIVE_V_SCALED',
 'VINELAND_WRITTEN_V_SCALED',
 'VINELAND_COMMUNICATION_STANDARD',
 'VINELAND_PERSONAL_V_SCALED',
 'VINELAND_DOMESTIC_V_SCALED',
 'VINELAND_COMMUNITY_V_SCALED',
 'VINELAND_DAI

Here are a list of some covariates that might be interesting:
- FIQ
- VIQ
- PIQ
- ADI_R_SOCIAL_TOTAL_A
- ADI_R_VERBAL_TOTAL_BV
- ADI_RRB_TOTAL_C
- BMI

Let's find out how many cases we get for each

In [5]:
# Replace -9999 with nan
pheno_0 = pheno_0.replace(-9999, np.nan)
pheno_1 = pheno_1.replace(-9999, np.nan)
pheno_2 = pheno_2.replace(-9999, np.nan)
pheno_3 = pheno_3.replace(-9999, np.nan)

In [6]:
# Go through the covariates and print out how many there are
cov_list = ['FIQ', 'VIQ', 'PIQ', 'ADI_R_SOCIAL_TOTAL_A',
            'ADI_R_VERBAL_TOTAL_BV', 'ADI_RRB_TOTAL_C', 
            'VINELAND_ABC_STANDARD', 'ADOS_TOTAL', 'SRS_RAW_TOTAL', 'DX_GROUP']

In [7]:
print('For pheno 0 with {} total subjects:\n'.format(len(pheno_0)))
for cov in cov_list:
    print('{1} for {0}'.format(cov, np.sum(pd.notnull(pheno_0[cov]))))

For pheno 0 with 823 total subjects:

756 for FIQ
628 for VIQ
647 for PIQ
257 for ADI_R_SOCIAL_TOTAL_A
258 for ADI_R_VERBAL_TOTAL_BV
257 for ADI_RRB_TOTAL_C
112 for VINELAND_ABC_STANDARD
382 for ADOS_TOTAL
284 for SRS_RAW_TOTAL
823 for DX_GROUP


In [8]:
print('For pheno 1 with {} total subjects:\n'.format(len(pheno_1)))
for cov in cov_list:
    print('{1} for {0}'.format(cov, np.sum(pd.notnull(pheno_1[cov]))))

For pheno 1 with 189 total subjects:

171 for FIQ
168 for VIQ
168 for PIQ
67 for ADI_R_SOCIAL_TOTAL_A
68 for ADI_R_VERBAL_TOTAL_BV
68 for ADI_RRB_TOTAL_C
53 for VINELAND_ABC_STANDARD
86 for ADOS_TOTAL
95 for SRS_RAW_TOTAL
189 for DX_GROUP


In [9]:
print('For pheno 2 with {} total subjects:\n'.format(len(pheno_2)))
for cov in cov_list:
    print('{1} for {0}'.format(cov, np.sum(pd.notnull(pheno_2[cov]))))

For pheno 2 with 308 total subjects:

276 for FIQ
276 for VIQ
277 for PIQ
103 for ADI_R_SOCIAL_TOTAL_A
104 for ADI_R_VERBAL_TOTAL_BV
104 for ADI_RRB_TOTAL_C
53 for VINELAND_ABC_STANDARD
136 for ADOS_TOTAL
132 for SRS_RAW_TOTAL
308 for DX_GROUP
