In [1]:
import pandas as pd
import numpy as np
import os
from itertools import combinations
# configure pandas
pd.set_option('display.width', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 1000)

### Load excel file with table and derivative variables of interest

In [2]:
# Input csv or excel filename
fname = 'Rabeeh REQUESTED data ABCD_updated.xlsx'
#######################################
fpath = '/home/cglab/data_pull/abcd/Rabeeh/'
if fname.endswith('xlsx'):
    tvars = pd.read_excel(fpath + fname)
elif fname.endswith('csv'):
    tvars = pd.read_csv(fpath + fname)
else:
    print('unexpected filetype')
tvars.tail()

Unnamed: 0,Table,Variable,NEWNAME,Description
153,su_y_peerdevia,peer_deviance_5_l,peer_dev_per5,How many of your friends currently: use marijuana
154,su_y_peerdevia,peer_deviance_6_l,peer_dev_per6,How many of your friends currently: use inhala...
155,su_y_peerdevia,peer_deviance_7_l,peer_dev_per7,How many of your friends currently: use other ...
156,su_y_peerdevia,peer_deviance_8_l,peer_dev_per8,How many of your friends currently: sell or gi...
157,su_y_peerdevia,peer_deviance_9_l,peer_dev_per9,How many of your friends currently: use other ...


In [3]:
dpath = '/home/cglab/projects/abcd/data/abcd5.1-rser/'

### Functions

In [4]:
# recursively search directory and subdirectories for csv file
def findFile(name, path):
    for root, dirs, files in os.walk(path):
        if name in files:
            return os.path.join(root, name)
            
def getDerivatives(df, table_file, table_key, how_merge='outer'):
    dpath = '/home/cglab/projects/abcd/data/abcd-data-release-5.1/'
    # find filepath, which is full path and name of file
    filepath = findFile(table_file, path=dpath)
    dat = pd.read_csv(filepath, low_memory=False)
    # get table name, which is the string before the period
    table = table_file.split('.')[0]
    # get column names sub and event which will need for merging dataframes
    # it's the same for each df so overwriting is fine
    se_nms = dat.columns[:2].values.tolist()
    # derivative variables desired
    derivative_cols = table_key[table_key['Table']==table]['Variable'].values.tolist()
    derivative_cols += se_nms
    # strip leading and ending spaces
    derivative_cols = [c.strip() for c in derivative_cols]
    # # troubleshooting
    # print(derivative_cols)
    # print(df.columns)
    # merge with overall with INNER join bc we dont want to exclude participants who have task mri data but not resting or vice versa
    print('Prior to merge df size is {0} and other df shape is {1}\n merging...'.format(df.shape, dat[derivative_cols].shape))
    if 'src_subject_id' not in df.columns:
        # for first table assign it to df
        df = dat[derivative_cols].copy()
    else:
        # all others are merged
        df = df.merge(dat[derivative_cols], how=how_merge, on=['src_subject_id', 'eventname'])
    print('Any duplicated columns? {}'.format(df.columns.duplicated().any()))
    print('New df size is {}\n'.format(df.shape))
    return df


def dropDuplicateCols(df):
    # check every combination of columns
    # if ALL values are EQUAL, get the 2 columns
    dup_pairs = [(i, j) for i,j in combinations(df, 2) if df[i].equals(df[j])]
    cols_to_remove = []
    # print the columns that match and get 1st one for removal
    for i, j in dup_pairs:
        # check if alll rows in matching column are NaN or 0. if so, don't delete, because this function is
        # for identifying columns with identical values. dropping fully missing variables should be done separetly for clarity
        if ~df[i].isnull().all() and rs.loc[rs[i].notnull(), i].sum() > 0:
            print('{0} is identical to {1}'.format(i.upper(), j.upper()))
            cols_to_remove.append(i)    
    # drop duplicate columns
    df.drop(columns=cols_to_remove, inplace=True)
    # return df WITHOUT duplicate columns
    return df

#check if there are any MRI variables requested
# if so, return the appropriate Quality control Variables
def anyMRIvars(df):
    # create empty list to store QC vars needed
    qc_vars = []

    # check for RESTING variables
    # if any columns are found that startwith 'rsfmri_', append RESTING QC vars & the csv file they're found in,..
    # ... creating a list of tuples which will be combined and converted to dataframe below
    # this relies on the implicit boolean nature of lists, if empty their FALSE
    if [col for col in df.columns if col.startswith('rsfmri_')]:
        qc_vars += [('mri_y_qc_incl', 'imgincl_rsfmri_include'), ('mri_y_qc_motion', 'rsfmri_meanmotion'), ('mri_y_adm_info', 'mri_info_deviceserialnumber')]
    
    # check for NBACK task variables
    # if any columns are found that startwith 'tfmri_nback', append NBACK QC vars & the csv file they're found in
    if [col for col in df.columns if col.startswith('tfmri_nback_') or col.startswith('tfabwdp')  or col.startswith('tnbasem')]:
        qc_vars += [('mri_y_qc_incl', 'imgincl_nback_include'), ('mri_y_qc_motion', 'tfmri_nback_all_meanmotion'), ('mri_y_adm_info', 'mri_info_deviceserialnumber')]
    
    # check for MID task variables
    # if any columns are found that startwith 'tfmri_ma', append MID QC vars & the csv file they're found in
    if [col for col in df.columns if col.startswith('tfmri_ma_') or col.startswith('midabwdp')]:
        qc_vars += [('mri_y_qc_incl', 'imgincl_mid_include'), ('mri_y_qc_motion', 'tfmri_mid_all_meanmotion'), ('mri_y_adm_info', 'mri_info_deviceserialnumber')]
    
    # if qc_vars is NOT empty
    if qc_vars:
        # remove duplicates from qc_vars
        # calling 'set' function on qc_vars returns only unique items
        # then calling 'list' function converts it back to a list
        qc_vars = list(set(qc_vars))
        # set up table_key dataframe for pulling QC vars
        # includes all QC vars selected above with cooresponding table name
        qc_key = pd.DataFrame(qc_vars, columns=['Table', 'Variable'])
        # get list of unique QC tables from qc_key tuple ex. ('mri_y_qc_incl', 'imgincl_rsfmri_include')
        qc_tables = list(set([table for table, var in qc_vars]))
        # append QC variables to dataframe
        for table in qc_tables:
            print('\nPulling from table: {0}.csv\n'.format(table))
            df = getDerivatives(df, table + '.csv', qc_key, how_merge='left')
        # output dataframe with QC variables merged
        return df
    else:
        # else return the unchanged dataframe
        return df
        

### Get tables that will be pulled
* append '.csv' for data loading later
* exclude temporal variance tables because interpretation of them unclear

In [5]:
[t for t in tvars['Table'].unique()]

['abcd_y_lt',
 'abcd_p_demo',
 'abcd_y_lf',
 'mri_y_rsfmr_cor_gp_gp',
 'mri_y_rsfmr_cor_gp_aseg',
 'su_y_alc_exp',
 'su_y_can_exp',
 'ce_y_pnh',
 'ce_y_rpi',
 'mh_p_fhx',
 'ce_y_nsc',
 'su_p_crpf',
 'su_y_crpf',
 'su_y_peerdevia']

In [6]:
data_files = [t + '.csv' for t in tvars['Table'].unique()]
data_files

['abcd_y_lt.csv',
 'abcd_p_demo.csv',
 'abcd_y_lf.csv',
 'mri_y_rsfmr_cor_gp_gp.csv',
 'mri_y_rsfmr_cor_gp_aseg.csv',
 'su_y_alc_exp.csv',
 'su_y_can_exp.csv',
 'ce_y_pnh.csv',
 'ce_y_rpi.csv',
 'mh_p_fhx.csv',
 'ce_y_nsc.csv',
 'su_p_crpf.csv',
 'su_y_crpf.csv',
 'su_y_peerdevia.csv']

## Make sure that first csv that is pulled from has multiple 'eventnames', so that they're included in future merges

### Load, Select, & Merge data, 
* for every file in 'data_files'
* and every variable listed in 'tvars'
* append to rs

In [7]:
# set empty dataframe  
# to be filled with merges
rs = pd.DataFrame()

for file in data_files:
    rs = getDerivatives(rs, file, tvars)

Prior to merge df size is (0, 0) and other df shape is (90312, 5)
 merging...
Any duplicated columns? False
New df size is (90312, 5)

Prior to merge df size is (90312, 5) and other df shape is (48807, 10)
 merging...
Any duplicated columns? False
New df size is (90312, 13)

Prior to merge df size is (90312, 13) and other df shape is (8151, 5)
 merging...
Any duplicated columns? False
New df size is (90312, 16)

Prior to merge df size is (90312, 16) and other df shape is (22130, 5)
 merging...
Any duplicated columns? False
New df size is (90312, 19)

Prior to merge df size is (90312, 19) and other df shape is (22130, 10)
 merging...
Any duplicated columns? False
New df size is (90312, 27)

Prior to merge df size is (90312, 27) and other df shape is (35579, 13)
 merging...
Any duplicated columns? False
New df size is (90312, 38)

Prior to merge df size is (90312, 38) and other df shape is (32537, 12)
 merging...
Any duplicated columns? False
New df size is (90312, 48)

Prior to merge df

In [8]:
idx = ['site_id_l', 'interview_age', 'rel_family_id', 'src_subject_id', 'eventname', 'demo_sex_v2', 'acs_raked_propensity_score', 'race_ethnicity', 'demo_prnt_marital_v2', 'demo_prnt_ed_v2', 'demo_prtnr_ed_v2', 'demo_comb_income_v2', 'demo_comb_income_v2_l', 'latent_factor_ss_general_ses', 'latent_factor_ss_social', 'latent_factor_ss_perinatal', 'aeq_positive_expectancies_ss', 'aeq_negative_expectancies_ss', 'aeq_negative_expectancies_nt', 'aeq_positive_expectancies_nt', 'aeq_section_q01',
       'aeq_section_q02', 'aeq_section_q03', 'aeq_section_q04', 'aeq_section_q05', 'aeq_section_q06', 'aeq_section_q07']

In [9]:
rs.loc[:, rs.columns.duplicated()==1]

0
1
2
3
4
...
90307
90308
90309
90310
90311


### Drop columns with duplicate NAMES

In [10]:
rs = rs.loc[:,~rs.columns.duplicated()]
print('Any duplicated columns? {}'.format(rs.columns.duplicated().any()))

Any duplicated columns? False


In [11]:
rs.head()

Unnamed: 0,site_id_l,interview_age,rel_family_id,src_subject_id,eventname,demo_sex_v2,acs_raked_propensity_score,race_ethnicity,demo_prnt_marital_v2,demo_prnt_ed_v2,demo_prtnr_ed_v2,demo_comb_income_v2,demo_comb_income_v2_l,latent_factor_ss_general_ses,latent_factor_ss_social,latent_factor_ss_perinatal,rsfmri_c_ngd_sa_ngd_sa,rsfmri_c_ngd_sa_ngd_dt,rsfmri_c_ngd_sa_ngd_fo,rsfmri_cor_ngd_sa_scs_aglh,rsfmri_cor_ngd_sa_scs_agrh,rsfmri_cor_ngd_sa_scs_thplh,rsfmri_cor_ngd_sa_scs_thprh,rsfmri_cor_ngd_smh_scs_aarh,rsfmri_cor_ngd_smh_scs_aalh,rsfmri_cor_ngd_sa_scs_plrh,rsfmri_cor_ngd_sa_scs_pllh,aeq_positive_expectancies_ss,aeq_negative_expectancies_ss,aeq_negative_expectancies_nt,aeq_positive_expectancies_nt,aeq_section_q01,aeq_section_q02,aeq_section_q03,aeq_section_q04,aeq_section_q05,aeq_section_q06,aeq_section_q07,meeq_positive_expectancies_ss,meeq_negative_expectancies_ss,meeq_negative_expectancies_nt,meeq_positive_expectancies_nt,meeq_section_q01,meeq_section_q02,meeq_section_q03,meeq_section_q04,meeq_section_q05,meeq_section_q06,pnh_substance,pnh_help,pnh_how_much_help,pnh_encourage,pnh_how_much_encourage,pnh_art_involve,pnh_ss_protective_scale,pnh_ss_protective_scale_nt,peerinfluence_q1,peerinfluence_q2,peerinfluence_q3,peerinfluence_q4,peerinfluence_q5,peerinfluence_q6,peerinfluence_q7,peerinfluence_q8,peerinfluence_q9,peerinfluence_q10,peerinfluence_ss_mean,famhx_ss_fath_prob_alc_p,famhx_ss_patgf_prob_alc_p,famhx_ss_patgm_prob_alc_p,famhx_ss_moth_prob_alc_p,famhx_ss_matgf_prob_alc_p,famhx_ss_matgm_prob_alc_p,famhx_ss_fulsiby1_prob_alc_p,famhx_ss_fulsiby2_prob_alc_p,famhx_ss_fulsiby3_prob_alc_p,famhx_ss_fulsiby4_prob_alc_p,famhx_ss_fulsiby5_prob_alc_p,famhx_ss_momdad_alc_p,famhx_ss_parent_alc_p,famhx_ss_fath_prob_dg_p,famhx_ss_patgf_prob_dg_p,famhx_ss_patgm_prob_dg_p,famhx_ss_moth_prob_dg_p,famhx_ss_matgf_prob_dg_p,famhx_ss_matgm_prob_dg_p,famhx_ss_fulsiby1_prob_dg_p,famhx_ss_fulsiby2_prob_dg_p,famhx_ss_fulsiby3_prob_dg_p,famhx_ss_fulsiby4_prob_dg_p,famhx_ss_fulsiby5_prob_dg_p,famhx_ss_momdad_dg_p,famhx_ss_parent_dg_p,neighborhood_crime_y,neighborhood_po_y,neighborhood_po_times_y,neighborhood_po_exp_y,neighborhood_po_phy_y,neighb_po_phy_times_y,neighb_po_used_slur_y,neighb_po_slur_times_y,neighb_po_stopped_y,neighb_po_stopped_times_y,neighb_po_arrested_y,neighb_po_service_y,neighb_po_service_times_y,neighb_po_fam_friend_exp_y,neighb_po_fam_friend_phy_y,neighb_po_fam_friend_times_y,neighb_po_exp_slur_y,neighb_po_exp_slur_times_y,neighb_po_stopped_2_y,neighb_po_stopped_times_2_y,neighb_po_community_y,neighb_po_fair_y,neighb_po_respect_y,su_risk_p_1,su_risk_p_2,su_risk_p_3,su_risk_p_4,su_risk_p_5,su_risk_p_6,su_risk_p_7,su_risk_p_8,su_risk_p_9,su_risk_p_10,su_risk_p_11,su_risk_p_12,su_risk_p_13,su_crpf_avail_1,su_crpf_avail_2,su_crpf_avail_3,su_crpf_avail_4,su_crpf_avail_5,su_crpf_avail_6,su_crpf_avail_7,su_crpf_avail_8,su_crpf_avail_9,su_crpf_avail_10,su_crpf_avail_11,su_crpf_avail_12,su_crpf_avail_13,peer_deviance_1_4bbe5d,peer_deviance_2_dd1457,peer_deviance_3_e1ec2e,peer_deviance_4_b6c588,peer_deviance_5_bffa44,peer_deviance_6_69562e,peer_deviance_7_beb683,peer_deviance_8_35702e,peer_deviance_9_6dd4ef,peer_deviance_1_l,peer_deviance_2_l,peer_deviance_3_l,peer_deviance_4_l,peer_deviance_5_l,peer_deviance_6_l,peer_deviance_7_l,peer_deviance_8_l,peer_deviance_9_l
0,site06,131.0,8781.0,NDAR_INV003RTV85,baseline_year_1_arm_1,2.0,466.092707,1.0,1.0,13.0,13.0,8.0,,0.1155,-0.9315,-0.035,0.511476,0.110278,0.215066,0.175114,-0.104169,0.247253,0.117783,0.264452,-0.058992,0.096238,-0.082889,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,0.0,0.0,5.0,,,,,,,,,,,,,,,,,,,,,,,2.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,,,,,,,,,,,,,,,,,,0.0,0.0,0.0,0.0,,,,0.0,0.0,,,,,,,,,
1,site06,136.0,,NDAR_INV003RTV85,6_month_follow_up_arm_1,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,site06,143.0,,NDAR_INV003RTV85,1_year_follow_up_y_arm_1,,533.38182,1.0,,,,,8.0,,,,,,,,,,,,,,,3.0,7.0,,,1.0,1.0,1.0,5.0,1.0,1.0,5.0,3.0,15.0,,,5.0,1.0,1.0,1.0,5.0,5.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,5.0,,,,,,,,,,,,,,,,,,,,,,,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,0.0,0.0,0.0,,,0.0,0.0
3,site06,148.0,,NDAR_INV003RTV85,18_month_follow_up_arm_1,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
4,site06,157.0,,NDAR_INV003RTV85,2_year_follow_up_y_arm_1,,,,,,,,8.0,,,,,,,,,,,,,,,10.0,13.0,,,4.0,2.0,4.0,2.0,4.0,4.0,5.0,5.0,11.0,,,3.0,3.0,1.0,1.0,3.0,5.0,0.0,0.0,,0.0,,,0.0,5.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,5.0,,,,,,,,,,,,,,,,,,,,,,,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,999.0,1.0,1.0,1.0,,999.0,,,,1.0,1.0,1.0,1.0,,,,,,,,,,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0


### Drop columns with duplicate VALUES

In [12]:
print(rs.shape)
rs = dropDuplicateCols(rs)
print(rs.shape)

(90312, 160)
(90312, 160)


In [13]:
if 'rsfmri_c_ngd_dt_ngd_dt' in rs.columns.tolist():
    print('included')

In [14]:
# check number of missing values in each column
rs.isnull().sum()

site_id_l                            0
interview_age                        8
rel_family_id                    78444
src_subject_id                       0
eventname                            0
demo_sex_v2                      78444
acs_raked_propensity_score       67224
race_ethnicity                   67233
demo_prnt_marital_v2             78446
demo_prnt_ed_v2                  78444
demo_prtnr_ed_v2                 80842
demo_comb_income_v2              78446
demo_comb_income_v2_l            53380
latent_factor_ss_general_ses     82161
latent_factor_ss_social          82161
latent_factor_ss_perinatal       82161
rsfmri_c_ngd_sa_ngd_sa           68183
rsfmri_c_ngd_sa_ngd_dt           68195
rsfmri_c_ngd_sa_ngd_fo           68187
rsfmri_cor_ngd_sa_scs_aglh       68183
rsfmri_cor_ngd_sa_scs_agrh       68197
rsfmri_cor_ngd_sa_scs_thplh      68184
rsfmri_cor_ngd_sa_scs_thprh      68227
rsfmri_cor_ngd_smh_scs_aarh      68183
rsfmri_cor_ngd_smh_scs_aalh      68190
rsfmri_cor_ngd_sa_scs_plr

In [15]:
rs.eventname.value_counts()

eventname
baseline_year_1_arm_1       11868
6_month_follow_up_arm_1     11389
1_year_follow_up_y_arm_1    11220
18_month_follow_up_arm_1    11083
2_year_follow_up_y_arm_1    10973
3_year_follow_up_y_arm_1    10336
30_month_follow_up_arm_1    10232
42_month_follow_up_arm_1     8457
4_year_follow_up_y_arm_1     4754
Name: count, dtype: int64

### Check if any MRI variables present
* if so, append the appropriate ABCD Quality Control Variables
* for scanner movement, artifacts, etc

In [18]:
print(rs.shape)
rs = anyMRIvars(rs)
print(rs.shape)

(90312, 160)

Pulling from table: mri_y_qc_incl.csv

Prior to merge df size is (90312, 160) and other df shape is (22939, 3)
 merging...
Any duplicated columns? False
New df size is (90312, 161)


Pulling from table: mri_y_adm_info.csv

Prior to merge df size is (90312, 161) and other df shape is (22939, 3)
 merging...
Any duplicated columns? False
New df size is (90312, 162)


Pulling from table: mri_y_qc_motion.csv

Prior to merge df size is (90312, 162) and other df shape is (22460, 3)
 merging...
Any duplicated columns? False
New df size is (90312, 163)

(90312, 163)


In [19]:
rs['eventname'].value_counts()

eventname
baseline_year_1_arm_1       11868
6_month_follow_up_arm_1     11389
1_year_follow_up_y_arm_1    11220
18_month_follow_up_arm_1    11083
2_year_follow_up_y_arm_1    10973
3_year_follow_up_y_arm_1    10336
30_month_follow_up_arm_1    10232
42_month_follow_up_arm_1     8457
4_year_follow_up_y_arm_1     4754
Name: count, dtype: int64

### Check if data present at certain waves for variable of interest
* change variable name in final brackets

In [20]:
rs.head()

Unnamed: 0,site_id_l,interview_age,rel_family_id,src_subject_id,eventname,demo_sex_v2,acs_raked_propensity_score,race_ethnicity,demo_prnt_marital_v2,demo_prnt_ed_v2,demo_prtnr_ed_v2,demo_comb_income_v2,demo_comb_income_v2_l,latent_factor_ss_general_ses,latent_factor_ss_social,latent_factor_ss_perinatal,rsfmri_c_ngd_sa_ngd_sa,rsfmri_c_ngd_sa_ngd_dt,rsfmri_c_ngd_sa_ngd_fo,rsfmri_cor_ngd_sa_scs_aglh,rsfmri_cor_ngd_sa_scs_agrh,rsfmri_cor_ngd_sa_scs_thplh,rsfmri_cor_ngd_sa_scs_thprh,rsfmri_cor_ngd_smh_scs_aarh,rsfmri_cor_ngd_smh_scs_aalh,rsfmri_cor_ngd_sa_scs_plrh,rsfmri_cor_ngd_sa_scs_pllh,aeq_positive_expectancies_ss,aeq_negative_expectancies_ss,aeq_negative_expectancies_nt,aeq_positive_expectancies_nt,aeq_section_q01,aeq_section_q02,aeq_section_q03,aeq_section_q04,aeq_section_q05,aeq_section_q06,aeq_section_q07,meeq_positive_expectancies_ss,meeq_negative_expectancies_ss,meeq_negative_expectancies_nt,meeq_positive_expectancies_nt,meeq_section_q01,meeq_section_q02,meeq_section_q03,meeq_section_q04,meeq_section_q05,meeq_section_q06,pnh_substance,pnh_help,pnh_how_much_help,pnh_encourage,pnh_how_much_encourage,pnh_art_involve,pnh_ss_protective_scale,pnh_ss_protective_scale_nt,peerinfluence_q1,peerinfluence_q2,peerinfluence_q3,peerinfluence_q4,peerinfluence_q5,peerinfluence_q6,peerinfluence_q7,peerinfluence_q8,peerinfluence_q9,peerinfluence_q10,peerinfluence_ss_mean,famhx_ss_fath_prob_alc_p,famhx_ss_patgf_prob_alc_p,famhx_ss_patgm_prob_alc_p,famhx_ss_moth_prob_alc_p,famhx_ss_matgf_prob_alc_p,famhx_ss_matgm_prob_alc_p,famhx_ss_fulsiby1_prob_alc_p,famhx_ss_fulsiby2_prob_alc_p,famhx_ss_fulsiby3_prob_alc_p,famhx_ss_fulsiby4_prob_alc_p,famhx_ss_fulsiby5_prob_alc_p,famhx_ss_momdad_alc_p,famhx_ss_parent_alc_p,famhx_ss_fath_prob_dg_p,famhx_ss_patgf_prob_dg_p,famhx_ss_patgm_prob_dg_p,famhx_ss_moth_prob_dg_p,famhx_ss_matgf_prob_dg_p,famhx_ss_matgm_prob_dg_p,famhx_ss_fulsiby1_prob_dg_p,famhx_ss_fulsiby2_prob_dg_p,famhx_ss_fulsiby3_prob_dg_p,famhx_ss_fulsiby4_prob_dg_p,famhx_ss_fulsiby5_prob_dg_p,famhx_ss_momdad_dg_p,famhx_ss_parent_dg_p,neighborhood_crime_y,neighborhood_po_y,neighborhood_po_times_y,neighborhood_po_exp_y,neighborhood_po_phy_y,neighb_po_phy_times_y,neighb_po_used_slur_y,neighb_po_slur_times_y,neighb_po_stopped_y,neighb_po_stopped_times_y,neighb_po_arrested_y,neighb_po_service_y,neighb_po_service_times_y,neighb_po_fam_friend_exp_y,neighb_po_fam_friend_phy_y,neighb_po_fam_friend_times_y,neighb_po_exp_slur_y,neighb_po_exp_slur_times_y,neighb_po_stopped_2_y,neighb_po_stopped_times_2_y,neighb_po_community_y,neighb_po_fair_y,neighb_po_respect_y,su_risk_p_1,su_risk_p_2,su_risk_p_3,su_risk_p_4,su_risk_p_5,su_risk_p_6,su_risk_p_7,su_risk_p_8,su_risk_p_9,su_risk_p_10,su_risk_p_11,su_risk_p_12,su_risk_p_13,su_crpf_avail_1,su_crpf_avail_2,su_crpf_avail_3,su_crpf_avail_4,su_crpf_avail_5,su_crpf_avail_6,su_crpf_avail_7,su_crpf_avail_8,su_crpf_avail_9,su_crpf_avail_10,su_crpf_avail_11,su_crpf_avail_12,su_crpf_avail_13,peer_deviance_1_4bbe5d,peer_deviance_2_dd1457,peer_deviance_3_e1ec2e,peer_deviance_4_b6c588,peer_deviance_5_bffa44,peer_deviance_6_69562e,peer_deviance_7_beb683,peer_deviance_8_35702e,peer_deviance_9_6dd4ef,peer_deviance_1_l,peer_deviance_2_l,peer_deviance_3_l,peer_deviance_4_l,peer_deviance_5_l,peer_deviance_6_l,peer_deviance_7_l,peer_deviance_8_l,peer_deviance_9_l,imgincl_rsfmri_include,mri_info_deviceserialnumber,rsfmri_meanmotion
0,site06,131.0,8781.0,NDAR_INV003RTV85,baseline_year_1_arm_1,2.0,466.092707,1.0,1.0,13.0,13.0,8.0,,0.1155,-0.9315,-0.035,0.511476,0.110278,0.215066,0.175114,-0.104169,0.247253,0.117783,0.264452,-0.058992,0.096238,-0.082889,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,0.0,0.0,5.0,,,,,,,,,,,,,,,,,,,,,,,2.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,,,,,,,,,,,,,,,,,,0.0,0.0,0.0,0.0,,,,0.0,0.0,,,,,,,,,,1.0,HASH96a0c182,0.150697
1,site06,136.0,,NDAR_INV003RTV85,6_month_follow_up_arm_1,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,site06,143.0,,NDAR_INV003RTV85,1_year_follow_up_y_arm_1,,533.38182,1.0,,,,,8.0,,,,,,,,,,,,,,,3.0,7.0,,,1.0,1.0,1.0,5.0,1.0,1.0,5.0,3.0,15.0,,,5.0,1.0,1.0,1.0,5.0,5.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,5.0,,,,,,,,,,,,,,,,,,,,,,,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,0.0,0.0,0.0,,,0.0,0.0,,,
3,site06,148.0,,NDAR_INV003RTV85,18_month_follow_up_arm_1,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
4,site06,157.0,,NDAR_INV003RTV85,2_year_follow_up_y_arm_1,,,,,,,,8.0,,,,,,,,,,,,,,,10.0,13.0,,,4.0,2.0,4.0,2.0,4.0,4.0,5.0,5.0,11.0,,,3.0,3.0,1.0,1.0,3.0,5.0,0.0,0.0,,0.0,,,0.0,5.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,5.0,,,,,,,,,,,,,,,,,,,,,,,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,999.0,1.0,1.0,1.0,,999.0,,,,1.0,1.0,1.0,1.0,,,,,,,,,,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,,,


In [21]:
rs.describe()

Unnamed: 0,interview_age,rel_family_id,demo_sex_v2,acs_raked_propensity_score,race_ethnicity,demo_prnt_marital_v2,demo_prnt_ed_v2,demo_prtnr_ed_v2,demo_comb_income_v2,demo_comb_income_v2_l,latent_factor_ss_general_ses,latent_factor_ss_social,latent_factor_ss_perinatal,rsfmri_c_ngd_sa_ngd_sa,rsfmri_c_ngd_sa_ngd_dt,rsfmri_c_ngd_sa_ngd_fo,rsfmri_cor_ngd_sa_scs_aglh,rsfmri_cor_ngd_sa_scs_agrh,rsfmri_cor_ngd_sa_scs_thplh,rsfmri_cor_ngd_sa_scs_thprh,rsfmri_cor_ngd_smh_scs_aarh,rsfmri_cor_ngd_smh_scs_aalh,rsfmri_cor_ngd_sa_scs_plrh,rsfmri_cor_ngd_sa_scs_pllh,aeq_positive_expectancies_ss,aeq_negative_expectancies_ss,aeq_negative_expectancies_nt,aeq_positive_expectancies_nt,aeq_section_q01,aeq_section_q02,aeq_section_q03,aeq_section_q04,aeq_section_q05,aeq_section_q06,aeq_section_q07,meeq_positive_expectancies_ss,meeq_negative_expectancies_ss,meeq_negative_expectancies_nt,meeq_positive_expectancies_nt,meeq_section_q01,meeq_section_q02,meeq_section_q03,meeq_section_q04,meeq_section_q05,meeq_section_q06,pnh_substance,pnh_help,pnh_how_much_help,pnh_encourage,pnh_how_much_encourage,pnh_art_involve,pnh_ss_protective_scale,pnh_ss_protective_scale_nt,peerinfluence_q1,peerinfluence_q2,peerinfluence_q3,peerinfluence_q4,peerinfluence_q5,peerinfluence_q6,peerinfluence_q7,peerinfluence_q8,peerinfluence_q9,peerinfluence_q10,peerinfluence_ss_mean,famhx_ss_fath_prob_alc_p,famhx_ss_patgf_prob_alc_p,famhx_ss_patgm_prob_alc_p,famhx_ss_moth_prob_alc_p,famhx_ss_matgf_prob_alc_p,famhx_ss_matgm_prob_alc_p,famhx_ss_fulsiby1_prob_alc_p,famhx_ss_fulsiby2_prob_alc_p,famhx_ss_fulsiby3_prob_alc_p,famhx_ss_fulsiby4_prob_alc_p,famhx_ss_fulsiby5_prob_alc_p,famhx_ss_momdad_alc_p,famhx_ss_parent_alc_p,famhx_ss_fath_prob_dg_p,famhx_ss_patgf_prob_dg_p,famhx_ss_patgm_prob_dg_p,famhx_ss_moth_prob_dg_p,famhx_ss_matgf_prob_dg_p,famhx_ss_matgm_prob_dg_p,famhx_ss_fulsiby1_prob_dg_p,famhx_ss_fulsiby2_prob_dg_p,famhx_ss_fulsiby3_prob_dg_p,famhx_ss_fulsiby4_prob_dg_p,famhx_ss_fulsiby5_prob_dg_p,famhx_ss_momdad_dg_p,famhx_ss_parent_dg_p,neighborhood_crime_y,neighborhood_po_y,neighborhood_po_times_y,neighborhood_po_exp_y,neighborhood_po_phy_y,neighb_po_phy_times_y,neighb_po_used_slur_y,neighb_po_slur_times_y,neighb_po_stopped_y,neighb_po_stopped_times_y,neighb_po_arrested_y,neighb_po_service_y,neighb_po_service_times_y,neighb_po_fam_friend_exp_y,neighb_po_fam_friend_phy_y,neighb_po_fam_friend_times_y,neighb_po_exp_slur_y,neighb_po_exp_slur_times_y,neighb_po_stopped_2_y,neighb_po_stopped_times_2_y,neighb_po_community_y,neighb_po_fair_y,neighb_po_respect_y,su_risk_p_1,su_risk_p_2,su_risk_p_3,su_risk_p_4,su_risk_p_5,su_risk_p_6,su_risk_p_7,su_risk_p_8,su_risk_p_9,su_risk_p_10,su_risk_p_11,su_risk_p_12,su_risk_p_13,su_crpf_avail_1,su_crpf_avail_2,su_crpf_avail_3,su_crpf_avail_4,su_crpf_avail_5,su_crpf_avail_6,su_crpf_avail_7,su_crpf_avail_8,su_crpf_avail_9,su_crpf_avail_10,su_crpf_avail_11,su_crpf_avail_12,su_crpf_avail_13,peer_deviance_1_4bbe5d,peer_deviance_2_dd1457,peer_deviance_3_e1ec2e,peer_deviance_4_b6c588,peer_deviance_5_bffa44,peer_deviance_6_69562e,peer_deviance_7_beb683,peer_deviance_8_35702e,peer_deviance_9_6dd4ef,peer_deviance_1_l,peer_deviance_2_l,peer_deviance_3_l,peer_deviance_4_l,peer_deviance_5_l,peer_deviance_6_l,peer_deviance_7_l,peer_deviance_8_l,peer_deviance_9_l,imgincl_rsfmri_include,rsfmri_meanmotion
count,90304.0,11868.0,11868.0,23088.0,23079.0,11866.0,11868.0,9470.0,11866.0,36932.0,8151.0,8151.0,8151.0,22129.0,22117.0,22125.0,22129.0,22115.0,22128.0,22085.0,22129.0,22122.0,22127.0,22126.0,32185.0,32185.0,0.0,0.0,32185.0,32185.0,32185.0,32185.0,32184.0,32185.0,32185.0,29141.0,29141.0,0.0,0.0,29141.0,29141.0,29140.0,29140.0,29140.0,29140.0,26001.0,26002.0,17311.0,26003.0,14686.0,14686.0,26000.0,26063.0,4744.0,4744.0,4742.0,4741.0,4742.0,4741.0,4741.0,4744.0,4745.0,4745.0,4745.0,11390.0,11004.0,11151.0,11491.0,11286.0,11396.0,4891.0,1624.0,502.0,121.0,26.0,11420.0,11420.0,11379.0,11113.0,11231.0,11507.0,11345.0,11411.0,4881.0,1624.0,505.0,122.0,26.0,11429.0,11429.0,38739.0,4745.0,835.0,4740.0,266.0,33.0,266.0,59.0,266.0,108.0,2646.0,4745.0,1049.0,4745.0,601.0,142.0,601.0,152.0,601.0,256.0,4744.0,4744.0,4744.0,48608.0,48608.0,48608.0,48608.0,48608.0,48608.0,28540.0,28559.0,28545.0,25510.0,25510.0,25510.0,25510.0,25865.0,25815.0,25815.0,25032.0,18995.0,25032.0,5616.0,5616.0,5616.0,22330.0,22720.0,22719.0,22720.0,11072.0,11411.0,11411.0,11595.0,6595.0,3034.0,1315.0,11595.0,11072.0,36745.0,36914.0,36914.0,36918.0,33786.0,27571.0,19172.0,37002.0,33784.0,22939.0,22225.0
mean,140.532258,5955.899141,1.478851,710.340651,2.031674,8.236895,17.684193,22.917318,82.498904,78.1457,2.8e-05,0.002777,0.002908,0.384872,0.07491,0.088978,0.000214,0.003045,0.110264,0.122612,0.201034,-0.031944,0.005759,-0.037161,6.457449,12.300451,,,2.215349,1.822464,3.882554,1.429268,4.103001,2.419636,4.315023,7.15123,11.999279,,,3.897842,2.603994,2.083253,2.464139,4.07766,4.024056,1.308411,1.331513,6.760441,1.129331,6.735871,0.405829,12.074731,5.0,1.237563,1.292791,1.585829,1.866484,1.704766,1.069395,1.828728,1.484401,1.81117,1.358061,1.523894,0.128973,0.113413,0.053538,0.042381,0.131756,0.061864,0.000409,0.0,0.0,0.0,0.0,0.151664,0.198161,0.091924,0.034284,0.027602,0.041453,0.04989,0.040049,0.0,0.001232,0.0,0.0,0.0,0.111208,0.155569,4.086657,0.175764,0.992814,0.056118,0.12406,1.30303,0.221805,1.135593,0.406015,1.342593,0.007181,412.874394,1.154433,371.30411,0.236273,1.091549,0.252912,1.236842,0.425957,1.140625,2.977445,3.22871,3.304384,1.294766,0.505061,0.475765,0.456694,0.312006,0.622305,0.804695,0.758465,136.927728,1.002548,0.535084,0.554841,0.650216,187.238585,157.707922,181.564478,192.517498,227.324822,698.909716,225.300214,246.460648,232.933939,234.683341,230.351452,253.365245,265.458011,0.025289,0.034879,0.01113,0.010522,0.015163,0.028016,0.009886,0.009142,0.015625,0.029691,0.056591,0.030828,0.019449,0.067572,0.010881,0.008658,0.021539,0.090161,0.835302,0.255469
std,16.714658,3430.136718,0.500079,440.937045,1.325221,68.720314,28.892921,78.518592,248.275498,239.778675,0.93684,0.848087,0.9063,0.126979,0.064136,0.065527,0.134057,0.078123,0.125542,0.048163,0.108285,0.07344,0.109215,0.084472,2.819173,2.838749,,,1.186793,1.04678,1.20136,0.816254,1.266835,1.307762,1.119991,2.975504,2.619125,,,1.079269,1.248473,1.060574,1.181517,1.107262,1.065031,1.487743,0.943469,2.198518,0.991621,2.197879,0.491068,8.07654,0.0,0.425635,0.455091,0.49263,0.340168,0.456196,0.254151,0.376786,0.499809,0.391415,0.479481,0.144474,0.335185,0.317112,0.225113,0.201466,0.33824,0.240919,0.02022,0.0,0.0,0.0,0.0,0.35871,0.574936,0.288931,0.181966,0.163837,0.199344,0.217727,0.196083,0.0,0.035082,0.0,0.0,0.0,0.314404,0.556676,1.019533,0.380659,0.950802,0.230174,0.330272,1.07485,0.416243,0.936936,0.492013,0.948802,0.096955,491.758667,0.975356,482.689092,0.425146,1.051206,0.435043,1.065639,0.494899,0.988041,1.083153,0.831193,0.792978,1.357313,1.073384,1.082229,1.077193,1.00784,0.809159,1.468849,1.484494,340.866643,1.371044,1.193139,1.189893,1.263539,388.100021,362.489336,383.612218,392.678845,417.817881,457.734116,416.153069,429.565912,422.154311,422.176471,419.49806,433.597214,440.228874,0.190298,0.223532,0.132258,0.121345,0.164522,0.221361,0.143004,0.116379,0.154509,0.204231,0.281101,0.215919,0.168651,0.332036,0.127294,0.107261,0.181123,0.367853,0.370916,0.332463
min,107.0,0.0,1.0,0.0,1.0,1.0,1.0,0.0,1.0,1.0,-3.9395,-5.4215,-4.902,-0.292921,-0.487964,-0.578405,-1.072404,-0.847553,-1.087117,-0.374167,-0.369524,-0.662933,-1.33116,-1.073557,3.0,3.0,,,1.0,1.0,1.0,1.0,1.0,1.0,1.0,3.0,3.0,,,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,5.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-2.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.018316
25%,128.0,2977.75,1.0,421.841839,1.0,1.0,15.0,15.0,6.0,7.0,-0.4735,-0.50575,-0.28225,0.300022,0.032919,0.046546,-0.083821,-0.046602,0.029206,0.091472,0.133709,-0.078027,-0.061938,-0.088605,4.0,11.0,,,1.0,1.0,3.0,1.0,4.0,1.0,4.0,4.0,10.0,,,3.0,1.0,1.0,1.0,3.0,3.0,0.0,0.0,5.0,0.0,5.0,0.0,6.0,5.0,1.0,1.0,1.0,2.0,1.0,1.0,2.0,1.0,2.0,1.0,1.4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,2.0,3.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.085363
50%,140.0,5969.0,1.0,636.637457,1.0,1.0,18.0,18.0,8.0,9.0,0.256,0.096,0.2335,0.381497,0.073313,0.088098,0.001459,0.000942,0.109747,0.121223,0.202919,-0.032252,0.006145,-0.035885,6.0,13.0,,,2.0,1.0,4.0,1.0,5.0,3.0,5.0,8.0,12.0,,,4.0,3.0,2.0,3.0,4.0,4.0,0.0,2.0,7.0,2.0,7.0,0.0,12.0,5.0,1.0,1.0,2.0,2.0,2.0,1.0,2.0,1.0,2.0,1.0,1.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,3.0,3.0,3.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,1.0,1.0,1.0,1.0,999.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.14377
75%,153.0,8911.0,2.0,884.89193,3.0,3.0,19.0,18.0,9.0,9.0,0.676,0.608,0.58775,0.46513,0.115481,0.130616,0.086774,0.050768,0.190977,0.151893,0.270674,0.013993,0.072687,0.01503,9.0,15.0,,,3.0,3.0,5.0,2.0,5.0,3.0,5.0,9.0,14.0,,,5.0,4.0,3.0,3.0,5.0,5.0,3.0,2.0,8.0,2.0,8.0,1.0,19.0,5.0,1.0,2.0,2.0,2.0,2.0,1.0,2.0,2.0,2.0,2.0,1.6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,1.0,0.0,0.0,2.0,0.0,2.0,1.0,2.0,0.0,999.0,2.0,999.0,0.0,1.0,1.0,2.0,1.0,1.25,4.0,4.0,4.0,3.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,8.0,2.0,0.0,0.0,1.0,3.0,3.0,3.0,3.0,2.0,999.0,4.0,4.0,2.0,4.0,4.0,999.0,999.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.286412
max,189.0,11883.0,3.0,2665.925049,5.0,777.0,777.0,999.0,999.0,999.0,1.685,2.311,2.4675,1.558666,0.577563,0.66274,0.883545,0.864875,0.981295,0.448206,0.978651,0.555979,1.120865,0.876687,15.0,15.0,,,5.0,5.0,5.0,5.0,5.0,5.0,5.0,15.0,15.0,,,5.0,5.0,5.0,5.0,5.0,5.0,3.0,2.0,10.0,2.0,10.0,1.0,27.0,5.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,3.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,3.0,5.0,1.0,4.0,1.0,1.0,4.0,1.0,4.0,1.0,4.0,2.0,999.0,4.0,999.0,1.0,4.0,1.0,4.0,1.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,2.0,4.0,4.0,999.0,4.0,4.0,4.0,4.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,999.0,4.0,4.0,4.0,3.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,1.0,5.878254


### DONT run, unless Only MR waves are desired
### Waves with MRI data to retain

In [22]:
# waves = ['baseline_year_1_arm_1', '2_year_follow_up_y_arm_1', '4_year_follow_up_y_arm_1']
# rs = rs[rs['eventname'].isin(waves)]
# print(rs.shape)
# rs['eventname'].value_counts()

### Check missing in Family ID

In [23]:
rs[rs['eventname']=='baseline_year_1_arm_1']['rel_family_id'].isnull().sum(), rs['rel_family_id'].isnull().sum()

(0, 78444)

#### 0 baseline participants are missing family id
* so we can assign family id to other waves if needed

### Convert scanID string to 32 unique float numbers

In [24]:
rs['mri_info_deviceserialnumber'].nunique()

32

In [25]:
unique_vals = rs['mri_info_deviceserialnumber'].unique()
rs['mri_info_deviceserialnumber'].replace(to_replace=unique_vals,
           value= list(range(len(unique_vals))),
           inplace=True)
rs['mri_info_deviceserialnumber'].dtype

dtype('int64')

### Include participants who passed Qaulity Control in ABCD preprocessing
* 'imgincl_rsfmri_include' & 'imgincl_nback_include' are provided by ABCD
* after preprocessing, raters assessed images and those receiving a 0 in these variables,
* had extremely noisey, unrecoverable data
* 1- data to include; 0- unusable
* 0 in either warrants exclusion

In [26]:
mr_qc = [c for c in rs.columns if 'include' in c]
mr_qc

['imgincl_rsfmri_include']

In [27]:
mr_waves = ['baseline_year_1_arm_1', '2_year_follow_up_y_arm_1', '4_year_follow_up_y_arm_1']
for mri in mr_qc:
    # number of subs with MR waves
    n_subs_mr = rs.loc[rs['eventname'].isin(mr_waves), 'eventname'].value_counts().sum()
    n_subs_mr_good = (rs[mri]==1).sum()
    n_subs_mr_bad = n_subs_mr - n_subs_mr_good
    print('{0}: {1} subs failed MRI Image Quality Control'.format(mri.split('_')[1], n_subs_mr_bad))

rsfmri: 8434 subs failed MRI Image Quality Control


In [28]:
print(rs.shape)
# edit which 'imgincl_..._inlcude' to use e.g., 'imgincl_rsfmri_include'
# keep desired non-MR waves OR MR waves where 'imgincl_mid_include' == 1
rs = rs[(~rs['eventname'].isin(mr_waves)) | (rs['eventname'].isin(mr_waves) & rs['imgincl_rsfmri_include']==1)]
rs.shape

(90312, 163)


(81878, 163)

In [29]:
rs.loc[rs['eventname'].isin(mr_waves), 'eventname'].value_counts().sum()

19161

In [30]:
rs.eventname.value_counts()

eventname
6_month_follow_up_arm_1     11389
1_year_follow_up_y_arm_1    11220
18_month_follow_up_arm_1    11083
3_year_follow_up_y_arm_1    10336
30_month_follow_up_arm_1    10232
baseline_year_1_arm_1        9374
42_month_follow_up_arm_1     8457
2_year_follow_up_y_arm_1     6971
4_year_follow_up_y_arm_1     2816
Name: count, dtype: int64

In [31]:
rs['eventname'].value_counts()

eventname
6_month_follow_up_arm_1     11389
1_year_follow_up_y_arm_1    11220
18_month_follow_up_arm_1    11083
3_year_follow_up_y_arm_1    10336
30_month_follow_up_arm_1    10232
baseline_year_1_arm_1        9374
42_month_follow_up_arm_1     8457
2_year_follow_up_y_arm_1     6971
4_year_follow_up_y_arm_1     2816
Name: count, dtype: int64

In [32]:
9374 + 6971 + 2816

19161

In [33]:
based = 11868-9374
based

2494

In [34]:
twod = 10973 - 6971
twod

4002

In [35]:
fourd = 4754 - 2816
fourd

1938

In [36]:
based + twod + fourd

8434

In [37]:
# troublshoot
wave2c = rs[rs['eventname']=='6_month_follow_up_arm_1']

In [38]:
wave2.shape, wave2c.shape

((11389, 160), (11389, 163))

### Rename Connectivty columns
* Replace ngd with '_' in rsFC variables
* for resting state fMRI

In [39]:
# # get all column names that have 'ngd' in the name
# ngd = [c for c in rs.columns if 'ngd' in c or 'scs' in c]

# if ngd:
#     # split on '_", get the first and 3rd strings,
#     # join them together with '_' in between
#     cor = ['_'.join([c.split('_')[0], c.split('_')[2]]) for c in ngd]
#     # zip them together in a dictionary for renaming in pandas
#     ncdict = dict(zip(ngd, cor))
#     # rename in pandas
#     rs.rename(ncdict, axis=1, inplace=True)

### If they provided new names for variables

In [40]:
tvars.columns

Index(['Table', 'Variable', 'NEWNAME', 'Description'], dtype='object')

In [41]:
# create dictionary that maps the new names to original varialble names
req_redict = dict(zip(tvars['Variable'], tvars['NEWNAME']))
req_redict

{'site_id_l': 'siteid',
 'interview_age': 'yage',
 'demo_sex_v2': 'ysex',
 'latent_factor_ss_general_ses': 'general_lf',
 'latent_factor_ss_social': 'social_lf',
 'latent_factor_ss_perinatal': 'perinatal_lf',
 'acs_raked_propensity_score': 'ppensity',
 'race_ethnicity': 'yrace',
 'demo_prnt_marital_v2': 'marital',
 'demo_prnt_ed_v2': 'pedu',
 'demo_prtnr_ed_v2': 'pedu2',
 'demo_comb_income_v2': 'income',
 'demo_comb_income_v2_l': 'incomel',
 'rel_family_id': 'familyid',
 'rsfmri_c_ngd_sa_ngd_sa': 'rsSN',
 'rsfmri_c_ngd_sa_ngd_dt': 'rsSN_dt',
 'rsfmri_c_ngd_sa_ngd_fo': 'rsSN_fo',
 'rsfmri_cor_ngd_sa_scs_aglh': 'rsSN_lamyg',
 'rsfmri_cor_ngd_sa_scs_agrh': 'rsSN_Ramyg',
 'rsfmri_cor_ngd_sa_scs_thplh': 'rsSN_Lth',
 'rsfmri_cor_ngd_sa_scs_thprh': 'rsSN_Rth',
 'rsfmri_cor_ngd_smh_scs_aarh': 'rsSN_Racc',
 'rsfmri_cor_ngd_smh_scs_aalh': 'rsSN_Lacc',
 'rsfmri_cor_ngd_sa_scs_plrh': 'rsSN_Rpl',
 'rsfmri_cor_ngd_sa_scs_pllh': 'rsSN_Lpl',
 'aeq_positive_expectancies_ss': 'aeq_Pos',
 'aeq_negative_e

In [42]:
rs['mri_info_deviceserialnumber']

0         0
1         1
2         1
3         1
5         1
         ..
90307    13
90308     1
90309     1
90310     1
90311    13
Name: mri_info_deviceserialnumber, Length: 81878, dtype: int64

In [43]:
redict = {
    'src_subject_id': 'subID',
    'rel_family_id': 'famID',
    'interview_age': 'age',
    'race_ethnicity': 'race',
    'mri_info_deviceserialnumber': 'scanID',
    'rsfmri_meanmotion': 'Motrs',
    'tfmri_mid_all_meanmotion': 'Motm',
    'tfmri_nback_all_meanmotion': 'Motnb',
    'demo_comb_income_v2': 'income', 
    'demo_prnt_ed_v2': 'pedu', 
    'demo_prtnr_ed_v2': 'spedu', 
    'demo_gender_id_v2': 'gender',
    'acs_raked_propensity_score': 'prpensity',
    'reshist_addr1_adi_edu_l': 'LowEdu1',
    'reshist_addr1_adi_sp': 'SingPH1',
    'reshist_addr1_adi_unemp': 'UnempR1',         

    # Large Loss Vs Neutral
    'tfmri_ma_aclvn_b_scs_aalh': 'NAL_lln',
    'tfmri_ma_aclvn_b_scs_aarh': 'NAR_lln',
    'tfmri_ma_aclvn_b_scs_aylh': 'AmygL_lln',
    'tfmri_ma_aclvn_b_scs_ayrh': 'AmygR_lln',
    'tfmri_ma_aclvn_b_scs_hpuslh': 'HipcL_lln',
    'tfmri_ma_aclvn_b_scs_hpusrh': 'HipcR_lln',
    'tfmri_ma_aclvn_b_scs_tplh': 'ThalL_lln',
    'tfmri_ma_aclvn_b_scs_tprh': 'ThalR_lln',
    'tfmri_ma_allvn_b_cds_lobofrlh': 'LtOrFrL_lln',
    'tfmri_ma_allvn_b_cds_lobofrrh': 'LtOrFrR_lln',
    'tfmri_ma_allvn_b_cds_mobofrlh': 'MedOrFrL_lln',
    'tfmri_ma_allvn_b_cds_mobofrrh': 'MedOrFrR_lln',
    'tfmri_ma_allvn_b_cds_roatcgelh': 'rACCL_lln',
    'tfmri_ma_allvn_b_cds_roatcgerh': 'rACCR_lln',
    'tfmri_ma_allvn_b_cds_clatcgelh': 'cACCL_lln',
    'tfmri_ma_allvn_b_cds_clatcgerh': 'cACCR_lln',
    'midabwdp1083': 'aInslL_lln',
    'midabwdp1157': 'aInslR_lln',

    # Large Reward Vs Neutral
    'tfmri_ma_alrvn_b_scs_aalh': 'NAL_lrn',
    'tfmri_ma_alrvn_b_scs_aarh': 'NAR_lrn',
    'tfmri_ma_alrvn_b_scs_aylh': 'AmygL_lrn',
    'tfmri_ma_alrvn_b_scs_ayrh': 'AmygR_lrn',
    'tfmri_ma_alrvn_b_scs_hpuslh': 'HipcL_lrn',
    'tfmri_ma_alrvn_b_scs_hpusrh': 'HipcR_lrn',
    'tfmri_ma_alrvn_b_scs_tplh': 'ThalL_lrn',
    'tfmri_ma_alrvn_b_scs_tprh': 'ThalR_lrn',    
    'tfmri_ma_alrvn_b_cds_clatcgelh': 'cACCL_lrn',
    'tfmri_ma_alrvn_b_cds_clatcgerh': 'cACCR_lrn',
    'tfmri_ma_alrvn_b_cds_lobofrlh': 'LtOrFrL_lrn',
    'tfmri_ma_alrvn_b_cds_lobofrrh': 'LtOrFrR_lrn',
    'tfmri_ma_alrvn_b_cds_mobofrlh': 'MedOrFrL_lrn',
    'tfmri_ma_alrvn_b_cds_mobofrrh': 'MedOrFrR_lrn',
    'tfmri_ma_alrvn_b_cds_roatcgelh': 'rACCL_lrn',
    'tfmri_ma_alrvn_b_cds_roatcgerh': 'rACCR_lrn',
    'midabwdp639': 'aInslL_lrn',
    'midabwdp713': 'aInslR_lrn',
}
if req_redict:
    req_redict['src_subject_id'] = 'subid'
    req_redict['mri_info_deviceserialnumber'] = 'scanid'
    rs = rs.rename(columns=req_redict)
else:
    rs = rs.rename(columns=redict)

In [44]:
rs.head()

Unnamed: 0,siteid,yage,familyid,subid,eventname,ysex,ppensity,yrace,marital,pedu,pedu2,income,incomel,general_lf,social_lf,perinatal_lf,rsSN,rsSN_dt,rsSN_fo,rsSN_lamyg,rsSN_Ramyg,rsSN_Lth,rsSN_Rth,rsSN_Racc,rsSN_Lacc,rsSN_Rpl,rsSN_Lpl,aeq_Pos,aeq_Neg,aeq_NegT,aeq_PosT,aeq1,aeq2,aeq3,aeq4,aeq5,aeq6,aeq7,meq_Pos,meq_Neg,meq_NegT,meq_PosT,meq1,meq2,meq3,meq4,meq5,meq6,pnh_sub,pnh_help,pnh_hhelp,pnh_enc,pnh_henc,pnh_art,pnh_prosum,pnh_prototal,peerinf1,peerinf2,peerinf3,peerinf4,peerinf5,peerinf6,peerinf7,peerinf8,peerinf9,peerinf10,peerinf_mean,fath_alc,fath_alc_fa,fath_alc_ma,moth_alc,moth_alc_fa,moth_alc_ma,sib1_alc,sib2_alc,sib3_alc,sib4_alc,sib5_alc,faORma_alc,Parent_alc,fath_drug,fath_drug_fa,fath_drug_ma,moth_drug,moth_drug_fa,moth_drug_ma,sib1_drug,sib2_drug,sib3_drug,sib4_drug,sib5_drug,faORma_drug,Parent_drug,neigh_crime,neigh_po,neigh_po_t,neigh_po_exp,neigh_po_phy,neigh_po_phyT,neigh_po_slur,neigh_po_slurT,neigh_po_stop,neigh_po_stopT,neigh_po_arrest,neigh_po_famservice,neigh_po_famserviceT,neigh_po_famexp,neigh_po_famagg,neigh_po_famaggT,neigh_po_famslur,neigh_po_famslurT,neigh_po_famstop,neigh_po_famstopT,neigh_po_famtrust,neigh_po_fair,neigh_po_resp,su_risk1,su_risk2,su_risk3,su_risk4,su_risk5,su_risk6,su_risk7,su_risk8,su_risk9,su_risk10,su_risk11,su_risk12,su_risk13,su_avail1,su_avail2,su_avail3,su_avail4,su_avail5,su_avail6,su_avail7,su_avail8,su_avail9,su_avail10,su_avail11,su_avail12,su_avail13,peer_dev_ob1,peer_dev_ob2,peer_dev_ob3,peer_dev_ob4,peer_dev_ob5,peer_dev_ob6,peer_dev_ob7,peer_dev_ob8,peer_dev_ob9,peer_dev_per1,peer_dev_per2,peer_dev_per3,peer_dev_per4,peer_dev_per5,peer_dev_per6,peer_dev_per7,peer_dev_per8,peer_dev_per9,imgincl_rsfmri_include,scanid,rsfmri_meanmotion
0,site06,131.0,8781.0,NDAR_INV003RTV85,baseline_year_1_arm_1,2.0,466.092707,1.0,1.0,13.0,13.0,8.0,,0.1155,-0.9315,-0.035,0.511476,0.110278,0.215066,0.175114,-0.104169,0.247253,0.117783,0.264452,-0.058992,0.096238,-0.082889,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,0.0,0.0,5.0,,,,,,,,,,,,,,,,,,,,,,,2.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,,,,,,,,,,,,,,,,,,0.0,0.0,0.0,0.0,,,,0.0,0.0,,,,,,,,,,1.0,0,0.150697
1,site06,136.0,,NDAR_INV003RTV85,6_month_follow_up_arm_1,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1,
2,site06,143.0,,NDAR_INV003RTV85,1_year_follow_up_y_arm_1,,533.38182,1.0,,,,,8.0,,,,,,,,,,,,,,,3.0,7.0,,,1.0,1.0,1.0,5.0,1.0,1.0,5.0,3.0,15.0,,,5.0,1.0,1.0,1.0,5.0,5.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,5.0,,,,,,,,,,,,,,,,,,,,,,,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,0.0,0.0,0.0,,,0.0,0.0,,1,
3,site06,148.0,,NDAR_INV003RTV85,18_month_follow_up_arm_1,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1,
5,site06,159.0,,NDAR_INV003RTV85,30_month_follow_up_arm_1,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1,


### only if not already renamed
### Create  shorter names for correlation variables

In [45]:
corr_cols = [c for c in rs.columns if '_cor_' in c or '_c_' in c]
corr_cols

[]

In [46]:

# FOR resting fmri CORRELATION cols
corr_cols = [c for c in rs.columns if '_cor_' in c or '_c_' in c]
# This wont run if corr_cols is empty
# if corr_cols not empty
if corr_cols:
    # columns for df
    corr_cols_cfa = [c for c in corr_cols if '/' not in c]
    corr_cols_cfa = [c for c in corr_cols_cfa if 'site_id_l' not in c]
    # create shorter variable names for viewing in R
    corr_cols_cfa_sn = ['_'.join(c.split('_')[-3:]) for c in corr_cols_cfa]
    rcols = dict(zip(corr_cols_cfa, corr_cols_cfa_sn))
    rs.rename(columns=rcols, inplace=True)

# Desikan regins and FOR task fmri ACTIVATION cols
# maybe implement programatic names in th future
# tfmri_dsk_cols = 
# if tfmri_dsk_cols:
    # # columns for cfa's
    # tfmri_dsk_cols = [c for c in tfmri_dsk_cols if 'site_id_l' not in c]
    # # create shorter variable names for viewing in R
    # tfmri_dsk_cols_sn = ['_'.join(c.split('_')[-3:]) for c in tfmri_dsk_cols]
    # # zip long names and short names together in Dictionary
    # rcols = dict(zip(tfmri_dsk_cols, tfmri_dsk_cols_sn))
    # rs.rename(columns=rcols, inplace=True)

### Naming key

In [47]:
## CHANGE name of filename ('tfmri_mid_var_name_key.csv') in to_csv function
# convert rename dictionary to dataframe  
names = pd.DataFrame(np.column_stack([list(redict.keys()), list(redict.values())]), columns=['Variable', 'Name'])
# merge with tvars
nkey = tvars.merge(names, how='left', on='Variable')
# get unnamed extra columns
udrop = [c for c in nkey.columns if 'Unnamed' in c]
# drop unnamed extra columns
nkey = nkey.drop(columns = udrop)
# reorder columns for readiblity
nkey = nkey[['Table', 'Variable', 'Name', 'Description']]
# export to csv
nkey.to_csv(fpath + 'rsfmri_var_name_key.csv', index=False)
nkey.head()

Unnamed: 0,Table,Variable,Name,Description
0,abcd_y_lt,site_id_l,,Site ID at each event
1,abcd_y_lt,interview_age,age,Participant's age in month at start of the event
2,abcd_p_demo,demo_sex_v2,,"(coded: 1 = male, 2 = female) What sex was the..."
3,abcd_y_lf,latent_factor_ss_general_ses,,"General latent factor of economic, social, and..."
4,abcd_y_lf,latent_factor_ss_social,,Latent factor for youth perceived social suppor


In [48]:
tvars

Unnamed: 0,Table,Variable,NEWNAME,Description
0,abcd_y_lt,site_id_l,siteid,Site ID at each event
1,abcd_y_lt,interview_age,yage,Participant's age in month at start of the event
2,abcd_p_demo,demo_sex_v2,ysex,"(coded: 1 = male, 2 = female) What sex was the..."
3,abcd_y_lf,latent_factor_ss_general_ses,general_lf,"General latent factor of economic, social, and..."
4,abcd_y_lf,latent_factor_ss_social,social_lf,Latent factor for youth perceived social suppor
5,abcd_y_lf,latent_factor_ss_perinatal,perinatal_lf,Latent factor for perinatal health
6,abcd_p_demo,acs_raked_propensity_score,ppensity,Imputed raked propensity weight. The raked pro...
7,abcd_p_demo,race_ethnicity,yrace,"(coded: 1=white, 2=black, 3=hispanic, 4=asian..."
8,abcd_p_demo,demo_prnt_marital_v2,marital,"(coded: 1=married, 2=widowed, 3=divorced, 4=se..."
9,abcd_p_demo,demo_prnt_ed_v2,pedu,What is the highest grade or level of school y...


### Save overall df to csv output

In [49]:
rs.shape

(81878, 163)

In [50]:
rs.to_csv(fpath + 'abcd5.1_rsfmri_rabeeh_8-6-24.csv', index=False)

#### For troubleshooting

In [39]:
### troubleshoot
df = rs.copy()
table_file = 'su_y_nic_exp.csv'
table_key = tvars
how_merge='outer'
dpath = '/home/cglab/projects/abcd/data/abcd-data-release-5.1/'
# find filepath, which is full path and name of file
filepath = findFile(table_file, path=dpath)
dat = pd.read_csv(filepath, low_memory=False)
# get table name, which is the string before the period
table = table_file.split('.')[0]
# get column names sub and event which will need for merging dataframes
# it's the same for each df so overwriting is fine
se_nms = dat.columns[:2].values.tolist()
# derivative variables desired
derivative_cols = table_key[table_key['Table']==table]['Variable'].values.tolist()
derivative_cols += se_nms
# strip leading and ending spaces
derivative_cols = [c.strip() for c in derivative_cols]
# merge with overall with INNER join bc we dont want to exclude participants who have task mri data but not resting or vice versa
print('Prior to merge df size is {0} and other df shape is {1}\n merging...'.format(df.shape, dat[derivative_cols].shape))
if 'src_subject_id' not in df.columns:
    # for first table assign it to df
    df = dat[derivative_cols].copy()
else:
    # all others are merged
    df = df.merge(dat[derivative_cols], how=how_merge, on=['src_subject_id', 'eventname'])
print('Any duplicated columns? {}'.format(df.columns.duplicated().any()))
print('New df size is {}\n'.format(df.shape))

KeyError: "['ceq_positive_expectancies_ss', 'ceq_negative_expectancies_ss', 'ceq_positive_expectancies_nt', 'ceq_negative_expectancies_nt'] not in index"

In [26]:
dat[derivative_cols]

Unnamed: 0,aeq_positive_expectancies_ss,aeq_negative_expectancies_ss,aeq_negative_expectancies_nt,aeq_positive_expectancies_nt,aeq_section_q01,aeq_section_q02,aeq_section_q03,aeq_section_q04,aeq_section_q05,aeq_section_q06,aeq_section_q07,src_subject_id,eventname
0,3.0,7.0,,,1.0,1.0,1.0,5.0,1.0,1.0,5.0,NDAR_INV003RTV85,1_year_follow_up_y_arm_1
1,10.0,13.0,,,4.0,2.0,4.0,2.0,4.0,4.0,5.0,NDAR_INV003RTV85,2_year_follow_up_y_arm_1
2,7.0,12.0,,,3.0,2.0,3.0,1.0,5.0,2.0,4.0,NDAR_INV003RTV85,3_year_follow_up_y_arm_1
3,3.0,3.0,,,1.0,1.0,1.0,1.0,1.0,1.0,1.0,NDAR_INV005V6D2C,1_year_follow_up_y_arm_1
4,4.0,3.0,,,2.0,1.0,1.0,1.0,1.0,1.0,1.0,NDAR_INV005V6D2C,2_year_follow_up_y_arm_1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
35574,6.0,14.0,,,1.0,1.0,4.0,1.0,5.0,4.0,5.0,NDAR_INVZZZNB0XC,3_year_follow_up_y_arm_1
35575,6.0,14.0,,,2.0,2.0,4.0,2.0,5.0,2.0,5.0,NDAR_INVZZZP87KR,1_year_follow_up_y_arm_1
35576,7.0,12.0,,,3.0,2.0,3.0,1.0,4.0,2.0,5.0,NDAR_INVZZZP87KR,2_year_follow_up_y_arm_1
35577,4.0,14.0,,,1.0,1.0,4.0,1.0,5.0,2.0,5.0,NDAR_INVZZZP87KR,3_year_follow_up_y_arm_1


In [19]:
df = pd.DataFrame()
table_file = 'nc_y_nihtb.csv'
table_key = svars1
how_merge = 'outer'

dat = pd.read_csv(dpath + table_file)
# get table name, which is the string before the period
table = table_file.split('.')[0]
# get column names sub and event which will need for merging dataframes
# it's the same for each df so overwriting is fine
se_nms = [c for c in dat.columns if c in ['src_subject_id', 'eventname']]
# derivative variables desired
derivative_cols = table_key[table_key['Table']==table]['Variable'].values.tolist()
derivative_cols += se_nms

print(len(derivative_cols))
derivative_cols

68


['rsfmri_cor_ngd_sa_scs_aalh',
 'rsfmri_cor_ngd_sa_scs_aarh',
 'rsfmri_cor_ngd_sa_scs_aglh',
 'rsfmri_cor_ngd_sa_scs_agrh',
 'rsfmri_cor_ngd_sa_scs_bs',
 'rsfmri_cor_ngd_sa_scs_cdelh',
 'rsfmri_cor_ngd_sa_scs_cderh',
 'rsfmri_cor_ngd_sa_scs_crcxlh',
 'rsfmri_cor_ngd_sa_scs_crcxrh',
 'rsfmri_cor_ngd_sa_scs_hplh',
 'rsfmri_cor_ngd_sa_scs_hprh',
 'rsfmri_cor_ngd_sa_scs_pllh',
 'rsfmri_cor_ngd_sa_scs_plrh',
 'rsfmri_cor_ngd_sa_scs_ptlh',
 'rsfmri_cor_ngd_sa_scs_ptrh',
 'rsfmri_cor_ngd_sa_scs_thplh',
 'rsfmri_cor_ngd_sa_scs_thprh',
 'rsfmri_cor_ngd_sa_scs_vtdclh',
 'rsfmri_cor_ngd_sa_scs_vtdcrh',
 'rsfmri_cor_ngd_df_scs_aalh',
 'rsfmri_cor_ngd_df_scs_aarh',
 'rsfmri_cor_ngd_df_scs_aglh',
 'rsfmri_cor_ngd_df_scs_agrh',
 'rsfmri_cor_ngd_df_scs_bs',
 'rsfmri_cor_ngd_df_scs_cdelh',
 'rsfmri_cor_ngd_df_scs_cderh',
 'rsfmri_cor_ngd_df_scs_crcxlh',
 'rsfmri_cor_ngd_df_scs_crcxrh',
 'rsfmri_cor_ngd_df_scs_hplh',
 'rsfmri_cor_ngd_df_scs_hprh',
 'rsfmri_cor_ngd_df_scs_pllh',
 'rsfmri_cor_ngd_df_scs_p

In [None]:
print('\nPrior to merge df size is {0} and other df shape is {1}\n merging...'.format(df.shape, dat[derivative_cols].shape))
if 'src_subject_id' not in df.columns:
    # for first table assign it to df
    df = dat[derivative_cols].copy()
else:
    # all others are merged
    df = df.merge(dat[derivative_cols], how=how_merge, on=['src_subject_id', 'eventname'])
print('Any duplicated columns? {}'.format(df.columns.duplicated().any()))
print('New df size is {}'.format(df.shape))

## test dropDuplicateCols

In [27]:
df = rs.copy()
dup_pairs = [(i, j) for i,j in combinations(df, 2) if df[i].equals(df[j])]
cols_to_remove = []
for i, j in dup_pairs:
    print('{0} is identical to {1}'.format(i.upper(), j.upper()))
    cols_to_remove.append(i)

df.drop(columns=cols_to_remove, inplace=True)

RSFMRI_C_NGD_SA_NGD_DT is identical to RSFMRI_C_NGD_DT_NGD_SA
RSFMRI_C_NGD_SA_NGD_VTA is identical to RSFMRI_C_NGD_VTA_NGD_SA
RSFMRI_C_NGD_DT_NGD_VTA is identical to RSFMRI_C_NGD_VTA_NGD_DT
RSFMRI_C_NGD_AD_NGD_VTA is identical to RSFMRI_C_NGD_VTA_NGD_AD
RSFMRI_C_NGD_CGC_NGD_VTA is identical to RSFMRI_C_NGD_VTA_NGD_CGC
RSFMRI_C_NGD_CA_NGD_VTA is identical to RSFMRI_C_NGD_VTA_NGD_CA
RSFMRI_C_NGD_DLA_NGD_VTA is identical to RSFMRI_C_NGD_VTA_NGD_DLA
RSFMRI_C_NGD_FO_NGD_VTA is identical to RSFMRI_C_NGD_VTA_NGD_FO
RSFMRI_C_NGD_N_NGD_VTA is identical to RSFMRI_C_NGD_VTA_NGD_N
RSFMRI_C_NGD_RSPLTP_NGD_VTA is identical to RSFMRI_C_NGD_VTA_NGD_RSPLTP
RSFMRI_C_NGD_SMH_NGD_VTA is identical to RSFMRI_C_NGD_VTA_NGD_SMH
RSFMRI_C_NGD_SMM_NGD_VTA is identical to RSFMRI_C_NGD_VTA_NGD_SMM
RSFMRI_C_NGD_VTA_NGD_VS is identical to RSFMRI_C_NGD_VS_NGD_VTA


In [22]:
cols_to_remove

['rsfmri_c_ngd_sa_ngd_dt',
 'rsfmri_c_ngd_sa_ngd_vta',
 'rsfmri_c_ngd_dt_ngd_vta',
 'rsfmri_c_ngd_ad_ngd_vta',
 'rsfmri_c_ngd_cgc_ngd_vta',
 'rsfmri_c_ngd_ca_ngd_vta',
 'rsfmri_c_ngd_dla_ngd_vta',
 'rsfmri_c_ngd_fo_ngd_vta',
 'rsfmri_c_ngd_n_ngd_vta',
 'rsfmri_c_ngd_rspltp_ngd_vta',
 'rsfmri_c_ngd_smh_ngd_vta',
 'rsfmri_c_ngd_smm_ngd_vta',
 'rsfmri_c_ngd_vta_ngd_vs']

## test anyMRIvars function

In [73]:
df = rs.copy()

In [None]:
# create empty list to store QC vars needed
qc_vars = []
# if any columns are found that startwith 'rsfmri_', append resting QC vars & the csv file they're found in,..
# ... creating a list of tuples which will be combined and converted to dataframe below
# this relies on the implicit boolean nature of lists, if empty their FALSE
if [col for col in df.columns if col.startswith('rsfmri_')]:
    qc_vars += [('mri_y_qc_incl', 'imgincl_rsfmri_include'), ('mri_y_qc_motion', 'rsfmri_meanmotion'), ('mri_y_adm_info', 'mri_info_deviceserialnumber')]

# if any columns are found that startwith 'rsfmri_', append resting QC vars & the csv file they're found in
if [col for col in df.columns if col.startswith('tfmri_nback_') or col.startswith('tfabwdp')  or col.startswith('tnbasem')]:
    qc_vars += [('mri_y_qc_incl', 'imgincl_nback_include'), ('mri_y_qc_motion', 'tfmri_nback_all_meanmotion'), ('mri_y_adm_info', 'mri_info_deviceserialnumber')]

# if qc_vars is NOT empty
if qc_vars:
    # remove duplicates from qc_vars
    # calling 'set' function on qc_vars returns only unique items
    # then calling 'list' function converts it back to a list
    qc_vars = list(set(qc_vars))
    # set up table_key dataframe for pulling QC vars
    # includes all QC vars selected above with cooresponding table name
    qc_key = pd.DataFrame(qc_vars, columns=['Table', 'Variable'])
    # get list of unique QC tables from qc_key tuple ex. ('mri_y_qc_incl', 'imgincl_rsfmri_include')
    qc_tables = list(set([table for table, var in qc_vars]))
    # append QC variables to dataframe
    for table in qc_tables:
        print('\nPulling from table: {0}.csv\n'.format(table))
        df = getDerivatives(df, table + '.csv', qc_key, how_merge='left')

In [75]:
qc_vars

[('mri_y_qc_incl', 'imgincl_rsfmri_include'),
 ('mri_y_qc_motion', 'rsfmri_meanmotion'),
 ('mri_y_adm_info', 'mri_info_deviceserialnumber'),
 ('mri_y_qc_incl', 'imgincl_nback_include'),
 ('mri_y_qc_motion', 'tfmri_nback_all_meanmotion'),
 ('mri_y_adm_info', 'mri_info_deviceserialnumber')]

In [76]:
qc_vars = list(set(qc_vars))
qc_vars

[('mri_y_qc_incl', 'imgincl_rsfmri_include'),
 ('mri_y_qc_motion', 'rsfmri_meanmotion'),
 ('mri_y_qc_motion', 'tfmri_nback_all_meanmotion'),
 ('mri_y_adm_info', 'mri_info_deviceserialnumber'),
 ('mri_y_qc_incl', 'imgincl_nback_include')]

In [72]:
[c for c in df.columns if 'meanmotion' in c or 'deviceserialnumber' in c or 'imgincl' in c]

['rsfmri_meanmotion',
 'tfmri_nback_all_meanmotion',
 'mri_info_deviceserialnumber',
 'imgincl_rsfmri_include',
 'imgincl_nback_include']

In [42]:
qc_key

Unnamed: 0,Table,Variable
0,mri_y_qc_incl.csv,imgincl_rsfmri_include
1,mri_y_qc_motion.csv,rsfmri_meanmotion
2,mri_y_qc_incl.csv,imgincl_nback_include
3,mri_y_adm_info.csv,mri_info_deviceserialnumber
4,mri_y_qc_motion.csv,tfmri_nback_all_meanmotion


In [33]:
qc_vars

[('mri_y_qc_incl.csv', 'imgincl_rsfmri_include'),
 ('mri_y_qc_motion.csv', 'rsfmri_meanmotion'),
 ('mri_y_adm_info.csv', 'mri_info_deviceserialnumber')]

In [27]:
qc_key

Unnamed: 0,Table,Variable
0,mri_y_qc_incl,rsfmri_meanmotion
1,mri_y_qc_incl,imgincl_rsfmri_include
2,mri_y_qc_incl,mri_info_deviceserialnumber
