In [1]:
import numpy as np
import pandas as pd

import matplotlib as mpl
import matplotlib.pyplot as plt

import seaborn as sns
import seaborn.objects as so

from functools import reduce
from itertools import combinations

from scipy import stats

# configure pandas
pd.set_option('display.width', 500)
pd.set_option('display.max_columns', 200)
pd.set_option('display.max_rows', 200)

In [2]:
home = 0

In [3]:
if home:
    dpath = '/Users/cglab/projects/abcd/data/abcd5.1-rser/'
else:
    dpath = '/home/cglab/projects/abcd/data/abcd5.1-rser/'

#### Functions

In [4]:
def get_deriviatives(df, table_file, table_key, how):
    dat = pd.read_csv(dpath + table_file)
    # get table name, which is the string before the period
    table = table_file.split('.')[0]
    # get column names sub and event which we'll need for merging dataframes
    # it's the same for each df so numeric index is fine, but check for future data
    se_nms = dat.columns[:2].values.tolist()
    # deriviative variables desired
    deriviative_cols = table_key[table_key['Table']==table]['Variable'].values.tolist()
    deriviative_cols += se_nms
    # merge with overall with INNER join bc we dont want to exclude participants who have task mri data but not resting or vice versa
    print('Prior to merge rs df size is {0} and other df shape is {1}'.format(df.shape, dat[deriviative_cols].shape))
    if 'src_subject_id' not in df.columns:
        # for first table assign it to df
        df = dat[deriviative_cols].copy()
    else:
        # all others are merged
        df = df.merge(dat[deriviative_cols], how=how, on=['src_subject_id', 'eventname'])
    print('Any duplicated columns? {}'.format(df.columns.duplicated().any()))
    print('New rs df size is {}'.format(df.shape))
    return df

# Function to map boolean columns to race IDs
def get_race_id(row, mapping):
    for race, race_id in mapping.items():
        if row[race]:
            return race_id
    return None  # Or another value indicating no race

### Load RAce, Ethinicity Variables

### get race table with variable names

In [5]:
tvars = pd.read_csv('abcd5.1_race_vars.csv')
tvars.tail(5)

Unnamed: 0,Table,Variable,Description,Options,extra,extra2
15,abcd_p_demo,demo_race_a_p___25,What race do you consider the child to be? Ple...,0 = No; 1 = Yes,demo_prim == 1 || demo_prim == 2 || demo_prim ...,pdem02
16,abcd_p_demo,demo_race_a_p___77,What race do you consider the child to be? Ple...,0 = No; 1 = Yes,demo_prim == 1 || demo_prim == 2 || demo_prim ...,pdem02
17,abcd_p_demo,demo_race_a_p___99,What race do you consider the child to be? Ple...,,,
18,abcd_p_demo,race_ethnicity,Race Ethnicity (Child),1 = White; 2 = Black; 3 = Hispanic; 4 = Asian;...,,acspsw03
19,abcd_p_demo,acs_raked_propensity_score,Imputed raked propensity weight. The raked pro...,,,


In [6]:
demo = pd.DataFrame()
demo = get_deriviatives(demo, 'abcd_p_demo.csv', tvars, 'left')

Prior to merge rs df size is (0, 0) and other df shape is (48807, 22)
Any duplicated columns? False
New rs df size is (48807, 22)


### Explore Dataframe

In [7]:
demo.head()

Unnamed: 0,demo_race_a_p___10,demo_race_a_p___11,demo_race_a_p___12,demo_race_a_p___13,demo_race_a_p___14,demo_race_a_p___15,demo_race_a_p___16,demo_race_a_p___17,demo_race_a_p___18,demo_race_a_p___19,demo_race_a_p___20,demo_race_a_p___21,demo_race_a_p___22,demo_race_a_p___23,demo_race_a_p___24,demo_race_a_p___25,demo_race_a_p___77,demo_race_a_p___99,race_ethnicity,acs_raked_propensity_score,src_subject_id,eventname
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,466.092707,NDAR_INV003RTV85,baseline_year_1_arm_1
1,,,,,,,,,,,,,,,,,,,1.0,533.38182,NDAR_INV003RTV85,1_year_follow_up_y_arm_1
2,,,,,,,,,,,,,,,,,,,,,NDAR_INV003RTV85,2_year_follow_up_y_arm_1
3,,,,,,,,,,,,,,,,,,,,,NDAR_INV003RTV85,3_year_follow_up_y_arm_1
4,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,520.488325,NDAR_INV005V6D2C,baseline_year_1_arm_1


In [8]:
demo.describe()

Unnamed: 0,demo_race_a_p___10,demo_race_a_p___11,demo_race_a_p___12,demo_race_a_p___13,demo_race_a_p___14,demo_race_a_p___15,demo_race_a_p___16,demo_race_a_p___17,demo_race_a_p___18,demo_race_a_p___19,demo_race_a_p___20,demo_race_a_p___21,demo_race_a_p___22,demo_race_a_p___23,demo_race_a_p___24,demo_race_a_p___25,demo_race_a_p___77,demo_race_a_p___99,race_ethnicity,acs_raked_propensity_score
count,11845.0,11845.0,11845.0,11845.0,11845.0,11845.0,11845.0,11845.0,11845.0,11845.0,11845.0,11845.0,11845.0,11845.0,11845.0,11845.0,11845.0,11845.0,23079.0,23088.0
mean,0.742676,0.212579,0.034276,0.000422,0.001942,0.000169,0.001013,0.003377,0.009624,0.017138,0.014099,0.007092,0.008442,0.005319,0.007598,0.067539,0.004981,0.00878,2.031674,710.340651
std,0.437178,0.40915,0.181945,0.020542,0.044024,0.012994,0.031814,0.058016,0.097634,0.129791,0.117903,0.083916,0.091498,0.072738,0.086839,0.250964,0.070403,0.093294,1.325221,440.937045
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,421.841839
50%,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,636.637457
75%,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,884.89193
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,5.0,2665.925049


In [9]:
demo.isnull().sum()

demo_race_a_p___10            36962
demo_race_a_p___11            36962
demo_race_a_p___12            36962
demo_race_a_p___13            36962
demo_race_a_p___14            36962
demo_race_a_p___15            36962
demo_race_a_p___16            36962
demo_race_a_p___17            36962
demo_race_a_p___18            36962
demo_race_a_p___19            36962
demo_race_a_p___20            36962
demo_race_a_p___21            36962
demo_race_a_p___22            36962
demo_race_a_p___23            36962
demo_race_a_p___24            36962
demo_race_a_p___25            36962
demo_race_a_p___77            36962
demo_race_a_p___99            36962
race_ethnicity                25728
acs_raked_propensity_score    25719
src_subject_id                    0
eventname                         0
dtype: int64

#### Sum across all columns
* check if anything > 1, which would indicate parent selected mutliple races for chile

In [10]:
dvars = [c for c in demo.columns if 'demo' in c]
print(len(dvars))
dvars

18


['demo_race_a_p___10',
 'demo_race_a_p___11',
 'demo_race_a_p___12',
 'demo_race_a_p___13',
 'demo_race_a_p___14',
 'demo_race_a_p___15',
 'demo_race_a_p___16',
 'demo_race_a_p___17',
 'demo_race_a_p___18',
 'demo_race_a_p___19',
 'demo_race_a_p___20',
 'demo_race_a_p___21',
 'demo_race_a_p___22',
 'demo_race_a_p___23',
 'demo_race_a_p___24',
 'demo_race_a_p___25',
 'demo_race_a_p___77',
 'demo_race_a_p___99']

In [11]:
demo['race_sum'] = demo[dvars].sum(axis=1)
demo['race_sum'].value_counts()

race_sum
0.0    36963
1.0    10365
2.0     1253
3.0      192
4.0       30
5.0        4
Name: count, dtype: int64

In [12]:
# select rows where race is specified
demo = demo[demo['race_sum']>0]
print(demo.shape)
demo.eventname.value_counts()

(11844, 23)


eventname
baseline_year_1_arm_1    11844
Name: count, dtype: int64

### Convert race columns to single column with race ID

In [13]:
# Mapping of race columns to unique IDs
race_ids = [c[-2:] for c in dvars]
race_to_id = dict(zip(dvars, race_ids))
race_to_id

{'demo_race_a_p___10': '10',
 'demo_race_a_p___11': '11',
 'demo_race_a_p___12': '12',
 'demo_race_a_p___13': '13',
 'demo_race_a_p___14': '14',
 'demo_race_a_p___15': '15',
 'demo_race_a_p___16': '16',
 'demo_race_a_p___17': '17',
 'demo_race_a_p___18': '18',
 'demo_race_a_p___19': '19',
 'demo_race_a_p___20': '20',
 'demo_race_a_p___21': '21',
 'demo_race_a_p___22': '22',
 'demo_race_a_p___23': '23',
 'demo_race_a_p___24': '24',
 'demo_race_a_p___25': '25',
 'demo_race_a_p___77': '77',
 'demo_race_a_p___99': '99'}

In [14]:
# Apply the get_race_id function to each row
demo['race_sep_id'] = demo.apply(get_race_id, axis=1, mapping=race_to_id)
demo['race_sep_id'] = pd.to_numeric(demo['race_sep_id'])
print(demo['race_sep_id'].isnull().sum())
demo.describe()

0


Unnamed: 0,demo_race_a_p___10,demo_race_a_p___11,demo_race_a_p___12,demo_race_a_p___13,demo_race_a_p___14,demo_race_a_p___15,demo_race_a_p___16,demo_race_a_p___17,demo_race_a_p___18,demo_race_a_p___19,demo_race_a_p___20,demo_race_a_p___21,demo_race_a_p___22,demo_race_a_p___23,demo_race_a_p___24,demo_race_a_p___25,demo_race_a_p___77,demo_race_a_p___99,race_ethnicity,acs_raked_propensity_score,race_sum,race_sep_id
count,11844.0,11844.0,11844.0,11844.0,11844.0,11844.0,11844.0,11844.0,11844.0,11844.0,11844.0,11844.0,11844.0,11844.0,11844.0,11844.0,11844.0,11844.0,11844.0,11844.0,11844.0,11844.0
mean,0.742739,0.212597,0.034279,0.000422,0.001942,0.000169,0.001013,0.003377,0.009625,0.017139,0.0141,0.007092,0.008443,0.005319,0.007599,0.067545,0.004981,0.008781,2.03926,691.246118,1.147163,12.103343
std,0.437143,0.409162,0.181952,0.020543,0.044026,0.012994,0.031816,0.058018,0.097639,0.129797,0.117908,0.08392,0.091501,0.072741,0.086843,0.250974,0.070406,0.093298,1.323498,351.175617,0.420943,9.496839
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,161.361068,1.0,10.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,448.839001,1.0,10.0
50%,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,619.30591,1.0,10.0
75%,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,821.716308,1.0,11.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,5.0,1778.916737,5.0,99.0


In [15]:
demo['race_sep_id'].value_counts()

race_sep_id
10    8797
11    1991
25     525
99      91
19      90
12      69
18      58
77      55
20      53
24      35
22      22
23      21
17      15
21      13
16       4
14       4
15       1
Name: count, dtype: int64

#### check subs who responded with mutliple race's

In [16]:
demo[demo['race_sum']>1].shape

(1479, 24)

### Actually, race_ethnicity variable is preferred
* inlcudes pretty much everythinng we need

In [17]:
demo['race_ethnicity'].value_counts()

race_ethnicity
1.0    6168
3.0    2400
2.0    1781
5.0    1243
4.0     252
Name: count, dtype: int64

In [18]:
demo['eventname'].value_counts()

eventname
baseline_year_1_arm_1    11844
Name: count, dtype: int64

In [19]:
# rename ethnicity column
demo.rename(columns={'src_subject_id': 'subID', 'race_ethnicity': 'race', 'acs_raked_propensity_score': 'propesit'}, inplace=True)
demo = demo[['subID', 'race','propesit']]
demo.shape

(11844, 3)

### Load full RSER dataframe

In [20]:
rser = pd.read_csv(dpath + 'abcd5.1_tfmri_nback_insula_subc_net_ders_gses_sfam_thrt_demo_noscl_nopt_w1_qc_lfa_w1_full.csv', low_memory=False)
rser.shape

(7173, 264)

In [21]:
rser.dtypes

Aware      float64
NoAcpt     float64
Implse     float64
Goals      float64
Threat     float64
            ...   
SFconP8    float64
SFconP9    float64
pedu       float64
pedu2      float64
income     float64
Length: 264, dtype: object

In [22]:
rser[['AmygL1', 'AmygR1']].isnull().sum()

AmygL1    0
AmygR1    0
dtype: int64

In [23]:
[m for m in rser.columns if 'mot' in m.lower()]

['rsfmri_meanmotion',
 'MotT1',
 'ders_emotion_overwhelm_p',
 'ders_upset_emotion_overwhelm_p',
 'Sders_emotion_overwhelm_p',
 'Sders_upset_emotion_overwhelm_p']

### Merge Latent factor df with RSER df

In [24]:
rs = demo.merge(rser, how="right", on="subID")
rs.shape

(7173, 266)

In [25]:
rs[['AmygL1', 'AmygR1']].isnull().sum()

AmygL1    0
AmygR1    0
dtype: int64

In [26]:
[m for m in rs.columns if 'mot' in m.lower()]

['rsfmri_meanmotion',
 'MotT1',
 'ders_emotion_overwhelm_p',
 'ders_upset_emotion_overwhelm_p',
 'Sders_emotion_overwhelm_p',
 'Sders_upset_emotion_overwhelm_p']

In [27]:
rs['AmygL1'].isnull().sum()

0

In [28]:
[c for c in rs.columns if 'Mot' in c]

['MotT1']

In [29]:
[c for c in rs.columns if 'Sal' in c]

[]

### Inspect Age

In [30]:
rs[['subID', 'interview_age']].isnull().sum()

subID            0
interview_age    0
dtype: int64

great, no missing age data

#### Export merged data 
* including full dataframe and latent factor summary scores

In [31]:
rs.to_csv(dpath + 'abcd5.1_tfmri_nback_insula_subc_net_ders_gses_sfam_thrt_demo_noscl_nopt_wide_qc_lfa_ra_w1_full.csv', index=False)