# PSM Matching for ABIDE
Here are the samples I want
1) full sample, w site: diagnosis ratio 1, age,  motion

2) ADOS sample, w site: age, motion

3) SRS sample, w site: diagnosis ratio 1, age, motion

4) full sample, w site: diagnosis ratio 1, age, motion, FIQ

5) ? maybe FIQ on the other ones as well, depending on how nice that goes

In [1]:
library(MatchIt)

Loading required package: MASS


In [2]:
# Set up the input paths
qc_maybe = '/data1/abide/Pheno/new/abide_qc_maybe.csv'
qc_pass = '/data1/abide/Pheno/new/abide_qc_pass.csv'

# Set up the output paths
abide_full_maybe = '/data1/abide/Pheno/new/abide_full_maybe.csv'
abide_ados_maybe = '/data1/abide/Pheno/new/abide_ados_maybe.csv'
abide_srs_maybe = '/data1/abide/Pheno/new/abide_srs_maybe.csv'
abide_fiq_maybe = '/data1/abide/Pheno/new/abide_fiq_maybe.csv'

abide_full_pass = '/data1/abide/Pheno/new/abide_full_pass.csv'
abide_ados_pass = '/data1/abide/Pheno/new/abide_ados_pass.csv'
abide_srs_pass = '/data1/abide/Pheno/new/abide_srs_pass.csv'
abide_fiq_pass = '/data1/abide/Pheno/new/abide_fiq_pass.csv'

# Set up the config paths
caliper_maybe = '/home/surchs/GDrive/PhD/Pheno/assignments/abide_calipers_maybe.csv'
caliper_pass = '/home/surchs/GDrive/PhD/Pheno/assignments/abide_calipers_pass.csv'

## QC Maybe

In [3]:
# Read the data
data = read.csv(qc_maybe)
# Remap the values of the diagnosis column
# This will change the meaning, now ASD = 1, TDC = 0
data$DX_GROUP[data$DX_GROUP == 2] <- 0
# Make a subset for only male participants
data_male = subset(data, SEX==1)
# Make a copy for non-nan values for FIQ and SRS
data_fiq = data_male[!is.na(data_male$FIQ),]
data_srs = data_male[!is.na(data_male$SRS_RAW_TOTAL),]
data_ados = data_male[!is.na(data_male$ADOS_sb_sev),]
# Load the caliper dataframe
caliper = read.csv(caliper_maybe)

In [4]:
# Full sample
tmp = caliper[!is.na(caliper$Full),]
for (ind in seq_along(tmp$Sites)){
    
    site = tmp$Sites[ind]
    cal = tmp$Full[ind]
    
    covariates = c('SITE_ID','SUB_ID', 'DX_GROUP', 'AGE_AT_SCAN', 'FD_scrubbed', 'frames_scrubbed')#, 'FIQ')
    slice_data = subset(data_male, SITE_ID==site, select=covariates)
    matched = matchit(DX_GROUP ~ FD_scrubbed + AGE_AT_SCAN, data=slice_data, method="nearest", caliper=cal, ratio=1)
    data_matched = match.data(matched)
    
    if (ind == 1){
        base = data_matched
    }
    else {
        base = merge(base, data_matched, all = TRUE)
    }
}
# Save it
write.csv(file=abide_full_maybe, x=base)

In matchit2nearest(structure(c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, : Fewer control than treated units and matching without replacement.  Not all treated units will receive a match.  Treated units will be matched in the order specified by m.order: largest

In [5]:
# SRS sample
tmp = caliper[!is.na(caliper$SRS),]
for (ind in seq_along(tmp$Sites)){
    
    site = tmp$Sites[ind]
    cal = tmp$SRS[ind]
    
    covariates = c('SITE_ID','SUB_ID', 'DX_GROUP', 'AGE_AT_SCAN', 'FD_scrubbed', 'frames_scrubbed')#, 'FIQ')
    slice_data = subset(data_srs, SITE_ID==site, select=covariates)
    matched = matchit(DX_GROUP ~ FD_scrubbed + AGE_AT_SCAN, data=slice_data, method="nearest", caliper=cal, ratio=1)
    data_matched = match.data(matched)
    
    if (ind == 1){
        base = data_matched
    }
    else {
        base = merge(base, data_matched, all = TRUE)
    }
}
# Save it
write.csv(file=abide_srs_maybe, x=base)

In matchit2nearest(structure(c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, : Fewer control than treated units and matching without replacement.  Not all treated units will receive a match.  Treated units will be matched in the order specified by m.order: largest

In [7]:
# SRS sample
tmp = caliper[!is.na(caliper$FIQ),]
for (ind in seq_along(tmp$Sites)){
    
    site = tmp$Sites[ind]
    cal = tmp$FIQ[ind]
    covariates = c('SITE_ID','SUB_ID', 'DX_GROUP', 'AGE_AT_SCAN', 'FD_scrubbed', 'frames_scrubbed', 'FIQ')
    slice_data = subset(data_fiq, SITE_ID==site, select=covariates)
    matched = matchit(DX_GROUP ~ FD_scrubbed + AGE_AT_SCAN + FIQ, data=slice_data, method="nearest", caliper=cal, ratio=1)
    data_matched = match.data(matched)
    
    if (ind == 1){
        base = data_matched
    }
    else {
        base = merge(base, data_matched, all = TRUE)
    }
}
# Save it
write.csv(file=abide_fiq_maybe, x=base)

[1] "PITT for 0.20"


In matchit2nearest(structure(c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, : Fewer control than treated units and matching without replacement.  Not all treated units will receive a match.  Treated units will be matched in the order specified by m.order: largest

[1] "OHSU for 0.20"
[1] "SDSU for 0.40"
[1] "TRINITY for 0.40"
[1] "UM_1 for 0.20"


In matchit2nearest(structure(c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, : Fewer control than treated units and matching without replacement.  Not all treated units will receive a match.  Treated units will be matched in the order specified by m.order: largest

[1] "UM_2 for 0.40"
[1] "USM for 0.20"


In matchit2nearest(structure(c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, : Fewer control than treated units and matching without replacement.  Not all treated units will receive a match.  Treated units will be matched in the order specified by m.order: largest

[1] "YALE for 0.10"


In matchit2nearest(structure(c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, : Fewer control than treated units and matching without replacement.  Not all treated units will receive a match.  Treated units will be matched in the order specified by m.order: largest

[1] "CMU for 0.40"


In matchit2nearest(structure(c(1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, : Fewer control than treated units and matching without replacement.  Not all treated units will receive a match.  Treated units will be matched in the order specified by m.order: largest

[1] "LEUVEN_1 for 0.10"
[1] "KKI for 0.10"
[1] "NYU for 0.40"
[1] "STANFORD for 0.40"
[1] "UCLA_1 for 0.40"


In matchit2nearest(structure(c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, : Fewer control than treated units and matching without replacement.  Not all treated units will receive a match.  Treated units will be matched in the order specified by m.order: largest

[1] "MAX_MUN for 0.20"


## QC Pass

In [9]:
# Read the data
data = read.csv(qc_pass)
# Remap the values of the diagnosis column
# This will change the meaning, now ASD = 1, TDC = 0
data$DX_GROUP[data$DX_GROUP == 2] <- 0
# Make a subset for only male participants
data_male = subset(data, SEX==1)
# Make a copy for non-nan values for FIQ and SRS
data_fiq = data_male[!is.na(data_male$FIQ),]
data_srs = data_male[!is.na(data_male$SRS_RAW_TOTAL),]
data_ados = data_male[!is.na(data_male$ADOS_sb_sev),]
# Load the caliper dataframe
caliper = read.csv(caliper_pass)

In [10]:
# Full sample
tmp = caliper[!is.na(caliper$Full),]
for (ind in seq_along(tmp$Sites)){
    
    site = tmp$Sites[ind]
    cal = tmp$Full[ind]
    
    covariates = c('SITE_ID','SUB_ID', 'DX_GROUP', 'AGE_AT_SCAN', 'FD_scrubbed', 'frames_scrubbed')#, 'FIQ')
    slice_data = subset(data_male, SITE_ID==site, select=covariates)
    matched = matchit(DX_GROUP ~ FD_scrubbed + AGE_AT_SCAN, data=slice_data, method="nearest", caliper=cal, ratio=1)
    data_matched = match.data(matched)
    
    if (ind == 1){
        base = data_matched
    }
    else {
        base = merge(base, data_matched, all = TRUE)
    }
}
# Save it
write.csv(file=abide_full_pass, x=base)

ERROR: Error in Ops.factor(SITE_ID, site): level sets of factors are different


In [13]:
site

In [18]:
names(data_male)

In [15]:
table(data_male$SITE_ID)


 CALTECH      CMU      KKI LEUVEN_1 LEUVEN_2  MAX_MUN      NYU     OHSU 
       2        8       42       28        0       25      134       28 
    OLIN     SDSU STANFORD  TRINITY   UCLA_1   UCLA_2     UM_1     UM_2 
      29       29       28       49       63       24       26        3 
     USM     YALE 
      66       37 

In [16]:
slice_data = subset(data_male, SITE_ID==site, select=covariates)

ERROR: Error in Ops.factor(SITE_ID, site): level sets of factors are different
