# PSM Test for the ABIDE dataset
Implementing Hien's suggestions as follows:

Next, in each of the matched data sets, sort the rows by
*match*. The smaller the *match* value, the more closely
matched the observations are. Now, delete the group of 
largest *match* rows as you require for data sufficient data
reduction and matching. Delete the same proportion from each
group to ensure a balanced design.

In [1]:
### Load python libraries
import os
import numpy as np
import pandas as pd
import rpy2.robjects as robjects
from matplotlib import pyplot as plt

In [2]:
%matplotlib inline

In [3]:
pheno_path = '/data1/abide/Pheno/full_merged_pheno.csv'
proj_name = 'abide_site'

In [4]:
pheno = pd.read_csv(pheno_path)

In [5]:
# Prepare the pheno file
pheno = pheno[pheno['SEX']==1]
include = ['USM', 'NYU', 'UCLA_1', 'UCLA_2']
pheno = pheno[pheno['SITE_ID'].isin(include)]

In [6]:
def split_pheno(pheno, var, val, out_path):
    """
    get pheno for var == val and dump to out_path
    """
    out_pheno = pheno[pheno[var] == val]
    out_name = 'pheno_{}_{}_split.csv'.format(var, val)
    save_path = os.path.join(out_path, out_name)
    out_pheno.to_csv(save_path)
    
    return save_path

In [7]:
def site_psm(in_file, site_name, out_path):
    """
    Make within site PSM
    """
    work_dir = out_path
    CSV_file =  in_file
    out_name = "{}_psm.csv".format(site_name)
    categories = robjects.StrVector(['DX_GROUP'])
    Mah_formula = 'DX_GROUP ~ AGE_AT_SCAN + FD_scrubbed' 
    cal_width = 20
    PSM_formula = 'DX_GROUP ~ AGE_AT_SCAN + FD_scrubbed'
    
    robjects.globalenv["work_dir"] = work_dir
    robjects.globalenv["CSV_file"] = CSV_file
    robjects.globalenv["categories"] = categories
    robjects.globalenv["Mah_formula"] = Mah_formula
    robjects.globalenv["cal_width"] = cal_width
    robjects.globalenv["PSM_formula"] = PSM_formula
    robjects.globalenv["out_name"] = out_name

    robjects.r('''
      # Load R Librarires
      library(optmatch)

      # Set Working directory
      setwd(work_dir)

      # Read CSV
      data <- read.csv(CSV_file)

      # Make all categories factors
      for (ff in 1:length(categories)) {
        data[[categories[ff]]] <- as.factor(data[[categories[ff]]])
      }

      # Performs Matching
      Matching <- fullmatch(
      match_on( as.formula(Mah_formula),
               data = data ) + 
        caliper( match_on( as.formula(PSM_formula), 
                         data = data ),
                width = cal_width ),
      data = data )

      # Make a data frame with a column with 1 to keep an observation and 0 to leave out
      save_data <- cbind(data,match=Matching)
      save_data$keep <- as.numeric(!is.na(save_data$match))
      write.csv(save_data,file=paste('matching_',out_name,sep=''))
    ''')
    
    return os.path.join(out_path, 'matching_{}'.format(out_name))

In [8]:
# Split the pheno file into separate files
split_dict = dict()
for site in list(pheno['SITE_ID'].unique()):
    print(site)
    tmp = split_pheno(pheno, 'SITE_ID', site, '/data1/abide/Pheno/')
    split_dict[site] = site_psm(tmp, site, '/data1/abide/Pheno/')

NYU
UCLA_1
UCLA_2
USM



  res = super(Function, self).__call__(*new_args, **new_kwargs)

  res = super(Function, self).__call__(*new_args, **new_kwargs)


In [9]:
# Recombine the csv files
data = pd.DataFrame()
for site in split_dict.keys():
    d2 = pd.read_csv(split_dict[site])
    data = pd.concat([data, d2])

In [10]:
data

Unnamed: 0.1,Unnamed: 0,X,SITE_ID,SUB_ID,DX_GROUP,DSM_IV_TR,AGE_AT_SCAN,SEX,HANDEDNESS_CATEGORY,HANDEDNESS_SCORES,...,AGE_AT_MPRAGE,BMI,frames_scrubbed,frames_OK,FD,FD_scrubbed,ADOS_SOCOM_SEV,ADOS_STBEH_SEV,match,keep
0,1,696,UCLA_2,51291,1,1,16.4700,1,R,,...,16.63,,8,112,0.153825,0.135741,6.0,5.0,1.10,1
1,2,697,UCLA_2,51292,1,1,12.2400,1,R,,...,12.24,,80,40,1.005779,0.205233,,,1.50,1
2,3,698,UCLA_2,51293,1,1,13.0800,1,R,,...,13.08,,0,120,0.114250,0.114250,,,1.90,1
3,4,699,UCLA_2,51294,1,1,11.7000,1,R,,...,12.06,,41,79,0.376118,0.192170,7.0,7.0,1.30,1
4,5,700,UCLA_2,51295,1,1,10.0400,1,L,,...,10.33,,32,88,0.281112,0.208634,,,1.40,1
5,6,701,UCLA_2,51296,1,1,11.1600,1,R,,...,11.16,,80,40,1.521789,0.427303,,,1.20,1
6,7,702,UCLA_2,51297,1,1,14.2700,1,L,,...,14.27,,78,42,0.558917,0.279952,,,1.20,1
7,8,703,UCLA_2,51298,1,1,10.5700,1,R,,...,10.69,,38,82,0.309270,0.206249,,,1.10,1
8,9,704,UCLA_2,51299,1,1,14.7700,1,R,,...,14.77,,49,71,0.645691,0.166244,,,1.80,1
9,10,705,UCLA_2,51300,1,1,14.0800,1,L,,...,14.08,,80,40,1.283017,0.177643,7.0,5.0,1.70,1
