# ESM Sample

In [1]:
# Imports
import numpy as np
import pandas as pd
import rpy2.robjects as robjects
from rpy2.robjects import pandas2ri
from matplotlib import pyplot as plt

In [2]:
#Set up
%matplotlib inline
pandas2ri.activate()

In [3]:
# Paths
pheno_path = '/data1/abide/Pheno//full_merged_pheno.csv' # ABIDE with ADOS_sev
out_path = '/data1/abide/Pheno/psm_pheno.csv' # ABIDE with ADOS_sev

In [173]:
# Get the pheno
pheno = pd.read_csv(pheno_path)
# Kill whitespace
pheno.columns = [x.strip(' ') for x in pheno.columns]
# select the columns that need to go into the PSM
cols = ['SITE_ID', 'SUB_ID', 'AGE_AT_SCAN', 
        'FD', 'FD_scrubbed', 'EYE_STATUS_AT_SCAN',
        'FIQ', 'VIQ', 'PIQ', 'DX_GROUP']
use_pheno = pheno[cols]
use_pheno['DX_GROUP'].replace(to_replace=[2,1], value=[1,0], inplace=True)
# Group the pheno information by site - so we can run PSM individually on each site
group_site = use_pheno.groupby('SITE_ID')

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._update_inplace(new_data)


In [174]:
use_pheno.head()

Unnamed: 0,SITE_ID,SUB_ID,AGE_AT_SCAN,FD,FD_scrubbed,EYE_STATUS_AT_SCAN,FIQ,VIQ,PIQ,DX_GROUP
0,CALTECH,51456,55.4,0.434593,0.328121,2,126.0,118.0,128.0,0
1,CALTECH,51457,22.9,0.20716,0.170893,2,107.0,119.0,93.0,0
2,CALTECH,51458,39.2,0.322805,0.288872,2,93.0,80.0,108.0,0
3,CALTECH,51459,22.8,0.152963,0.152963,2,106.0,94.0,118.0,0
4,CALTECH,51460,34.6,0.207277,0.207277,2,133.0,135.0,122.0,0


In [7]:
nyu_pheno = use_pheno[use_pheno.SITE_ID=='NYU']

In [172]:
# Set up the PSM function
def run_psm(pheno, cal_width=0.5):
    '''
    Function that calls the PSM script in R and returns a dataframe 
    that includes a column for matching and keeping
    '''
    categories = ['DX_GROUP']
    # Declare Formula for Mahalanobis distance matching (using the names from CSV)
    ## Format: Disease variable ~  What to Match by 1 + What to Match by 2 + ...
    ## All variables matched by must be NUMERIC
    Mah_formula = 'DX_GROUP ~ FD_scrubbed' 
    # Declare Formula for PSM distance matching (using the names from CSV)
    ## Format: Disease variable ~  What to Match by 1 + What to Match by 2 + ...
    PSM_formula = 'DX_GROUP ~ FD_scrubbed'
    
    # Store data in R
    robjects.globalenv["data"] = pandas2ri.py2ri(pheno)
    robjects.globalenv["categories"] = robjects.StrVector(categories)
    robjects.globalenv["Mah_formula"] = Mah_formula
    robjects.globalenv["cal_width"] = cal_width
    robjects.globalenv["PSM_formula"] = PSM_formula
    
    # Run R script
    tmp = robjects.r(
    '''
      # Load R Librarires
      library(optmatch)

      # Make all categories factors
      for (ff in 1:length(categories)) {
        data[[categories[ff]]] <- as.factor(data[[categories[ff]]])
      }

      # Performs Matching
      Matching <- fullmatch(
      match_on( as.formula(Mah_formula),
               data = data ) + 
        caliper( match_on( as.formula(PSM_formula), 
                         data = data ),
                width = cal_width ),
      data = data )

      # Make a data frame with a column with 1 to keep an observation and 0 to leave out
      save_data <- cbind(data,match=Matching)
      save_data$keep <- as.numeric(!is.na(save_data$match))
      save_data$match = as.numeric(as.character(save_data$match))
    ''')
    
    # Get the results back
    data = pandas2ri.ri2py(robjects.r['save_data'])
    
    return data

In [175]:
matched = run_psm(nyu_pheno, 2)

In [176]:
matched.DX_GROUP.value_counts()

1    105
0     79
Name: DX_GROUP, dtype: int64

In [177]:
# Method 1 - take the 20 highest match scores in each group and make a new sample
control_ind = [index for index, row in matched[matched.DX_GROUP.astype(float)==1].iterrows()]
control_row = [row.match for index, row in matched[matched.DX_GROUP.astype(float)==1].iterrows()]
control_sort = [(x,y) for (x,y) in sorted(zip(control_row, control_ind))]

patient_ind = [index for index, row in matched[matched.DX_GROUP.astype(float)==0].iterrows()]
patient_row = [row.match for index, row in matched[matched.DX_GROUP.astype(float)==0].iterrows()]
patient_sort = [(x,y) for (x,y) in sorted(zip(patient_row, patient_ind))]

# Get the 20 highest scores for both
control20high = control_sort[::-1][:20]
patient20high = patient_sort[::-1][:20]

# Get the indices back from that
control20high_ind = [y for (x,y) in control20high]
patient20high_ind = [y for (x,y) in patient20high]

# Get the 20 lowest scores for both
control20low = control_sort[:20]
patient20low = patient_sort[:20]

# Get the indices back from that
control20low_ind = [y for (x,y) in control20low]
patient20low_ind = [y for (x,y) in patient20low]

# Get the corresponding samples
high_matched = pd.concat([matched.loc[control20high_ind], matched.loc[patient20high_ind]])
high_matched.drop(high_matched.columns[[10, 11]], axis=1, inplace=True)
low_matched = pd.concat([matched.loc[control20low_ind], matched.loc[patient20low_ind]])
low_matched.drop(low_matched.columns[[10, 11]], axis=1, inplace=True)

In [178]:
low_rematch = run_psm(low_matched, 0.2)

In [179]:
high_rematch = run_psm(high_matched, 0.2)

In [180]:
high_rematch.head()

Unnamed: 0,SITE_ID,SUB_ID,AGE_AT_SCAN,FD,FD_scrubbed,EYE_STATUS_AT_SCAN,FIQ,VIQ,PIQ,DX_GROUP,match,keep
387,NYU,51149,20.56,0.261704,0.220838,1,113.0,107.0,118.0,1,1.16,1.0
386,NYU,51148,20.3,0.248065,0.227824,1,107.0,108.0,104.0,1,1.15,1.0
384,NYU,51146,20.02,0.147987,0.140063,1,106.0,96.0,116.0,1,1.14,1.0
381,NYU,51129,17.7,0.205037,0.186266,1,102.0,111.0,92.0,1,1.13,1.0
378,NYU,51126,16.31,0.255284,0.216417,1,81.0,83.0,83.0,1,1.12,1.0


In [181]:
low_rematch.head()

Unnamed: 0,SITE_ID,SUB_ID,AGE_AT_SCAN,FD,FD_scrubbed,EYE_STATUS_AT_SCAN,FIQ,VIQ,PIQ,DX_GROUP,match,keep
291,NYU,51036,8.04,0.201879,0.170828,1,101.0,102.0,99.0,1,1.1,1.0
300,NYU,51047,12.1,0.199353,0.179035,1,123.0,119.0,119.0,1,1.2,1.0
317,NYU,51064,7.26,0.177189,0.170715,2,119.0,127.0,108.0,1,1.1,1.0
379,NYU,51127,16.55,0.436847,0.171271,1,91.0,88.0,96.0,1,1.1,1.0
390,NYU,51152,23.35,0.158098,0.146764,1,139.0,140.0,129.0,1,1.18,1.0


In [182]:
# Ok, so they both didn't work let's try matching the raw match values
# First, get the sorted raw values for controls
control_match_sorted = [x for (x,y) in control_sort]
control_index_sorted = [y for (x,y) in control_sort]
keep_control = list()
keep_patient = list()
# Now loop through the patients and find the matching controls while popping the corresponding indices
for (patient_match, patient_index) in patient_sort:
    # Get the nearest matching patient
    tmp = np.argmin(np.abs(patient_match - np.array(control_match_sorted)))
    # Get the diff
    dif = np.abs(patient_match - control_match_sorted[tmp])
    if dif > 0.1:
        continue
    # Get the control index
    control_index = control_index_sorted[tmp]
    # Put that in a list
    keep_patient.append(patient_index)
    keep_control.append(control_index)
    # And pop it from the lists
    control_match_sorted.pop(tmp)
    control_index_sorted.pop(tmp)
    
# Get the match
new_match = pd.concat([matched.loc[keep_patient], matched.loc[keep_control]])
new_match.drop(new_match.columns[[10, 11]], axis=1, inplace=True)

In [121]:
# Run another match
new_rematch = run_psm(new_match, 0.2)

In [161]:
from scipy import stats as st

In [None]:
# Now take a look at this guy
new_rematch.boxplot('AGE_AT_SCAN', by='DX_GROUP')

In [184]:
for name, group in new_rematch.groupby('DX_GROUP'):
    y = group.AGE_AT_SCAN.values
    a = st.shapiro(y)
    # See if significant
    if a[1]<0.01:
        # Not normal
        pass
    else:
        # normal
        
    print('for {}, p={:.2e}'.format(name, a[1]))

for 0, p=3.70e-08
for 1, p=1.28e-05


In [166]:
a

(0.8981843590736389, 1.284561130887596e-05)

In [123]:
new_rematch.head()

Unnamed: 0,SITE_ID,SUB_ID,AGE_AT_SCAN,FD,FD_scrubbed,EYE_STATUS_AT_SCAN,FIQ,VIQ,PIQ,DX_GROUP,match,keep
260,NYU,51001,10.71,0.23634,0.177639,1,87.0,85.0,94.0,0,1.24,1.0
262,NYU,51003,8.51,0.193305,0.161452,1,120.0,106.0,133.0,0,,0.0
220,NYU,50960,38.76,0.187822,0.143725,2,114.0,108.0,119.0,0,,0.0
223,NYU,50964,12.75,0.243671,0.207482,1,106.0,108.0,101.0,0,1.37,1.0
290,NYU,51035,10.27,0.316139,0.14474,1,100.0,95.0,106.0,0,,0.0


In [None]:
# Ok, this also doesn't work, now let's try what Hien suggested
sort_match = matched.sort_values('match', ascending=False)
n_rows = sort_match.shape[0]

In [158]:
# Drop by unique
for i in sort_match.match.unique():
    tmp = sort_match[sort_match.match<i].DX_GROUP.value_counts()
    print('drop below {}, get {}/{}'.format(i, tmp[0], tmp[1]))

drop below 1.99, get 104/78
drop below 1.94, get 103/76
drop below 1.9, get 102/75
drop below 1.88, get 101/74
drop below 1.87, get 99/73
drop below 1.84, get 98/72
drop below 1.83, get 97/71
drop below 1.82, get 96/70
drop below 1.8, get 94/68
drop below 1.79, get 91/67
drop below 1.76, get 89/66
drop below 1.75, get 87/65
drop below 1.74, get 86/64
drop below 1.73, get 85/63
drop below 1.71, get 84/62
drop below 1.7, get 81/61
drop below 1.65, get 77/60
drop below 1.64, get 76/59
drop below 1.62, get 75/58
drop below 1.61, get 74/57
drop below 1.6, get 71/54
drop below 1.59, get 70/53
drop below 1.58, get 69/52
drop below 1.57, get 68/51
drop below 1.56, get 67/49
drop below 1.55, get 66/48
drop below 1.54, get 65/46
drop below 1.52, get 64/45
drop below 1.51, get 63/44
drop below 1.5, get 61/39
drop below 1.48, get 60/37
drop below 1.47, get 59/36
drop below 1.46, get 58/35
drop below 1.45, get 57/34
drop below 1.43, get 55/33
drop below 1.42, get 54/32
drop below 1.41, get 53/31
dr

IndexError: index out of bounds

In [151]:
# Drop case by case and look at the ratio
for i in np.arange(184):
    tmp = sort_match.iloc[i:].DX_GROUP.value_counts()
    print('drop {}, get {}/{}'.format(i, tmp[0], tmp[1]))

drop 0, get 105/79
drop 1, get 105/78
drop 2, get 104/78
drop 3, get 103/78
drop 4, get 103/77
drop 5, get 103/76
drop 6, get 103/75
drop 7, get 102/75
drop 8, get 101/75
drop 9, get 101/74
drop 10, get 101/73
drop 11, get 100/73
drop 12, get 99/73
drop 13, get 98/73
drop 14, get 98/72
drop 15, get 97/72
drop 16, get 97/71
drop 17, get 97/70
drop 18, get 96/70
drop 19, get 96/69
drop 20, get 95/69
drop 21, get 95/68
drop 22, get 94/68
drop 23, get 93/68
drop 24, get 92/68
drop 25, get 91/68
drop 26, get 91/67
drop 27, get 90/67
drop 28, get 89/67
drop 29, get 89/66
drop 30, get 88/66
drop 31, get 88/65
drop 32, get 87/65
drop 33, get 87/64
drop 34, get 86/64
drop 35, get 85/64
drop 36, get 85/63
drop 37, get 85/62
drop 38, get 84/62
drop 39, get 83/62
drop 40, get 82/62
drop 41, get 81/62
drop 42, get 81/61
drop 43, get 80/61
drop 44, get 80/60
drop 45, get 79/60
drop 46, get 78/60
drop 47, get 77/60
drop 48, get 77/59
drop 49, get 76/59
drop 50, get 75/59
drop 51, get 75/58
drop 52, g

IndexError: index out of bounds

In [144]:
test_matched = sort_match.iloc[160:]

In [146]:
test_matched.drop(test_matched.columns[[10, 11]], axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


In [148]:
test_rematch = run_psm(test_matched, 0.2)

In [149]:
test_rematch

Unnamed: 0,SITE_ID,SUB_ID,AGE_AT_SCAN,FD,FD_scrubbed,EYE_STATUS_AT_SCAN,FIQ,VIQ,PIQ,DX_GROUP,match,keep
393,NYU,51155,30.78,0.128875,0.11959,1,104.0,106.0,100.0,1,,0.0
284,NYU,51028,29.18,0.132931,0.12127,1,80.0,73.0,92.0,0,,0.0
277,NYU,51020,28.58,0.288787,0.206328,1,107.0,108.0,103.0,0,,0.0
370,NYU,51118,29.02,0.201811,0.196253,1,122.0,115.0,124.0,1,,0.0
298,NYU,51045,11.56,0.292517,0.25131,1,80.0,85.0,79.0,1,1.1,1.0
265,NYU,51008,12.37,0.321424,0.250506,2,128.0,125.0,126.0,0,1.1,1.0
230,NYU,50971,10.24,0.205634,0.163792,2,122.0,111.0,128.0,0,1.3,1.0
372,NYU,51120,10.19,0.167225,0.162492,1,115.0,119.0,106.0,1,1.3,1.0
229,NYU,50970,8.9,0.164245,0.159378,1,99.0,99.0,99.0,0,,0.0
373,NYU,51121,10.74,0.165076,0.154962,1,113.0,103.0,121.0,1,1.3,1.0
