In [1]:
import pandas as pd
import numpy as np
import os
import nltools as nlt
import nilearn as nil
import nibabel as nib
import warnings
import glob
import random
import pickle
import dev_wtp_io_utils
import gc #garbage collection
from nilearn import plotting


  warn("Fetchers from the nilearn.datasets module will be "


In [2]:
pd.set_option('display.max_rows', 99)

### Load brain data

In [3]:
test_train_set = pd.read_csv("../data/train_test_markers_20210601T183243.csv")

In [4]:
with open('../data/Brain_Data_2sns_60subs.pkl', 'rb') as pkl_file:
    Brain_Data_allsubs = pickle.load(pkl_file)
    
dev_wtp_io_utils.check_BD_against_test_train_set(Brain_Data_allsubs,test_train_set)

checked for intersection and no intersection between the brain data and the subjects was found.
there were 60 subjects overlapping between the subjects marked for train data and the training dump file itself.


### Preprocess

In [5]:
Brain_Data_allsubs.Y = Brain_Data_allsubs.X.response.copy()
print(Brain_Data_allsubs.Y.value_counts())
Brain_Data_allsubs.Y[Brain_Data_allsubs.Y=='NULL']=None
print(Brain_Data_allsubs.Y.value_counts())
print(Brain_Data_allsubs.Y.isnull().value_counts())
Brain_Data_allsubs_nn = Brain_Data_allsubs[Brain_Data_allsubs.Y.isnull()==False]
print(len(Brain_Data_allsubs_nn))
print(len(Brain_Data_allsubs))

5.0    1164
6.0    1018
7.0     904
8.0     604
Name: response, dtype: int64
5.0    1164
6.0    1018
7.0     904
8.0     604
Name: response, dtype: int64
False    3690
True      150
Name: response, dtype: int64
3690
3840


In [6]:
all_subs_nn_nifti = Brain_Data_allsubs_nn.to_nifti()
all_subs_nn_nifti_Y = Brain_Data_allsubs_nn.Y
all_subs_nn_nifti_groups = Brain_Data_allsubs_nn.X.subject
all_subs_nn_nifti_groups

0       DEV001
1       DEV001
2       DEV001
3       DEV001
4       DEV001
         ...  
3685    DEV089
3686    DEV089
3687    DEV089
3688    DEV089
3689    DEV089
Name: subject, Length: 3690, dtype: object

### Predict

Regressing in nilearn:
 - https://nilearn.github.io/decoding/estimator_choice.html
 - http://www.ncbi.nlm.nih.gov/pubmed/20691790







OK, so that's how you do it. It's pretty straightforward.

So...we won't look at nested cross-validation juuust yet, because the next step is to work out how to train on one set and predict on another. that will definitely require a custom pipeline. Let's get started...

In [7]:
del Brain_Data_allsubs
gc.collect()

15

In [8]:
from nilearn.decoding import DecoderRegressor
dRegressor = DecoderRegressor(estimator = 'ridge_regressor', standardize= True,scoring="r2")

As a control, we'll try this again, this time just training and testing on individual values:

In [9]:
from pympler.asizeof import asizeof
import sys
def asizeof_fmt(obj, suffix='B'):
    ''' by Fred Cirera,  https://stackoverflow.com/a/1094933/1870254, modified'''
    num = asizeof(obj)
    for unit in ['','Ki','Mi','Gi','Ti','Pi','Ei','Zi']:
        if abs(num) < 1024.0:
            return "%3.1f %s%s" % (num, unit, suffix)
        num /= 1024.0
    return "%.1f %s%s" % (num, 'Yi', suffix)

# for name, size in sorted(((name, asizeof(value)) for name, value in locals().items()),
#                          key= lambda x: -x[1])[:10]:


In [10]:
asizeof_fmt(Brain_Data_allsubs_nn)

'3.3 GiB'

In [13]:
asizeof_fmt(all_subs_nn_nifti)

'12.4 GiB'

In [12]:
all_subs_nn_nifti.slicer[0:]

<nibabel.nifti1.Nifti1Image at 0x2aaab4d709a0>

In [14]:
all_subs_nn_nifti.shape

(91, 109, 91, 3690)

In [16]:
all_subs_nn_nifti.slicer[0:500].shape

(91, 109, 91, 3690)

In [19]:
first_subs_nifti = all_subs_nn_nifti.slicer[...,0:500]
first_subs_nifti_Y = all_subs_nn_nifti_Y[0:500]
first_subs_nifti = nil.image.clean_img(first_subs_nifti,detrend=False,standardize=True)

In [24]:
all_subs_nn_nifti_Y

0       7.0
1       6.0
2       6.0
3       5.0
4       7.0
       ... 
3685    5.0
3686    5.0
3687    5.0
3688    5.0
3689    7.0
Name: response, Length: 3690, dtype: float64

In [20]:
first_subs_nifti.shape

(91, 109, 91, 500)

In [26]:

from sklearn.model_selection import KFold,GroupKFold
#def cv_train_test_different_sets(averaged_X,averaged_Y, averaged_groups, individual_X,individual_groups, Y, cv,group_list)
"""
averaged_X: values grouped
averaged_groups: group allocations for the averaged dataset
individual_X: values grouped into averages for testing
cv: a Grouped cross-validator
group_list: name of the groups
"""
individual_X = first_subs_nifti#all_subs_nn_nifti
individual_y = first_subs_nifti_Y #all_subs_nn_nifti_Y
individual_groups = all_subs_nn_nifti_groups[0:500]
cv=KFold(n_splits=5)



groups_array = np.array(list(set(individual_groups)))

#the CV that the inner Regressor uses
cv_inner = GroupKFold(3)

#we actually use KFold on the group names themselves, then filter across that
#that's equivalent to doing a GroupedKFold on the data.
test_scores = []
for train_i,test_i in cv.split(groups_array):
    train_group_items, test_group_items = groups_array[train_i], groups_array[test_i]
    print('In order to test on a training group of ' +
          str(len(train_group_items)) + ' items, holding out the following subjects:' +
          str(test_group_items))
    
    #select training data from the averages
    print('selecting training data')
    train_selector = [i for i, x in enumerate(individual_groups) if x in train_group_items]
    train_y = individual_y[train_selector]
    train_X = nib.funcs.concat_images([individual_X.slicer[...,s] for s in train_selector])
    train_groups = individual_groups[train_selector]
    print(asizeof_fmt(train_X))
    print(train_X.shape)
    
    #select testing data from the individual values
    print('selecting test data')
    test_selector = [i for i, x in enumerate(individual_groups) if x in test_group_items]
    test_y = individual_y[test_selector]
    test_X = nib.funcs.concat_images([individual_X.slicer[...,s] for s in test_selector])
    
    test_groups = individual_groups[test_selector]
    print(test_X.shape)
    
    print("regressing")
    regressor = DecoderRegressor(standardize= True,cv=cv_inner, scoring="r2")
    print(asizeof_fmt(train_X))
    regressor.fit(y=train_y,X=train_X,groups=train_groups)
    
    print("predicting")
    #now predict on our test split
    test_score = regressor.score(test_X,test_y)
    test_scores = test_scores+[test_score]
    print('test score was:')
    print(test_score)
    
    del test_X
    del train_X
    gc.collect() #clean up. this is big data we're working with
    #https://stackoverflow.com/questions/1316767/how-can-i-explicitly-free-memory-in-python

In order to test on a training group of 7 items, holding out the following subjects:['DEV005' 'DEV014']
selecting training data
2.5 GiB
(91, 109, 91, 374)
selecting test data
(91, 109, 91, 126)
regressing
2.5 GiB
predicting
test score was:
0.12762489867306293
In order to test on a training group of 7 items, holding out the following subjects:['DEV013' 'DEV009']
selecting training data
2.5 GiB
(91, 109, 91, 375)
selecting test data
(91, 109, 91, 125)
regressing
2.5 GiB
predicting
test score was:
-0.002948134494224819
In order to test on a training group of 7 items, holding out the following subjects:['DEV010' 'DEV001']
selecting training data
2.5 GiB
(91, 109, 91, 379)
selecting test data
(91, 109, 91, 121)
regressing
2.5 GiB
predicting
test score was:
0.3263347191308281
In order to test on a training group of 7 items, holding out the following subjects:['DEV006' 'DEV015']
selecting training data
2.9 GiB
(91, 109, 91, 434)
selecting test data
(91, 109, 91, 66)
regressing
2.9 GiB
predict

In [30]:

from sklearn.model_selection import KFold,GroupKFold
#def cv_train_test_different_sets(averaged_X,averaged_Y, averaged_groups, individual_X,individual_groups, Y, cv,group_list)
"""
averaged_X: values grouped
averaged_groups: group allocations for the averaged dataset
individual_X: values grouped into averages for testing
cv: a Grouped cross-validator
group_list: name of the groups
"""
individual_X = first_subs_nifti#all_subs_nn_nifti
individual_y = first_subs_nifti_Y #all_subs_nn_nifti_Y
individual_groups = all_subs_nn_nifti_groups[0:500]
cv=KFold(n_splits=5)



groups_array = np.array(list(set(individual_groups)))

#the CV that the inner Regressor uses
cv_inner = GroupKFold(3)

#we actually use KFold on the group names themselves, then filter across that
#that's equivalent to doing a GroupedKFold on the data.
test_scores = []
for train_i,test_i in cv.split(groups_array):
    train_group_items, test_group_items = groups_array[train_i], groups_array[test_i]
    print('In order to test on a training group of ' +
          str(len(train_group_items)) + ' items, holding out the following subjects:' +
          str(test_group_items))
    
    #select training data from the averages
    print('selecting training data')
    train_selector = [i for i, x in enumerate(individual_groups) if x in train_group_items]
    train_y = individual_y[train_selector]
    train_X = nib.funcs.concat_images([individual_X.slicer[...,s] for s in train_selector])
    train_groups = individual_groups[train_selector]
    print(asizeof_fmt(train_X))
    print(train_X.shape)
    
    #select testing data from the individual values
    print('selecting test data')
    test_selector = [i for i, x in enumerate(individual_groups) if x in test_group_items]
    test_y = individual_y[test_selector]
    test_X = nib.funcs.concat_images([individual_X.slicer[...,s] for s in test_selector])
    
    test_groups = individual_groups[test_selector]
    print(test_X.shape)
    
    print("regressing")
    regressor = DecoderRegressor(standardize= True,cv=cv_inner, scoring="r2")
    print(asizeof_fmt(train_X))
    regressor.fit(y=train_y,X=train_X,groups=train_groups)
    
    print("predicting")
    #now predict on our test split
    test_score = regressor.score(test_X,test_y)
    test_scores = test_scores+[test_score]
    print('test score was:')
    print(test_score)
    
    del test_X
    del train_X
    gc.collect() #clean up. this is big data we're working with
    #https://stackoverflow.com/questions/1316767/how-can-i-explicitly-free-memory-in-python

In order to test on a training group of 7 items, holding out the following subjects:['DEV005' 'DEV014']
selecting training data
2.5 GiB
(91, 109, 91, 374)
selecting test data
(91, 109, 91, 126)
regressing
2.5 GiB
predicting
test score was:
0.12762490054536846
In order to test on a training group of 7 items, holding out the following subjects:['DEV013' 'DEV009']
selecting training data
2.5 GiB
(91, 109, 91, 375)
selecting test data
(91, 109, 91, 125)
regressing
2.5 GiB
predicting
test score was:
-0.002948134659435997
In order to test on a training group of 7 items, holding out the following subjects:['DEV010' 'DEV001']
selecting training data
2.5 GiB
(91, 109, 91, 379)
selecting test data
(91, 109, 91, 121)
regressing
2.5 GiB
predicting
test score was:
0.3263347157390941
In order to test on a training group of 7 items, holding out the following subjects:['DEV006' 'DEV015']
selecting training data
2.9 GiB
(91, 109, 91, 434)
selecting test data
(91, 109, 91, 66)
regressing
2.9 GiB
predict

In [45]:
from sklearn.model_selection import KFold,GroupKFold
def cv_train_test_different_sets(
    individual_X,individual_groups, individual_y, 
    averaged_X = None,averaged_y = None, averaged_groups = None,
    cv = None):
    """
    averaged_X: values grouped
    averaged_groups: group allocations for the averaged dataset
    individual_X: values grouped into averages for testing
    cv: a Grouped cross-validator
    group_list: name of the groups
    """
    if cv is None:
        cv=KFold(n_splits=5)

    groups_array = np.array(list(set(individual_groups)))
    assert(set(averaged_groups)==set(individual_groups))

    #the CV that the inner Regressor uses
    cv_inner = GroupKFold(3)

    #we actually use KFold on the group names themselves, then filter across that
    #that's equivalent to doing a GroupedKFold on the data.
    test_scores = []
    for train_i,test_i in cv.split(groups_array):
        train_group_items, test_group_items = groups_array[train_i], groups_array[test_i]
        print('In order to test on a training group of ' +
              str(len(train_group_items)) + ' items, holding out the following subjects:' +
              str(test_group_items))
        
        
        #select training data from the averages
        print('selecting training data')
        train_selector = [i for i, x in enumerate(averaged_groups) if x in train_group_items]
        train_y = averaged_y[train_selector]
        train_X = nib.funcs.concat_images([averaged_X.slicer[...,s] for s in train_selector])
        train_groups = averaged_groups[train_selector]
        print(train_X.shape)
        print(asizeof_fmt(train_X))

        #select testing data from the individual values
        print('selecting test data')
        test_selector = [i for i, x in enumerate(individual_groups) if x in test_group_items]
        test_y = individual_y[test_selector]
        test_X = nib.funcs.concat_images([individual_X.slicer[...,s] for s in test_selector])
        test_groups = individual_groups[test_selector]
        print(asizeof_fmt(test_X))
        print(test_X.shape)


        print("regressing")
        regressor = DecoderRegressor(standardize= True,cv=cv_inner, scoring="r2")
        print(asizeof_fmt(train_X))
        regressor.fit(y=train_y,X=train_X,groups=train_groups)

        print("predicting")
        #now predict on our test split
        test_score = regressor.score(test_X,test_y)
        test_scores = test_scores+[test_score]
        print('test score was:')
        print(test_score)

        del test_X
        del train_X
        gc.collect() #clean up. this is big data we're working with
        #https://stackoverflow.com/questions/1316767/how-can-i-explicitly-free-memory-in-python

    return(test_scores)





In [46]:
def cv_train_test_same_sets(
    individual_X,individual_groups, individual_y, 
    #averaged_X = None,averaged_y = None, averaged_groups = None,
    cv = None):
    """
    averaged_X: values grouped
    averaged_groups: group allocations for the averaged dataset
    individual_X: values grouped into averages for testing
    cv: a Grouped cross-validator
    """
    #individual_X = first_subs_nifti#all_subs_nn_nifti
    #individual_y = first_subs_nifti_Y #all_subs_nn_nifti_Y
    #individual_groups = all_subs_nn_nifti_groups[0:500]
    if cv is None:
        cv=KFold(n_splits=5)



    groups_array = np.array(list(set(individual_groups)))

    #the CV that the inner Regressor uses
    cv_inner = GroupKFold(3)

    #we actually use KFold on the group names themselves, then filter across that
    #that's equivalent to doing a GroupedKFold on the data.
    test_scores = []
    for train_i,test_i in cv.split(groups_array):
        train_group_items, test_group_items = groups_array[train_i], groups_array[test_i]
        print('In order to test on a training group of ' +
              str(len(train_group_items)) + ' items, holding out the following subjects:' +
              str(test_group_items))

        #select training data from the averages
        print('selecting training data')
        train_selector = [i for i, x in enumerate(individual_groups) if x in train_group_items]
        train_y = individual_y[train_selector]
        train_X = nib.funcs.concat_images([individual_X.slicer[...,s] for s in train_selector])
        train_groups = individual_groups[train_selector]
        print(asizeof_fmt(train_X))
        print(train_X.shape)

        #select testing data from the individual values
        print('selecting test data')
        test_selector = [i for i, x in enumerate(individual_groups) if x in test_group_items]
        test_y = individual_y[test_selector]
        test_X = nib.funcs.concat_images([individual_X.slicer[...,s] for s in test_selector])

        test_groups = individual_groups[test_selector]
        print(test_X.shape)

        print("regressing")
        regressor = DecoderRegressor(standardize= True,cv=cv_inner, scoring="r2")
        print(asizeof_fmt(train_X))
        regressor.fit(y=train_y,X=train_X,groups=train_groups)

        print("predicting")
        #now predict on our test split
        test_score = regressor.score(test_X,test_y)
        test_scores = test_scores+[test_score]
        print('test score was:')
        print(test_score)

        del test_X
        del train_X
        gc.collect() #clean up. this is big data we're working with
    return(test_scores)
        #https://stackoverflow.com/questions/1316767/how-can-i-explicitly-free-memory-in-python

In [43]:
first_subs_nifti = all_subs_nn_nifti.slicer[...,0:500]
first_subs_nifti_Y = all_subs_nn_nifti_Y[0:500]
first_subs_nifti = nil.image.clean_img(first_subs_nifti,detrend=False,standardize=True)

In [44]:
test_scores = cv_train_test_different_sets(
    individual_X=first_subs_nifti,
    individual_y=first_subs_nifti_Y,
    individual_groups=all_subs_nn_nifti_groups[0:500])

In order to test on a training group of 7 items, holding out the following subjects:['DEV005' 'DEV014']
selecting training data
selecting test data
(91, 109, 91, 126)
regressing


KeyboardInterrupt: 

In [None]:
test_scores = cv_train_test_same_sets(
    individual_X=first_subs_nifti,
    individual_y=first_subs_nifti_Y,
    individual_groups=all_subs_nn_nifti_groups[0:500])

In [None]:
#### Nested cross-validation

See for instance: http://nilearn.github.io/auto_examples/02_decoding/plot_haxby_grid_search.html


Issues here:

 - Need to get the cross-validation right. We have got it working but there is an inner validation that selects a decoder for each group. Does that make sense? I think so, but just need to consider it a bit carefully.
 - What are the individual methods that inner validation is running? Should keep track of that
 - We really need to compare this to a baseline. So we need to run an individual-level analysis
