In [1]:
import pandas as pd
import numpy as np
import os
import nltools as nlt
import nilearn as nil
import nibabel as nib
import warnings
import glob
import random
import pickle
import dev_wtp_io_utils
from nilearn import plotting


  warn("Fetchers from the nilearn.datasets module will be "


In [2]:
pd.set_option('display.max_rows', 99)

### Load brain data

In [4]:
with open('../data/Brain_Data_2sns_60subs.pkl', 'rb') as pkl_file:
    Brain_Data_allsubs = pickle.load(pkl_file)

In [5]:
test_train_set = pd.read_csv("../data/train_test_markers_20210601T183243.csv")

In [6]:
dev_wtp_io_utils.check_BD_against_test_train_set(Brain_Data_allsubs,test_train_set)

checked for intersection and no intersection between the brain data and the subjects was found.
there were 60 subjects overlapping between the subjects marked for train data and the training dump file itself.


### Preprocess

In [7]:
Brain_Data_allsubs.Y = Brain_Data_allsubs.X.response.copy()
print(Brain_Data_allsubs.Y.value_counts())
Brain_Data_allsubs.Y[Brain_Data_allsubs.Y=='NULL']=None
print(Brain_Data_allsubs.Y.value_counts())
print(Brain_Data_allsubs.Y.isnull().value_counts())
Brain_Data_allsubs_nn = Brain_Data_allsubs[Brain_Data_allsubs.Y.isnull()==False]
print(len(Brain_Data_allsubs_nn))
print(len(Brain_Data_allsubs))

5.0    1164
6.0    1018
7.0     904
8.0     604
Name: response, dtype: int64
5.0    1164
6.0    1018
7.0     904
8.0     604
Name: response, dtype: int64
False    3690
True      150
Name: response, dtype: int64
3690
3840


In [10]:
all_subs_nn_nifti = Brain_Data_allsubs_nn.to_nifti()

In [14]:
all_subs_nn_nifti_Y = Brain_Data_allsubs_nn.Y

In [32]:
all_subs_nn_nifti_groups = Brain_Data_allsubs_nn.X.subject

In [33]:
all_subs_nn_nifti_groups

0       DEV001
1       DEV001
2       DEV001
3       DEV001
4       DEV001
         ...  
3685    DEV089
3686    DEV089
3687    DEV089
3688    DEV089
3689    DEV089
Name: subject, Length: 3690, dtype: object

### Predict

Regressing in nilearn:
 - https://nilearn.github.io/decoding/estimator_choice.html
 - http://www.ncbi.nlm.nih.gov/pubmed/20691790







In [13]:


from nilearn.decoding import DecoderRegressor

regressor = DecoderRegressor(estimator = 'ridge_regressor', standardize= True,cv=5)

In [None]:
#output["r_xval"] = pearsonr(output["Y"], output["yfit_xval"])[0]

In [18]:
reg_results = regressor.fit(all_subs_nn_nifti, all_subs_nn_nifti_Y)#default scoring is 42



In [22]:
regressor.cv_scores_

{'beta': [-0.7752011400140235,
  -1.0197575102413023,
  -0.5455072414813338,
  -0.5094201119299464,
  -0.6835421929486358,
  -0.5216117331328245,
  -0.3277013750651854,
  -1.2561362345534581,
  -0.5259166290404353,
  -1.2392101173224317]}

In [24]:
prediction_score = -np.mean(regressor.cv_scores_['beta'])

print("=== DECODER ===")
print("explained variance for the cross-validation: %f" % prediction_score)
print("")

=== DECODER ===
explained variance for the cross-validation: 0.740400



But this cross-validation won't have respected subject divisions so it's likely to have data across the subjects. To do that properly we would need a cross-validation generator...

https://scikit-learn.org/stable/modules/cross_validation.html

In [29]:
#e.g.,...

from sklearn.model_selection import KFold

kf = KFold(n_splits=5)

regressor_kf = DecoderRegressor(estimator = 'ridge_regressor', standardize= True,cv=kf,scoring="r2")
reg_results = regressor.fit(all_subs_nn_nifti, all_subs_nn_nifti_Y)



In [30]:
prediction_score = -np.mean(regressor.cv_scores_['beta'])

print("=== DECODER ===")
print("explained variance for the cross-validation: %f" % prediction_score)
print("")

=== DECODER ===
explained variance for the cross-validation: 0.740400



But that still doens't resepct grouped data. to do that we'll need the GroupKFold.

But how do we specify it when passing it to DecoderRegressor? We have to be able to specify the groups.


We might not be able to use the built-in cross-validation; have to build our own perhaps.
Considering that the ultimate aim is to train on one thing and predict on another, that's probably inevitable anyway?

Interesting thing about this is that we can probably/might as well go back to nltools.

In [37]:
from sklearn.model_selection import GroupKFold

kf = GroupKFold(n_splits=5)

regressor_kf = DecoderRegressor(estimator = 'ridge_regressor', standardize= True,cv=kf,scoring="r2")
reg_results = regressor_kf.fit(all_subs_nn_nifti, all_subs_nn_nifti_Y,groups=all_subs_nn_nifti_groups)

In [39]:
regressor_kf.cv_scores_['beta']

[-0.29641380042067,
 -0.8979007845484785,
 -0.7959601680162349,
 -0.5116243845130992,
 -0.5946075353340565]

In [40]:
prediction_score = -np.mean(regressor_kf.cv_scores_['beta'])

print("=== DECODER ===")
print("explained variance for the cross-validation: %f" % prediction_score)
print("")

=== DECODER ===
explained variance for the cross-validation: 0.619301



OK, so that's how you do it. It's pretty straightforward.

So...we won't look at nested cross-validation juuust yet, because the next step is to work out how to train on one set and predict on another. that will definitely require a custom pipeline. Let's get started...

In [None]:
#def train_test_cv(averaged_X,averaged_groups, individual_X,individual_groups, Y, cv,group_list)
"""
averaged_X: values not grouped
averaged_groups: group allocations for the averaged dataset
individual_X: values grouped
cv: a Grouped cross-validator
group_list: name of the groups
"""
averaged_groups = all_subs_nn_nifti_groups
cv=kf

#what we're really doing here is splitting on subjects, so let's make that explicit
averaged_groups_set = set(averaged_groups)
for groups in kf()

In [None]:
#### Nested cross-validation

See for instance: http://nilearn.github.io/auto_examples/02_decoding/plot_haxby_grid_search.html