In [4]:
import math
import pandas as pd
import numpy as np
import os
import nltools as nlt
import nilearn as nil
import nibabel as nib
import warnings
import glob
import random
from sys import getsizeof

import pickle
from operator import itemgetter

In [2]:
#get the behavioral data and the list of subjects allocated for training
wtpw1_behavdesign_clean = pd.read_csv("../data/wtpw1_behavdesign_clean.csv")
test_train_df = pd.read_csv("../data/train_test_markers_20210601T183243.csv")
train_subjs = test_train_df.loc[test_train_df.SplitGroup=='Train','sub_label'].tolist()

A prior load file `load_multisubject_brain_data.ipynb` loaded subjects from betas that are already generated (see  https://docs.google.com/presentation/d/1K-nFrZYE6rR8t0myNyacB7frBzV3B1--nMqPhVkwL8E/edit#slide=id.gd9fcc4129a_0_0 for this process). We're going one level up the chain to look at the raw images those betas were generated from. We want to use the behavioral data to do another extraction from them.

This file uses `load_filter_test_train_split` and replaces `load_multisubject_brain_data.ipynb` for loading raw data instead of the betas.

Betaseries output files are stored in `/gpfs/projects/sanlab/shared/DEV/nonbids_data/fMRI/fx/models/WTP/wave1/betaseries/sub-DEV049/`, processed by files in `/gpfs/projects/sanlab/shared/DEV/DEV_scripts/fMRI/fx/models/WTP`.

These are loaded from folders including: `/projects/sanlab/shared/DEV/bids_data/derivatives/fmriprep/sub-DEV082/ses-wave1/func/`.

### Example subject

For a given subject label, wave, and run, we can load their WTP file....




In [3]:
def get_wtp_filepath_for_run(sub_label,wave,run):
    folder_path = (
        "/gpfs/projects/sanlab/shared/DEV/bids_data/derivatives/fmriprep/sub-"+
        sub_label+
        "/ses-wave"+str(wave)+"/func/"
    )
    filename = (
        's6_sub-' + sub_label + '_ses-wave' + str(wave) + 
        '_task-WTP_acq-' + str(run) +'_bold_space-MNI152NLin2009cAsym_preproc.nii'
    )
    
    return(folder_path+filename)

...and we can load it into a brain data file...

In [4]:
sub_label = 'DEV081'
subj_raw_data_bd = nlt.Brain_Data(get_wtp_filepath_for_run(sub_label,1,1))


In [5]:
subj_raw_data_nii = nib.load(get_wtp_filepath_for_run(sub_label,1,1))

...but now we have to figure out what to do with that! What sort of pre-processing do we need to do, and how should we do it?

We probably don't want to run a haemodynamic convolution because we don't wanna mix up the pre- and post- periods.

I think the SPM script does:

 - high-pass filter
 - convolution
 - hrf
 - serial correlations
 - ...?
 
What if we just used literally raw data--any scans with a time within the period? If we're skipping hrf convolution...we might as well skip this other stuff? Might need to come back and add it later but let's see how we go.

The time period is 4 seconds. Therefore if we get an image taken in the first 2 seconds we can guarantee it was completed before the subject was given a chance to select. The signal will be weak but if it works it'll be helpful.

So going back to our example subject, we get their design data:

In [6]:
print(sub_label)
subj_behav_design = wtpw1_behavdesign_clean[
    (wtpw1_behavdesign_clean.subject==sub_label) &
    (wtpw1_behavdesign_clean.wave==1) &
    (wtpw1_behavdesign_clean.run=='run1')
]

DEV081


because this has TR=2, and because we want the image taken in the first two seconds of each event, we can just grab the image two seconds after the start of the event.

In [7]:
def extract_events_from_nii(subj_raw_data_nii, subj_behav_design):
    slice_series = []
    for event_i, event_r in subj_behav_design.iterrows():
        #print(event_r['onset'])
        #the event tr is the first TR AFTER the onset; so we need to find the point and round-up

        event_tr = (int)(np.ceil(event_r['onset']/2))
        if event_tr<subj_raw_data_nii.shape[3]:
            slice_series = slice_series + [subj_raw_data_nii.slicer[...,event_tr]]
        else:
            raise IndexError("The behavioral design refers to a slice (" + str(event_tr) + ") that is not present in the dataset. This may indicate bad data or a truncated run. The dataset has shape: " + str(subj_raw_data_nii.shape))

        #print(event_tr)

    event_related_nii = nib.funcs.concat_images(slice_series)
    return(event_related_nii)

extract_events_from_nii(subj_raw_data_nii, subj_behav_design)

<nibabel.nifti1.Nifti1Image at 0x2aaaeb71d730>

In [8]:
wtpw1_behavdesign_clean[wtpw1_behavdesign_clean.subject=='DEV081']

Unnamed: 0,beta,type,task,event_id,isi_pre,onset,duration,food_pic,food_num,cond,health_cond,liking_cond,liking_rating,response,isi_post,end,run,wave,subject
125,beta_0001.nii,run1,WTP betas,1,3.000000,6.081474,6.565242,Fritos.bmp,1,unhealthy_liked,unhealthy,liked,4,,3.000000,12.646716,run1,1,DEV081
307,beta_0002.nii,run1,WTP betas,2,3.000000,18.686977,6.523985,CoconutChips.bmp,2,healthy_liked,healthy,liked,4,8.0,1.780283,25.210962,run1,1,DEV081
489,beta_0003.nii,run1,WTP betas,3,1.780283,30.036748,6.523213,OatmealBites.bmp,3,healthy_liked,healthy,liked,4,5.0,0.095117,36.559961,run1,1,DEV081
671,beta_0004.nii,run1,WTP betas,4,0.095117,39.693001,6.526969,JellyBeans.bmp,4,unhealthy_disliked,unhealthy,disliked,1,5.0,2.680966,46.219969,run1,1,DEV081
853,beta_0005.nii,run1,WTP betas,5,2.680966,51.942914,6.531816,PeanutButterCup.bmp,5,unhealthy_liked,unhealthy,liked,4,8.0,0.319097,58.474729,run1,1,DEV081
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10863,beta_0075.nii,run4,WTP betas,12,0.925946,127.198770,6.531133,chilifritos.bmp,12,unhealthy_disliked,unhealthy,disliked,1,5.0,0.670865,133.729903,run4,1,DEV081
11045,beta_0076.nii,run4,WTP betas,13,0.670865,137.447447,6.531707,Kiwi.bmp,13,healthy_disliked,healthy,disliked,1,5.0,0.859947,143.979153,run4,1,DEV081
11227,beta_0077.nii,run4,WTP betas,14,0.859947,147.879711,6.531721,KaleChips.bmp,14,healthy_disliked,healthy,disliked,1,5.0,0.333113,154.411431,run4,1,DEV081
11409,beta_0078.nii,run4,WTP betas,15,0.333113,157.794491,6.531670,VeggieStraws.bmp,15,healthy_liked,healthy,liked,4,7.0,1.421652,164.326160,run4,1,DEV081


In [9]:
np.sum(wtpw1_behavdesign_clean.subject=='DEV081')
#def get_raw_betas(run_data,betas):



64

Now let's put it together -- create one function that will run through that subject again:

In [10]:
def get_event_related_Brain_Data_for_sub_run(subj_label,wave,run,all_behav_design):
    #load the raw run file
    subj_raw_data_nii = nib.load(get_wtp_filepath_for_run(sub_label,1,1))
    
    #subset the behavioral data
    subj_behav_design = all_behav_design[
        (all_behav_design.subject==sub_label) &
        (all_behav_design.wave==1) &
        (all_behav_design.run=='run' + str(run))
    ]
    
    #go through the event file an extract the appropriate nii for each event
    event_related_nii = extract_events_from_nii(subj_raw_data_nii, subj_behav_design)
    
    #if as_Brain_Data:
    #create the nlt brain_Data file
    event_related_bd = nlt.Brain_Data(event_related_nii)
    event_related_bd.X = subj_behav_design
    return(event_related_bd)
    
    #return(event_related_nii)
    
    
sub_041 = get_event_related_Brain_Data_for_sub_run('DEV041',1,1,wtpw1_behavdesign_clean)
    
 
    

Now let's expand that one more level by looping through all of a subject's runs:

In [11]:
def get_event_related_Brain_Data_for_sub_all_runs(subj_label,wave,all_behav_design):
    event_related_run_list = []
    #loop through each run
    for run in [1,2,3,4]:
        event_related_nii_run = (
            get_event_related_Brain_Data_for_sub_run(
                subj_label,wave,run, all_behav_design)
        )
        event_related_run_list = event_related_run_list + [event_related_nii_run]
    
    #concatenate the data from each run into a single file
    subj_run_Brain_Data = nlt.utils.concatenate(event_related_run_list)
    
    return(subj_run_Brain_Data)

### Load all subjects - attempt 1

...and one level further still, by iterating through a list of subjects from the training set list.

This takes a long time, because it iterates through and runs the Brain_Data operation (which is quite slow) for every subject. That's unnecessary.

### Reorganising this: loading subjects attempt 2

This is too slow and messy. We're going to do another way that iterates through all the data BEFORE creating the bd file. This may end up faster because wrapping the nifti in the brain data format is the slowest step.

In [41]:
def get_event_related_Brain_Data_for_all_subs_all_runs_fast(subj_list, wave,all_behav_design):
    """
    Developed in load_multisubject_raw_data.ipynb. Gets raw brain data from raw files and concatenates into a Brain_Data file.
    """
    training_data_list = []
    behavioral_design_list_in_order = []
    for sub_label in subj_list:
        print(sub_label + " (",end='',flush=True)

        #def get_event_related_Brain_Data_for_sub_all_runs(subj_label,wave,all_behav_design):
        #loop through each run
        for run in [1,2,3,4]:
            print(str(run) + " ", end='',flush=True)
            #def get_event_related_Brain_Data_for_sub_run(subj_label,wave,run,all_behav_design):
            #load the raw run file
            raw_filepath = get_wtp_filepath_for_run(sub_label,wave,run)
            #print(raw_filepath)
            subj_raw_data_nii = nib.load(raw_filepath)

            #subset the behavioral data
            subj_behav_design = all_behav_design[
                (all_behav_design.subject==sub_label) &
                (all_behav_design.wave==wave) &
                (all_behav_design.run=='run' + str(run))
            ]
            
            
            #go through the event file and extract the appropriate nii for each event
            try:
                
                event_related_nii = extract_events_from_nii(subj_raw_data_nii, subj_behav_design)
                
                #this is a good place to convert the data from 64 to 32bit. we don't need 64-bit float. might only need 16-bit
                #https://stackoverflow.com/questions/44397617/change-data-type-in-numpy-and-nibabel/45589431
#                 event_related_nii_32b = event_related_nii
#                 event_related_nii_32b
#                 event_related_nii.get_fdata().astype(np.float32)
                training_data_list = training_data_list + [event_related_nii]
                behavioral_design_list_in_order= behavioral_design_list_in_order + [subj_behav_design]
            except IndexError:
                print("For subject " + sub_label + ", run " + str(run) + ", there was a mismatch between behavioral and data. Skipping this run.")
                
        print(")")
                
            
    print("extracted all data. concatenating...",flush=True)
    #concatenate the data from each run into a single file
    all_nii = nib.funcs.concat_images(training_data_list,axis=3)
    del training_data_list
    print("...concatenated.",flush=True)
    behavioral_design = pd.concat(behavioral_design_list_in_order)
    behavioral_design.reset_index(inplace=True,drop=True)
    
    print("combining into a Brain_Data file....",flush=True)
    #combine as a Brain_Data file.
    all_bd = nlt.Brain_Data(all_nii)
    all_bd.X = behavioral_design
    print("...done.",flush=True)
    
    return(all_bd)



In [17]:
training_Brain_Data_5 = get_event_related_Brain_Data_for_all_subs_all_runs_fast(train_subjs[0:5],1,wtpw1_behavdesign_clean)

DEV001 (1 2 3 4 )
DEV005 (1 2 3 4 )
DEV006 (1 2 3 4 )
DEV009 (1 2 3 4 )
DEV010 (1 2 3 4 )
extracted all data. concatenating...
combining into a Brain_Data file....
...done.


In [18]:
#training_Brain_Data_20 = get_event_related_Brain_Data_for_all_subs_all_runs_fast(train_subjs[0:20],1,wtpw1_behavdesign_clean)

In [19]:
training_Brain_Data = get_event_related_Brain_Data_for_all_subs_all_runs_fast(train_subjs,1,wtpw1_behavdesign_clean)

DEV001 (1 2 3 4 )
DEV005 (1 2 3 4 )
DEV006 (1 2 3 4 )
DEV009 (1 2 3 4 )
DEV010 (1 2 3 4 )
DEV012 (1 2 3 4 )
DEV013 (1 2 3 4 )
DEV014 (1 2 3 4 )
DEV015 (1 2 3 4 )
DEV016 (1 2 3 4 )
DEV017 (1 2 3 4 )
DEV018 (1 2 3 4 )
DEV019 (1 2 3 4 )
DEV021 (1 2 3 4 )
DEV022 (1 2 3 4 )
DEV023 (1 2 3 4 )
DEV024 (1 2 3 4 )
DEV025 (1 2 For subject DEV025, run 2, there was a mismatch between behavioral and data. Skipping this run.
3 4 )
DEV026 (1 2 3 4 )
DEV027 (1 2 3 4 )
DEV028 (1 2 3 4 )
DEV029 (1 2 3 4 )
DEV030 (1 2 3 4 )
DEV034 (1 2 3 4 )
DEV035 (1 2 3 4 )
DEV036 (1 2 3 4 )
DEV039 (1 2 3 4 )
DEV040 (1 2 3 4 )
DEV041 (1 2 3 4 )
DEV042 (1 2 3 4 )
DEV043 (1 2 3 4 )
DEV046 (1 2 3 4 )
DEV048 (1 2 3 4 )
DEV049 (1 2 3 4 )
DEV050 (1 2 3 4 )
DEV051 (1 2 3 4 )
DEV052 (1 2 3 4 )
DEV053 (1 2 3 4 )
DEV055 (1 2 3 4 )
DEV056 (1 2 3 4 )
DEV057 (1 2 3 4 )
DEV058 (1 2 3 4 )
DEV059 (1 2 3 4 )
DEV060 (1 2 3 4 )
DEV061 (1 2 3 4 )
DEV062 (1 2 3 4 )
DEV068 (1 2 3 4 )
DEV069 (1 2 3 For subject DEV069, run 3, there was a misma

MemoryError: Unable to allocate 30.6 GiB for an array with shape (97, 115, 97, 3792) and data type float64

In [None]:
from nilearn import plotting

plotting.plot_stat_map(training_Brain_Data_20.mean().to_nifti())

In [6]:
from sys import getsizeof

getsizeof(training_Brain_Data_5.to_nifti().get_fdata())/ math.pow(10,9)

2.310730384

That's about 2.3 GB for 5 subjects. So for 60 subjects, we can expect



In [5]:
getsizeof(training_Brain_Data_5.to_nifti().get_fdata()) /5 * 60 / math.pow(10,9)

27.728764608

### Save

Now we have a complete set of data based on raw extractions of events that have not had high pass filter or HRF applied.

We might really need to work out how to apply high-pass filter, but I think extracting without application of HRF is probably the right approach.

It seems haphazard. If the data was anything but a 4 second event with 2 second TRs where we want the last TR that doesn't go into the choice phase, we'd have a problem. But because that's precisely what we want, this code will grab exactly the right data.

In [None]:
with open('../data/Brain_Data_75subs.pkl', 'rb') as pkl_file:
    Brain_Data_allsubs = pickle.load(pkl_file)
    
with open('../data/Brain_Data_raw_5subs.pkl', 'rb') as pkl_file:
    Brain_Data_raw_allsubs = pickle.load(pkl_file)
    
Brain_Data_raw_allsubs.dtype()

Brain_Data_allsubs.dtype()

In [None]:
with open('../data/Brain_Data_raw_20subs.pkl', 'wb') as pkl_file:
    pickle.dump(training_Brain_Data_20,pkl_file)

In [20]:
with open('../data/Brain_Data_raw_5subs.pkl', 'wb') as pkl_file:
    pickle.dump(training_Brain_Data_5,pkl_file)

In [2]:
training_Brain_Data_5.dtype()

dtype('float64')

In [12]:
training_Brain_Data_5.to_nifti().get_fdata().astype(np.float32).dtype

dtype('float32')

In [40]:
test_nifti=training_Brain_Data_5.to_nifti()
print(test_nifti.get_fdata().dtype)
#test_nifti.get_fdata().astype(np.float32)
test_nifti.header.set_data_dtype(np.float64)
test_nifti_out = nib.Nifti1Image(
    test_nifti.get_fdata().astype(np.float64),
    test_nifti.affine,
    header=test_nifti.header)
print(test_nifti_out.get_fdata().dtype)
#print(test_nifti.get_fdata().dtype)

float64
float64


In [38]:
np.unique(test_nifti_out.get_fdata())

array([  0.,   1.,   2.,   3.,   4.,   5.,   6.,   7.,   8.,   9.,  10.,
        11.,  12.,  13.,  14.,  15.,  16.,  17.,  18.,  19.,  20.,  21.,
        22.,  23.,  24.,  25.,  26.,  27.,  28.,  29.,  30.,  31.,  32.,
        33.,  34.,  35.,  36.,  37.,  38.,  39.,  40.,  41.,  42.,  43.,
        44.,  45.,  46.,  47.,  48.,  49.,  50.,  51.,  52.,  53.,  54.,
        55.,  56.,  57.,  58.,  59.,  60.,  61.,  62.,  63.,  64.,  65.,
        66.,  67.,  68.,  69.,  70.,  71.,  72.,  73.,  74.,  75.,  76.,
        77.,  78.,  79.,  80.,  81.,  82.,  83.,  84.,  85.,  86.,  87.,
        88.,  89.,  90.,  91.,  92.,  93.,  94.,  95.,  96.,  97.,  98.,
        99., 100., 101., 102., 103., 104., 105., 106., 107., 108., 109.,
       110., 111., 112., 113., 114., 115., 116., 117., 118., 119., 120.,
       121., 122., 123., 124., 125., 126., 127., 128., 129., 130., 131.,
       132., 133., 134., 135., 136., 137., 138., 139., 140., 141., 142.,
       143., 144., 145., 146., 147., 148., 149., 15

In [31]:
datatyped32=test_nifti.get_fdata().astype(np.float32)
print(datatyped32.dtype)

float32


In [29]:
print(test_nifti.header)

<class 'nibabel.nifti1.Nifti1Header'> object, endian='<'
sizeof_hdr      : 348
data_type       : b''
db_name         : b''
extents         : 0
session_error   : 0
regular         : b''
dim_info        : 0
dim             : [  4  91 109  91 320   1   1   1]
intent_p1       : 0.0
intent_p2       : 0.0
intent_p3       : 0.0
intent_code     : none
datatype        : float32
bitpix          : 32
slice_start     : 0
pixdim          : [-1.  2.  2.  2.  1.  1.  1.  1.]
vox_offset      : 0.0
scl_slope       : nan
scl_inter       : nan
slice_end       : 0
slice_code      : unknown
xyzt_units      : 0
cal_max         : 0.0
cal_min         : 0.0
slice_duration  : 0.0
toffset         : 0.0
glmax           : 0
glmin           : 0
descrip         : b''
aux_file        : b''
qform_code      : unknown
sform_code      : aligned
quatern_b       : 0.0
quatern_c       : 1.0
quatern_d       : 0.0
qoffset_x       : 90.0
qoffset_y       : -126.0
qoffset_z       : -72.0
srow_x          : [-2.  0.  0. 90.]
srow_

In [3]:
getsizeof(training_Brain_Data_5.to_nifti().get_fdata()) /5 * 60 / math.pow(10,9)

NameError: name 'getsizeof' is not defined

### Standalone script

In [1]:
import math
import pandas as pd
import numpy as np
import os
import nltools as nlt
import nilearn as nil
import nibabel as nib
import warnings
import glob
import random
import pickle
from operator import itemgetter
import dev_wtp_io_utils

#get the behavioral data and the list of subjects allocated for training
wtpw1_behavdesign_clean = pd.read_csv("../data/wtpw1_behavdesign_clean.csv")
test_train_df = pd.read_csv("../data/train_test_markers_20210601T183243.csv")
train_subjs = test_train_df.loc[test_train_df.SplitGroup=='Train','sub_label'].tolist()

training_Brain_Data_5 = dev_wtp_io_utils.get_event_related_Brain_Data_for_all_subs_all_runs_fast(train_subjs[0:5],1,wtpw1_behavdesign_clean)

with open('../data/Brain_Data_raw_5subs.pkl', 'wb') as pkl_file:
    pickle.dump(training_Brain_Data_5,pkl_file)

  warn("Fetchers from the nilearn.datasets module will be "


DEV001 (1 2 3 4 )
DEV005 (1 2 3 4 )
DEV006 (1 2 3 4 )
DEV009 (1 2 3 4 )
DEV010 (1 2 3 4 )
extracted all data. concatenating...
combining into a Brain_Data file....
...done.
