 # Metadata Organization
 ## Imports

In [1]:
import pandas as pd 
import numpy as np
import os.path
import glob
import pathlib
import functools
import time
import re
import gc
from nilearn.input_data import NiftiMasker
import nibabel as nib
from nilearn import image
from joblib import Parallel, delayed


 ## Load configs (all patterns/files/folderpaths)

In [2]:
import configurations
configs = configurations.Config('sub-xxx-resamp-intersected')


 ## Function to find all the regressor file paths

In [3]:
def timer(func):
    """Print the runtime of the decorated function"""
    @functools.wraps(func)
    def wrapper(*args, **kwargs):
        print(f'Calling {func.__name__!r}')
        startTime = time.perf_counter()
        value = func(*args, **kwargs)
        endTime = time.perf_counter()
        runTime = endTime - startTime
        print(f'Finished {func.__name__!r} in {runTime:.4f} secs')
        return value
    return wrapper


 ## Function to find all the BOLD NII file paths

In [4]:
@timer
def find_paths(relDataFolder, subj, sess, func, patt):
    paths = list(pathlib.Path(relDataFolder).glob(
                        os.path.join(subj, sess, func, patt)
                    )
                )                        
    return paths


 ## Find all the regressor file paths

In [5]:
regressor_paths = find_paths(relDataFolder=configs.dataDir,
                            subj='sub-*',
                            sess='ses-*',
                            func='func',
                            patt=configs.confoundsFilePattern)
regressor_paths


Calling 'find_paths'
Finished 'find_paths' in 0.0247 secs


ath('../data/preprocessed/sub-9054/ses-1/func/sub-9054_ses-1_task-faces_desc-confounds_regressors.tsv'),
 WindowsPath('../data/preprocessed/sub-9054/ses-1/func/sub-9054_ses-1_task-hands_desc-confounds_regressors.tsv'),
 WindowsPath('../data/preprocessed/sub-9054/ses-1/func/sub-9054_ses-1_task-rest_desc-confounds_regressors.tsv'),
 WindowsPath('../data/preprocessed/sub-9054/ses-1/func/sub-9054_ses-1_task-sleepiness_desc-confounds_regressors.tsv'),
 WindowsPath('../data/preprocessed/sub-9054/ses-2/func/sub-9054_ses-2_task-arrows_desc-confounds_regressors.tsv'),
 WindowsPath('../data/preprocessed/sub-9054/ses-2/func/sub-9054_ses-2_task-faces_desc-confounds_regressors.tsv'),
 WindowsPath('../data/preprocessed/sub-9054/ses-2/func/sub-9054_ses-2_task-hands_desc-confounds_regressors.tsv'),
 WindowsPath('../data/preprocessed/sub-9054/ses-2/func/sub-9054_ses-2_task-rest_desc-confounds_regressors.tsv'),
 WindowsPath('../data/preprocessed/sub-9054/ses-2/func/sub-9054_ses-2_task-sleepiness_desc-co

 ## Find all the BOLD NII file paths

In [6]:
nii_paths = find_paths(relDataFolder=configs.dataDir,
                        subj='sub-*',
                        sess='ses-*',
                        func='func',
                        patt=configs.maskedImagePattern)
nii_paths


Calling 'find_paths'
Finished 'find_paths' in 0.0224 secs


[WindowsPath('../data/preprocessed/sub-9001/ses-1/func/sub-9001_ses-1_task-arrows_space-MNI152NLin2009cAsym_desc-preproc_bold_masked_(sub-9001-9072_resamp_intersected)_bold.nii.gz'),
 WindowsPath('../data/preprocessed/sub-9001/ses-1/func/sub-9001_ses-1_task-faces_space-MNI152NLin2009cAsym_desc-preproc_bold_masked_(sub-9001-9072_resamp_intersected)_bold.nii.gz'),
 WindowsPath('../data/preprocessed/sub-9001/ses-1/func/sub-9001_ses-1_task-hands_space-MNI152NLin2009cAsym_desc-preproc_bold_masked_(sub-9001-9072_resamp_intersected)_bold.nii.gz'),
 WindowsPath('../data/preprocessed/sub-9001/ses-1/func/sub-9001_ses-1_task-rest_space-MNI152NLin2009cAsym_desc-preproc_bold_masked_(sub-9001-9072_resamp_intersected)_bold.nii.gz'),
 WindowsPath('../data/preprocessed/sub-9001/ses-1/func/sub-9001_ses-1_task-sleepiness_space-MNI152NLin2009cAsym_desc-preproc_bold_masked_(sub-9001-9072_resamp_intersected)_bold.nii.gz'),
 WindowsPath('../data/preprocessed/sub-9001/ses-2/func/sub-9001_ses-2_task-arrows_spa

 ## Read the participants.tsv file to find summaries of the subjects

In [7]:
participant_info_df = pd.read_csv(
        configs.participantsSummaryFile,
        sep='\t'
    )
participant_info_df


Unnamed: 0,participant_id,Sex,AgeGroup,BMI1,BMI2,EducationLevel,HADS_Anxiety,HADS_Depression,ISI,KSQ_SleepQualityIndex,...,PPIR_IR15,PPIR_IR40,BADD_Total,BADD_Activation,BADD_Attention,BADD_Effort,BADD_Affect,BADD_Memory,Sl_cond,Trial
0,sub-9001,Male,Young,1978997095,1978997095,Studerar för närvarande på universitet/högskola,0,1,12,475,...,11.0,29.0,16.0,6.0,2.0,2.0,6.0,0.0,2,B
1,sub-9002,Male,Old,2179944511,2146915048,Har avslutat gymnasieskolan,2,3,9,55,...,10.0,32.0,33.0,5.0,8.0,6.0,8.0,6.0,2,B
2,sub-9003,Male,Old,2049861496,2049861496,Har examen från universitet/högskola,2,3,10,525,...,12.0,26.0,13.0,4.0,4.0,4.0,1.0,0.0,1,B
3,sub-9004,Female,Old,2294811574,2294811574,Har examen från universitet/högskola,1,2,11,425,...,10.0,28.0,24.0,3.0,7.0,5.0,6.0,3.0,1,B
4,sub-9005,Male,Old,2475546432,2475546432,Har avslutat gymnasieskolan,0,0,9,6,...,8.0,28.0,30.0,9.0,5.0,7.0,3.0,6.0,2,B
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
81,sub-9094,Female,Old,308112461,3011099051,Har avslutat gymnasieskolan,5,1,9,5,...,8.0,23.0,30.0,5.0,10.0,5.0,5.0,5.0,1,A
82,sub-9095,Male,Old,2717310162,,Har avslutat gymnasieskolan,2,1,13,475,...,10.0,24.0,,,,,,,2,A
83,sub-9096,Female,Old,2405693475,,Har avslutat gymnasieskolan,0,1,7,525,...,11.0,27.0,9.0,2.0,4.0,2.0,1.0,0.0,1,A
84,sub-9098,Female,Old,2734375,26953125,Har examen från universitet/högskola,0,1,9,5,...,17.0,31.0,10.0,4.0,5.0,1.0,0.0,0.0,2,A


 ## Get a mapping Dataframe of subject and which session is the sleep deprived one

In [8]:
@timer
def map_sleepdep(participant_info):
    df = pd.DataFrame(participant_info.loc[:,['participant_id', 'Sl_cond']])
    df.replace('sub-', '', inplace=True, regex=True)
    return df.rename(columns={'participant_id':'subject', 'Sl_cond':'sleepdep_session'})

sleepdep_map = map_sleepdep(participant_info_df)
sleepdep_map


Calling 'map_sleepdep'
Finished 'map_sleepdep' in 0.0026 secs


Unnamed: 0,subject,sleepdep_session
0,9001,2
1,9002,2
2,9003,1
3,9004,1
4,9005,2
...,...,...
81,9094,1
82,9095,2
83,9096,1
84,9098,2


 ## Get Dataframe of subject, session, task, path

In [9]:
@timer
def get_bids_components(paths):
    components_list = []
    for i, path in enumerate(paths):
        filename = path.stem
        dirpath = path.parents[0]
        matches = re.search(
            '[a-z0-9]+\-([a-z0-9]+)_[a-z0-9]+\-([a-z0-9]+)_[a-z0-9]+\-([a-z0-9]+)', 
            filename
        )
        subject = matches.group(1)
        session = matches.group(2)
        task = matches.group(3)
        confound_file = path.with_name(
            'sub-'+subject+'_ses-'+session+'_task-'+task+'_desc-confounds_regressors.tsv'
        )
        components_list.append([subject, session, task, 
            path.__str__(), confound_file.__str__(), 0]
        )
    df = pd.DataFrame(components_list, 
        columns=['subject', 'session', 'task', 'path', 'confound_path', 'sleepdep']
    )
    return df

bids_comp_df = get_bids_components(nii_paths)
bids_comp_df


Calling 'get_bids_components'
Finished 'get_bids_components' in 0.0019 secs


Unnamed: 0,subject,session,task,path,confound_path,sleepdep
0,9001,1,arrows,..\data\preprocessed\sub-9001\ses-1\func\sub-9...,..\data\preprocessed\sub-9001\ses-1\func\sub-9...,0
1,9001,1,faces,..\data\preprocessed\sub-9001\ses-1\func\sub-9...,..\data\preprocessed\sub-9001\ses-1\func\sub-9...,0
2,9001,1,hands,..\data\preprocessed\sub-9001\ses-1\func\sub-9...,..\data\preprocessed\sub-9001\ses-1\func\sub-9...,0
3,9001,1,rest,..\data\preprocessed\sub-9001\ses-1\func\sub-9...,..\data\preprocessed\sub-9001\ses-1\func\sub-9...,0
4,9001,1,sleepiness,..\data\preprocessed\sub-9001\ses-1\func\sub-9...,..\data\preprocessed\sub-9001\ses-1\func\sub-9...,0
5,9001,2,arrows,..\data\preprocessed\sub-9001\ses-2\func\sub-9...,..\data\preprocessed\sub-9001\ses-2\func\sub-9...,0
6,9001,2,faces,..\data\preprocessed\sub-9001\ses-2\func\sub-9...,..\data\preprocessed\sub-9001\ses-2\func\sub-9...,0
7,9001,2,hands,..\data\preprocessed\sub-9001\ses-2\func\sub-9...,..\data\preprocessed\sub-9001\ses-2\func\sub-9...,0
8,9001,2,rest,..\data\preprocessed\sub-9001\ses-2\func\sub-9...,..\data\preprocessed\sub-9001\ses-2\func\sub-9...,0
9,9001,2,sleepiness,..\data\preprocessed\sub-9001\ses-2\func\sub-9...,..\data\preprocessed\sub-9001\ses-2\func\sub-9...,0


 ## Combine logically sleepdep_map and components_df into 1 dataframe

In [10]:
sleep_bids_comb_df = bids_comp_df.merge(sleepdep_map, how='left')


 ## Response column 'sleepdep' imputed from 'session' 'sleepdep_session'

In [11]:
for i in range(len(sleep_bids_comb_df)):
    if (int(sleep_bids_comb_df['session'].iloc[i]) == 
            int(sleep_bids_comb_df['sleepdep_session'].iloc[i])):
        sleep_bids_comb_df['sleepdep'].iloc[i] = 1
sleep_bids_comb_df


Unnamed: 0,subject,session,task,path,confound_path,sleepdep,sleepdep_session
0,9001,1,arrows,..\data\preprocessed\sub-9001\ses-1\func\sub-9...,..\data\preprocessed\sub-9001\ses-1\func\sub-9...,0,2
1,9001,1,faces,..\data\preprocessed\sub-9001\ses-1\func\sub-9...,..\data\preprocessed\sub-9001\ses-1\func\sub-9...,0,2
2,9001,1,hands,..\data\preprocessed\sub-9001\ses-1\func\sub-9...,..\data\preprocessed\sub-9001\ses-1\func\sub-9...,0,2
3,9001,1,rest,..\data\preprocessed\sub-9001\ses-1\func\sub-9...,..\data\preprocessed\sub-9001\ses-1\func\sub-9...,0,2
4,9001,1,sleepiness,..\data\preprocessed\sub-9001\ses-1\func\sub-9...,..\data\preprocessed\sub-9001\ses-1\func\sub-9...,0,2
5,9001,2,arrows,..\data\preprocessed\sub-9001\ses-2\func\sub-9...,..\data\preprocessed\sub-9001\ses-2\func\sub-9...,1,2
6,9001,2,faces,..\data\preprocessed\sub-9001\ses-2\func\sub-9...,..\data\preprocessed\sub-9001\ses-2\func\sub-9...,1,2
7,9001,2,hands,..\data\preprocessed\sub-9001\ses-2\func\sub-9...,..\data\preprocessed\sub-9001\ses-2\func\sub-9...,1,2
8,9001,2,rest,..\data\preprocessed\sub-9001\ses-2\func\sub-9...,..\data\preprocessed\sub-9001\ses-2\func\sub-9...,1,2
9,9001,2,sleepiness,..\data\preprocessed\sub-9001\ses-2\func\sub-9...,..\data\preprocessed\sub-9001\ses-2\func\sub-9...,1,2


 ## Get confounds that can be used further clean up the signal or for prediction

In [12]:
def get_important_confounds(regressor_paths, important_reg_list, start, end):
    regressors_df_list = []
    for paths in regressor_paths:
        regressors_all = pd.DataFrame(pd.read_csv(paths, sep="\t"))
        regressors_selected = pd.DataFrame(regressors_all[important_reg_list].loc[start:end-1])
        regressors_df_list.append(pd.DataFrame(regressors_selected.stack(0)).transpose())
    concatenated_df = pd.concat(regressors_df_list, ignore_index=True)
    concatenated_df.columns = [col[1] + '-' + str(col[0]) for col in concatenated_df.columns.values]
    return concatenated_df

important_reg_list = ['csf', 'white_matter', 'global_signal', 
                      'trans_x', 'trans_y', 'trans_z', 
                      'rot_x', 'rot_y', 'rot_z', 
                      'csf_derivative1', 'white_matter_derivative1', 'global_signal_derivative1',
                      'trans_x_derivative1', 'trans_y_derivative1', 'trans_z_derivative1',
                      'rot_x_derivative1', 'rot_y_derivative1', 'rot_z_derivative1',
                      'csf_power2', 'white_matter_power2', 'global_signal_power2',
                      'trans_x_power2', 'trans_y_power2', 'trans_z_power2',
                      'rot_x_power2', 'rot_y_power2', 'rot_z_power2',
                      'csf_derivative1_power2', 'white_matter_derivative1_power2', 'global_signal_derivative1_power2',
                      'trans_x_derivative1_power2', 'trans_y_derivative1_power2', 'trans_z_derivative1_power2',
                      'rot_x_derivative1_power2', 'rot_y_derivative1_power2', 'rot_z_derivative1_power2'
                     ]

important_confounds_df = get_important_confounds(
    sleep_bids_comb_df['confound_path'], important_reg_list, configs.startSlice, configs.endSlice
)


 ## Load the masker data file to prepare to apply to images

In [13]:
masker = NiftiMasker(mask_img=configs.maskDataFile, standardize=False)


 ## Helper to generate raw voxel df from a given path + masker and print shape for sanity

In [14]:
@timer
def gen_one_voxel_df(filepath, masker, start, end):
    masked_array = masker.fit_transform(image.index_img(filepath, slice(start,end)))
    reshaped_array = pd.DataFrame(np.reshape(
        masked_array.ravel(), newshape=[1,-1]), dtype='float32')
    print('> Shape of raw voxels for file ' + 
          '\"' + pathlib.Path(filepath).stem + '\" ' + 
          'is: \n' + 
          '\t 1-D (UnMasked+Sliced): ' + str(reshaped_array.shape) + '\n' +
          '\t 2-D (UnMasked+Sliced): ' + str(masked_array.shape) + '\n' +
          '\t 4-D (Raw header)     : ' + str(nib.load(filepath).header.get_data_shape())
    )
    return reshaped_array


 ## Function to generate from masked image the raw voxel df from all images in folder

In [15]:
@timer
def get_voxels_df(metadata_df, masker, start, end):
    rawvoxels_list = []
    print() # Print to add a spacer for aesthetics

    #below has been parallelized
    for i in range(len(metadata_df)):
        rawvoxels_list.append(gen_one_voxel_df(metadata_df['path'].iloc[i], masker, start, end))
        print() # Print to add a spacer for aesthetics
    
    # rawvoxels_list.append(Parallel(n_jobs=-1, verbose=100)(delayed(gen_one_voxel_df)(metadata_df['path'].iloc[i], masker, start, end) for i in range(len(metadata_df))))

    print() # Print to add a spacer for aesthetics
    tmp_df = pd.concat(rawvoxels_list, ignore_index=True)
    tmp_df['sleepdep'] = metadata_df['sleepdep']
    temp_dict = dict((val, str(val)) for val in list(range(len(tmp_df.columns)-1)))
    return tmp_df.rename(columns=temp_dict, errors='raise')


 ## Garbage collect

In [16]:
gc.collect()


160

 ## Get/Generate raw voxels dataframe from all images with Y column label included

In [17]:
voxels_df = get_voxels_df(sleep_bids_comb_df, masker, configs.startSlice, configs.endSlice)
X = pd.concat([voxels_df, important_confounds_df], axis=1)


Calling 'get_voxels_df'

Calling 'gen_one_voxel_df'
> Shape of raw voxels for file "sub-9001_ses-1_task-arrows_space-MNI152NLin2009cAsym_desc-preproc_bold_masked_(sub-9001-9072_resamp_intersected)_bold.nii" is: 
	 1-D (UnMasked+Sliced): (1, 3634160)
	 2-D (UnMasked+Sliced): (40, 90854)
	 4-D (Raw header)     : (87, 103, 65, 352)
Finished 'gen_one_voxel_df' in 6.1580 secs

Calling 'gen_one_voxel_df'
> Shape of raw voxels for file "sub-9001_ses-1_task-faces_space-MNI152NLin2009cAsym_desc-preproc_bold_masked_(sub-9001-9072_resamp_intersected)_bold.nii" is: 
	 1-D (UnMasked+Sliced): (1, 3634160)
	 2-D (UnMasked+Sliced): (40, 90854)
	 4-D (Raw header)     : (87, 103, 65, 165)
Finished 'gen_one_voxel_df' in 3.0791 secs

Calling 'gen_one_voxel_df'
> Shape of raw voxels for file "sub-9001_ses-1_task-hands_space-MNI152NLin2009cAsym_desc-preproc_bold_masked_(sub-9001-9072_resamp_intersected)_bold.nii" is: 
	 1-D (UnMasked+Sliced): (1, 3634160)
	 2-D (UnMasked+Sliced): (40, 90854)
	 4-D (Raw head

 ## Separately get the Y label

In [18]:
Y = sleep_bids_comb_df['sleepdep']


 ## Save raw dataframe with Y column included to a file

In [20]:
X.to_pickle(configs.rawVoxelFile)
