# Functions to generate data lists

In [None]:
def generate_augm_params(max_augm_params):
    import numpy.random as rnd
    max_shift = max_augm_params['shift']
    max_blur = max_augm_params['blur']
    while True:
        shift_x = rnd.randint(-max_shift, max_shift)
        shift_y = rnd.randint(-max_shift, max_shift)
        shift_z = rnd.randint(-max_shift, max_shift)
        blur_sigma = float(rnd.randint(1000)) / 1000 * max_blur
        if shift_x + shift_y + shift_z + blur_sigma > 0:
            return (shift_x, shift_y, shift_z, blur_sigma)

In [None]:
def generate_augm_lists(dirs_with_labels, new_size, max_augm_params, default_augm_params=None):
    import numpy.random as rnd
    import math
    if new_size == None or len(dirs_with_labels) == new_size:
        return [dwl + [default_augm_params] for dwl in dirs_with_labels]
    augm_coeff = int(math.floor(new_size / len(dirs_with_labels)))
    res = []
    i = 0
    for dwl in dirs_with_labels:
        res.append(dwl + [(0, 0, 0, 0.0)])
        i += 1
        for _ in range(augm_coeff-1):
            res.append(dwl + [generate_augm_params(max_augm_params)])
            i += 1
    while i < new_size:
        ridx = rnd.randint(len(dirs_with_labels))
        dwl = dirs_with_labels[ridx]
        res.append(dwl +[generate_augm_params(max_augm_params)])
        i += 1
    return res

In [None]:
def generate_lists_from_adni2(adni_root, max_augm_params, augm_factor, valid_prc = 0.25, test_prc = 0.25, shuffle_data=True, debug=True):
    
    import os
    import numpy as np
    import numpy.random as rnd

    stage_dirs = {
        'AD': '/AD/',
        'MCI': '/MCI/',
        'NC': '/NC/'
    }

    stage_dirs_root = {k: adni_root + v for k, v in stage_dirs.items()}
    
    default_augm = (0, 0, 0, 0.0)
    
    patients_MRI_train = []
    patients_MD_train = []
    patients_MRI_test = []
    patients_MD_test = []
    
    
    class_size = {k: len(os.listdir(stage_dirs_root[k])) for k in stage_dirs_root}
    print('source patients:', class_size)

    ts = int(min(class_size.values()) * test_prc)
    test_size = {k: ts for k in stage_dirs_root}
    valid_size = {k: int(class_size[k] * valid_prc) for k in stage_dirs_root}
    train_size = {k: class_size[k] - test_size[k] - valid_size[k] for k in stage_dirs_root}
    
    print('source patients used for train:', train_size)
    print('source patients used for validation:', valid_size)
    print('source patients used for test', test_size)

    train_size_balanced = int(max(train_size.values()) * augm_factor)
    valid_size_balanced = int(max(valid_size.values()) * augm_factor)
    print('train data will be augmented to %d samples by each class' % train_size_balanced)
    print('validation data will be augmented to %d samples by each class' % valid_size_balanced)
    print('test data will be augmented to %d samples by each class' % ts)
    
    train_lists_out = []
    valid_lists_out = []
    test_lists_out = []
    
    for k in stage_dirs_root:
        stage_dir = stage_dirs[k]
        patient_dirs = os.listdir(stage_dirs_root[k])
        rnd.shuffle(patient_dirs)

        test_dirs = patient_dirs[:test_size[k]]
        valid_dirs = patient_dirs[test_size[k]:test_size[k]+valid_size[k]]
        train_dirs = patient_dirs[test_size[k]+valid_size[k]:]
                                 
        train_lists = [[k, stage_dir + d + '/SMRI/', stage_dir + d + '/MD/'] for d in train_dirs]
        valid_lists = [[k, stage_dir + d + '/SMRI/', stage_dir + d + '/MD/'] for d in valid_dirs]
        test_lists = [[k, stage_dir + d + '/SMRI/', stage_dir + d + '/MD/'] for d in test_dirs]
        
        train_lists_out += generate_augm_lists(train_lists, train_size_balanced, max_augm_params)
        valid_lists_out += generate_augm_lists(valid_lists, valid_size_balanced, max_augm_params)
        test_lists_out += generate_augm_lists(test_lists, None, None, default_augm_params=default_augm)
    
    if shuffle_data:
        rnd.shuffle(train_lists_out)
        rnd.shuffle(valid_lists_out)
        rnd.shuffle(test_lists_out)
    
    if debug:
        print('### train lists (%d instances):' % len(train_lists_out))
        for i in train_lists_out: print(i)
        print('### valid lists (%d instances):' % len(valid_lists_out))
        for i in valid_lists_out: print(i)
        print('### test lists (%d instances):' % len(test_lists_out))
        for i in test_lists_out: print(i)
        
        
    return (train_lists_out, valid_lists_out, test_lists_out)

# An example of how to do data preprocessing

Preprocessing params

In [None]:
import numpy as np

lists_params = {
    'adni_root': '/home/xubiker/ADNI_Multimodal/dataset/',
    'max_augm': {'shift': 2, 'blur': 1.2},
    'test_prc': 0.25,
    'valid_prc': 0.25,
    'augm_factor': 2
}

In [None]:
def save_params(params, file_path):
    import pickle
    with open(file_path, 'wb') as f:
        pickle.dump(params, f)

Let's generate lists...

In [None]:
def generate_lists(lists_file_path, params, debug=True):
    import ex_utils
    train_list, valid_list, test_list = generate_lists_from_adni2(
        params['adni_root'],
        params['max_augm'], test_prc=params['test_prc'], valid_prc=params['valid_prc'],
        augm_factor=params['augm_factor'],
        shuffle_data=True, debug=debug
    )
    ex_utils.save_pickle((train_list, valid_list, test_list), lists_file_path)

In [None]:
import ex_utils
ex_utils.save_pickle(lists_params, 'params.pkl')
generate_lists('lists.pkl', lists_params, debug=True)