# Data inspection and preprocessing - 4, Preprocessing ENG
Code is meant to be **read**! Respect [PEP 8](https://peps.python.org/pep-0008/)!

This code has been written by Leonardo Pollina @TNE, EPFL

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import h5py
import copy

###### Variables definition for loading

In [2]:
data_path = '../../Data/'
fs = 24400 # Hz

# Subjects
n_subjects = 2
subjs_info_loading = {'Subject2' : {'Name' : 'p2-t2', 'Vars' : ['Baseline', 'Angio36', 'RR15', 'TV500']},
                      'Subject21' : {'Name' : 'p6-t6', 'Vars' : ['Baseline', 'Angio36', 'RR15', 'TV125']}
                     }
subjs_info_final = {'Subject2' : {'Name' : 'p2-t2', 'Vars' : ['Baseline', 'Angio36', 'RRC', 'TVC']},
                    'Subject21' : {'Name' : 'p6-t6', 'Vars' : ['Baseline', 'Angio36', 'RRC', 'TVC']}
                   }
animals_labels = ['p2-t2', 'p6-t6']

###### Loading functions

In [3]:
def load_data_all_subjects(subjs_info_loading, subjs_info_final, fs, type_data = 'Field_Data_Neuro'):
    subjects_names = list(subjs_info_loading.keys())
    subjs_info = subjs_info_final
    all_data = {}
    print('Start loading ... \n')
    for subject in subjects_names:
        name_subj, data_struct = load_data_one_subject(subject, subjs_info_loading, fs, type_data = type_data)
        all_data[name_subj] = data_struct
        print('================== %s loaded. =================='%subject)
    return all_data


def load_data_one_subject(subject, subjs_info_loading, fs, type_data = 'Field_Data_Neuro'):
    name_subj_to_stock = subjs_info_loading[subject]['Name']
    vars_to_load = subjs_info_loading[subject]['Vars']
    data_struct = {}
    
    file = h5py.File('data\\' + subject + '.mat','r')
    
    # To get the names of the fields after decoding ASCII
    all_field_names = get_field_names(file)

    for var in vars_to_load:
        id_field = np.where(all_field_names == var)[0]
        curr_reference_data1 = file['Vagus_Data_Stimuli'][type_data][id_field][0][0]
        curr_reference_data2 = file[curr_reference_data1][0][0]
        final_data = np.transpose(np.asarray(file[curr_reference_data2]))
        
        if var == 'TV800' or var == 'TV500' or var == 'TV125': 
            var_name = 'TVC'
        elif var == 'RR15' or var == 'RR20': 
            var_name = 'RRC'
        else:
            var_name = var
            
        data_struct[var_name] = {}
        data_struct[var_name]['Data'] = final_data
        n_time_pts = np.shape(final_data)[-1]
        data_struct[var_name]['Time_pts'] = np.linspace(0, n_time_pts/fs, n_time_pts)
        
    return name_subj_to_stock, data_struct

def get_field_names(file):
    n_fields,_ = np.shape(file['Vagus_Data_Stimuli']['stimuli_name'])
    all_field_names = []
    for field in range(n_fields):
        curr_reference_field = file['Vagus_Data_Stimuli']['stimuli_name'][field][0]
        curr_field_ASCII = file[curr_reference_field]
        curr_field = decode_ASCII(curr_field_ASCII)
        all_field_names.append(curr_field)
    return np.asarray(all_field_names)


def decode_ASCII(numbers_array):
    name = ''
    squeezed_numbers = np.squeeze(numbers_array)
    for n in squeezed_numbers:
        name += chr(n)
    return name

In [4]:
data = load_data_all_subjects(subjs_info_loading, subjs_info_final, fs)

Start loading ... 



In [5]:
for key1 in data.keys():
    print('=========== %s ==========='%key1)
    for key2 in data[key1].keys():
        print('--- %s'%key2)
        for key3 in data[key1][key2].keys():
            print(' - %s'%key3)
            print('Shape : ', np.shape(data[key1][key2][key3]))

--- Baseline
 - Data
Shape :  (8, 7335936)
 - Time_pts
Shape :  (7335936,)
--- Angio36
 - Data
Shape :  (8, 14780416)
 - Time_pts
Shape :  (14780416,)
--- RRC
 - Data
Shape :  (8, 2979840)
 - Time_pts
Shape :  (2979840,)
--- TVC
 - Data
Shape :  (8, 2983936)
 - Time_pts
Shape :  (2983936,)
--- Baseline
 - Data
Shape :  (16, 7720960)
 - Time_pts
Shape :  (7720960,)
--- Angio36
 - Data
Shape :  (16, 14940160)
 - Time_pts
Shape :  (14940160,)
--- RRC
 - Data
Shape :  (16, 3043328)
 - Time_pts
Shape :  (3043328,)
--- TVC
 - Data
Shape :  (16, 3033088)
 - Time_pts
Shape :  (3033088,)


In [6]:
def cut_all_data_to_established_duration_per_challenge(data):
    '''
    This function is a wrap-up to the function 'cut_data_one_pig_to_established_duration_per_challenge'.
    '''
    
    new_data_struct = {}
    for pig in data.keys():
        print('================== Working on Pig %s =================='%pig)
        data_curr_pig = data[pig]
        data_curr_pig_cut = cut_data_one_pig_to_established_duration_per_challenge(data_curr_pig)
        new_data_struct[pig] = data_curr_pig_cut
        
    return new_data_struct


def cut_data_one_pig_to_established_duration_per_challenge(data_one_pig):
    ''' 
    This function is used to cut the data for each challenge to the duration shown in Suppl. Table 1 in Vallone et al., 2021. 
    We take the first part of the data for each challenge (arbitrary choice). 
    '''
    new_struct = copy.deepcopy(data_one_pig)
    
    dur_baseline = 5 #min
    dur_RRC = 2 #min
    dur_TVC = 2 #min
    
    for challenge in new_struct.keys():
        data_curr_chal = new_struct[challenge]['Data']
        time_pts_curr_chal = new_struct[challenge]['Time_pts']
        t_end_sec = time_pts_curr_chal[-1]
        
        if challenge == 'Baseline': t_end_sec = dur_baseline * 60
        elif challenge == 'RRC': t_end_sec = dur_RRC * 60
        elif challenge == 'TVC': t_end_sec = dur_TVC * 60
            
#         print('Challenge %s , t_end_sec %0.3f'%(challenge, t_end_sec))
            
        id_t_end_curr_chal = find_specific_time_index(time_pts_curr_chal, t_end_sec)
        new_struct[challenge]['Data'] = data_curr_chal[:,:id_t_end_curr_chal]
        new_struct[challenge]['Time_pts'] = time_pts_curr_chal[:id_t_end_curr_chal]
    
    return new_struct

def find_specific_time_index(time_pts, t):
    t_id = np.argmin(np.abs(time_pts - t))
    return t_id

In [7]:
cut_data = cut_all_data_to_established_duration_per_challenge(data)



In [8]:
cut_data

{'p2-t2': {'Baseline': {'Data': array([[-1.12648413e-05, -1.23800410e-05, -1.35547416e-05, ...,
            1.32435453e-05,  1.45472741e-05,  1.51437080e-05],
          [-3.74680121e-06, -3.48188632e-06, -3.23318136e-06, ...,
            1.11843274e-05,  1.05915515e-05,  1.00407251e-05],
          [-5.41470808e-06, -5.14782141e-06, -4.60008687e-06, ...,
           -4.05208084e-06, -3.27755629e-06, -1.95015400e-06],
          ...,
          [-1.73591343e-05, -1.76173908e-05, -1.75757450e-05, ...,
            1.15151252e-05,  1.08488775e-05,  1.04782011e-05],
          [-1.38157031e-06, -7.61102342e-07, -3.68625166e-07, ...,
            1.41303290e-05,  1.47788178e-05,  1.62204087e-05],
          [-1.40569618e-05, -1.37505831e-05, -1.31897295e-05, ...,
            3.02282547e-06,  4.61986292e-06,  5.35063418e-06]]),
   'Time_pts': array([0.00000000e+00, 4.09836121e-05, 8.19672243e-05, ...,
          2.99999877e+02, 2.99999918e+02, 2.99999959e+02])},
  'Angio36': {'Data': array([[ 1.50161