## Function definitions - Loading data

Functions defined in this script and their description:
* **get_file_list** - returns a list of the files in a given path that match a certain set pattern. Optional to remove a part of the path, like the extension.
* **loadrundataset** - given a dataset and a key  returns the loaded minitrees data (*data_run1 = loadrundataset(run= '1'*). Also checks for savenumber equal to the run number in 2018
* **loadsingledataset** - loads a single dataset. Needs dataset, accepts tmakers and pre.
* **process_runs** - Given a dictionary where each key has a dataset, returns a modified version of such dictionary with the processed data also in each key: *NG_runs = {'run1':dataset_run1, 'run2':dataset_run2, 'run3':dataset_run3, 'run4':dataset_run4}* -> NG_runs = {'run1':[dataset_run1,data_run1], ... ]}

In [None]:
print('Loading data loading functions.')

In [1]:
# From Erik Hogenbirk's notes and functions
def get_file_list(path, pattern, remove_string='', remove_path=True):
    '''
    Get a liuserst of files matching pattern in path. Optional to remove a part of the path (i.e. the extention)
    Optional to remove the path
    '''
    file_list = glob.glob(path + pattern)
    # Remove path
    for i, f in enumerate(file_list):
        if remove_path:
            f = f.replace(path, '')
        if remove_string != '':
            f = f.replace(remove_string, '')
        file_list[i] = f
    file_list = np.sort(file_list)
    return file_list

In [None]:
def loadrundataset(key,dataset, savenumber, trees, pre):        
    if os.path.isfile(cachefolder + 'cachedata_%s_%d.hdf5' %(key,savenumber)) == True:
        _data = hax.minitrees.load_cache_file(cachefolder + 'cachedata_%s_%d.hdf5' %(key,savenumber))
        print ('Loaded cache file: %s' %(cachefolder + 'cachedata_%s_%d.hdf5' %(key,savenumber)))
        return _data
    else:
        if trees == 'normal':
            tmakers = ['Fundamentals', 'Basics', 'Extended', 'Corrections']
        elif trees == 'doublescatter':
            tmakers = ['Fundamentals','FlashIdentification', 'Basics', 'Extended', 'Corrections','TailCut','TotalProperties', 'Proximity','PositionReconstruction','CorrectedDoubleS1Scatter']
        elif trees == 'all':
            tmakers = ['Fundamentals','FlashIdentification', 'Basics', 'Extended', 'Corrections','TailCut','TotalProperties', 'Proximity','PositionReconstruction']
        else:
            tmakers = trees
        
        dfs = []
        run_numbers = dataset['number'].values
        for counter,dset in enumerate(run_numbers):
            #print ('Loading dataset nr:',dset)
            try:
                df_temp = hax.minitrees.load(dset, 
                                         treemakers = tmakers,
                                         preselection = pre)
                dfs.append(df_temp)
                if counter %10 ==0:
                    print (counter,'/',len(run_numbers))
            except:
                print('bad dataset: %d'%dset)
                continue
        _data = pd.concat(dfs)
        hax.minitrees.save_cache_file(_data, cachefolder + 'cachedata_%s_%d.hdf5' %(key,savenumber))
        return _data

In [None]:
def loadsingledataset(dataset, 
              tmakers = ['Fundamentals','FlashIdentification', 'Basics', 'Extended', 'Corrections','TailCut','TotalProperties', 'Proximity','PositionReconstruction'],
              pre = ['cs1 < 200', 'cs1 > 0', 'cs2 > 0']):
    dfs = []
    run_numbers = dataset['number'].values
    for counter,dset in enumerate(run_numbers):
        if counter %10 ==0:
            print (counter,'/',len(run_numbers))
        print ('Loading dataset nr:',dset)
        df_temp = hax.minitrees.load(dset, 
                                     treemakers = tmakers,
                                     preselection = pre)
        dfs.append(df_temp)
        
    _data = pd.concat(dfs)
    return _data

In [None]:
def process_runs(run_dict, savenumber, trees = 'normal', pre = ['cs1 < 200', 'cs1 > 0', 'cs2 > 0']):
    keys = run_dict.keys()
    for key in keys:
        _dataset = run_dict[key][0]
        _data = loadrundataset(key,_dataset, trees = trees, pre = pre, savenumber = savenumber)
        _dataset.name=key
        _data.name=key
        run_dict[key] = [_dataset,_data]
    return run_dict