# merge_seisan_like_sets

This script takes two seisan-like datasets and merges them into a new set.

This script works only with single level deep sets. Merging performed by similar keys or keys lists specified in `# Init parameters` section.

In [1]:
import h5py as h5
import numpy as np

from utils.h5_tools import write_batch

In [2]:
# Init parameters
path_1 = 'data/2014_2019_global_norm.h5'
path_2 = 'data/2020_2021_global_norm.h5'

save_path = 'data/2014_2021_global_norm.h5'

# List of keys to merge together
keys_1 = None # example: ['X', 'Y', 'ID']
keys_2 = None # example: ['X', 'Y', 'Z']

batch_size = 20000

In [3]:
# Read data
f1 = h5.File(path_1, 'r')
f2 = h5.File(path_2, 'r')

# Read sets
sets_1 = []
sets_2 = []
sets_keys = keys_1

if keys_1 and keys_2:
    
    for k in keys_1:
        sets_1.append(f1[k])
        
    for k in keys_2:
        sets_2.append(f2[k])
        
else:
    
    sets_keys = []
    for k1 in f1.keys():
        if k1 in f2.keys():
            
            sets_1.append(f1[k1])
            sets_2.append(f2[k1])
            
            sets_keys.append(k1)

## Sets info:

In [4]:
sets_1, sets_2

([<HDF5 dataset "ID": shape (7446,), type "|O">,
  <HDF5 dataset "X": shape (7446, 400, 3), type "<f8">,
  <HDF5 dataset "Y": shape (7446,), type "<f8">],
 [<HDF5 dataset "ID": shape (1331,), type "|O">,
  <HDF5 dataset "X": shape (1331, 400, 3), type "<f8">,
  <HDF5 dataset "Y": shape (1331,), type "<f8">])

## Merge:

In [5]:
def transfer_set_list(sets, sets_keys, batch_size, save_path):
    
    string_type = np.dtype(object) # data type of string NumPy arrays
    
    for i, k in enumerate(sets_keys):
    
        s = sets[i]
        s_length = s.shape[0]

        batch_count = s_length // batch_size
        last_batch = s_length % batch_size

        if last_batch:
            batch_count += 1

        for b in range(batch_count):

            c_batch_size = batch_size
            if b == batch_count - 1:
                c_batch_size = last_batch

            start_pos = batch_size * b

            c_data = s[start_pos : start_pos + c_batch_size]

            if s.dtype == string_type:
                write_batch(save_path, k, c_data, string = True)
            else:
                write_batch(save_path, k, c_data)

In [6]:
transfer_set_list(sets_1, sets_keys, batch_size, save_path)
transfer_set_list(sets_2, sets_keys, batch_size, save_path)

In [7]:
f1.close()
f2.close()