# merge_shuffle_dataset

Merges (and shuffles) together multiple datasets in SP format.

In [1]:
import random
import h5py as h5
import numpy as np

# Modifying sys.path to be able to load project packages
import sys
sys.path.append('../')

# Load project packages
from utils.h5_tools import write_batch

In [2]:
out = 'C:/data/datasets/combined_fixed.h5'
batch_size = 100000
report_step = 100

seed = 42

n_samples = 400
n_channels = 3

label_aliases = {
                    0: 'p_wave',
                    1: 's_wave',
                    2: 'noise'
                }

`datasets` is a list of dictionaries, each dictionary representing dataset for the merge.

Dataset dictionary fields:
- `data_key` - data key, default "X"
- `label_key` - label key, default "Y", set it to `False` if you do not want to use labels (must specify `force_label`)
- `path` - path to the dataset file
- `normalize` - boolean, dataset will be normalized by maximum absolute value within one record is set to `True`
- `force_label` - if set will assign set value as label, regardless of original label value
- `select_label` - if set as value (numerical value), will use only that label value for the merge
- `length` - if set as integer value, will only select specified amount of entries from the dataset

In [3]:
datasets = [
    {
        'path': 'C:/data/datasets/stead_converted.h5',
        'normalize': True,
    },
    {
        'path': 'C:/data/datasets/scsn_ps_2000_2017_shuf.hdf5',
        'normalize': True,
    },
    {
        'path': 'C:/data/datasets/meier_converted_noise.h5',
        'normalize': True,
        'force_label': 2,
    }
]

In [4]:
# Process default values and errors
for x in datasets:
    if 'select_label' in x and x['select_label'] not in label_aliases:
        raise AttributeError(f'Dataset {x["path"]} specified selection label {x["select_label"]} is not in label_aliases!')
    if 'label_key' in x and not x['label_key'] and 'force_label' not in x:
        raise AttributeError(f'Dataset {x["path"]}: provide either "label_key" or "force_label"')
    if 'data_key' not in x:
        x['data_key'] = 'X'
    if 'label_key' not in x:
        x['label_key'] = 'Y'

## Open datasets

In [5]:
actual_total_length = 0
total_length = 0

print('Datasets info: ')
for x in datasets:
    
    x['file'] = h5.File(x['path'], 'r')
    x['X'] = x['file'][x['data_key']]
    if x['label_key']:
        x['Y'] = x['file'][x['label_key']]
        x['actual_length'] = x['Y'].shape[0]
    else:
        x['actual_length'] = x['X'].shape[0]
    if 'length' not in x or x['length'] == -1:
        x['length'] = x['actual_length']
    elif x['length'] > x['actual_length']:
        raise AttributeError(f'Dataset {x["path"]} specified selection length '\
                             f'{x["length"]} is larger than actual dataset length {x["actual_length"]}')
        
    actual_total_length += x['actual_length']
    total_length += x['length']
    
    labels_count = {}
    if x['label_key']:
        for k in label_aliases:
            l_count = x['Y'][x['Y'][:] == k].shape[0]
            if l_count:
                labels_count[k] = l_count
    else:
        labels_count[x['force_label']] = x['length']
        
    if 'select_label' in x:
        x['actual_label_length'] = labels_count[x['select_label']]
    if 'select_label' in x and x['actual_label_length'] < x['length']:
        raise AttributeError(f'Dataset {x["path"]} specified selection length '\
                             f'{x["length"]} is larger than actual label {label_aliases[x["select_label"]]} '\
                             f' items count {labels_count[x["select_label"]]}')
        
    print(f'\ndataset "{x["path"]}:"')
    print('  --data info--')
    print(f'\t---length: {x["actual_length"]}')
    for key, count in labels_count.items():
        print(f'\t---{label_aliases[key]} count: {count}')
        
    print(' --selection--')
    print(f'\t---selection length: {x["length"]}')
    if 'select_label' in x:
        print(f'\t---select label: {label_aliases[x["select_label"]]}')
    else:
          print(f'\t---select label: all')
    if 'force_label' in x:
        print(f'\t---force_label: {x["force_label"]}')
    
    print('  --data processing--')
    if x['normalize']:
        print('\t---normalize!')
        
print(f'Total length: {total_length}')

Datasets info: 

dataset "C:/data/datasets/stead_converted.h5:"
  --data info--
	---length: 2531314
	---p_wave count: 1030231
	---s_wave count: 1030231
	---noise count: 470852
 --selection--
	---selection length: 2531314
	---select label: all
  --data processing--
	---normalize!

dataset "C:/data/datasets/scsn_ps_2000_2017_shuf.hdf5:"
  --data info--
	---length: 4773750
	---p_wave count: 1591250
	---s_wave count: 1591250
	---noise count: 1591250
 --selection--
	---selection length: 4773750
	---select label: all
  --data processing--
	---normalize!

dataset "C:/data/datasets/meier_converted_noise.h5:"
  --data info--
	---length: 945571
	---noise count: 945571
 --selection--
	---selection length: 945571
	---select label: all
	---force_label: 2
  --data processing--
	---normalize!
Total length: 8250635


## Generate and shuffle indexes

In [6]:
np.random.seed(seed)
new_labels = np.zeros((total_length, 2), dtype=int)
start = 0
for i, x in enumerate(datasets):
    
    end = start + x['length']
    inter_labels = np.arange(x['actual_length'])
    
    if 'select_label' in x:
        inter_labels = inter_labels[x['Y'][:] == x['select_label']]
        if x['actual_label_length'] > x['length']:
            np.random.shuffle(inter_labels)
            inter_labels = inter_labels[:x['length']]  
    elif x['actual_length'] > x['length']:
        np.random.shuffle(inter_labels)
        inter_labels = inter_labels[:x['length']]  
        
    new_labels[start:end, 0] = inter_labels
    new_labels[start:end, 1] = i
    start = end
    
np.random.shuffle(new_labels)

## Convert

In [None]:
batch_X = np.zeros((batch_size, n_samples, n_channels))
batch_Y = np.zeros(batch_size, dtype=int)

current_idx = 0
total = 0

for i in range(total_length):
    
    d_point = new_labels[i, 0]
    
    d_data = datasets[new_labels[i, 1]]
    X = d_data['X'][d_point]
    
    if 'force_label' in d_data:
        label = int(d_data['force_label'])
    else:
        label = int(d_data['Y'][d_point])
    
        
    if d_data['normalize']:
        a_max = np.max(np.abs(X))
        X /= a_max
        
    
    batch_X[current_idx] = X
    batch_Y[current_idx] = label
    current_idx += 1
    total += 1

    if current_idx >= batch_size:
        write_batch(out, 'X', batch_X)
        write_batch(out, 'Y', batch_Y)
        print(f'---Batch written to: {out}.. Saved total: {total} traces.')
        current_idx = 0
        
    if not total%report_step:
        print(f'\tConverted {total} traces.')
        
if current_idx != 0:
    write_batch(out, 'X', batch_X[:current_idx])
    write_batch(out, 'Y', batch_Y[:current_idx])
    
for x in datasets:
    x['file'].close()

	Converted 100 traces.
	Converted 200 traces.
	Converted 300 traces.
	Converted 400 traces.
	Converted 500 traces.
	Converted 600 traces.
	Converted 700 traces.
	Converted 800 traces.
	Converted 900 traces.
	Converted 1000 traces.
	Converted 1100 traces.
	Converted 1200 traces.
	Converted 1300 traces.
	Converted 1400 traces.
	Converted 1500 traces.
	Converted 1600 traces.
	Converted 1700 traces.
	Converted 1800 traces.
	Converted 1900 traces.
	Converted 2000 traces.
	Converted 2100 traces.
	Converted 2200 traces.
	Converted 2300 traces.
	Converted 2400 traces.
	Converted 2500 traces.
	Converted 2600 traces.
	Converted 2700 traces.
	Converted 2800 traces.
	Converted 2900 traces.
	Converted 3000 traces.
	Converted 3100 traces.
	Converted 3200 traces.
	Converted 3300 traces.
	Converted 3400 traces.
	Converted 3500 traces.
	Converted 3600 traces.
	Converted 3700 traces.
	Converted 3800 traces.
	Converted 3900 traces.
	Converted 4000 traces.
	Converted 4100 traces.
	Converted 4200 traces.
	

	Converted 33400 traces.
	Converted 33500 traces.
	Converted 33600 traces.
	Converted 33700 traces.
	Converted 33800 traces.
	Converted 33900 traces.
	Converted 34000 traces.
	Converted 34100 traces.
	Converted 34200 traces.
	Converted 34300 traces.
	Converted 34400 traces.
	Converted 34500 traces.
	Converted 34600 traces.
	Converted 34700 traces.
	Converted 34800 traces.
	Converted 34900 traces.
	Converted 35000 traces.
	Converted 35100 traces.
	Converted 35200 traces.
	Converted 35300 traces.
	Converted 35400 traces.
	Converted 35500 traces.
	Converted 35600 traces.
	Converted 35700 traces.
	Converted 35800 traces.
	Converted 35900 traces.
	Converted 36000 traces.
	Converted 36100 traces.
	Converted 36200 traces.
	Converted 36300 traces.
	Converted 36400 traces.
	Converted 36500 traces.
	Converted 36600 traces.
	Converted 36700 traces.
	Converted 36800 traces.
	Converted 36900 traces.
	Converted 37000 traces.
	Converted 37100 traces.
	Converted 37200 traces.
	Converted 37300 traces.


	Converted 66200 traces.
	Converted 66300 traces.
	Converted 66400 traces.
	Converted 66500 traces.
	Converted 66600 traces.
	Converted 66700 traces.
	Converted 66800 traces.
	Converted 66900 traces.
	Converted 67000 traces.
	Converted 67100 traces.
	Converted 67200 traces.
	Converted 67300 traces.
	Converted 67400 traces.
	Converted 67500 traces.
	Converted 67600 traces.
	Converted 67700 traces.
	Converted 67800 traces.
	Converted 67900 traces.
	Converted 68000 traces.
	Converted 68100 traces.
	Converted 68200 traces.
	Converted 68300 traces.
	Converted 68400 traces.
	Converted 68500 traces.
	Converted 68600 traces.
	Converted 68700 traces.
	Converted 68800 traces.
	Converted 68900 traces.
	Converted 69000 traces.
	Converted 69100 traces.
	Converted 69200 traces.
	Converted 69300 traces.
	Converted 69400 traces.
	Converted 69500 traces.
	Converted 69600 traces.
	Converted 69700 traces.
	Converted 69800 traces.
	Converted 69900 traces.
	Converted 70000 traces.
	Converted 70100 traces.


	Converted 99000 traces.
	Converted 99100 traces.
	Converted 99200 traces.
	Converted 99300 traces.
	Converted 99400 traces.
	Converted 99500 traces.
	Converted 99600 traces.
	Converted 99700 traces.
	Converted 99800 traces.
	Converted 99900 traces.
---Batch written to: C:/data/datasets/combined_fixed.h5.. Saved total: 100000 traces.
	Converted 100000 traces.
	Converted 100100 traces.
	Converted 100200 traces.
	Converted 100300 traces.
	Converted 100400 traces.
	Converted 100500 traces.
	Converted 100600 traces.
	Converted 100700 traces.
	Converted 100800 traces.
	Converted 100900 traces.
	Converted 101000 traces.
	Converted 101100 traces.
	Converted 101200 traces.
	Converted 101300 traces.
	Converted 101400 traces.
	Converted 101500 traces.
	Converted 101600 traces.
	Converted 101700 traces.
	Converted 101800 traces.
	Converted 101900 traces.
	Converted 102000 traces.
	Converted 102100 traces.
	Converted 102200 traces.
	Converted 102300 traces.
	Converted 102400 traces.
	Converted 102

	Converted 130300 traces.
	Converted 130400 traces.
	Converted 130500 traces.
	Converted 130600 traces.
	Converted 130700 traces.
	Converted 130800 traces.
	Converted 130900 traces.
	Converted 131000 traces.
	Converted 131100 traces.
	Converted 131200 traces.
	Converted 131300 traces.
	Converted 131400 traces.
	Converted 131500 traces.
	Converted 131600 traces.
	Converted 131700 traces.
	Converted 131800 traces.
	Converted 131900 traces.
	Converted 132000 traces.
	Converted 132100 traces.
	Converted 132200 traces.
	Converted 132300 traces.
	Converted 132400 traces.
	Converted 132500 traces.
	Converted 132600 traces.
	Converted 132700 traces.
	Converted 132800 traces.
	Converted 132900 traces.
	Converted 133000 traces.
	Converted 133100 traces.
	Converted 133200 traces.
	Converted 133300 traces.
	Converted 133400 traces.
	Converted 133500 traces.
	Converted 133600 traces.
	Converted 133700 traces.
	Converted 133800 traces.
	Converted 133900 traces.
	Converted 134000 traces.
	Converted 1

	Converted 161900 traces.
	Converted 162000 traces.
	Converted 162100 traces.
	Converted 162200 traces.
	Converted 162300 traces.
	Converted 162400 traces.
	Converted 162500 traces.
	Converted 162600 traces.
	Converted 162700 traces.
	Converted 162800 traces.
	Converted 162900 traces.
	Converted 163000 traces.
	Converted 163100 traces.
	Converted 163200 traces.
	Converted 163300 traces.
	Converted 163400 traces.
	Converted 163500 traces.
	Converted 163600 traces.
	Converted 163700 traces.
	Converted 163800 traces.
	Converted 163900 traces.
	Converted 164000 traces.
	Converted 164100 traces.
	Converted 164200 traces.
	Converted 164300 traces.
	Converted 164400 traces.
	Converted 164500 traces.
	Converted 164600 traces.
	Converted 164700 traces.
	Converted 164800 traces.
	Converted 164900 traces.
	Converted 165000 traces.
	Converted 165100 traces.
	Converted 165200 traces.
	Converted 165300 traces.
	Converted 165400 traces.
	Converted 165500 traces.
	Converted 165600 traces.
	Converted 1

	Converted 193500 traces.
	Converted 193600 traces.
	Converted 193700 traces.
	Converted 193800 traces.
	Converted 193900 traces.
	Converted 194000 traces.
	Converted 194100 traces.
	Converted 194200 traces.
	Converted 194300 traces.
	Converted 194400 traces.
	Converted 194500 traces.
	Converted 194600 traces.
	Converted 194700 traces.
	Converted 194800 traces.
	Converted 194900 traces.
	Converted 195000 traces.
	Converted 195100 traces.
	Converted 195200 traces.
	Converted 195300 traces.
	Converted 195400 traces.
	Converted 195500 traces.
	Converted 195600 traces.
	Converted 195700 traces.
	Converted 195800 traces.
	Converted 195900 traces.
	Converted 196000 traces.
	Converted 196100 traces.
	Converted 196200 traces.
	Converted 196300 traces.
	Converted 196400 traces.
	Converted 196500 traces.
	Converted 196600 traces.
	Converted 196700 traces.
	Converted 196800 traces.
	Converted 196900 traces.
	Converted 197000 traces.
	Converted 197100 traces.
	Converted 197200 traces.
	Converted 1

	Converted 224700 traces.
	Converted 224800 traces.
	Converted 224900 traces.
	Converted 225000 traces.
	Converted 225100 traces.
	Converted 225200 traces.
	Converted 225300 traces.
	Converted 225400 traces.
	Converted 225500 traces.
	Converted 225600 traces.
	Converted 225700 traces.
	Converted 225800 traces.
	Converted 225900 traces.
	Converted 226000 traces.
	Converted 226100 traces.
	Converted 226200 traces.
	Converted 226300 traces.
	Converted 226400 traces.
	Converted 226500 traces.
	Converted 226600 traces.
	Converted 226700 traces.
	Converted 226800 traces.
	Converted 226900 traces.
	Converted 227000 traces.
	Converted 227100 traces.
	Converted 227200 traces.
	Converted 227300 traces.
	Converted 227400 traces.
	Converted 227500 traces.
	Converted 227600 traces.
	Converted 227700 traces.
	Converted 227800 traces.
	Converted 227900 traces.
	Converted 228000 traces.
	Converted 228100 traces.
	Converted 228200 traces.
	Converted 228300 traces.
	Converted 228400 traces.
	Converted 2

	Converted 256300 traces.
	Converted 256400 traces.
	Converted 256500 traces.
	Converted 256600 traces.
	Converted 256700 traces.
	Converted 256800 traces.
	Converted 256900 traces.
	Converted 257000 traces.
	Converted 257100 traces.
	Converted 257200 traces.
	Converted 257300 traces.
	Converted 257400 traces.
	Converted 257500 traces.
	Converted 257600 traces.
	Converted 257700 traces.
	Converted 257800 traces.
	Converted 257900 traces.
	Converted 258000 traces.
	Converted 258100 traces.
	Converted 258200 traces.
	Converted 258300 traces.
	Converted 258400 traces.
	Converted 258500 traces.
	Converted 258600 traces.
	Converted 258700 traces.
	Converted 258800 traces.
	Converted 258900 traces.
	Converted 259000 traces.
	Converted 259100 traces.
	Converted 259200 traces.
	Converted 259300 traces.
	Converted 259400 traces.
	Converted 259500 traces.
	Converted 259600 traces.
	Converted 259700 traces.
	Converted 259800 traces.
	Converted 259900 traces.
	Converted 260000 traces.
	Converted 2

	Converted 287900 traces.
	Converted 288000 traces.
	Converted 288100 traces.
	Converted 288200 traces.
	Converted 288300 traces.
	Converted 288400 traces.
	Converted 288500 traces.
	Converted 288600 traces.
	Converted 288700 traces.
	Converted 288800 traces.
	Converted 288900 traces.
	Converted 289000 traces.
	Converted 289100 traces.
	Converted 289200 traces.
	Converted 289300 traces.
	Converted 289400 traces.
	Converted 289500 traces.
	Converted 289600 traces.
	Converted 289700 traces.
	Converted 289800 traces.
	Converted 289900 traces.
	Converted 290000 traces.
	Converted 290100 traces.
	Converted 290200 traces.
	Converted 290300 traces.
	Converted 290400 traces.
	Converted 290500 traces.
	Converted 290600 traces.
	Converted 290700 traces.
	Converted 290800 traces.
	Converted 290900 traces.
	Converted 291000 traces.
	Converted 291100 traces.
	Converted 291200 traces.
	Converted 291300 traces.
	Converted 291400 traces.
	Converted 291500 traces.
	Converted 291600 traces.
	Converted 2

	Converted 319100 traces.
	Converted 319200 traces.
	Converted 319300 traces.
	Converted 319400 traces.
	Converted 319500 traces.
	Converted 319600 traces.
	Converted 319700 traces.
	Converted 319800 traces.
	Converted 319900 traces.
	Converted 320000 traces.
	Converted 320100 traces.
	Converted 320200 traces.
	Converted 320300 traces.
	Converted 320400 traces.
	Converted 320500 traces.
	Converted 320600 traces.
	Converted 320700 traces.
	Converted 320800 traces.
	Converted 320900 traces.
	Converted 321000 traces.
	Converted 321100 traces.
	Converted 321200 traces.
	Converted 321300 traces.
	Converted 321400 traces.
	Converted 321500 traces.
	Converted 321600 traces.
	Converted 321700 traces.
	Converted 321800 traces.
	Converted 321900 traces.
	Converted 322000 traces.
	Converted 322100 traces.
	Converted 322200 traces.
	Converted 322300 traces.
	Converted 322400 traces.
	Converted 322500 traces.
	Converted 322600 traces.
	Converted 322700 traces.
	Converted 322800 traces.
	Converted 3

	Converted 350700 traces.
	Converted 350800 traces.
	Converted 350900 traces.
	Converted 351000 traces.
	Converted 351100 traces.
	Converted 351200 traces.
	Converted 351300 traces.
	Converted 351400 traces.
	Converted 351500 traces.
	Converted 351600 traces.
	Converted 351700 traces.
	Converted 351800 traces.
	Converted 351900 traces.
	Converted 352000 traces.
	Converted 352100 traces.
	Converted 352200 traces.
	Converted 352300 traces.
	Converted 352400 traces.
	Converted 352500 traces.
	Converted 352600 traces.
	Converted 352700 traces.
	Converted 352800 traces.
	Converted 352900 traces.
	Converted 353000 traces.
	Converted 353100 traces.
	Converted 353200 traces.
	Converted 353300 traces.
	Converted 353400 traces.
	Converted 353500 traces.
	Converted 353600 traces.
	Converted 353700 traces.
	Converted 353800 traces.
	Converted 353900 traces.
	Converted 354000 traces.
	Converted 354100 traces.
	Converted 354200 traces.
	Converted 354300 traces.
	Converted 354400 traces.
	Converted 3

	Converted 382300 traces.
	Converted 382400 traces.
	Converted 382500 traces.
	Converted 382600 traces.
	Converted 382700 traces.
	Converted 382800 traces.
	Converted 382900 traces.
	Converted 383000 traces.
	Converted 383100 traces.
	Converted 383200 traces.
	Converted 383300 traces.
	Converted 383400 traces.
	Converted 383500 traces.
	Converted 383600 traces.
	Converted 383700 traces.
	Converted 383800 traces.
	Converted 383900 traces.
	Converted 384000 traces.
	Converted 384100 traces.
	Converted 384200 traces.
	Converted 384300 traces.
	Converted 384400 traces.
	Converted 384500 traces.
	Converted 384600 traces.
	Converted 384700 traces.
	Converted 384800 traces.
	Converted 384900 traces.
	Converted 385000 traces.
	Converted 385100 traces.
	Converted 385200 traces.
	Converted 385300 traces.
	Converted 385400 traces.
	Converted 385500 traces.
	Converted 385600 traces.
	Converted 385700 traces.
	Converted 385800 traces.
	Converted 385900 traces.
	Converted 386000 traces.
	Converted 3

	Converted 413500 traces.
	Converted 413600 traces.
	Converted 413700 traces.
	Converted 413800 traces.
	Converted 413900 traces.
	Converted 414000 traces.
	Converted 414100 traces.
	Converted 414200 traces.
	Converted 414300 traces.
	Converted 414400 traces.
	Converted 414500 traces.
	Converted 414600 traces.
	Converted 414700 traces.
	Converted 414800 traces.
	Converted 414900 traces.
	Converted 415000 traces.
	Converted 415100 traces.
	Converted 415200 traces.
	Converted 415300 traces.
	Converted 415400 traces.
	Converted 415500 traces.
	Converted 415600 traces.
	Converted 415700 traces.
	Converted 415800 traces.
	Converted 415900 traces.
	Converted 416000 traces.
	Converted 416100 traces.
	Converted 416200 traces.
	Converted 416300 traces.
	Converted 416400 traces.
	Converted 416500 traces.
	Converted 416600 traces.
	Converted 416700 traces.
	Converted 416800 traces.
	Converted 416900 traces.
	Converted 417000 traces.
	Converted 417100 traces.
	Converted 417200 traces.
	Converted 4

	Converted 445100 traces.
	Converted 445200 traces.
	Converted 445300 traces.
	Converted 445400 traces.
	Converted 445500 traces.
	Converted 445600 traces.
	Converted 445700 traces.
	Converted 445800 traces.
	Converted 445900 traces.
	Converted 446000 traces.
	Converted 446100 traces.
	Converted 446200 traces.
	Converted 446300 traces.
	Converted 446400 traces.
	Converted 446500 traces.
	Converted 446600 traces.
	Converted 446700 traces.
	Converted 446800 traces.
	Converted 446900 traces.
	Converted 447000 traces.
	Converted 447100 traces.
	Converted 447200 traces.
	Converted 447300 traces.
	Converted 447400 traces.
	Converted 447500 traces.
	Converted 447600 traces.
	Converted 447700 traces.
	Converted 447800 traces.
	Converted 447900 traces.
	Converted 448000 traces.
	Converted 448100 traces.
	Converted 448200 traces.
