# convert_idg_to_h5

This script converts IDG matlab dataset to .h5 dataset, similar to SP and GPD datasets.

In [1]:
import os
import scipy.io
import obspy as obs
from pathlib import Path
import h5py as h5
import numpy as np

# Modifying sys.path to be able to load project packages
import sys
sys.path.append('../')

# Load project packages
from utils.h5_tools import write_batch

In [4]:
label = 0
path_in = 'C:/data/datasets/data_for_ML/events_data/'  # path to the dataset directory
path_out = 'C:/data/datasets/data_for_ML_convertet/events.h5'
data_keys = ['data_record', 0]
time_keys = ['time_in', 0]

preprocess = True

in_df = 10000.
out_df = 100.

pick_length = 400  # samples

In [13]:
def get_content(data, keys):
    """
    Returns content of data, indexed by series of keys.
    Usage Example:
        data = get_content(dataset, ["data_record", "Z"])
        Will return dataset["data_record"]["Z"].
    """
    for k in keys:
        data = data[k]
    return data


def add_channels(data, n_channels=3):
    """
    Converts data to three-channel array (by duplicating data for each extra channel).
    """
    n_samples = data.shape[0]
    X = np.zeros((n_samples, n_channels))
    for i in range(n_channels):
        X[:, i] = data[:]
    return X


def convert_file(path_in, data_keys, time_keys, preprocess=True):
    """
    Converts seismic data from .mat dataset to miniSEED format.
    """
    dataset = scipy.io.loadmat(path_in)
    
    # Check if there are multiple data or time entries
    data = get_content(dataset, data_keys)
    time = get_content(dataset, time_keys)

    trace = obs.Trace(data)

    trace.stats.sampling_rate = in_df

    if preprocess:
        trace.detrend(type="linear")
        trace.filter(type="highpass", freq=2.)
        
    trace.resample(out_df)
    
    if preprocess:
        trace.normalize()
    
    # Get data and label
    data = trace.data
    l_picks = []
    for t in time:
        
        # Get pick span in sample positions
        n_sample = int(t*out_df)
        
        start = int(n_sample - pick_length/2)
        if start < 0:
            start = 0
        
        end = int(start + pick_length)
        if end > len(data):
            end = len(data)
            start = int(end - pick_length)
            
        # Get pick
        pick = data[start:end]
        pick = add_channels(pick)
        
        l_picks.append(pick)
    
    return l_picks

In [None]:
files = os.listdir(path_in)  # load input file names
# Path(path_out).mkdir(parents=True, exist_ok=True)  # create out directory if doesn't exists

X = []
for f_in in files:
    f_in = os.path.join(path_in, f_in)
    X.extend(convert_file(f_in, data_keys, time_keys, preprocess=preprocess))

X = np.array(X)
Y = np.full(X.shape[0], label, dtype = int)

write_batch(path_out, 'X', X)
write_batch(path_out, 'Y', Y)

s 4513; e 4913; t 47.1319; n_sample 4713; len 6000;
s 401; e 801; t 6.0117; n_sample 601; len 6000;
s 2078; e 2478; t 22.7863; n_sample 2278; len 6000;
s 212; e 612; t 4.1254; n_sample 412; len 6000;
s 4512; e 4912; t 47.1244; n_sample 4712; len 6000;
s 400; e 800; t 6.0038; n_sample 600; len 6000;
s 2077; e 2477; t 22.7785; n_sample 2277; len 6000;
s 211; e 611; t 4.1163; n_sample 411; len 6000;
s 4512; e 4912; t 47.1215; n_sample 4712; len 6000;
s 400; e 800; t 6.0001; n_sample 600; len 6000;
s 2077; e 2477; t 22.7743; n_sample 2277; len 6000;
s 211; e 611; t 4.1127; n_sample 411; len 6000;
s 877; e 1277; t 10.7741; n_sample 1077; len 6000;
s 3633; e 4033; t 38.3365; n_sample 3833; len 6000;
s 5333; e 5733; t 55.3326; n_sample 5533; len 6000;
s 5379; e 5779; t 55.7988; n_sample 5579; len 6000;
s 3015; e 3415; t 32.1526; n_sample 3215; len 6000;
s 876; e 1276; t 10.7673; n_sample 1076; len 6000;
s 3632; e 4032; t 38.329; n_sample 3832; len 6000;
s 5332; e 5732; t 55.3235; n_sample 553

s 128; e 528; t 3.2864; n_sample 328; len 6000;
s 1608; e 2008; t 18.0801; n_sample 1808; len 6000;
s 1148; e 1548; t 13.4837; n_sample 1348; len 6000;
s 5335; e 5735; t 55.3544; n_sample 5535; len 6000;
s 2780; e 3180; t 29.8053; n_sample 2980; len 6000;
s 2688; e 3088; t 28.8805; n_sample 2888; len 6000;
s 0; e 400; t 0.4855; n_sample 48; len 6000;
s 956; e 1356; t 11.5633; n_sample 1156; len 6000;
s 1881; e 2281; t 20.8198; n_sample 2081; len 6000;
s 2871; e 3271; t 30.7123; n_sample 3071; len 6000;
s 88; e 488; t 2.8812; n_sample 288; len 6000;
s 127; e 527; t 3.2791; n_sample 327; len 6000;
s 1607; e 2007; t 18.0727; n_sample 1807; len 6000;
s 1147; e 1547; t 13.4759; n_sample 1347; len 6000;
s 5334; e 5734; t 55.3486; n_sample 5534; len 6000;
s 2779; e 3179; t 29.7967; n_sample 2979; len 6000;
s 2687; e 3087; t 28.8728; n_sample 2887; len 6000;
s 0; e 400; t 0.4843; n_sample 48; len 6000;
s 955; e 1355; t 11.5597; n_sample 1155; len 6000;
s 1882; e 2282; t 20.8287; n_sample 2082;