# convert_idg_to_h5

This script converts IDG matlab dataset to .h5 dataset, similar to SP and GPD datasets.

In [5]:
import os
import scipy.io
import obspy as obs
from pathlib import Path
import h5py as h5
import numpy as np

# Modifying sys.path to be able to load project packages
import sys
sys.path.append('../')

# Load project packages
from utils.h5_tools import write_batch

In [6]:
label = 0
path_in = 'C:/data/datasets/data_for_ML/events_data/'  # path to the dataset directory
path_out = 'C:/data/datasets/data_for_ML_convertet/events.h5'
data_keys = ['data_record', 0]
time_keys = ['time_in', 0]

preprocess = True

in_df = 10000.
out_df = 100.

pick_length = 400  # samples

In [9]:
def get_content(data, keys):
    """
    Returns content of data, indexed by series of keys.
    Usage Example:
        data = get_content(dataset, ["data_record", "Z"])
        Will return dataset["data_record"]["Z"].
    """
    for k in keys:
        data = data[k]
    return data


def add_channels(data, n_channels=3, normalize=True):
    """
    Converts data to three-channel array (by duplicating data for each extra channel).
    """
    n_samples = data.shape[0]
    if normalize:
        data[:] /= np.max(data)
    X = np.zeros((n_samples, n_channels))
    for i in range(n_channels):
        X[:, i] = data[:]
    return X


def convert_file(path_in, data_keys, time_keys, preprocess=True):
    """
    Converts seismic data from .mat dataset to miniSEED format.
    """
    dataset = scipy.io.loadmat(path_in)
    
    # Check if there are multiple data or time entries
    data = get_content(dataset, data_keys)
    time = get_content(dataset, time_keys)

    trace = obs.Trace(data)

    trace.stats.sampling_rate = in_df

    if preprocess:
        trace.detrend(type="linear")
        trace.filter(type="highpass", freq=2.)
        
    trace.resample(out_df)
    
    if preprocess:
        trace.normalize()
    
    # Get data and label
    data = trace.data
    l_picks = []
    for t in time:
        
        # Get pick span in sample positions
        n_sample = int(t*out_df)
        
        start = int(n_sample - pick_length/2)
        if start < 0:
            start = 0
        
        end = int(start + pick_length)
        if end > len(data):
            end = len(data)
            start = int(end - pick_length)
            
        # Get pick
        pick = data[start:end]
        pick = add_channels(pick)
        
        l_picks.append(pick)
    
    return l_picks

In [10]:
files = os.listdir(path_in)  # load input file names
# Path(path_out).mkdir(parents=True, exist_ok=True)  # create out directory if doesn't exists

X = []
for f_in in files:
    f_in = os.path.join(path_in, f_in)
    X.extend(convert_file(f_in, data_keys, time_keys, preprocess=preprocess))

X = np.array(X)
Y = np.full(X.shape[0], label, dtype = int)

write_batch(path_out, 'X', X)
write_batch(path_out, 'Y', Y)