# convert_idg_mat_dataset

This script converts dataset organized as folder with collection of .mat files (with field for data and pick position in seconds).

In [1]:
import os
import scipy.io
import obspy as obs
from pathlib import Path
import pandas as pd

In [2]:
path_in = 'C:/data/datasets/data_for_ML/events_data/'  # path to the dataset directory
path_out = 'C:/data/datasets/data_for_ML_convertet/events_data/'
data_keys = ['data_record']
time_keys = ['time_in']
csv_file_out = 'picks.csv' 
ml_file = 'input.txt'

preprocess = False

in_df = 10000.
out_df = 100.

In [3]:
# Check that path_out is not a file

In [4]:
def generate_ml_input_file(path, df):
    """
    Generates input file for https://github.com/Syler1984/seismo-ml-models-integration.
    """
    with open(path, 'w') as f:
        for _, row in df.iterrows():
            wave = row['file']
            f.write(f'{wave} {wave} {wave}\n')

In [5]:
def get_content(data, keys):
    """
    Returns content of data, indexed by series of keys.
    Usage Example:
        data = get_content(dataset, ["data_record", "Z"])
        Will return dataset["data_record"]["Z"].
    """
    for k in keys:
        data = data[k]
    return data

In [6]:
def convert_file(path_in, dir_out, data_keys, time_keys, preprocess=True):
    """
    Converts seismic data from .mat dataset to miniSEED format.
    """
    dataset = scipy.io.loadmat(path_in)
    
    # Check if there are multiple data or time entries
    data = get_content(dataset, data_keys)
    time = get_content(dataset, time_keys)
    
    if len(data) != 1:
        raise AttributeError(f'In file "{path_in}" data array length is not equal one entry!')
    if len(time) != 1:
        raise AttributeError(f'In file "{path_in}" time array length is not equal one entry!')
        
    data = data[0]
    time = time[0]

    trace = obs.Trace(data)

    trace.stats.sampling_rate = in_df

    if preprocess:
        trace.detrend(type="linear")
        trace.filter(type="highpass", freq=2.)
        
    trace.resample(out_df)
    
    if preprocess:
        trace.normalize()
    
    stream = obs.Stream(traces=[trace])
    
    # Generate output file path
    out_name = os.path.split(path_in)[1]
    out_name = os.path.splitext(out_name)[0]
    out_name = f'{out_name}.mseed'
    path_out = os.path.join(dir_out, out_name)
    
    # Generate unique filename
    
    # Save stream
    stream.write(path_out, format='MSEED')
    
    return {'file': path_out, 'time': time}

In [7]:
files = os.listdir(path_in)  # load input file names
Path(path_out).mkdir(parents=True, exist_ok=True)  # create out directory if doesn't exists

In [8]:
df = pd.DataFrame({'file': [], 'time': []})

In [9]:
for f_in in files:
    f_in = os.path.join(path_in, f_in)
    df = df.append(convert_file(f_in, path_out, data_keys, time_keys, preprocess=preprocess), ignore_index=True)

In [11]:
df_out = os.path.join(path_out, csv_file_out)
df.to_csv(df_out, index=False)

In [12]:
ml_out = os.path.join(path_out, ml_file)
generate_ml_input_file(ml_out, df)

## Check .csv output

In [13]:
df

Unnamed: 0,file,time
0,C:/data/datasets/data_for_ML_convertet/events_...,[47.1319]
1,C:/data/datasets/data_for_ML_convertet/events_...,"[6.0117, 22.7863]"
2,C:/data/datasets/data_for_ML_convertet/events_...,[4.1254]
3,C:/data/datasets/data_for_ML_convertet/events_...,[47.1244]
4,C:/data/datasets/data_for_ML_convertet/events_...,"[6.0038, 22.7785]"
...,...,...
157,C:/data/datasets/data_for_ML_convertet/events_...,[39.552]
158,C:/data/datasets/data_for_ML_convertet/events_...,[0.7377]
159,C:/data/datasets/data_for_ML_convertet/events_...,[39.5442]
160,C:/data/datasets/data_for_ML_convertet/events_...,[0.7344]


In [14]:
df_save_test = pd.read_csv(df_out); df_save_test

Unnamed: 0,file,time
0,C:/data/datasets/data_for_ML_convertet/events_...,[ 47.1319]
1,C:/data/datasets/data_for_ML_convertet/events_...,[ 6.0117 22.7863]
2,C:/data/datasets/data_for_ML_convertet/events_...,[ 4.1254]
3,C:/data/datasets/data_for_ML_convertet/events_...,[ 47.1244]
4,C:/data/datasets/data_for_ML_convertet/events_...,[ 6.0038 22.7785]
...,...,...
157,C:/data/datasets/data_for_ML_convertet/events_...,[ 39.552]
158,C:/data/datasets/data_for_ML_convertet/events_...,[ 0.7377]
159,C:/data/datasets/data_for_ML_convertet/events_...,[ 39.5442]
160,C:/data/datasets/data_for_ML_convertet/events_...,[ 0.7344]
