In [1]:
import os
import sys
import time
from pathlib import Path

import h5py
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from joblib import Parallel, delayed
from PIL import Image

from col_dtypes import ColDataTypes

In [2]:
data_folder = "../../data/" # root data folder
dpi = 50 # spectrogram image resolution

chunk_id = 2 # chunk 1 is noise, chunks 2-6 are earthquake signals
assert chunk_id > 0 and chunk_id <= 6

In [3]:
col_dtypes = ColDataTypes()
dtypes = col_dtypes.get_initial_dtype_dict()
date_cols = col_dtypes.get_date_cols()

csv_pth = os.path.join(data_folder, f'raw/chunk{chunk_id}/chunk{chunk_id}.csv') # chunk metadata
eqpath = os.path.join(data_folder, f'raw/chunk{chunk_id}/chunk{chunk_id}.hdf5') # chunk earthquake data

chunk = pd.read_csv(csv_pth, dtype=dtypes, parse_dates=date_cols, encoding='utf-8')
print(eqpath)

processed_csv_save_pth = os.path.join(data_folder, f'raw/chunk{chunk_id}/chunk{chunk_id}_processed.csv') # processed metadata save pth

chunk.columns

../../data/raw/chunk2/chunk2.hdf5


Index(['network_code', 'receiver_code', 'receiver_type', 'receiver_latitude',
       'receiver_longitude', 'receiver_elevation_m', 'p_arrival_sample',
       'p_status', 'p_weight', 'p_travel_sec', 's_arrival_sample', 's_status',
       's_weight', 'source_id', 'source_origin_time',
       'source_origin_uncertainty_sec', 'source_latitude', 'source_longitude',
       'source_error_sec', 'source_gap_deg',
       'source_horizontal_uncertainty_km', 'source_depth_km',
       'source_depth_uncertainty_km', 'source_magnitude',
       'source_magnitude_type', 'source_magnitude_author',
       'source_mechanism_strike_dip_rake', 'source_distance_deg',
       'source_distance_km', 'back_azimuth_deg', 'snr_db', 'coda_end_sample',
       'trace_start_time', 'trace_category', 'trace_name'],
      dtype='object')

# Data Cleaning

In [4]:
# Get chunk w relevent columns
arrival_df = chunk
# Convert 'coda_end_sample' to int
arrival_df.loc[:, ['coda_end_sample']] = arrival_df['coda_end_sample'].apply(lambda x: int(x[2:-3]))
arrival_df.astype({
    'p_arrival_sample': 'int16',
    's_arrival_sample': 'int16',
    })
# Check if 'p_status' == 's_status'
arrival_df.loc[:, ['same_status?']] = arrival_df[['p_status', 's_status']].apply(lambda x: x['p_status'] == x['s_status'], axis=1)

# New columns

In [5]:
arrival_df.loc[:, ['p_duration']] = arrival_df.apply(lambda col: col['s_arrival_sample'] - col['p_arrival_sample'], axis=1)
arrival_df.loc[:, ['s_duration']] = arrival_df.apply(lambda col: col['coda_end_sample'] - col['s_arrival_sample'], axis=1)

In [6]:
# Find min/max p and s wave durations
arrival_df[['p_duration', 's_duration']].describe()

Unnamed: 0,p_duration,s_duration
count,200000.0,200000.0
mean,543.476347,959.112945
std,438.93768,630.377232
min,1.0,8.0
25%,246.0,531.0
50%,408.0,758.0
75%,699.0,1156.0
max,4744.0,4859.0


In [7]:
# only accept waveforms that satisfy 'min_acceptable_duration' 
min_acceptable_duration = 500 # 100 == 1 second
acceptable_duration_cond = (arrival_df['p_duration'] >= min_acceptable_duration) \
                           & (arrival_df['s_duration'] >= min_acceptable_duration)
arrival_df = arrival_df.loc[acceptable_duration_cond, :].reset_index(drop=True)

arrival_df.loc[:, ['p_duration', 's_duration']]

Unnamed: 0,p_duration,s_duration
0,1194.0,1002.0
1,1163.0,3745.0
2,1178.0,1436.0
3,1186.0,1066.0
4,1197.0,1237.0
...,...,...
81745,612.0,960.0
81746,686.0,1032.0
81747,848.0,1034.0
81748,752.0,682.0


In [8]:
# Save resultant df
arrival_df.to_csv(processed_csv_save_pth, index=False)
arrival_df.loc[:, ['p_arrival_sample', 's_arrival_sample', 'p_status', 's_status', 'coda_end_sample']]

Unnamed: 0,p_arrival_sample,s_arrival_sample,p_status,s_status,coda_end_sample
0,700.0,1894.0,manual,manual,2896
1,600.0,1763.0,manual,manual,5508
2,500.0,1678.0,manual,manual,3114
3,900.0,2086.0,manual,manual,3152
4,700.0,1897.0,manual,manual,3134
...,...,...,...,...,...
81745,700.0,1312.0,manual,manual,2272
81746,598.0,1284.0,autopicker,autopicker,2316
81747,500.0,1348.0,manual,manual,2382
81748,692.0,1444.0,autopicker,autopicker,2126
