In [1]:
import os
import sys
import time
from pathlib import Path

import h5py
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from joblib import Parallel, delayed
from PIL import Image

from col_dtypes import ColDataTypes

In [2]:
data_folder = "../../data/" # root data folder
dpi = 50 # spectrogram image resolution

chunk_id = 4 # chunk 1 is noise, chunks 2-6 are earthquake signals
assert chunk_id > 0 and chunk_id <= 6

print(f'Analysing chunk {chunk_id}')

In [3]:
col_dtypes = ColDataTypes()
dtypes = col_dtypes.get_initial_dtype_dict()
date_cols = col_dtypes.get_date_cols()

csv_pth = os.path.join(data_folder, f'raw/chunk{chunk_id}/chunk{chunk_id}.csv') # chunk metadata
eqpath = os.path.join(data_folder, f'raw/chunk{chunk_id}/chunk{chunk_id}.hdf5') # chunk earthquake data

chunk = pd.read_csv(csv_pth, dtype=dtypes, parse_dates=date_cols, encoding='utf-8')
print(eqpath)

processed_csv_save_pth = os.path.join(data_folder, f'raw/chunk{chunk_id}/chunk{chunk_id}_processed.csv') # processed metadata save pth

chunk.columns

../../data/raw/chunk4/chunk4.hdf5


Index(['network_code', 'receiver_code', 'receiver_type', 'receiver_latitude',
       'receiver_longitude', 'receiver_elevation_m', 'p_arrival_sample',
       'p_status', 'p_weight', 'p_travel_sec', 's_arrival_sample', 's_status',
       's_weight', 'source_id', 'source_origin_time',
       'source_origin_uncertainty_sec', 'source_latitude', 'source_longitude',
       'source_error_sec', 'source_gap_deg',
       'source_horizontal_uncertainty_km', 'source_depth_km',
       'source_depth_uncertainty_km', 'source_magnitude',
       'source_magnitude_type', 'source_magnitude_author',
       'source_mechanism_strike_dip_rake', 'source_distance_deg',
       'source_distance_km', 'back_azimuth_deg', 'snr_db', 'coda_end_sample',
       'trace_start_time', 'trace_category', 'trace_name'],
      dtype='object')

# Data Cleaning

In [4]:
# Get chunk w relevent columns
arrival_df = chunk
# Convert 'coda_end_sample' to int
arrival_df.loc[:, ['coda_end_sample']] = arrival_df['coda_end_sample'].apply(lambda x: int(x[2:-3]))
arrival_df.astype({
    'p_arrival_sample': 'int16',
    's_arrival_sample': 'int16',
    })
# Check if 'p_status' == 's_status'
arrival_df.loc[:, ['same_status?']] = arrival_df[['p_status', 's_status']].apply(lambda x: x['p_status'] == x['s_status'], axis=1)

# New columns

In [5]:
arrival_df.loc[:, ['p_duration']] = arrival_df.apply(lambda col: col['s_arrival_sample'] - col['p_arrival_sample'], axis=1)
arrival_df.loc[:, ['s_duration']] = arrival_df.apply(lambda col: col['coda_end_sample'] - col['s_arrival_sample'], axis=1)

In [6]:
# Find min/max p and s wave durations
arrival_df[['p_duration', 's_duration']].describe()

Unnamed: 0,p_duration,s_duration
count,200000.0,200000.0
mean,553.252827,1187.294256
std,633.714357,696.185176
min,1.0,1.0
25%,72.0,753.9015
50%,336.0,992.1015
75%,845.48525,1387.8
max,4791.0,5046.0


In [7]:
# only accept waveforms that satisfy 'min_acceptable_duration' 
min_acceptable_duration = 500 # 100 == 1 second
acceptable_duration_cond = (arrival_df['p_duration'] >= min_acceptable_duration) \
                           & (arrival_df['s_duration'] >= min_acceptable_duration)
arrival_df = arrival_df.loc[acceptable_duration_cond, :].reset_index(drop=True)

arrival_df.loc[:, ['p_duration', 's_duration']]

Unnamed: 0,p_duration,s_duration
0,1299.0,1500.0
1,894.0,1805.0
2,1720.0,1252.0
3,1587.0,1011.0
4,1032.0,1767.0
...,...,...
81869,503.0,1425.0
81870,617.0,1156.0
81871,664.0,1132.0
81872,515.0,915.0


In [8]:
# Save resultant df
arrival_df.to_csv(processed_csv_save_pth, index=False)
arrival_df.loc[:, ['p_arrival_sample', 's_arrival_sample', 'p_status', 's_status', 'coda_end_sample']]

Unnamed: 0,p_arrival_sample,s_arrival_sample,p_status,s_status,coda_end_sample
0,400.0,1699.0,manual,manual,3199
1,500.0,1394.0,manual,manual,3199
2,700.0,2420.0,automatic,automatic,3672
3,1000.0,2587.0,automatic,automatic,3598
4,400.0,1432.0,manual,manual,3199
...,...,...,...,...,...
81869,600.0,1103.0,manual,manual,2528
81870,600.0,1217.0,manual,manual,2373
81871,600.0,1264.0,manual,manual,2396
81872,600.0,1115.0,manual,manual,2030


# Analyse Number of Datapoints at Different Duration Cutoff

In [11]:
# 0.1s/1s/2s/3s/4s/5s/6s
acceptable_duration = [10, 100, 200, 300, 400, 500, 600]

for i in acceptable_duration:
    duration_cond = (arrival_df['p_duration'] >= i) \
                  & (arrival_df['s_duration'] >= i)
    temp_df = arrival_df.loc[duration_cond, :].reset_index(drop=True)
    print(f'Acceptable duration of {i/100}s', temp_df.shape)

Acceptable duration of 0.1s (81874, 38)
Acceptable duration of 1.0s (81874, 38)
Acceptable duration of 2.0s (81874, 38)
Acceptable duration of 3.0s (81874, 38)
Acceptable duration of 4.0s (81874, 38)
Acceptable duration of 5.0s (81874, 38)
Acceptable duration of 6.0s (71646, 38)
