In [1]:
import os
import pandas as pd
import h5py
import numpy as np
import matplotlib.pyplot as plt


## **1. Data directory**

In [2]:
SOURCE_DIR="/root/data/Datasets/Diting50hz"
SAVE_DIR="/root/data/Datasets/Diting50hz"

## **2. Read DiTing dataset**

In [3]:

meta_data= pd.concat(
    [
        pd.read_csv(
            os.path.join(SOURCE_DIR, f"DiTing330km_part_{i}.csv"), 
            dtype=str,
            low_memory=False,
            index_col=0
        )
        for i in range(0, 28)
    ]
)
    

## **3. Filtering**
#### **3.1 Replace space**

In [4]:
for k in meta_data.columns:
    if meta_data[k].dtype in [object, np.object_, "object", "O"]:
        meta_data[k] = meta_data[k].str.replace(" ", "")

for k in ["P_residual", "S_residual"]:
    meta_data[k].replace("-", "", inplace=True)
meta_data


Unnamed: 0,part,key,ev_id,evmag,mag_type,p_pick,p_clarity,p_motion,s_pick,net,...,N_P_amplitude_snr,N_P_power_snr,N_S_amplitude_snr,N_S_power_snr,E_P_amplitude_snr,E_P_power_snr,E_S_amplitude_snr,E_S_power_snr,P_residual,S_residual
0,0,000001.0004,1,1.5,ML,1743,E,,2615,AA,...,1.5,3.481,2.279,10.628,1.5,3.481,2.279,10.628,0.10,2.24
1,0,000001.0003,1,1.5,ML,1215,E,,1399,AA,...,1.307,0.067,4.273,16.639,1.307,0.067,4.273,16.639,-0.04,1.17
2,0,000001.0001,1,1.5,ML,2196,E,,2939,AA,...,0.948,-0.895,1.526,2.751,0.948,-0.895,1.526,2.751,-0.14,1.70
3,0,000001.0002,1,1.5,ML,2572,,,3710,AA,...,3.51,4.594,1.054,1.272,3.51,4.594,1.054,1.272,-0.62,1.97
4,0,000002.0007,2,1.0,ML,662,,R,733,AA,...,3.202,2.393,3.603,10.445,3.202,2.393,3.603,10.445,0.00,0.08
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
34743,27,904402.0380,904402,1.5,ML,2147,,,2704,AA,...,1.485,2.548,1.082,-0.947,1.485,2.548,1.082,-0.947,-0.17,-0.59
34744,27,904402.0296,904402,1.5,ML,2154,E,,3075,AA,...,0.9,-1.654,0.845,-3.816,0.9,-1.654,0.845,-3.816,-1.39,-0.04
34745,27,904402.0018,904402,1.5,ML,1416,,,1835,AA,...,1.628,4.829,1.17,-1.104,1.628,4.829,1.17,-1.104,-0.19,-0.35
34746,27,904403.0021,904403,0.7,ML,1154,I,R,1345,AA,...,1.197,-0.612,1.03,-2.467,1.197,-0.612,1.03,-2.467,-0.00,-0.02


In [5]:
meta_data.columns

Index(['part', 'key', 'ev_id', 'evmag', 'mag_type', 'p_pick', 'p_clarity',
       'p_motion', 's_pick', 'net', 'sta_id', 'dis', 'st_mag', 'baz',
       'Z_P_amplitude_snr', 'Z_P_power_snr', 'Z_S_amplitude_snr',
       'Z_S_power_snr', 'N_P_amplitude_snr', 'N_P_power_snr',
       'N_S_amplitude_snr', 'N_S_power_snr', 'E_P_amplitude_snr',
       'E_P_power_snr', 'E_S_amplitude_snr', 'E_S_power_snr', 'P_residual',
       'S_residual'],
      dtype='object')

#### **3.2 Drop empty rows**

In [6]:
inds = pd.notnull(meta_data["key"])
for k in meta_data.columns:
    if meta_data[k].dtype in [object, np.object_, "object", "O"]:
        inds &= meta_data[k] != ""
    else:
        print(k, meta_data[k].dtype)
    inds &= pd.notnull(meta_data[k])
inds.value_counts()


False    2456534
True      278214
Name: key, dtype: int64

#### **3.3 Convert data type**

In [7]:
meta_data_light = meta_data[inds].copy(deep=True)
type_map = {
    "part": np.int64,
    "key": str,
    "ev_id": np.int64,
    "evmag": np.float64,
    "mag_type": str,
    "p_pick": np.int64,
    "p_clarity": str,
    "p_motion": str,
    "s_pick": np.int64,
    "net": str,
    "sta_id": np.int64,
    "dis": np.float64,
    "st_mag": np.float64,
    "baz": np.float64,
    "Z_P_amplitude_snr": np.float64,
    "Z_P_power_snr": np.float64,
    "Z_S_amplitude_snr": np.float64,
    "Z_S_power_snr": np.float64,
    "N_P_amplitude_snr": np.float64,
    "N_P_power_snr": np.float64,
    "N_S_amplitude_snr": np.float64,
    "N_S_power_snr": np.float64,
    "E_P_amplitude_snr": np.float64,
    "E_P_power_snr": np.float64,
    "E_S_amplitude_snr": np.float64,
    "E_S_power_snr": np.float64,
    "P_residual": np.float64,
    "S_residual": np.float64,
}
meta_data_light = meta_data_light.astype(type_map)
meta_data_light


Unnamed: 0,part,key,ev_id,evmag,mag_type,p_pick,p_clarity,p_motion,s_pick,net,...,N_P_amplitude_snr,N_P_power_snr,N_S_amplitude_snr,N_S_power_snr,E_P_amplitude_snr,E_P_power_snr,E_S_amplitude_snr,E_S_power_snr,P_residual,S_residual
5,0,000002.0006,2,1.0,ML,1039,I,R,1094,AA,...,16.712,16.508,0.359,-3.852,16.712,16.508,0.359,-3.852,-0.01,0.03
18,0,000005.0021,5,1.6,ML,1176,I,U,1459,AA,...,4.173,3.536,1.079,-2.055,4.173,3.536,1.079,-2.055,-0.11,0.02
19,0,000005.0017,5,1.6,ML,1550,I,U,1798,AA,...,1.163,0.496,1.169,0.109,1.163,0.496,1.169,0.109,0.02,0.36
28,0,000008.0007,8,0.7,ML,1279,I,R,1344,AA,...,2.004,-0.409,1.986,3.997,2.004,-0.409,1.986,3.997,0.05,0.02
29,0,000008.0006,8,0.7,ML,1439,I,R,1493,AA,...,2.378,-0.071,1.616,6.333,2.378,-0.071,1.616,6.333,-0.01,0.02
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
34732,27,904399.0013,904399,1.4,ML,1300,I,U,1479,AA,...,37.495,29.615,0.887,-1.091,37.495,29.615,0.887,-1.091,0.78,2.32
34741,27,904402.0017,904402,1.5,ML,891,I,U,1118,AA,...,3.721,8.689,1.574,3.025,3.721,8.689,1.574,3.025,0.10,0.14
34742,27,904402.0021,904402,1.5,ML,1635,I,U,1825,AA,...,3.498,0.565,2.237,1.202,3.498,0.565,2.237,1.202,-0.01,-0.09
34746,27,904403.0021,904403,0.7,ML,1154,I,R,1345,AA,...,1.197,-0.612,1.030,-2.467,1.197,-0.612,1.030,-2.467,-0.00,-0.02


#### **3.4 Drop invalid data**

In [8]:
meta_data_light = meta_data_light[(meta_data_light["p_pick"] > 0) & (meta_data_light["s_pick"] > 0) & (meta_data_light["p_motion"] != "n")]
print(meta_data_light.shape)
meta_data_light = meta_data_light[(meta_data_light["p_pick"] < int(50 * 180)) & (meta_data_light["s_pick"] < int(50 * 180))]
print(meta_data_light.shape)
meta_data_light = meta_data_light[meta_data_light["p_pick"] < meta_data_light["s_pick"]]
print(meta_data_light.shape)


(277738, 28)
(277738, 28)
(277736, 28)


In [9]:
meta_data_light.reset_index(drop=True,inplace=True)
meta_data_light


Unnamed: 0,part,key,ev_id,evmag,mag_type,p_pick,p_clarity,p_motion,s_pick,net,...,N_P_amplitude_snr,N_P_power_snr,N_S_amplitude_snr,N_S_power_snr,E_P_amplitude_snr,E_P_power_snr,E_S_amplitude_snr,E_S_power_snr,P_residual,S_residual
0,0,000002.0006,2,1.0,ML,1039,I,R,1094,AA,...,16.712,16.508,0.359,-3.852,16.712,16.508,0.359,-3.852,-0.01,0.03
1,0,000005.0021,5,1.6,ML,1176,I,U,1459,AA,...,4.173,3.536,1.079,-2.055,4.173,3.536,1.079,-2.055,-0.11,0.02
2,0,000005.0017,5,1.6,ML,1550,I,U,1798,AA,...,1.163,0.496,1.169,0.109,1.163,0.496,1.169,0.109,0.02,0.36
3,0,000008.0007,8,0.7,ML,1279,I,R,1344,AA,...,2.004,-0.409,1.986,3.997,2.004,-0.409,1.986,3.997,0.05,0.02
4,0,000008.0006,8,0.7,ML,1439,I,R,1493,AA,...,2.378,-0.071,1.616,6.333,2.378,-0.071,1.616,6.333,-0.01,0.02
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
277731,27,904399.0013,904399,1.4,ML,1300,I,U,1479,AA,...,37.495,29.615,0.887,-1.091,37.495,29.615,0.887,-1.091,0.78,2.32
277732,27,904402.0017,904402,1.5,ML,891,I,U,1118,AA,...,3.721,8.689,1.574,3.025,3.721,8.689,1.574,3.025,0.10,0.14
277733,27,904402.0021,904402,1.5,ML,1635,I,U,1825,AA,...,3.498,0.565,2.237,1.202,3.498,0.565,2.237,1.202,-0.01,-0.09
277734,27,904403.0021,904403,0.7,ML,1154,I,R,1345,AA,...,1.197,-0.612,1.030,-2.467,1.197,-0.612,1.030,-2.467,-0.00,-0.02


In [10]:
meta_data['p_motion'].value_counts()

     2093198
U     334099
R     281777
D      13373
C      11776
n        525
Name: p_motion, dtype: int64

In [11]:
meta_data_light['p_motion'].value_counts()

U    146894
R    116579
D      7558
C      6705
Name: p_motion, dtype: int64

## **4. Save data**

In [14]:
if not os.path.exists(SAVE_DIR):
    os.makedirs(SAVE_DIR)
meta_data_light.to_csv(os.path.join(SAVE_DIR,"DiTing330km_light.csv"),index=True)