## Collect and Format Data

Go through this notebook to download and format all data.
When you run this notebook to completion, it'll store the training, validation, and test data in the following files:

x_train.pic
y_train.pic
df_train.pic

x_val.pic
y_val.pic
df_val.pic

x_test.pic
y_test.pic
df_test.pic

x_rock.pic
y_rock.pic
df_rock.pic


The x_* files contain filtered and normalized waveforms.
The y_* files contain a 1 or 0 for explosion/rockburst or earthquake, respectively (y_rock.pic is all 1's).
The df_* files contain the metadata for each waveform.

In [None]:
import pandas as pd
import numpy as np

from tensorflow import keras
from tensorflow.keras import layers
from keras import backend as K
import matplotlib.pyplot as plt

#web scraping
import requests
import bs4
import lxml.etree as xml

#getting waveforms
import obspy
from obspy.clients.fdsn import Client
from obspy import UTCDateTime
from obspy import read

from obspy.signal.trigger import classic_sta_lta
from obspy.signal.trigger import plot_trigger

import sklearn.metrics as metrics
from sklearn.metrics import roc_curve

client = Client("IRIS")

import pickle

In [None]:
def web_scrape_events(year):
    ev_url = "http://www.isc.ac.uk/cgi-bin/web-db-v4?out_format=ISF&request=REVIEWED&searchshape=GLOBAL&start_year=" + str(year) + "&end_year=" + str(year) + "&start_month=1&end_month=1&start_day=1&end_day=15&start_time=00:00:00&end_time=23:59:59"

    #have to keep trying because the website goes down every now and then
    while 1:
        ev_page = bs4.BeautifulSoup(requests.get(ev_url, {}).text, "lxml")
        try:
            w = ev_page.find_all("pre")[-1]
            break
        except:
            time.sleep(60*10)
            continue

    w = str(w)
    #Cropping off the text before the data
    for i in range(100):
        if w[i:i+4] == str(year):
            w = w[i:]
            break
    a = w.splitlines()
    
    ev_url = "http://www.isc.ac.uk/cgi-bin/web-db-v4?out_format=ISF&request=REVIEWED&searchshape=GLOBAL&start_year=" + str(year) + "&end_year=" + str(year) + "&start_month=1&end_month=1&start_day=16&end_day=31&start_time=00:00:00&end_time=23:59:59"
    ev_page = bs4.BeautifulSoup(requests.get(ev_url, {}).text, "lxml")
    w = ev_page.find_all("pre")[-1]
    w = str(w)
    #Cropping off the text before the data
    for i in range(100):
        if w[i:i+4] == str(year):
            w = w[i:]
            break
    a += w.splitlines()

    ev_url = "http://www.isc.ac.uk/cgi-bin/web-db-v4?out_format=ISF&request=REVIEWED&searchshape=GLOBAL&start_year=" + str(year) + "&end_year=" + str(year) + "&start_month=2&end_month=3&start_day=1&end_day=1&start_time=00:00:00&end_time=23:59:59"
    ev_page = bs4.BeautifulSoup(requests.get(ev_url, {}).text, "lxml")
    w = ev_page.find_all("pre")[-1]
    w = str(w)
    #Cropping off the text before the data
    for i in range(100):
        if w[i:i+4] == str(year):
            w = w[i:]
            break
    a += w.splitlines()
    
    ev_url = "http://www.isc.ac.uk/cgi-bin/web-db-v4?out_format=ISF&request=REVIEWED&searchshape=GLOBAL&start_year=" + str(year) + "&end_year=" + str(year) + "&start_month=3&end_month=4&start_day=2&end_day=30&start_time=00:00:00&end_time=23:59:59"
    ev_page = bs4.BeautifulSoup(requests.get(ev_url, {}).text, "lxml")
    w = ev_page.find_all("pre")[-1]
    w = str(w)
    #Cropping off the text before the data
    for i in range(100):
        if w[i:i+4] == str(year):
            w = w[i:]
            break
    a += w.splitlines()
    
    ev_url = "http://www.isc.ac.uk/cgi-bin/web-db-v4?out_format=ISF&request=REVIEWED&searchshape=GLOBAL&start_year=" + str(year) + "&end_year=" + str(year) + "&start_month=5&end_month=8&start_day=1&end_day=31&start_time=00:00:00&end_time=23:59:59"
    ev_page = bs4.BeautifulSoup(requests.get(ev_url, {}).text, "lxml")
    w = ev_page.find_all("pre")[-1]
    w = str(w)
    #Cropping off the text before the data
    for i in range(100):
        if w[i:i+4] == str(year):
            w = w[i:]
            break
    a += w.splitlines()

    ev_url = "http://www.isc.ac.uk/cgi-bin/web-db-v4?out_format=ISF&request=REVIEWED&searchshape=GLOBAL&start_year=" + str(year) + "&end_year=" + str(year) + "&start_month=9&end_month=12&start_day=1&end_day=31&start_time=00:00:00&end_time=23:59:59"
    ev_page = bs4.BeautifulSoup(requests.get(ev_url, {}).text, "lxml")
    w = ev_page.find_all("pre")[-1]
    w = str(w)
    #Cropping off the text before the data
    for i in range(100):
        if w[i:i+4] == str(year):
            w = w[i:]
            break
    a += w.splitlines()
    
    return a


def man_event_collect(a):
    global man

    e = False #Has this event been labeled an earthquake
    m = False #Has this event been labeled a manmade event

    i = 0
    while i < len(a):
        if (len(a[i]) != 136):
            e = False
            m = False
            i += 1
            continue

        indices = [0, 11, 23, 30, 36, 45, 55, 61, 67, 71, 77, 83, 88, 93, 97, 104, 111, 113, 115, 118, 128]
        l = [a[i][j:k] for j,k in zip(indices, indices[1:]+[None])]
        for j in range (len(l)):
            l[j] = l[j].strip()

        if (l[18] != '' and l[18] != 'uk' and l[19] != "ISC-EHB" and l[19] != "EHB"):
            if (l[18][1] == 'e'):
                e = True
            elif (l[18][1] == 'm' or l[18][1] == 'n' or l[18][1] == 'h' or l[18][1] == 'x' or l[18][1] == 'i'):
                m = True
            else: #discard this event and move on
                for j in range(i, len(a)):
                    if (len(a[j]) == 0):
                        i = j
                        e = False
                        m = False
                        break
                continue

        #cropping the f off the longitude
        if (l[5][-1] == 'f'):
            l[5] = l[5][:-1]
        #cropping the f off the time
        if (l[1][-1] == 'f'):
            l[1] = l[1][:-1]

        #Converting to floats
        for j in range(2, 8):
            #print (l[j])
            if (len(l[j]) > 0):
                l[j] = float(l[j])
        
        if (len(a[i+1]) > 0 and a[i+1][2] == '#'): #Checking if this is the prime piece of data for the event, then we add it to the dataframe
            if (m and not e):
                l[18] = 'man'
                man.loc[man.shape[0]+1] = l
            e = False #Reset boolean variables when we reach the last recording of this event
            m = False
        i += 1


def web_scrape_arrivals(month, year):
    days = 30
    if (month < 8):
        days = 30 + month%2
    else:
        days = 31 - month%2
    if month == 2:
        days = 28
    
    
    arr_url = "http://www.isc.ac.uk/cgi-bin/web-db-v4?iscreview=on&out_format=CSV&ttime=on&ttres=on&tdef=on&phaselist=&sta_list=&stnsearch=GLOBAL&stn_ctr_lat=&stn_ctr_lon=&stn_radius=&max_stn_dist_units=deg&stn_top_lat=&stn_bot_lat=&stn_left_lon=&stn_right_lon=&stn_srn=&stn_grn=&bot_lat=&top_lat=&left_lon=&right_lon=&ctr_lat=&ctr_lon=&radius=&max_dist_units=deg&searchshape=GLOBAL&srn=&grn=&start_year=" + str(year) + "&start_month=" + str(month) + "&start_day=1&start_time=00%3A00%3A00&end_year=" + str(year) + "&end_month="+ str(month) + "&end_day=" + str(days) + "&end_time=23%3A59%3A59&min_dep=&max_dep=&min_mag=&max_mag=&req_mag_type=mb&req_mag_agcy=Any&request=STNARRIVALS"
    arr_page = bs4.BeautifulSoup(requests.get(arr_url, {}).text, "lxml")
    w = arr_page.find_all("pre")
    w = str(w)
    a = w.splitlines()
    #Cropping off the text before the data
    a = a[6:-3]
    if len(a) == 0: #keep trying because website goes down every now and then
        time.sleep(60*5)
        return man_web_scrape_arrivals(month, year)
    
    return a

def man_data_collect(a):
    global data_man
    global event
    
    i = 0
    while i < len(a):
        if event > man.shape[0]:
            break
        
        #if i % 10000 == 0:
            #print (data_man.shape[0], i)

        l = a[i].split(',')

        for j in range (len(l)):
            l[j] = l[j].strip()
        
        # If event ID and station are the same as the previous line, move on because this arrival was already picked up.
        if i > 0:
            l2 = a[i-1].split(',')
            for j in range (len(l2)):
                l2[j] = l2[j].strip()
            if l[0] == l2[0] and l[2] == l2[2]:
                i += 1
                continue
        
        eventtime = UTCDateTime(l[-8] + "T" + l[-7]) #time of event for current item in bulletin

        reftime = UTCDateTime(man.loc[event]["Date"] + "T" + man.loc[event]["Time"]) #time of event for current item in dataframe

        #if the bulletin's time is smaller or the distance detected is less than, need to move on to next item in bulletin.
        #(the bulletin's arrivals are ordered in increasing event time and distance detected)
        if eventtime < reftime or float(l[7]) < 20:
            i += 1
            continue
        elif eventtime > reftime:
            event += 1
            continue
        
        station = l[2]
        channel = l[6]
        
        time = UTCDateTime(l[11] + "T" + l[12])
        
        # Request a waveform from 60 seconds before to 120 seconds after the arrival time
        # Sometimes the waveforms are given with a sharp spike at the front so we query for a few extra seconds at the front and later cut off the excess
        
        ext = 9
        try:
            st = client.get_waveforms("*", station, "*", "??Z", starttime=time-60-ext, endtime=time + 120)
        except:
            i += 1
            continue
        
        arr = []
        ind = 0
        for tr in st:
            s = str(tr.stats.network) + "." + str(tr.stats.station) + "." + str(tr.stats.location) + "." + str(tr.stats.channel)
            arr.append([s, ind, tr])
            ind += 1
        
        arr.sort()
        
        prevstation = ""
        prevchannel = ""
        
        for item in arr:
            tr = item[2]
            
            #sometimes if the sampling rate is 20, it is recorded as 19.99 so this fixes that
            sr = int(tr.stats.sampling_rate + 0.1)

            if sr == 0 or sr % 20 != 0:
                continue
                #print (r)
            
            network = tr.stats.network
            station = tr.stats.station
            location = tr.stats.location
            channel = tr.stats.channel
            
            if station == prevstation and channel == prevchannel:
                continue
            
            #some stations don't provide the full data when a very long interval is queried so we try reducing the extra amount
            ext = 9
            while tr.stats.npts < (180+ext)*sr and ext >= 3:
                ext -= 2
                try:
                    tr = client.get_waveforms(network, station, location, channel, starttime=time-60-ext, endtime=time + 120)[0]
                except:
                    continue
                
                sr = int(tr.stats.sampling_rate + 0.1)
            
            if tr.stats.npts < (180+ext)*sr: # Can't take sample if it is too small.
                #print (tr)
                continue

            tr2 = tr.copy()
            
            #perform a highpass filter of 1 Hz
            tr2.filter('highpass', freq=1, corners=2, zerophase=True)
            
            #cut off the excess at the front
            tr2 = tr2.slice(tr2.stats.starttime+ext)

            #downsample to 20 Hz
            if sr != 20:
                tr2 = tr2.decimate(int(sr)//20, strict_length=False, no_filter=True)

            #find the sta/lta value throughout the waveform
            cft = classic_sta_lta(tr2, int(5 * 20), int(20 * 20))
            
            # the arrival time is at 60s. This makes sure the sta/lta gets to at least 2 somewhere in the interval 50-70 seconds
            # arrival is only taken if it its size is reasonably significant
            m = max(cft[50*20:70*20])
            if m < 2:
                continue

            #look at the data_cols array to see what each item in the row represents
            row = [l[0], l[-8], l[-7], l[11], l[12], float(l[-6]), float(l[-5]), l[-2], l[-1], tr.stats.network, station, tr.stats.location, tr.stats.channel, l[9], float(l[7]), True, sr, tr.data[:20*180], tr2.data[:20*180]]
            
            data_man.loc[data_man.shape[0] + 1] = row
            prevstation = station
            prevchannel = channel
            
        i += 1


## Collecting explosions

The loop below takes a few days to run. To speed up the process you can split the year range into 4 parts and run each on a parallel notebook, store each partial dataframe, then recombine the data.

In [None]:
c = ['Date', 'Time', 'error', 'RMS', 'lat', 'long', 'Smaj', 'Smin', 'Az', 'Depth', 'Err', 'Ndef', 'Nsta', 'gap', 'mdist', 'Mdist', 'a', 'l', 'event type', 'Author', 'OrigID']
man = pd.DataFrame(columns = c) #manmade events
data_cols = ["ISC EventID", "Event Date", "Event Time", "Arrival Date", "Arrival Time", "Lat", "Long", "MagType", "Mag", "Station", "Channel", "ISC Phase", "Distance (Deg)", "IsExplosion", "SampRate", "Samples"]
data_man = pd.DataFrame(columns = data_cols) #arrivals with waveforms and metadata
event = 1

for y in range (1978, 2019):
    print (y)
    man = pd.DataFrame(columns = c)
    man_event_collect(web_scrape_events(y))
    print (man.shape[0])
    event = 1
    for m in range (1, 13):
        if y == 2018 and m == 12: #there is no data for December 2018
            break
        man_data_collect(web_scrape_arrivals(m, y))
    print ("Data: " + str(data_man.shape[0]))

data_man

In [None]:
def nat_event_collect(a):
    global nat

    e = False #Has this event been labeled an earthquake
    m = False #Has this event been labeled a manmade event

    i = 0
    while i < len(a):
        if (len(a[i]) != 136):
            e = False
            m = False
            i += 1
            continue

        indices = [0, 11, 23, 30, 36, 45, 55, 61, 67, 71, 77, 83, 88, 93, 97, 104, 111, 113, 115, 118, 128]
        l = [a[i][j:k] for j,k in zip(indices, indices[1:]+[None])]
        for j in range (len(l)):
            l[j] = l[j].strip()

        if (l[18] != '' and l[18] != 'uk' and l[19] != "ISC-EHB" and l[19] != "EHB"):
            if (l[18][1] == 'e'):
                e = True
            elif (l[18][1] == 'm' or l[18][1] == 'n' or l[18][1] == 'h' or l[18][1] == 'x' or l[18][1] == 'i'):
                m = True
            else: #discard this event and move on
                for j in range(i, len(a)):
                    if (len(a[j]) == 0):
                        i = j
                        e = False
                        m = False
                        break
                continue

        #cropping the f off the longitude
        if (l[5][-1] == 'f'):
            l[5] = l[5][:-1]
        #cropping the f off the time
        if (l[1][-1] == 'f'):
            l[1] = l[1][:-1]

        #Converting to floats
        for j in range(2, 8):
            #print (l[j])
            if (len(l[j]) > 0):
                l[j] = float(l[j])
        
        if (len(a[i+1]) > 0 and a[i+1][2] == '#'): #Checking if this is the prime piece of data for the event, then we add it to the dataframe
            if (e and not m):
                l[18] = 'nat'
                nat.loc[nat.shape[0]+1] = l
            e = False #Reset boolean variables when we reach the last recording of this event
            m = False
        i += 1

def nat_data_collect(a):
    global data_nat_all
    global event
    
    i = 0
    while i < len(a)-3:
        if event > nat.shape[0]:
            break
        #if (i % 10000 == 0):
            #print (data_nat.shape[0], i)

        l = a[i].split(',')

        for j in range (len(l)):
            l[j] = l[j].strip()

        # If event ID and station are the same as the previous line, move on because this arrival was already picked up.
        if i > 0:
            l2 = a[i-1].split(',')
            for j in range (len(l2)):
                l2[j] = l2[j].strip()
            if l[0] == l2[0] and l[2] == l2[2]:
                i += 1
                continue
        
        eventtime = UTCDateTime(l[-8] + "T" + l[-7]) #time of event for current item in bulletin

        reftime = UTCDateTime(nat.loc[event]["Date"] + "T" + nat.loc[event]["Time"]) #time of event for current item in dataframe

        #if the bulletin's time is smaller or the distance detected is less than, need to move on to next item in bulletin.
        #(the bulletin's arrivals are ordered in increasing event time and distance detected)
        if eventtime < reftime or float(l[7]) < 20:
            i += 1
            continue
        elif eventtime > reftime:
            event += 1
            continue
        
        
        station = l[2]
        channel = l[6]

        time = UTCDateTime(l[11] + "T" + l[12])
        
        # Request a waveform from 60 seconds before to 120 seconds after the arrival time
        # Sometimes the waveforms are given with a sharp spike at the front so we query for a few extra seconds at the front and later cut off the excess
        
        ext = 9
        try:
            st = client.get_waveforms("*", station, "*", "??Z", starttime=time-60-ext, endtime=time + 120)
        except:
            i += 1
            continue
        
        prevstation = ""
        for tr in st:
            #sometimes if the sampling rate is 20, it is recorded as 19.99 so this fixes that
            sr = int(tr.stats.sampling_rate + 0.1)

            if sr == 0 or sr % 20 != 0:
                continue
                #print (r)

            network = tr.stats.network
            station = tr.stats.station
            location = tr.stats.location
            channel = tr.stats.channel
            
            if station == prevstation:
                continue;
            
            #some stations don't provide the full data when a very long interval is queried so we try reducing the extra amount
            while tr.stats.npts < (180+ext)*sr and ext >= 3:
                ext -= 2
                try:
                    tr = client.get_waveforms(network, station, location, channel, starttime=time-60-ext, endtime=time + 120)[0]
                except:
                    continue
                
                sr = int(tr.stats.sampling_rate + 0.1)

            if tr.stats.npts < (180+ext)*sr:
                continue

            tr2 = tr.copy()
            
            #perform a highpass filter
            tr2.filter('highpass', freq=1, corners=2, zerophase=True)
            
            #cut off the excess at the front
            tr2 = tr2.slice(tr2.stats.starttime+ext)

            #downsample to 20 Hz
            if sr != 20:
                tr2 = tr2.decimate(int(sr)//20, strict_length=False, no_filter=True)

            #find the sta/lta value throughout the waveform
            cft = classic_sta_lta(tr2, int(5 * 20), int(20 * 20))
            
            # the arrival time is at 60s, so this makes sure the sta/lta gets to at least 2 somewhere in the interval of 50-70 seconds
            m = max(cft[50*20:70*20])
            if m < 2:
                continue

            #look at data_cols to see what each item in the row represents
            row = [l[0], l[-8], l[-7], l[11], l[12], float(l[-6]), float(l[-5]), l[-2], l[-1], tr.stats.network, station, tr.stats.location, tr.stats.channel, l[9], float(l[7]), False, sr, tr.data[:20*180], tr2.data[:20*180]]
            
            data_nat_all.loc[data_nat_all.shape[0] + 1] = row
            
            prevstation = station
            
        i += 1



## Collecting Earthquakes

There are far more earthquakes recorded than explosions, so we only need to collect earthquakes over a couple months.

In [None]:
nat = pd.DataFrame(columns = c)
data_nat_all = pd.DataFrame(columns = data_cols) #arrivals with waveforms and metadata
event = 1

for y in range (2018, 2019):
    print (y)
    nat = pd.DataFrame(columns = c)
    nat_event_collect(web_scrape_events(y))
    print (nat.shape[0])
    event = 1
    for m in range (1, 3):
        if y == 2018 and m == 12: #there is no data for December 2018
            break
        nat_data_collect(web_scrape_arrivals(m, y))
    print ("Data: " + str(data_nat_all.shape[0]))

data_nat_all

## Equating the number of earthquakes and explosions in each distance range of 10 degrees

In [None]:
#Ranges of 10 from 20-180 degrees
man_arr = np.zeros(16) #counts number of samples in data_man for each distance range

count = 0
for ind, row in data_man.iterrows():
    """
    # this code is necessary if you are equating the samples in each magnitude range
    m = row["Mag"]
    if m == '' or row["MagType"] != "mb":
        count += 1
        m_ind = 0
    else:
        a = float(m)
        m_ind = int((a-2) * 2)
    """
    
    d = float(row["Distance (Deg)"])
    d_ind = int(d-20) // 10
    
    man_arr[d_ind] += 1


nat_arr = np.zeros(16)

data_nat = pd.DataFrame(columns=data_cols)


for i in range (data_nat_all.shape[0]):
    if data_nat.shape[0] >= data_man.shape[0]:
        break
    r = data_nat_all.loc[i]
    date = r["Arrival Date"]
    time = r["Arrival Time"]
    station = r["Station"]
    channel = r["Channel"]
    dist_deg = r["Distance (Deg)"]
    exp = r["IsExplosion"]
    sam = r["Samples"]
    sr = r["SampRate"]
    m = r["Mag"]
    
    if i > 0:
        r2 = nat.loc[i-1]
        # Avoid taking arrivals of the same event with the same station and channel
        if r['ISC EventID'] == r2['ISC EventID'] and r['Arrival Time'] == r2['Arrival Time'] and r['Station'] == r2['Station'] and r['Channel'] == r2['Channel']:
            continue
            
    if m == '' or r["MagType"] != "mb" or sam.size != 3600 or max(sam) == 0:
        continue
    
    d = float(r["Distance (Deg)"])
    d_ind = int(d-20) // 10
    
    if nat_arr[d_ind] >= man_arr[d_ind]:
        continue
    
    nat_arr[d_ind] += 1

    data_nat.loc[data_nat.shape[0]] = row
    
print ("Nat done: " + str(data_nat.shape[0])) #This should be the same size as data_man.
#If it is smaller, you may need to collect earthquake data from more months above.


## Split data into train, test, and validation (8:1:1), keeping the same ratio in each distance range

In [None]:
data_man = data_man.sample(frac = 1)
data_nat = data_nat.sample(frac = 1)

# split data in 8:1:1 ratio for each distance range
df_train = pd.DataFrame(columns=data_cols)
df_val = pd.DataFrame(columns=data_cols)
df_test = pd.DataFrame(columns=data_cols)

train_man = [] #number of samples added to df_train so far in each distance range
for i in range (16):
    train_man.append(0)
val_man = [] #number of samples added to df_val so far in each distance range
for i in range (16):
    val_man.append(0)

for i, r in data_man.iterrows():
    d = float(r["Distance (Deg)"])
    d_ind = int(d-20) // 10
    
    test = (man_arr[d_ind]) // 10 + 1 #number of samples in this distance range that should be in training set
    val = (man_arr[d_ind]) // 10
    train = man_arr[d_ind] - test - val
    
    if train_man[d_ind] < train:
        df_train.loc[df_train.shape[0]] = r
        train_man[d_ind] += 1
    elif val_man[d_ind] < val:
        df_val.loc[df_val.shape[0]] = r
        val_man[d_ind] += 1
    else:
        df_test.loc[df_test.shape[0]] = r

print (df_train.shape[0], df_val.shape[0], df_test.shape[0])

train_nat = []
for i in range (16):
    train_nat.append(0)
val_nat = []
for i in range (16):
    val_nat.append(0)

for i, r in data_nat.iterrows():
    d = float(r["Distance (Deg)"])
    d_ind = int(d-20) // 10
    
    test = (nat_arr[d_ind]) // 10 + 1
    val = (nat_arr[d_ind]) // 10
    train = nat_arr[d_ind] - test - val
    
    if train_nat[d_ind] < train:
        df_train.loc[df_train.shape[0]] = r
        train_nat[d_ind] += 1
    elif val_nat[d_ind] < val:
        df_val.loc[df_val.shape[0]] = r
        val_nat[d_ind] += 1
    else:
        df_test.loc[df_test.shape[0]] = r


print (df_train.shape[0], df_val.shape[0], df_test.shape[0])

## Store data in x and y numpy arrays that can be used in a Keras model

In [None]:
df_train = df_train.sample(frac = 1)
df_train = df_train.reset_index(drop=True)

x = np.empty((df_train.shape[0], df_train.loc[0]["Samples"].shape[0]))

for i, row in df_train.iterrows():
    x[i] = row["Samples"]

y = np.array(df_train["IsExplosion"])

#Standardize the data
x = x.reshape(x.shape[0], x.shape[1], 1)

y = np.asarray(y).astype('int32')

#Normalization:
sums = x.sum(axis=1)
sums = sums/3600
mins = x.min(axis=1)
maxs = x.max(axis=1)

sums = sums.reshape((sums.shape[0], 1))
mins = mins.reshape((mins.shape[0], 1))
maxs = maxs.reshape((maxs.shape[0], 1))

x = x.reshape((x.shape[0], x.shape[1]))
x = -sums + x
x /= (maxs-mins)
x = x.reshape((x.shape[0], x.shape[1], 1))

for i in range (maxs.shape[0]):
    if maxs[i] == mins[i]:
        print (df_train.loc[i])

with open("x_train.pic", "wb") as fp:
  pickle.dump(x, fp)
with open("y_train.pic", "wb") as fp:
  pickle.dump(y, fp)
with open("df_train.pic", "wb") as fp:
    pickle.dump(df_train, fp)


print ("Train: " + str(x.shape[0]))


df_val = df_val.sample(frac = 1)
df_val = df_val.reset_index(drop=True)

x = np.empty((df_val.shape[0], df_val.loc[0]["Samples"].shape[0]))

for i, row in df_val.iterrows():
    x[i] = row["Samples"]

y = np.array(df_val["IsExplosion"])

#Standardize the data
x = x.reshape(x.shape[0], x.shape[1], 1)

y = np.asarray(y).astype('int32')

#Normalization:
sums = x.sum(axis=1)
sums = sums/3600
mins = x.min(axis=1)
maxs = x.max(axis=1)

sums = sums.reshape((sums.shape[0], 1))
mins = mins.reshape((mins.shape[0], 1))
maxs = maxs.reshape((maxs.shape[0], 1))

x = x.reshape((x.shape[0], x.shape[1]))
x = -sums + x
x /= (maxs-mins)
x = x.reshape((x.shape[0], x.shape[1], 1))


with open("x_val.pic", "wb") as fp:
  pickle.dump(x, fp)
with open("y_val.pic", "wb") as fp:
  pickle.dump(y, fp)
with open("df_val.pic", "wb") as fp:
    pickle.dump(df_val, fp)



df_test = df_test.sample(frac = 1)
df_test = df_test.reset_index(drop=True)

x = np.empty((df_test.shape[0], df_test.loc[0]["Samples"].shape[0]))

for i, row in df_test.iterrows():
    x[i] = row["Samples"]

y = np.array(df_test["IsExplosion"])

#Standardize the data
x = x.reshape(x.shape[0], x.shape[1], 1)

y = np.asarray(y).astype('int32')

#Normalization:
sums = x.sum(axis=1)
sums = sums/3600
mins = x.min(axis=1)
maxs = x.max(axis=1)

sums = sums.reshape((sums.shape[0], 1))
mins = mins.reshape((mins.shape[0], 1))
maxs = maxs.reshape((maxs.shape[0], 1))

x = x.reshape((x.shape[0], x.shape[1]))
x = -sums + x
x /= (maxs-mins)
x = x.reshape((x.shape[0], x.shape[1], 1))

with open("x_test.pic", "wb") as fp:
  pickle.dump(x, fp)
with open("y_test.pic", "wb") as fp:
  pickle.dump(y, fp)
with open("df_test.pic", "wb") as fp:
    pickle.dump(df_test, fp)

print ("Done")

## Rockburst Data

In [None]:
def rock_event_collect(a):
    global rock

    count = 0; #total number of events
    e = False #Has this event been labeled an earthquake
    m = False #Has this event been labeled a manmade event
    r = False #Has this event been labeled a rockburst

    i = 0
    while i < len(a):
        if (len(a[i]) != 136):
            e = False
            m = False
            i += 1
            continue

        indices = [0, 11, 23, 30, 36, 45, 55, 61, 67, 71, 77, 83, 88, 93, 97, 104, 111, 113, 115, 118, 128]
        l = [a[i][j:k] for j,k in zip(indices, indices[1:]+[None])]
        for j in range (len(l)):
            l[j] = l[j].strip()

        if (l[18] != '' and l[18] != 'uk' and l[19] != "ISC-EHB" and l[19] != "EHB"): #ISC-EHB and EHB can be unreliable in their assessments; they labeled multiple known nuclear explosions as earthquakes
            if (l[18][1] == 'e'):
                e = True
            elif (l[18][1] == 'm' or l[18][1] == 'n' or l[18][1] == 'h' or l[18][1] == 'x' or l[18][1] == 'i'):
                m = True
            elif l[18][1] == 'r':
                m = True
                r = True
            else: #discard this event and move on
                for j in range(i, len(a)):
                    if (len(a[j]) == 0):
                        i = j
                        e = False
                        m = False
                        break
                continue

        #cropping the f off the longitude
        if (l[5][-1] == 'f'):
            l[5] = l[5][:-1]
        #cropping the f off the time
        if (l[1][-1] == 'f'):
            l[1] = l[1][:-1]

        #Converting to floats
        for j in range(2, 8):
            #print (l[j])
            if (len(l[j]) > 0):
                l[j] = float(l[j])

        if (len(a[i+1]) > 0 and a[i+1][2] == '#'): #Checking if this is the prime piece of data for the event, then we add it to the dataframes
            if (r and m and not e):
                l[18] = 'man'
                rock.loc[rock.shape[0]+1] = l

            e = False #Reset boolean variables when we reach the last recording of this event
            m = False
        i += 1

def rock_data_collect(a):
    global data_rock
    global event
    
    i = 0
    while i < len(a):
        if event > rock.shape[0]:
            break
        
        #if i % 10000 == 0:
            #print (data_rock.shape[0], i)

        l = a[i].split(',')

        for j in range (len(l)):
            l[j] = l[j].strip()
        
        # If event ID and station are the same as the previous line, move on because this arrival was already picked up.
        if i > 0:
            l2 = a[i-1].split(',')
            for j in range (len(l2)):
                l2[j] = l2[j].strip()
            if l[0] == l2[0] and l[2] == l2[2]:
                i += 1
                continue
        
        eventtime = UTCDateTime(l[-8] + "T" + l[-7]) #time of event for current item in bulletin

        reftime = UTCDateTime(rock.loc[event]["Date"] + "T" + rock.loc[event]["Time"]) #time of event for current item in dataframe

        #if the bulletin's time is smaller or the distance detected is less than, need to move on to next item in bulletin.
        #(the bulletin's arrivals are ordered in increasing event time and distance detected)
        if eventtime < reftime or float(l[7]) < 20:
            i += 1
            continue
        elif eventtime > reftime:
            event += 1
            continue
        
        station = l[2]
        channel = l[6]
        
        time = UTCDateTime(l[11] + "T" + l[12])
        
        # Request a waveform from 60 seconds before to 120 seconds after the arrival time
        # Sometimes the waveforms are given with a sharp spike at the front so we query for a few extra seconds at the front and later cut off the excess
        
        ext = 9
        try:
            st = client.get_waveforms("*", station, "*", "??Z", starttime=time-60-ext, endtime=time + 120)
        except:
            i += 1
            continue
        
        arr = []
        ind = 0
        for tr in st:
            s = str(tr.stats.network) + "." + str(tr.stats.station) + "." + str(tr.stats.location) + "." + str(tr.stats.channel)
            arr.append([s, ind, tr])
            ind += 1
        
        arr.sort()
        
        prevstation = ""
        prevchannel = ""
        
        for item in arr:
            tr = item[2]
            
            #sometimes if the sampling rate is 20, it is recorded as 19.99 so this fixes that
            sr = int(tr.stats.sampling_rate + 0.1)

            if sr == 0 or sr % 20 != 0:
                continue
                #print (r)
            
            network = tr.stats.network
            station = tr.stats.station
            location = tr.stats.location
            channel = tr.stats.channel
            
            if station == prevstation and channel == prevchannel:
                continue
            
            #some stations don't provide the full data when a very long interval is queried so we try reducing the extra amount
            ext = 9
            while tr.stats.npts < (180+ext)*sr and ext >= 3:
                ext -= 2
                try:
                    tr = client.get_waveforms(network, station, location, channel, starttime=time-60-ext, endtime=time + 120)[0]
                except:
                    continue
                
                sr = int(tr.stats.sampling_rate + 0.1)
            
            if tr.stats.npts < (180+ext)*sr: # Can't take sample if it is too small.
                #print (tr)
                continue

            tr2 = tr.copy()
            
            #perform a highpass filter of 1 Hz
            tr2.filter('highpass', freq=1, corners=2, zerophase=True)
            
            #cut off the excess at the front
            tr2 = tr2.slice(tr2.stats.starttime+ext)

            #downsample to 20 Hz
            if sr != 20:
                tr2 = tr2.decimate(int(sr)//20, strict_length=False, no_filter=True)

            #find the sta/lta value throughout the waveform
            cft = classic_sta_lta(tr2, int(5 * 20), int(20 * 20))
            
            # the arrival time is at 60s. This makes sure the sta/lta gets to at least 2 somewhere in the interval 50-70 seconds
            # arrival is only taken if it its size is reasonably significant
            m = max(cft[50*20:70*20])
            if m < 2:
                continue

            #look at the data_cols array to see what each item in the row represents
            row = [l[0], l[-8], l[-7], l[11], l[12], float(l[-6]), float(l[-5]), l[-2], l[-1], tr.stats.network, station, tr.stats.location, tr.stats.channel, l[9], float(l[7]), True, sr, tr.data[:20*180], tr2.data[:20*180]]
            
            data_rock.loc[data_rock.shape[0] + 1] = row
            prevstation = station
            prevchannel = channel
            
        i += 1


In [None]:
rock = pd.DataFrame(columns = c)
data_rock = pd.DataFrame(columns = data_cols) #arrivals with waveforms and metadata
event = 1

for y in range (2010, 2019):
    print (y)
    rock = pd.DataFrame(columns = c)
    rock_event_collect(web_scrape_events(y))
    print (rock.shape[0])
    event = 1
    for m in range (1, 13):
        if y == 2018 and m == 12: #there is no data for December 2018
            break
        rock_data_collect(web_scrape_arrivals(m, y))
    print ("Data: " + str(data_rock.shape[0]))

data_rock

In [None]:
x_rock = np.empty((df.shape[0], 3600))

for i, row in data_rock.iterrows():
    s = row["Samples"]
    if max(s) == min(s):
        print (row)
    x_rock[i] = row["Samples"]

y_rock = np.array(data_rock["IsExplosion"])

#Standardize the data
x_rock = x_rock.reshape(x_rock.shape[0], x_rock.shape[1], 1)

y_rock = np.asarray(y_rock).astype('int32')

#Normalization:
sums = x_rock.sum(axis=1)
sums = sums/3600
mins = x_rock.min(axis=1)
maxs = x_rock.max(axis=1)

sums = sums.reshape((sums.shape[0], 1))
mins = mins.reshape((mins.shape[0], 1))
maxs = maxs.reshape((maxs.shape[0], 1))

x_rock = x_rock.reshape((x_rock.shape[0], x_rock.shape[1]))
x_rock = -sums + x_rock
x_rock /= (maxs-mins)
x_rock = x_rock.reshape((x_rock.shape[0], x_rock.shape[1], 1))

with open("x_rock.pic", "wb") as fp:
    pickle.dump(x_rock, fp)
with open("y_rock.pic", "wb") as fp:
    pickle.dump(y_rock, fp)
with open("df_rock.pic", "wb") as fp:
    pickle.dump(data_rock, fp)