In [1]:
import matplotlib.pyplot as plt
import numpy as np
import h5py

In [2]:
### ----- Load the full dataset from HDF5 files ----- ###

training_data = h5py.File('/hdd/mlaapde/decimated/training_data_full_decimate2.hdf5', 'r')
validation_data = h5py.File('/hdd/mlaapde/decimated/validation_data_full_decimate2.hdf5', 'r')
testing_data = h5py.File('/hdd/mlaapde/decimated/testing_data.hdf5', 'r')

train_mags = training_data['magnitude'][:]
valid_mags = validation_data['magnitude'][:]
test_mags = testing_data['magnitude'][:]

training_data.close()
validation_data.close()
testing_data.close()

In [3]:
idx = 741100
idx2 = 741102

# Fixing the weird nan wave

copy_mag = train_mags[0]
copy_mag2 = train_mags[1]

train_mags[idx] = copy_mag
train_mags[idx2] = copy_mag2

In [4]:
len(train_mags) + len(valid_mags) + len(test_mags)

3244974

In [5]:
historic_test_mags = np.load('/hdd/mlaapde/decimated/historic_test_mags.npy')

In [6]:
both_test = [valid_mags, historic_test_mags]

In [9]:
### ----- FIGURE S1 ----- ###

plt.figure(figsize = (16,8), facecolor = 'white', dpi = 300)
#plt.suptitle('Dataset EQ Magnitude Distribution', x = 0.5, y = 1.02, fontsize = 16)

plt.subplot(131)
plt.grid()
plt.hist(train_mags, bins = 50, color = '#001528', label = 'Aug. 1, 2013 - Sept. 30, 2018')
#plt.title('Training data (800,000 samples)\nAug 1, 2013 - July 31, 2017', fontsize = 14)
plt.title('Training data (2,431,341 samples)', fontsize = 16)
#plt.xlabel('Magnitude', fontsize = 14)
plt.ylabel('Log Count', fontsize = 18)
plt.yscale('log', nonpositive = 'clip')
plt.ylim(1, 10**6)
plt.tick_params(axis = 'y', labelsize = 16)
#plt.ylim(1, 2*10**5)
plt.xlim(0,9.5)
plt.xticks([0, 1, 2, 3, 4, 5, 6, 7, 8, 9], fontsize = 16)
plt.legend(loc = 'upper left', fontsize = 13)
plt.text(x = -1.6, y = 2*10**6, s = '(a)', fontsize = 20)

plt.subplot(132)
plt.grid()
plt.hist(valid_mags, bins = 50, color = '#2DADB4', label = 'Oct. 1, 2018 - Dec. 31, 2019')
#plt.title('Validation data (200,000 samples)\nAug 1, 2017 - Apr 1, 2018', fontsize = 14)
plt.title('Validation data (489,268 samples)', fontsize = 16)
plt.xlabel('Catalog Magnitude', fontsize = 18)
# plt.ylabel('Log Count', fontsize = 14)
# plt.yticks(ticks = None, labels = None, fontsize = 11)
plt.tick_params(axis = 'y', labelleft = False)
plt.yscale('log', nonpositive = 'clip')
plt.ylim(1, 10**6)
#plt.ylim(1, 2*10**5)
plt.xlim(0,9.5)
plt.xticks([1, 2, 3, 4, 5, 6, 7, 8, 9], fontsize = 16)
plt.legend(loc = 'upper left', fontsize = 13)
plt.text(x = -0.1, y = 2*10**6, s = '(b)', fontsize = 20)

plt.subplot(133)
plt.grid()
plt.hist(both_test, bins = 50, color = ['#730114', '#f01f42'], label = ['MLAAPDE testing data\n(Jan. 1, 2020 - Dec. 31, 2020)', 'Extended historical data\n(Jan. 1, 2000 - Jul. 31, 2013)'], stacked = True)
#plt.hist(historic_test_mags, bins = 10, color = '#f01f42', label = 'Extended historic data\n(Jan. 1, 2000 - Jul. 31, 2013)', stacked = True)
#plt.hist(valid_mags, bins = 50, color = '#730114', label = 'MLAAPDE testing data\n(Jan. 1, 2020 - Dec. 31, 2020)', alpha = 0.8, stacked = True)
plt.title('Testing data (337,814 samples)', fontsize = 16)
#plt.xlabel('Magnitude', fontsize = 14)
#plt.ylabel('Log Count', fontsize = 14)
#plt.yticks(ticks = None, labels = None)
plt.tick_params(axis = 'y', labelleft = False)
plt.yscale('log', nonpositive = 'clip')
plt.ylim(1, 10**6)
#plt.ylim(1, 2*10**5)
plt.xlim(0,9.5)
plt.xticks([1, 2, 3, 4, 5, 6, 7, 8, 9], fontsize = 16)
plt.legend(loc = 'upper left', fontsize = 13)
plt.text(x = -0.1, y = 2*10**6, s = '(c)', fontsize = 20)

plt.subplots_adjust(wspace = 0)
# plt.show()
plt.savefig('/home/sdybing/mlaapde/testdata_preds/all_test_rerun/revised_figures/figS1_datadistrib_hist.png', format = 'PNG')
plt.close();
