In [1]:
import numpy as np
import matplotlib.pyplot as plt

project = 'july6'
traindate = '2023-07-27'
testdate = '2023-07-28'
traindate_path = '/Users/sydneydybing/gnss-picker/cnn_models_outputs/' + project + '_fq_train/models/traindate_' + traindate + '/'
test_outputs_path = traindate_path + 'data/'
figure_save_dir = traindate_path + 'figures/'

fqtest_data = np.load(test_outputs_path + testdate + '_fqtest_data.npy')
fqtest_metadata = np.load(test_outputs_path + testdate + '_fqtest_metadata.npy')
fqtest_target = np.load(test_outputs_path + testdate + '_fqtest_target.npy')
fqtest_predictions = np.load(test_outputs_path + testdate + '_fqtest_predictions.npy')

num_fqtest = len(fqtest_predictions)
best_thresh = 0.13 # From code 2


In [56]:
# Calculating PGDs from the waveforms

pgds = []

for i in range(len(fqtest_data)):
    n_data = fqtest_data[i,:,0]
    e_data = fqtest_data[i,:,1]
    z_data = fqtest_data[i,:,2]
    pgd = np.max(np.sqrt((n_data)**2+(e_data)**2+(z_data)**2))
    pgds.append(pgd)

print(np.array(pgds).shape)
np.save(test_outputs_path + 'fqtest_data_pgds.npy', np.array(pgds))

(91738,)


In [26]:
# Calculating SNRs from the waveforms using the target Gaussian peak as the arrival time

targets_count = []

SNRs_N = []
SNRs_E = []
SNRs_Z = []

for idx in range(len(fqtest_target)):
    
    target_min = min(fqtest_target[idx,:])
    target_max = max(fqtest_target[idx,:])
    target_range = target_max - target_min
    
    if target_range != 0:
        target_max_idx = np.argmax(fqtest_target[idx,:])
        targets_count.append(target_max_idx)
    
        p_arrival_index = int(target_max_idx) # The index in the sample that the P-wave arrives at
        
        '''
        In this section, I calculate the signal-to-noise ratio of the data. I 
        aim to use a window of 20 seconds before the P-wave arrival time as the 
        noise, and a window of 20 seconds after the P-wave arrival time as the 
        signal. I take the standard deviation of these segments and divide 
        signal/noise (or after/before) to get the SNR.
        
        Sometimes the P-wave arrival time is too close to the start or end of the
        sample, and this causes issues. I've added conditions for these cases.
        '''
        
        preeq_std_end = p_arrival_index # The end of the 20 second 'noise' section before the earthquake is the P-wave arrival index
        
        if preeq_std_end <= 10: # Ask Diego if this is reasonable # Try 10
        
            # If P-wave pick is at zero - can't calculate a pre-eq standard deviation. 
            # OR the P-wave pick is too close to zero, it throws off the SNR values by a LOT.
            
            SNR_N = 'nan' # Just skip it (at least 10 cases for Z component with weird SNRs - one over 10,000!)
            SNR_E = 'nan'
            SNR_Z = 'nan'
        
        elif preeq_std_end > 10 and preeq_std_end <= 20: # If the pre-earthquake noise window is smaller than 20 seconds...
            
            preeq_std_start = 0
            
            posteq_std_start = p_arrival_index # Start the section for the "signal" at the P-wave arrival index
            posteq_std_end = posteq_std_start + 20
            # posteq_std_end = posteq_std_start + p_arrival_index # If the window before is less than 20 because the arrival time is less than 20, this makes the window after that same length
            
            std_before_N = np.std(fqtest_data[idx,preeq_std_start:preeq_std_end,0]) # Take the standard deviation of the sections for each component
            std_after_N = np.std(fqtest_data[idx,posteq_std_start:posteq_std_end,0])
            std_before_E = np.std(fqtest_data[idx,preeq_std_start:preeq_std_end,1])
            std_after_E = np.std(fqtest_data[idx,posteq_std_start:posteq_std_end,1])
            std_before_Z = np.std(fqtest_data[idx,preeq_std_start:preeq_std_end,2])
            std_after_Z = np.std(fqtest_data[idx,posteq_std_start:posteq_std_end,2])
            
            if std_before_N == 0 or std_before_E == 0 or std_before_Z == 0: # If any of the denominators are zeros, we get 'inf' in the results
                
                SNR_N = 'nan' # Skip 'em
                SNR_E = 'nan'
                SNR_Z = 'nan'
                
            else: # Calculate the SNR
                
                SNR_N = std_after_N / std_before_N
                SNR_E = std_after_E / std_before_E
                SNR_Z = std_after_Z / std_before_Z
        
        elif preeq_std_end > 20 and preeq_std_end <= 108: # Standard case where the P-wave arrival is nicely in the middle somewhere
            
            preeq_std_start = preeq_std_end - 20
            
            posteq_std_start = p_arrival_index
            posteq_std_end = posteq_std_start + 20
        
            std_before_N = np.std(fqtest_data[idx,preeq_std_start:preeq_std_end,0]) # Take the standard deviation of the sections for each component
            std_after_N = np.std(fqtest_data[idx,posteq_std_start:posteq_std_end,0])
            std_before_E = np.std(fqtest_data[idx,preeq_std_start:preeq_std_end,1])
            std_after_E = np.std(fqtest_data[idx,posteq_std_start:posteq_std_end,1])
            std_before_Z = np.std(fqtest_data[idx,preeq_std_start:preeq_std_end,2])
            std_after_Z = np.std(fqtest_data[idx,posteq_std_start:posteq_std_end,2])
            
            if std_before_N == 0 or std_before_E == 0 or std_before_Z == 0:
                
                SNR_N = 'nan'
                SNR_E = 'nan'
                SNR_Z = 'nan'
                
            else:
                
                SNR_N = std_after_N / std_before_N
                SNR_E = std_after_E / std_before_E
                SNR_Z = std_after_Z / std_before_Z
            
        elif preeq_std_end > 108 and preeq_std_end < 128: # End edge case - the "signal" period is less than 20 seconds long
            
            preeq_std_start = preeq_std_end - 20
            
            posteq_std_start = p_arrival_index # Should the below be 127 instead??
            posteq_std_end = posteq_std_start + (128 - p_arrival_index) # Make the signal period end at the end of the sample at 128 to avoid errors
        
            std_before_N = np.std(fqtest_data[idx,preeq_std_start:preeq_std_end,0]) # Take the standard deviation of the sections for each component
            std_after_N = np.std(fqtest_data[idx,posteq_std_start:posteq_std_end,0])
            std_before_E = np.std(fqtest_data[idx,preeq_std_start:preeq_std_end,1])
            std_after_E = np.std(fqtest_data[idx,posteq_std_start:posteq_std_end,1])
            std_before_Z = np.std(fqtest_data[idx,preeq_std_start:preeq_std_end,2])
            std_after_Z = np.std(fqtest_data[idx,posteq_std_start:posteq_std_end,2])
            
            if std_before_N == 0 or std_before_E == 0 or std_before_Z == 0:
                
                SNR_N = 'nan'
                SNR_E = 'nan'
                SNR_Z = 'nan'
                
            else:
                
                SNR_N = std_after_N / std_before_N
                SNR_E = std_after_E / std_before_E
                SNR_Z = std_after_Z / std_before_Z
            
        else: # Covers if the pick is exactly at 128, the end of the sample.
            
            # Can't get a post-eq std because the earthquake arrives at the end of the sample
            
            SNR_N = 'nan' # Skip 'em (5 cases)
            SNR_E = 'nan'
            SNR_Z = 'nan'
            
        '''
        Add the calculated SNRs (or 'nan's for issues) to the lists.
        '''
        
        # if SNR_N == 0:
            
        #     print(idx)
            
        SNRs_N.append(SNR_N)
        SNRs_E.append(SNR_E)
        SNRs_Z.append(SNR_Z)
        
    elif target_range == 0:
        
        SNRs_N.append('nan')
        SNRs_E.append('nan')
        SNRs_Z.append('nan')

print(len(SNRs_N))
print(len(SNRs_E))
print(len(SNRs_Z))


In [None]:
np.save(test_outputs_path + 'fqtest_data_SNRs_N.npy', np.array(SNRs_N))
np.save(test_outputs_path + 'fqtest_data_SNRs_E.npy', np.array(SNRs_E))
np.save(test_outputs_path + 'fqtest_data_SNRs_Z.npy', np.array(SNRs_Z))

In [86]:
# Plots to check and make sure stuff looks right

testmags = []

for ii in range(len(fqtest_metadata)):
    mag = fqtest_metadata[ii,2]
    if mag == 'nan':
        testmags.append(np.nan)
    else:
        testmags.append(float(mag))
        
testsnrs = []
        
for iii in range(len(SNRs_N)):
    snr = SNRs_N[iii]
    if snr == 'nan':
        testsnrs.append(np.nan)
    else:
        testsnrs.append(float(snr))

logsnrs = np.log10(testsnrs)
logpgds = np.log10(pgds)

fig = plt.subplots(nrows = 2, ncols = 2, figsize = (15,10), dpi = 300, facecolor = 'white')
plt.suptitle('PGD and SNR distribution in FQ testing dataset', fontsize = 20)

plt.subplot(2,2,1)
plt.hist(logpgds, bins = 50, color = 'deepskyblue', edgecolor = 'black')
plt.xlabel('Log PGD (m)', fontsize = 16)
plt.ylabel('Count', fontsize = 16)

plt.subplot(2,2,2)
plt.scatter(testmags, logpgds, s = 0.5, color = 'deepskyblue')
plt.xlabel('Magnitude', fontsize = 16)
plt.ylabel('Log PGD (m)', fontsize = 16)

# Remove NaN SNRs to be able to make this quick plot
fixsnrs = []
numremoved = []
for iv in range(len(logsnrs)):
    snr = logsnrs[iv]
    if np.isnan(snr):
        numremoved.append(1)
    elif np.isinf(snr):
        numremoved.append(1)
    else:
        fixsnrs.append(snr)
        
print('Number of NaNs removed: ' + str(len(numremoved)))
print('Number of good SNRs left: ' + str(len(fixsnrs)))

plt.subplot(2,2,3)
plt.hist(fixsnrs, bins = 50, color = 'limegreen', edgecolor = 'black')
plt.xlabel('Log SNR (N-S component)', fontsize = 16)
plt.ylabel('Count', fontsize = 16)

plt.subplot(2,2,4)
plt.scatter(testmags, logsnrs, s = 0.5, color = 'limegreen')
plt.xlabel('Magnitude', fontsize = 16)
plt.ylabel('Log SNR (N-S component)', fontsize = 16)

# plt.show();
plt.savefig(figure_save_dir + '9_pgd_and_snr_distribution_fqtest_data.png', format = 'PNG')
plt.close();

  logsnrs = np.log10(testsnrs)


Number of NaNs removed: 50161
Number of good SNRs left: 41577


In [91]:
# Make a list of indices for rows in the testing dataset that actually have earthquakes

rows_w_eqs = []

for idx in range(len(fqtest_metadata)):
    
    if fqtest_metadata[idx,0] == 'nan':
        pass
#         print('nan row: ' + str(idx))
    else:
#         print('eq row: '+ str(idx) + ', rupture ' + fqtest_metadata[idx,0])
        rows_w_eqs.append(idx)

print(len(rows_w_eqs))

45869


In [None]:
# This next section deals with calculating the averages for the entire set of 
# earthquakes and the set of earthquakes which the CNN got correct in testing.

pgds_vector = np.array(pgds).reshape(len(pgds),1) # Turn the list of PGDs we made into a vector...


rows_w_eqs = np.load('/Users/sydneydybing/GNSS-CNN_repo/GNSS-CNN/More_RealData/rowsweqs.npy') # Rows that have earthquakes
correct_eq_inds = np.genfromtxt('/Users/sydneydybing/GNSS-CNN_repo/GNSS-CNN/newtrain_march/more_realdata_norm_testing/correct_indices.txt', dtype = 'int') # Rows that the CNN found earthquakes in

'''
Calculating average PGDs for the groups.
'''

all_eq_pgds = np.asfarray(new_meta_array[rows_w_eqs, 6]) # The PGDs of all of the earthquakes
correct_eq_pgds = np.asfarray(new_meta_array[correct_eq_inds, 6]) # The PGDs of all the earthquakes the CNN found

all_eq_avg_PGD = np.mean(all_eq_pgds)
correct_eq_avg_PGD = np.mean(correct_eq_pgds)

print('Average PGD of all earthquakes: ' + str(round((all_eq_avg_PGD * 100),2)) + ' cm')
print('Average PGD of earthquakes the CNN correctly found: ' + str(round((correct_eq_avg_PGD * 100),2)) + ' cm')
print('-------------------------------------------------------------------')

'''
Calculating average SNRs for all three components of both groups.
'''

SNR_N_vector = np.array(SNRs_N).reshape(len(SNRs_N),1) # Turns the lists of SNRs into vectors...
SNR_E_vector = np.array(SNRs_E).reshape(len(SNRs_E),1)
SNR_Z_vector = np.array(SNRs_Z).reshape(len(SNRs_Z),1)

new_meta_array_a = np.append(new_meta_array, SNR_N_vector, axis = 1) # ...And adds these to make another new metadata array
new_meta_array_b = np.append(new_meta_array_a, SNR_E_vector, axis = 1)
new_meta_array_2 = np.append(new_meta_array_b, SNR_Z_vector, axis = 1)

# print(new_meta_array_2[0]) # Columns: station, date, start time, end time, counter, gauss position, pgd, SNR N component, SNR E, SNR Z

np.save('/Users/sydneydybing/GNSS-CNN_repo/GNSS-CNN/More_RealData/real_metadata_w_gauss_pgd_snr.npy', new_meta_array_2)

'''
Because of the edge cases in the loop above, there are some nans in our SNR vectors.
We can't calculate averages with nans, so we need to find the rows with nans and
just remove them for the sake of this calculation.
'''

h = np.where(new_meta_array_2[rows_w_eqs,7] == 'nan') # Finds nans for all earthquakes
non_nan_rows_w_eqs = np.delete(rows_w_eqs, h) # Removes those rows

j = np.where(new_meta_array_2[correct_eq_inds,7] == 'nan') # Finds nans for the earthquakes the CNN found
non_nan_correct_eq_inds = np.delete(correct_eq_inds, j) # Removes those rows

'''
Now I just grab the good SNRs out of the new metadata array and calculate the averages.
'''

all_eq_SNR_N = np.asfarray(new_meta_array_2[non_nan_rows_w_eqs, 7])
all_eq_SNR_E = np.asfarray(new_meta_array_2[non_nan_rows_w_eqs, 8])
all_eq_SNR_Z = np.asfarray(new_meta_array_2[non_nan_rows_w_eqs, 9])

correct_eq_SNR_N = np.asfarray(new_meta_array_2[non_nan_correct_eq_inds, 7])
correct_eq_SNR_E = np.asfarray(new_meta_array_2[non_nan_correct_eq_inds, 8])
correct_eq_SNR_Z = np.asfarray(new_meta_array_2[non_nan_correct_eq_inds, 9])

all_eq_SNR_N_avg = np.mean(all_eq_SNR_N)
all_eq_SNR_E_avg = np.mean(all_eq_SNR_E)
all_eq_SNR_Z_avg = np.mean(all_eq_SNR_Z)

correct_eq_SNR_N_avg = np.mean(correct_eq_SNR_N)
correct_eq_SNR_E_avg = np.mean(correct_eq_SNR_E)
correct_eq_SNR_Z_avg = np.mean(correct_eq_SNR_Z)

print(len(all_eq_SNR_N))
print(len(correct_eq_SNR_N))

print('Average N-S component SNR of all earthquakes: ' + str(round(all_eq_SNR_N_avg,2)))
print('Average N-S component SNR of earthquakes the CNN correctly found: ' + str(round(correct_eq_SNR_N_avg,2)))
print('-------------------------------------------------------------------')

print('Average E-W component SNR of all earthquakes: ' + str(round(all_eq_SNR_E_avg,2)))
print('Average E-W component SNR of earthquakes the CNN correctly found: ' + str(round(correct_eq_SNR_E_avg,2)))
print('-------------------------------------------------------------------')

print('Average Z component SNR of all earthquakes: ' + str(round(all_eq_SNR_Z_avg,2)))
print('Average Z component SNR of earthquakes the CNN correctly found: ' + str(round(correct_eq_SNR_Z_avg,2)))




