## Notebook to compute ML picks for INGV events in Norcia Catalog

In [5]:
import time
import os
import obspy
from obspy import UTCDateTime
from obspy.core.event import  Event, Origin, Magnitude, Pick, WaveformStreamID
from obspy import Catalog
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.colors import BoundaryNorm
from obspy import read_events
from obspy import read_inventory
from obspy.clients.fdsn import Client
from obspy.clients.filesystem.sds import Client as sdsclient
from obspy.core import Trace, Stream, Stats
import glob

### Def to download waveform

In [2]:
def download(starttime, endtime, inventory):
    max_retry = 10
    stream = obspy.Stream()
    for network in inventory:
        for station in network:
            # print(f"********{network.code}.{station.code}********")
            retry = 0
            while retry < max_retry:
                try:
                    ch=inventory.select(station=station.code).get_contents()['channels'][0].split('.')[-1][:2]+"?"
                    if network.code =="YR":
                        tmp = sdsYR.get_waveforms(
                            network=network.code, 
                            station=station.code, 
                            location="", 
                            channel=ch, 
                            starttime=starttime, 
                            endtime=endtime
                        )
                    else:
                        tmp = sds.get_waveforms(
                            network=network.code, 
                            station=station.code, 
                            location="", 
                            channel=ch, 
                            starttime=starttime, 
                            endtime=endtime
                        )
                    if len(tmp) > 0:
                        tmp.merge(method=0,fill_value=0)
                    for trace in tmp:
                        if trace.stats.sampling_rate != 100:
                            trace = trace.interpolate(100, method="linear")
                    stream += tmp
                    
                    break
                except Exception as err:
                    print("Error {}.{}: {}".format(network.code, station.code, err))
                    message = "No data available for request."
                    if str(err)[: len(message)] == message:
                        break
                    retry += 1
                    time.sleep(5)
                    continue
            if retry == max_retry:
                print(f"{fname}: MAX {max_retry} retries reached : {network.code}.{station.code}")
    stream.merge(method=0,fill_value=0)
    return stream

In [3]:

def create_zero_stream(network, station, channels, start_time, end_time, sampling_rate):
    stream = Stream()

    # Create a Trace for each channel
    for channel in channels:
        stats = Stats()
        stats.network = network
        stats.station = station
        stats.channel = channel
        stats.sampling_rate = sampling_rate
        stats.starttime = UTCDateTime(start_time)
        # stats.endtime = UTCDateTime(end_time)

        # Create a Trace with zeros
        trace_data = [0.0] * int((end_time - start_time) * sampling_rate)
        trace = Trace(data=np.array(trace_data), header=stats)

        stream.append(trace)

    return stream


# Print the resulting stream
# print(zero_stream)


In [4]:

def findpicks(pddataframe, picker,threshold,inventory):
    name="pn"
    Plist = []
    Slist = []
    Pproblist = []
    Sproblist = []
    mseedlist = []
    # distlist = []
    # Mlist = []
    for i, row in pddataframe.iterrows():
        sta = row['Station']
        Pori = row['P pick']
        Sori = row['S pick']
        # print(sta,Pori, Sori)
        # stainv = inv.select(station=sta)
        # if len(stainv.networks) > 0:
        if pd.notna(Pori):
            t0 = Pori - 20
            t1 = Pori + 40
        else:
            t0 = Sori - 20
            t1 = Sori + 40

        # inv = inv_ingv_hh.select(station=sta,starttime=t0, endtime=t1)
        # inv = inv_ingv_hh.select(station=sta)
        inv = inventory.select(station=sta,starttime=t0, endtime=t1)
        # print(sta,inv)
        for net in inv:
            if net.code == "8P":
                net.code = "IV"

        mseed = download(t0, t1, inv)
        # print(sta, len(mseed))
        # mseed = st.select(station=sta)
        ### if len(mseed)=0, create a stream of zeroes
        if len(mseed) == 0:
            channels=[]
            for net in inv:
                network = net.code
                for sta in net:
                    station = sta.code
                    for ch in sta:
                        channels.append(ch.code)
                                               
            start_time = t0
            end_time = t1
            sampling_rate = 100.0  # Replace with your desired sampling rate

            mseed = create_zero_stream(network, station, channels, start_time, end_time, sampling_rate)

            
        # Dist = calc_vincenty_inverse(lat_eve, lon_eve, mseed[0].stats.sac.stla, mseed[0].stats.sac.stlo)[0] / 1000
        # distlist.append(Dist)
        mseedlist.append(mseed[0].get_id())
        for trace in mseed:
            if trace.stats.sampling_rate != 100:
                trace.resample(100.)
            # mseed = downlad(Pori, stainv)

        picks = picker.classify(mseed, overlap=2800, stacking='max', P_threshold=threshold, S_threshold=threshold).picks
        # picks = picker.classify(mseed, P_threshold=threshold, S_threshold=threshold).picks
        deltap = 1e30
        deltas = 1e30
        PP = None
        Pprob=None
        if pd.notna(Pori):  # Check if Pori is not NaN
            for p in picks:
                if p.phase == 'P':
                    if abs(Pori - obspy.UTCDateTime(p.peak_time)) < deltap:
                        PP = obspy.UTCDateTime(p.peak_time)
                        Pprob = p.peak_value
                        deltap = abs(Pori - PP)
        # Plist.append(PP)
            Plist.append(PP)
            Pproblist.append(Pprob)
        else:
            Plist.append(None)
            Pproblist.append(None)

        if pd.notna(Sori):  # Check if Sori is not NaN
            SS = None
            Sprob=None
            for p in picks:
                if p.phase == 'S':
                    if abs(Sori - obspy.UTCDateTime(p.peak_time)) < deltas:
                        SS = obspy.UTCDateTime(p.peak_time)
                        Sprob = p.peak_value
                        deltas = abs(Sori - SS)
            Slist.append(SS)
            Sproblist.append(Sprob)
        else:
            Slist.append(None)
            Sproblist.append(None)

    namep = "P " + name
    names = "S " + name
    pddataframe['id'] = mseedlist
    # pddataframe['Dist'] = distlist
    pddataframe[namep] = Plist
    pddataframe[names] = Slist
    pddataframe['Pproba'] = Pproblist
    pddataframe['Sproba'] = Sproblist
    return


In [5]:
def write_event(df_local, origintime, eve_id, savelocaldirectory):
    # print(origintime,test)
    timedelta = 2

    obsdir = "/home/jovyan/shared/users/spina/Norcia/github/OBS/"+savelocaldirectory+"/"

    if not os.path.exists(obsdir):
        os.makedirs(obsdir)

    cat = Catalog()
    cat.description = "Norcia_test"
    phaselist = ['P', 'S']

    e = Event()
    e.event_type = "Earthquake"
    e.resource_id = eve_id
    o = Origin()
    o.time = origintime

    for i, row in df_local.iterrows():
        if pd.notna(row['P pick']) and (row['P pn'] != ''):
            if abs(row['P pick'] - row['P pn']) <= timedelta:
                wav_id = WaveformStreamID(
                    station_code=row['Station'],
                    channel_code="Z",
                    network_code=row['id'].split('.')[0]
                )
                e.picks.append(Pick(
                    time=row['P pn'],
                    waveform_id=wav_id,
                    phase_hint='P',
                    evaluation_mode="automatic",
                    time_errors=0.02
                ))

        if pd.notna(row['S pick']) and (row['S pn'] != ''):
            if abs(row['S pick'] - row['S pn']) <= timedelta:
                wav_id = WaveformStreamID(
                    station_code=row['Station'],
                    channel_code="N",
                    network_code=row['id'].split('.')[0]
                )
                e.picks.append(Pick(
                    time=row['S pn'],
                    waveform_id=wav_id,
                    phase_hint='S',
                    evaluation_mode="automatic",
                    time_errors=0.04
                ))

    # print(e.picks)
    if len(e.picks) > 0:
        fileOBS = obsdir + "Norcia_test_" + str(origintime) + "_" + ".phs"
        e.write(fileOBS, format="NLLOC_OBS")

        with open(fileOBS, "r+") as f: s = f.read(); f.seek(0); f.write("PUBLIC_ID "+str(e.resource_id)+"\n" + s)
    else:
        print('No picks for event',e.resource_id)
        f = open('no_event.txt', 'a')
        f.write(str(e.resource_id)+"\n")
        f.close()
# Example usage:
# replace df1, origintime, and test with your actual data
# write_event(df1, origintime_value, test_value)


In [6]:
def check_components(stream):

#################################################
# Check if E and N component are foud in stream #
#################################################

    check = stream.select(component='[E,N,1,2]')
    
    if len(check) == 2 :
        return True
    else:
        return False

# Example usage
# stream = mseed  # Your ObsPy Stream object here
# has_en_components = check_components(stream)
# print(f"Stream has E or N components: {has_en_components}")


In [7]:
def check_data_length(stream, threshold=400):
    for trace in stream:
        if len(trace.data) <= threshold:
            return False
    return True

# # Example usage:
# if check_data_length(mseed):
#     print("All traces have data length larger than 3000.")
# else:
#     print("Some traces have data length less than or equal to 3000.")


In [8]:
def SNR_dataframe(dataframe,inventory,lenght):
    """
    The function calculates the signal-to-noise ratio (SNR) of a given time series data.
    
    :param tr: The variable "tr" is likely a time series data object, such as a waveform or seismogram.
    It contains the data that we want to calculate the signal-to-noise ratio (SNR) for
    :param P_sample: P_sample is the sample number where the noise portion of the signal starts
    :param S_sample: The starting sample index of the signal window
    :param npts_snr: The parameter "npts_snr" represents the number of data points used to calculate the
    signal-to-noise ratio (SNR)
    :return: the signal-to-noise ratio (SNR) calculated using the provided parameters.
    """
    #
    
    filt=[0.05,0.1,0.2,0.5,1,2]
    buffer_samples = 0.2
    ampl_lenght = 10

    Psnr_final=[]
    Pfreq_final=[]
    Ssnr_final=[]
    Sfreq_final=[]

    for i, row in dataframe.iterrows():
        sta = row['Station']
        Pori = row['P pick']
        Sori = row['S pick']
        # print(sta,Pori,Sori)
        if pd.notna(Pori):
            start_signal = Pori - buffer_samples
            end_signal = Pori + lenght - buffer_samples
            start_noise = Pori - (lenght + buffer_samples)
            end_noise = Pori - buffer_samples

            t0 = start_noise-1
            t1 = end_signal+1
            inv = inventory.select(station=sta,starttime=t0, endtime=t1)
            for net in inv:
                if net.code == "8P":
                    net.code = "IV"

            mseed = download(t0, t1, inv)
            mseed = mseed.select(component="[E,N,1,2]")
            # mseed.plot();
            # print(sta,mseed)
            # print('comp',check_components(mseed))
            # print('len',check_data_length(mseed))

            if (check_components(mseed)) and (check_data_length(mseed)):
                snrlist=[]
                for f in filt:
                    snr=0.
                    stream_filt=mseed.copy().filter('highpass',freq=f)
                    for t in stream_filt:
                        tr = t.copy().trim(starttime=start_signal,endtime=end_signal)
                        signal = np.absolute(tr.data)
                        sig_95per = np.percentile(signal, 95)

                        tr = t.copy().trim(starttime=start_noise,endtime=end_noise)
                        noise = np.absolute(tr.data)
                        noi_95per = np.percentile(noise, 95)
                        snrtmp = 20.0 * np.log10(sig_95per/noi_95per)
                        snr+=snrtmp
                    snr=snr/2.
                    snrlist.append(snr)
                snrindex=snrlist.index(max(snrlist))
                Psnr_final.append(snrlist[snrindex])
                Pfreq_final.append(filt[snrindex])
            else:
                # mseed.plot();
                # print(sta,mseed)
                # print('comp',check_components(mseed))
                # print('len',check_data_length(mseed))
                Psnr_final.append("NTA")
                Pfreq_final.append("NTA")
        else:
            Psnr_final.append(None)
            Pfreq_final.append(None)

            
        if pd.notna(Sori):
            start_signal = Sori - buffer_samples
            end_signal = Sori + lenght - buffer_samples
            start_noise = Sori - (lenght + buffer_samples)
            end_noise = Sori - buffer_samples

            t0 = start_noise-1
            t1 = end_signal+1
            inv = inventory.select(station=sta,starttime=t0, endtime=t1)
            for net in inv:
                if net.code == "8P":
                    net.code = "IV"

            mseed = download(t0, t1, inv)
            mseed = mseed.select(component="[E,N,1,2]")

            # print(sta,mseed)
            # print('comp',check_components(mseed))
            # print('len',check_data_length(mseed))

            if (check_components(mseed)) and (check_data_length(mseed)):
                snrlist=[]
                for f in filt:
                    snr=0.
                    stream_filt=mseed.copy().filter('highpass',freq=f)
                    for t in stream_filt:
                        tr = t.copy().trim(starttime=start_signal,endtime=end_signal)
                        signal = np.absolute(tr.data)
                        sig_95per = np.percentile(signal, 95)

                        tr = t.copy().trim(starttime=start_noise,endtime=end_noise)
                        noise = np.absolute(tr.data)
                        noi_95per = np.percentile(noise, 95)
                        snrtmp = 20.0 * np.log10(sig_95per/noi_95per)
                        snr+=snrtmp
                    snr=snr/2.
                    snrlist.append(snr)
                snrindex=snrlist.index(max(snrlist))
                Ssnr_final.append(snrlist[snrindex])
                Sfreq_final.append(filt[snrindex])
            else:
                # mseed.plot();
                # print(sta,mseed)
                # print('comp',check_components(mseed))
                # print('len',check_data_length(mseed))
                Ssnr_final.append("NTA")
                Sfreq_final.append("NTA")
        else:
            Ssnr_final.append(None)
            Sfreq_final.append(None)

    dataframe['Psnr'] = Psnr_final
    dataframe['Pfreq'] = Pfreq_final
    dataframe['Ssnr'] = Ssnr_final
    dataframe['Sfreq'] = Sfreq_final

    dataframe['P_terr'] = dataframe['P pick'] - dataframe['P pn']
    dataframe['S_terr'] = dataframe['S pick'] - dataframe['S pn']
    
    return

In [9]:
def plot_residual_snr(local_dataframe):

    df_p = local_dataframe.dropna(subset=['P pick', 'P pn', 'Pproba', 'Psnr'])
    df_s = local_dataframe.dropna(subset=['S pick', 'S pn', 'Sproba', 'Ssnr'])

    # Calculate log of SNR and residuals
    df_p['log_Psnr'] = np.log10(df_p['Psnr'])
    df_s['log_Ssnr'] = np.log10(df_s['Ssnr'])
    # df_p['P_terr'] = df_p['P pick'] - df_p['P pn']
    # df_s['S_terr'] = df_s['S pick'] - df_s['S pn']

    # Create subplots
    fig, axs = plt.subplots(1, 2, figsize=(12, 6))

    # Plot for P data
    sc_p = axs[0].scatter(df_p['P_terr'], df_p['log_Psnr'], c=df_p['Pproba'], cmap='viridis', edgecolor='black')
    axs[0].set_xlabel('P Residual (manual-phasnet)')
    axs[0].set_ylabel('Log P SNR')
    axs[0].set_title('P Residual vs Log P SNR')
    fig.colorbar(sc_p, ax=axs[0], label='P Probability')

    # Plot for S data
    sc_s = axs[1].scatter(df_s['S_terr'], df_s['log_Ssnr'], c=df_s['Sproba'], cmap='plasma', edgecolor='black')
    axs[1].set_xlabel('S Residual (manual-phasnet)')
    axs[1].set_ylabel('Log S SNR')
    axs[1].set_title('S Residual vs Log S SNR')
    fig.colorbar(sc_s, ax=axs[1], label='S Probability')

    # Adjust layout
    plt.tight_layout()
    plt.show()
    
    # return


In [10]:

client = Client("INGV")
sds=sdsclient("/home/jovyan/data/sds/")
sdsYR=sdsclient("/home/jovyan/data/iris/")


In [11]:
starttime=UTCDateTime("2016-10-20T00:00:00")
endtime=UTCDateTime("2016-10-21T00:00:00")
# endtime=UTCDateTime("2016-12-01T00:00:00")
print(starttime,endtime)
nday=int((endtime-starttime)/86400)

2016-10-20T00:00:00.000000Z 2016-10-21T00:00:00.000000Z


## Read inventory

In [12]:
INVE='./INVENTORY/*.xml'

ii  = glob.glob(INVE)
inv=obspy.Inventory()

for e in ii:
    inv+=read_inventory(e)



In [13]:
stalist=set()
# for net in invent_new.select(channel="*Z"):
for net in inv.select(channel="*Z"):
    for sta in net:
        stalist.add(sta.code)

## Read INGV catalog

In [4]:
catINGV = read_events("./catalog_ingv.xml")
catINGV = Catalog(sorted(catINGV, key=lambda e: e.origins[0].time))

### Load Seisbench model

In [15]:
import seisbench.models as sbm

picker_pno = sbm.PhaseNet.from_pretrained("original")
picker_pni = sbm.PhaseNet.from_pretrained("instance")

picker_pno.cuda()
picker_pni.cuda()
print(picker_pno.weights_docstring)

  model_weights = torch.load(f"{path_pt}")


Original PhaseNet model from Zhu et al. (2018). Originally published under MIT License. Original available at https://github.com/AI4EPS/PhaseNet/tree/master/model/190703-214543 . 

Converted to SeisBench by Jannes Münchmeyer (munchmej@univ-grenoble-alpes.fr) with help from Sacha Lapins, Yiyuan Zhong, and Jun Zhu


## Extract picks from INGV catalog correspondig to arrivals

In [16]:
## Define model a
picker=picker_pni
thresholds=0.1
dirsave='PN_INST_01'
offset=2
for curreve in catINGV:
    staall = set([])
    ingvpick = []
    data_dict = {}
    event_id_str = str(curreve.resource_id)
    evento_id = event_id_str.split("eventId=")[-1]
    ori = curreve.origins[0]
    arrivals = ori.arrivals
    picks=curreve.picks
    t = ori.time
    lon = ori.longitude
    lat = ori.latitude
    dep = ori.depth
    inventory=read_inventory('./INVENTORY/inventory_ingv'+str(t.julday)+".xml")
    stalist=[]
    for net in inventory.select(channel="*Z"):
        for sta in net:
            stalist.append(sta.code)
    for ar in arrivals:
        if (ar.phase in ['P','S','Pn','Sn','Pg','Sg']):# and ar.time_weight >= 0.00001):
            pi = [p for p in picks if p.resource_id == ar.pick_id][0]
            sta = pi.waveform_id.station_code
            staall.add(sta)
            if sta in stalist:
                ingvpick.append(pi)

    # Initialize the 'S pick' column with np.nan in the dictionary
    for pick in ingvpick:
        station_code = pick.waveform_id.station_code
        pick_time = pick.time

        # Check phase_hint and update the dictionary accordingly
        if (pick.phase_hint == 'P') or (pick.phase_hint == 'Pg') or (pick.phase_hint == 'Pn'):
            if station_code not in data_dict:
                data_dict[station_code] = {'Station': station_code, 'P pick': pick_time, 'S pick': np.nan}
            else:
                data_dict[station_code]['P pick'] = pick_time
        elif (pick.phase_hint == 'S') or (pick.phase_hint == 'Sg') or (pick.phase_hint == 'Sn'):
            if station_code not in data_dict:
                data_dict[station_code] = {'Station': station_code, 'P pick': np.nan, 'S pick': pick_time}
            else:
                data_dict[station_code]['S pick'] = pick_time


# Convert the dictionary to a list of dictionaries and create a DataFrame
    df_picks = pd.DataFrame(list(data_dict.values()))
    # print(df_picks)

# Now call phasenet to find the picks         
    findpicks(df_picks,picker,thresholds,inventory)
    # print(df_picks)
    # dfn_snr = SNR_dataframe(dfn,inv,2)
    SNR_dataframe(df_picks,inv,2)
    # print(df_picks)
    
    snr_dir='/home/jovyan/shared/users/spina/Norcia/github/NLLoc/SNR/'
    if not os.path.exists(snr_dir):
        os.makedirs(snr_dir)

    nname=snr_dir+str(ori.time)+'.csv'
    print(nname)
    # dfn_snr.to_csv(nname, index=False)
    df_picks.to_csv(nname, index=False)
    # print(dfn)
            # for p in pick:
    #     print(p.waveform_id.station_code,p.waveform_id.channel_code ,p.phase_hint,p.time)
    # Now write the event

    # write_event(dfn_snr, ori.time, evento_id, dirsave)

    # print(len(ingvpick))

/home/jovyan/shared/users/spina/Norcia/githib/NLLoc/SNR/2016-10-20T23:40:14.010000Z.csv


In [20]:
 plot_residual_snr(df_picks)

In [17]:
# Drop rows with missing values in the relevant columns
df_p = df_picks.dropna(subset=['P pick', 'P pn', 'Pproba', 'Psnr'])
df_s = df_picks.dropna(subset=['S pick', 'S pn', 'Sproba', 'Ssnr'])

# Calculate log of SNR and residuals
df_p['log_Psnr'] = np.log10(df_p['Psnr'])
df_s['log_Ssnr'] = np.log10(df_s['Ssnr'])
# df_p['P_terr'] = df_p['P pick'] - df_p['P pn']
# df_s['S_terr'] = df_s['S pick'] - df_s['S pn']

# Define the discrete color scale and normalization
cmap_p = plt.cm.viridis
cmap_s = plt.cm.plasma
bounds = np.arange(0, 1.1, 0.1)
norm = BoundaryNorm(boundaries=bounds, ncolors=cmap_p.N, clip=True)

# Create subplots
fig, axs = plt.subplots(1, 2, figsize=(12, 6))

# Plot for P data
sc_p = axs[0].scatter(df_p['P_terr'], df_p['log_Psnr'], c=df_p['Pproba'], cmap=cmap_p, norm=norm, edgecolor='black')
axs[0].set_xlabel('P Residual (manual-phasnet)')
axs[0].set_ylabel('Log P SNR')
axs[0].set_title('P Residual vs Log P SNR')
cbar_p = fig.colorbar(sc_p, ax=axs[0], boundaries=bounds, ticks=bounds)
cbar_p.set_label('P Probability')

# Plot for S data
sc_s = axs[1].scatter(df_s['S_terr'], df_s['log_Ssnr'], c=df_s['Sproba'], cmap=cmap_s, norm=norm, edgecolor='black')
axs[1].set_xlabel('S Residual (manual-phasnet)')
axs[1].set_ylabel('Log S SNR')
axs[1].set_title('S Residual vs Log S SNR')
cbar_s = fig.colorbar(sc_s, ax=axs[1], boundaries=bounds, ticks=bounds)
cbar_s.set_label('S Probability')

# Adjust layout
plt.tight_layout()
plt.show()


In [18]:
df_p = df_picks.dropna(subset=['Pproba', 'Psnr'])
df_s = df_picks.dropna(subset=['Sproba', 'Ssnr'])

df_p['log_Psnr'] = np.log10(df_p['Psnr'])
df_s['log_Ssnr'] = np.log10(df_s['Ssnr'])

# Create the scatter plot
plt.figure(figsize=(10, 6))
plt.scatter(df_p['P_terr'], df_p['log_Psnr'], color='blue', label='Pproba vs log(Psnr)')
plt.scatter(df_s['S_terr'], df_s['log_Ssnr'], color='red', label='Sproba vs log(Ssnr)')

# Add labels and title
plt.xlabel('Probability')
plt.ylabel('Log Signal-to-Noise Ratio (log SNR)')
plt.title('Scatter Plot of Probabilities vs Log SNR')
plt.legend()

# Display the plot
plt.show()