In [None]:
import numpy as np
import glob2
import datetime
from pathlib import Path
from tqdm.notebook import tqdm
import pickle
from matplotlib import pyplot as plt
from utils.detection.association_geodesic import squarize
import matplotlib as mpl
import matplotlib.dates as mdates
import matplotlib.ticker as mticker
from collections import Counter

plt.style.use('classic')
plt.rcParams.update({
    "pgf.texsystem": "pdflatex",
    "text.usetex": True,
    "font.family": "serif",
    "font.size": 10,
    "axes.titlesize": 10,
    "axes.labelsize": 10,
    "xtick.labelsize": 8,
    "ytick.labelsize": 8,
    "legend.fontsize": 8,
})
from matplotlib import rc
rc('font', **{'family': 'serif', 'serif': ['Computer Modern']})
rc('text', usetex=True)
import math
from numpy.linalg import LinAlgError
import pandas as pd
import cartopy.feature as cfeature
import matplotlib.patches as mpatches
import cartopy.crs as ccrs
from collections import Counter
from utils.physics.geodesic.distance import distance_point_point
from utils.data_reading.sound_data.station import StationsCatalog
from utils.physics.sound_model.spherical_sound_model import GridSphericalSoundModel as GridSoundModel, MonthlyHomogeneousSphericalSoundModel as HomogeneousSoundModel
from utils.detection.association_geodesic import compute_candidates, update_valid_grid, update_results, load_detections, compute_grids

In [None]:
from scipy.interpolate import RegularGridInterpolator

# paths
CATALOG_PATH = "/media/plerolland/akoustik/MAHY"
Pn_DETECTIONS_DIR = f"../../../../data/detection/TiSSNet_Pn_raw_OBS-fixed"

# sound model definition
STATIONS = StationsCatalog(CATALOG_PATH).filter_out_undated().filter_out_unlocated()

LAT_BOUNDS = [-13.4, -12.4]
LON_BOUNDS = [45.25, 46.25]
with open("../../../../data/detection/i_TiSSNet_raw_OBS-fixed/MAHY0/cache/grids_-13.4_-12.4_45.25_46.25_150_1_0.25_0.25.pkl", "rb") as f:
    GRID_TO_COORDS, TDoA, MAX_TDoA, TDoA_UNCERTAINTIES = pickle.load(f)
GRID_TO_COORDS = np.array(GRID_TO_COORDS)
Pn_BOUNDS = [(1_000,100_000), (-13.4,-12.4), (45.25,46.25)]
with open("../../../../data/detection/TiSSNet_Pn_raw_OBS-fixed/MAHY0/cache/grids_1000_100000_-13.4_-12.4_45.25_46.25_100_100_1_0.1.pkl", "rb") as f:
    Pn_GRID_TO_COORDS, Pn_TDoA, Pn_MAX_TDoA, Pn_TDoA_UNCERTAINTIES, Pn_LATS, Pn_DEPTHS, Pn_TRAVEL_TIMES = pickle.load(f)
Pn_GRID_TO_COORDS = np.array(Pn_GRID_TO_COORDS)


seismic_paths = glob2.glob("../../../../data/MAHY/loc_3D/*.npz")
acoustic_to_s = {}
for s in STATIONS:
    depth, bathy = s.other_kwargs["depth"], s.other_kwargs["bathy"]
    under_hydro = bathy - depth
    acoustic_to_s[s] = under_hydro / 1520
interp_seismic = {}
for seismic_propa_path in seismic_paths:
    depth = float(seismic_propa_path.split("_")[-1].split("m")[0])
    data = np.load(seismic_propa_path)
    seismic_propagations = data["values"]
    seismic_propagations[(seismic_propagations < 10 ** -6) | (seismic_propagations > 10 ** 6)] = np.nan
    seismic_depths = data["depths"] * 1_000
    seismic_distances = data["distances"] * 1_000
    interp_seismic[depth] = RegularGridInterpolator(
        (seismic_depths, seismic_distances),
        seismic_propagations,
        bounds_error=False,  # allow extrapolation
        fill_value=None)
available_depths = np.array(list(interp_seismic.keys()))
s_to_interp = {s: interp_seismic[available_depths[np.argmin(abs(available_depths - s.other_kwargs["bathy"]))]]
               for s in STATIONS}

clock_corrections = pd.read_csv("../../../../data/detection/TiSSNet_Pn_raw_repicked/corrections_fixed-intercept.csv", names=["s","intercept","slope","u"], header=None).set_index('s')

In [None]:
MIN_ASSOCIATION_SIZE = 3
datasets = set([s.dataset for s in STATIONS])
Pn_new_stations = {}
Pn_idx_to_det = {}
for dataset in datasets:
    dets = glob2.glob(f"{Pn_DETECTIONS_DIR}/{dataset}/cache/detections*.pkl")[0]
    with open(dets, "rb") as f:
        Pn_DETECTIONS = pickle.load(f)
    Pn_idx_det = 0
    Pn_idx_to_det_local = {}
    for idx, s in enumerate(Pn_DETECTIONS.keys()):
        s.idx = idx  # indexes to store efficiently the associations
        Pn_DETECTIONS[s] = list(Pn_DETECTIONS[s])
        for i in range(len(Pn_DETECTIONS[s])):
            Pn_DETECTIONS[s][i] = np.concatenate((Pn_DETECTIONS[s][i], [Pn_idx_det]))
            Pn_idx_to_det_local[Pn_idx_det] = Pn_DETECTIONS[s][i]
            Pn_idx_det += 1
        Pn_DETECTIONS[s] = np.array(Pn_DETECTIONS[s])
    Pn_new_stations[dataset] = list(Pn_DETECTIONS.keys())
    Pn_idx_to_det[dataset] = Pn_idx_to_det_local

STATIONS = Pn_new_stations

In [None]:
DELTA = datetime.timedelta(seconds=30)


catalog = []
for dataset in datasets:
    association_files = glob2.glob(f"{Pn_DETECTIONS_DIR}/{dataset}/cache/associations_{1}_{MIN_ASSOCIATION_SIZE}_*.pkl")
    for file in association_files:
        with open(file, "rb") as f:
            associations = pickle.load(f)
        for detections, valid_points in tqdm(associations):
            valid_coords = Pn_GRID_TO_COORDS[valid_points[:,0].astype(np.int32)]
            pos = np.nanmean(valid_coords, axis=0)
            if np.any(np.isnan(pos)):
                continue
            catalog.append({"depth":pos[0],"latitude":pos[1], "longitude":pos[2]})

            dates, dates_reception = [], []
            d_h, d_v_hydro, d_v_seismo, RL = [], [], [], []
            for si, di in detections:
                s = STATIONS[dataset][si]
                if "43" in s.name:
                    s.path = "/media/plerolland/akoustik/MAHY/MAHY4_fixed/MAHY43"

                s.other_kwargs["raw"] = True
                date = Pn_idx_to_det[dataset][di][0]

                hydro_dep, local_bathy = s.other_kwargs["depth"], s.other_kwargs["bathy"]
                d_h.append(distance_point_point([pos[1],pos[2]], s.get_pos()))
                d_v_hydro.append(local_bathy-hydro_dep)
                d_v_seismo.append(pos[0] - local_bathy)

                dates_reception.append(date)
                seismic_travel_path = s_to_interp[s]([[pos[0], d_h[-1]]])[0]
                dates.append(date - datetime.timedelta(seconds=(seismic_travel_path + acoustic_to_s[s])))

                c = clock_corrections.loc[s.name]["intercept"] + clock_corrections.loc[s.name]["slope"] * (date-s.date_start).total_seconds() * 10**-6
                c = datetime.timedelta(seconds=c)
                data = s.get_manager().get_segment(date+c,date+c+DELTA)
                RL.append(np.log10(np.mean(data**2)))

            date = dates[0] + np.mean(np.array(dates) - dates[0])
            catalog[-1]["d_v_seismo"] = d_v_seismo
            catalog[-1]["d_v_hydro"] = d_v_hydro
            catalog[-1]["d_h"] = d_h
            catalog[-1]["RL"] = RL
            catalog[-1]["dates"] = dates
            catalog[-1]["dates_orig"] = dates_reception
            catalog[-1]["date"] = date
            catalog[-1]["n_stations"] = len(detections)

            #catalog[-1]["mb"] = -141.3/14.8 + SL / 14.8

In [None]:
whole_df = pd.DataFrame(catalog)
whole_df.to_pickle("save_with-subsets.pkl")
print(len(whole_df))

# remove associations that are subsets of other associations
sets_dates = whole_df["dates_orig"].apply(set).tolist()
sizes = np.array([len(s) for s in sets_dates])
keep = np.ones(len(whole_df), dtype=bool)
idx_4 = np.where(sizes == 4)[0]
for i in tqdm(idx_4):
    big_set = sets_dates[i]
    for j in range(len(whole_df)):
        if i == j or not keep[j]:
            continue
        small_set = sets_dates[j]
        if small_set.issubset(big_set):
            keep[j] = False
clean_df = whole_df[keep].reset_index(drop=True)


clean_df.to_pickle("save.pkl")
print(len(clean_df))

In [None]:
df = pd.read_pickle("save.pkl")
df = df[(df["latitude"] > -13) & (df["latitude"] < -12.6) & (df["longitude"] > 45) & (df["longitude"] < 45.7)].sort_values("date")

all_dates = [date for sublist in df["dates_orig"] for date in sublist]
date_counts = Counter(all_dates)
duplicated_dates = {date for date, count in date_counts.items() if count > 1}
to_drop = df[df["dates_orig"].apply(lambda lst: any(date in duplicated_dates for date in lst))].index
df_clean = df.drop(index=to_drop).reset_index(drop=True)

df_clean.to_pickle("filtered_decimated.pkl")
print(len(df), len(df_clean))

In [None]:
df = pd.read_pickle("filtered_decimated.pkl")
df["d_seismic"] = df.apply(lambda row: np.sqrt(np.array(row["d_v_seismo"])**2 + np.array(row["d_h"])**2), axis=1)
df["RL_corrected"] = df.apply(lambda row: 10 * np.array(row["RL"]) + 17.201 * np.log10(np.array(row["d_seismic"])), axis=1)
df["SL"] = 10*((10**(df["RL_corrected"]/10)).apply(np.nanmean).apply(np.log10))
df["mb"] = -178/14.2 + df["SL"] / 14.2
df["depth"] /= 1000
df.to_csv("../../../../data/MAHY/loc_3D/P_association_catalog_full.csv", index=False, columns=["date","latitude","longitude","depth","mb","n_stations"])

print(len(df))

df = df[(df["latitude"] > -13) & (df["latitude"] < -12.6) & (df["longitude"] > 45) & (df["longitude"] < 45.7)].sort_values("date")
#df['mb'] = df['mb'].map(lambda x: f"{x:.1f}")
df.to_csv("../../../../data/MAHY/loc_3D/P_association_catalog.csv", index=False, columns=["date","latitude","longitude","depth","mb","SL"], float_format="%.4f")
print(len(df))

In [None]:
print(df["d_seismic"][0])
df["mb"]

In [None]:
df["SL"]

In [None]:
df = df.sort_values("mb")
nb = np.log10(np.arange(1,1+len(df)))[::-1]
plt.scatter(nb,df["mb"])