In [None]:
import numpy as np
import glob2
import datetime
from pathlib import Path
from tqdm.notebook import tqdm
import pickle
from matplotlib import pyplot as plt
from utils.detection.association_geodesic import squarize
import matplotlib as mpl
import matplotlib.dates as mdates
import matplotlib.ticker as mticker

plt.style.use('classic')
plt.rcParams.update({
    "pgf.texsystem": "pdflatex",
    "text.usetex": True,
    "font.family": "serif",
    "font.size": 10,
    "axes.titlesize": 10,
    "axes.labelsize": 10,
    "xtick.labelsize": 8,
    "ytick.labelsize": 8,
    "legend.fontsize": 8,
})
from matplotlib import rc
rc('font', **{'family': 'serif', 'serif': ['Computer Modern']})
rc('text', usetex=True)
import math
from numpy.linalg import LinAlgError
import pandas as pd
import cartopy.feature as cfeature
import matplotlib.patches as mpatches
import cartopy.crs as ccrs
from collections import Counter
from utils.physics.geodesic.distance import distance_point_point
from utils.data_reading.sound_data.station import StationsCatalog
from utils.physics.sound_model.spherical_sound_model import GridSphericalSoundModel as GridSoundModel, MonthlyHomogeneousSphericalSoundModel as HomogeneousSoundModel
from utils.detection.association_geodesic import compute_candidates, update_valid_grid, update_results, load_detections, compute_grids

In [None]:
from scipy.interpolate import RegularGridInterpolator

# paths
CATALOG_PATH = "/media/plerolland/akoustik/MAHY"
i_DETECTIONS_DIR = f"../../../../data/detection/i_TiSSNet_raw_OBS-fixed"

# sound model definition
STATIONS = StationsCatalog(CATALOG_PATH).filter_out_undated().filter_out_unlocated()

LAT_BOUNDS = [-13.4, -12.4]
LON_BOUNDS = [45.25, 46.25]
with open("../../../../data/detection/i_TiSSNet_raw_OBS-fixed/MAHY0/cache/grids_-13.4_-12.4_45.25_46.25_150_1_0.25_0.25.pkl", "rb") as f:
    GRID_TO_COORDS, TDoA, MAX_TDoA, TDoA_UNCERTAINTIES = pickle.load(f)
i_GRID_TO_COORDS = np.array(GRID_TO_COORDS)


seismic_paths = glob2.glob("../../../../data/MAHY/loc_3D/*.npz")
acoustic_to_s = {}
for s in STATIONS:
    depth, bathy = s.other_kwargs["depth"], s.other_kwargs["bathy"]
    under_hydro = bathy - depth
    acoustic_to_s[s] = under_hydro / 1520
interp_seismic = {}
for seismic_propa_path in seismic_paths:
    depth = float(seismic_propa_path.split("_")[-1].split("m")[0])
    data = np.load(seismic_propa_path)
    seismic_propagations = data["values"]
    seismic_propagations[(seismic_propagations < 10 ** -6) | (seismic_propagations > 10 ** 6)] = np.nan
    seismic_depths = data["depths"] * 1_000
    seismic_distances = data["distances"] * 1_000
    interp_seismic[depth] = RegularGridInterpolator(
        (seismic_depths, seismic_distances),
        seismic_propagations,
        bounds_error=False,  # allow extrapolation
        fill_value=None)
available_depths = np.array(list(interp_seismic.keys()))
s_to_interp = {s: interp_seismic[available_depths[np.argmin(abs(available_depths - s.other_kwargs["bathy"]))]]
               for s in STATIONS}

clock_corrections = pd.read_csv("../../../../data/detection/TiSSNet_Pn_raw_repicked/corrections_fixed-intercept.csv", names=["s","intercept","slope","u"], header=None).set_index('s')

In [None]:
MIN_ASSOCIATION_SIZE = 3
datasets = set([s.dataset for s in STATIONS])
i_new_stations = {}
i_idx_to_det = {}
for dataset in datasets:
    dets = glob2.glob(f"{i_DETECTIONS_DIR}/{dataset}/cache/detections*.pkl")[0]
    with open(dets, "rb") as f:
        i_DETECTIONS = pickle.load(f)
    i_idx_det = 0
    i_idx_to_det_local = {}
    for idx, s in enumerate(i_DETECTIONS.keys()):
        s.idx = idx  # indexes to store efficiently the associations
        i_DETECTIONS[s] = list(i_DETECTIONS[s])
        for i in range(len(i_DETECTIONS[s])):
            i_DETECTIONS[s][i] = np.concatenate((i_DETECTIONS[s][i], [i_idx_det]))
            i_idx_to_det_local[i_idx_det] = i_DETECTIONS[s][i]
            i_idx_det += 1
        i_DETECTIONS[s] = np.array(i_DETECTIONS[s])
    i_new_stations[dataset] = list(i_DETECTIONS.keys())
    i_idx_to_det[dataset] = i_idx_to_det_local

STATIONS = i_new_stations

In [None]:
datasets

In [None]:
DELTA = datetime.timedelta(seconds=5)

catalog = []
for dataset in datasets:
    association_files = glob2.glob(f"{i_DETECTIONS_DIR}/{dataset}/cache/associations_3_*.pkl")
    for file in association_files:
        with open(file, "rb") as f:
            associations = pickle.load(f)
        for detections, valid_points in tqdm(associations):
            valid_coords = i_GRID_TO_COORDS[valid_points.astype(np.int32)]
            pos = np.nanmean(valid_coords, axis=0)
            if np.any(np.isnan(pos)):
                continue
            catalog.append({"latitude":pos[0], "longitude":pos[1]})

            dates_reception = []
            dates = []
            rms = []
            for si, di in detections:
                s = STATIONS[dataset][si]
                if "43" in s.name:
                    s.path = "/media/plerolland/akoustik/MAHY/MAHY4_fixed/MAHY43"

                s.other_kwargs["raw"] = True
                date = i_idx_to_det[dataset][di][0]
                dates_reception.append(date)

                dist = distance_point_point([pos[0],pos[1]], s.get_pos())

                dates.append(date - datetime.timedelta(seconds=(dist / 1500)))

                c = clock_corrections.loc[s.name]["intercept"] + clock_corrections.loc[s.name]["slope"] * (date-s.date_start).total_seconds() * 10**-6
                c = datetime.timedelta(seconds=c)
                data = s.get_manager().get_segment(date+c-DELTA,date+c+DELTA)
                rms.append(np.log10(dist) + np.log10(np.max(data**2)))
            date = dates[0] + np.mean(np.array(dates) - dates[0])
            catalog[-1]["date"] = date
            catalog[-1]["dates_orig"] = dates_reception
            catalog[-1]["n_stations"] = len(detections)

            SL = 10*np.log10(np.nanmean(10**np.array(rms)))
            catalog[-1]["SL"] = SL

In [None]:
whole_df = pd.DataFrame(catalog)
whole_df.to_pickle("save_with-subsets.pkl")
print(len(whole_df))

# remove associations that are subsets of other associations
sets_dates = whole_df["dates_orig"].apply(set).tolist()
sizes = np.array([len(s) for s in sets_dates])
keep = np.ones(len(whole_df), dtype=bool)
idx_4 = np.where(sizes == 4)[0]
for i in tqdm(idx_4):
    big_set = sets_dates[i]
    for j in range(len(whole_df)):
        if i == j or not keep[j]:
            continue
        small_set = sets_dates[j]
        if small_set.issubset(big_set):
            keep[j] = False
clean_df = whole_df[keep].reset_index(drop=True)

print(len(clean_df))

clean_df.to_pickle("../../../../data/MAHY/loc_3D/i_association_catalog_clean_df.pkl")

df = clean_df.sort_values("date")

all_dates = [date for sublist in df["dates_orig"] for date in sublist]
date_counts = Counter(all_dates)
duplicated_dates = {date for date, count in date_counts.items() if count > 1}
to_drop = df[df["dates_orig"].apply(lambda lst: any(date in duplicated_dates for date in lst))].index
clean_df = df.drop(index=to_drop).reset_index(drop=True)

clean_df.to_pickle("../../../../data/MAHY/loc_3D/i_association_catalog_clean_df_decimated.pkl")
print(len(clean_df))

In [None]:
df = pd.read_pickle("../../../../data/MAHY/loc_3D/i_association_catalog_clean_df.pkl")
df.to_csv("../../../../data/MAHY/loc_3D/i_association_catalog_full.csv", index=False, columns=["date","latitude","longitude","SL","n_stations", "dates_orig"])


df_N = df[(df["latitude"] > -12.62) & (df["latitude"] < -12.49) & (df["longitude"] > 45.49) & (df["longitude"] < 45.6)].sort_values("date")
df_N.to_csv("../../../../data/MAHY/loc_3D/i_association_catalog_North.csv", index=False, columns=["date","latitude","longitude","SL"], float_format="%.4f")

df_C = df[(df["latitude"] > -12.9) & (df["latitude"] < -12.84) & (df["longitude"] > 45.62) & (df["longitude"] < 45.72)].sort_values("date")
df_C.to_csv("../../../../data/MAHY/loc_3D/i_association_catalog_Center.csv", index=False, columns=["date","latitude","longitude","SL"], float_format="%.4f")

In [None]:
df = df.sort_values("mb")
nb = np.log10(np.arange(1,1+len(df)))[::-1]
plt.scatter(nb,df["mb"])