In [None]:
import os
import sys
from types import SimpleNamespace

import geopandas as gpd
import numpy as np
import pandas as pd
from dotenv import load_dotenv
from geoalchemy2 import WKTElement
import requests
import movingpandas as mpd
import shapely
from shapely import frechet_distance
import math
from tqdm import tqdm
from IPython.display import clear_output

sys.path.append(r"C:\Users\ebeva\SkyTruth\git\cerulean-cloud")
load_dotenv(r"C:\Users\ebeva\.env")

In [None]:
from cerulean_cloud.cloud_function_asa.utils.analyzer import AISAnalyzer

from cerulean_cloud.cloud_function_asa.utils.scoring import (
    compute_distance_score,
    compute_overlap_score,
    compute_temporal_score,
    compute_total_score,
)

In [None]:
def get_s1_scene(scene_id, download_path=os.getenv("ASA_DOWNLOAD_PATH")):
    """
    Downloads a S1 scene GeoJSON file from the specified URL if it hasn't been downloaded already.
    """
    url = f"https://api.cerulean.skytruth.org/collections/public.sentinel1_grd/items?scene_id={scene_id}&f=geojson"
    geojson_file_path = os.path.join(download_path, f"{scene_id}.geojson")
    if not os.path.exists(geojson_file_path):
        print(f"Downloading GeoJSON file for Scene {scene_id}...")
        os.system(f'curl "{url}" -o "{geojson_file_path}"')
        print(f"Downloaded GeoJSON to {geojson_file_path}")
    else:
        print(f"GeoJSON file already exists at {geojson_file_path}. Skipping download.")
    s1_gdf = gpd.read_file(geojson_file_path)
    s1_scene = SimpleNamespace(
        scene_id=scene_id,
        scihub_ingestion_time=s1_gdf.scihub_ingestion_time.iloc[0],
        start_time=s1_gdf.start_time.iloc[0],
        end_time=s1_gdf.end_time.iloc[0],
        geometry=WKTElement(str(s1_gdf.geometry.iloc[0])),
    )
    return s1_scene

In [None]:
def download_geojson(id, download_path=os.getenv("ASA_DOWNLOAD_PATH")):
    """
    Downloads a GeoJSON file from the specified URL if it hasn't been downloaded already.

    Parameters:
    - id (int): The unique identifier for the GeoJSON item.
    - download_path (str): The directory path where the GeoJSON will be saved.

    Returns:
    - geojson_file_path (str): The file path to the downloaded GeoJSON.
    """
    url = f"https://api.cerulean.skytruth.org/collections/public.slick/items?id={id}&f=geojson"
    geojson_file_path = os.path.join(download_path, f"{id}.geojson")

    if not os.path.exists(geojson_file_path):
        print(f"Downloading GeoJSON file for ID {id}...")
        os.system(f'curl "{url}" -o "{geojson_file_path}"')
        print(f"Downloaded GeoJSON to {geojson_file_path}")
    else:
        print(f"GeoJSON file already exists at {geojson_file_path}. Skipping download.")

    return geojson_file_path

In [None]:
def retrieve_scene_id_from_inactive_slick(slick_id):
    geojson_file_path = download_geojson(slick_id)
    slick_gdf = gpd.read_file(geojson_file_path)
    orch_id = slick_gdf["orchestrator_run"].values[0]
    url = f"https://api.cerulean.skytruth.org/collections/public.orchestrator_run/items?id={orch_id}"
    orch_data = gpd.GeoDataFrame.from_features(requests.get(url).json()["features"])
    s1_id = orch_data["sentinel1_grd"].values[0]
    url = f"https://api.cerulean.skytruth.org/collections/public.sentinel1_grd/items?id={s1_id}"
    s1_data = gpd.GeoDataFrame.from_features(requests.get(url).json())
    return s1_data["scene_id"].values[0]

In [None]:
def vess_infra_true_ranks(df, coll_column="collated_score"):
    vessel_true_rank = []
    infra_true_rank = []
    for slick_id, group in df.groupby("slick"):
        group.sort_values(by=coll_column, ascending=False, inplace=True)
        group["calculated_rank"] = list(range(1, len(group) + 1))
        true_source = group[group["hitl_verification"]]
        if len(true_source) != 1:
            continue
            # raise BaseException("can't have more than one true source per slick")

        calc_rank = true_source["calculated_rank"].values[0]
        if true_source["type"].values[0] == 1:
            vessel_true_rank.append(calc_rank)
        else:
            infra_true_rank.append(calc_rank)
    return vessel_true_rank, infra_true_rank

In [None]:
def compute_weighted_distance_score(
    traj: mpd.Trajectory,
    curves: gpd.GeoDataFrame,
    crs_meters: str,
    ais_ref_dist: float,
):
    """
    Compute the frechet distance between an AIS trajectory and an oil slick curve

    Args:
        traj (mpd.Trajectory): AIS trajectory
        curves (gpd.GeoDataFrame): oil slick curves

    Returns:
        float: frechet distance between traj and curve
    """
    # Only use the longest curve

    frechet_scores = []
    # curve_lengths = []

    # print("Processing ", len(curves), "curves...")
    for i, slick_curve in curves.to_crs(crs_meters).iterrows():
        curve = slick_curve["geometry"]  # curves.to_crs(crs_meters).iloc[0]["geometry"]
        curve_length = slick_curve["length"]
        # print("Working with ", type(curve))
        # get the trajectory coordinates as points in descending time order from collect
        traj_gdf = (
            traj.to_point_gdf()
            .sort_values(by="timestamp", ascending=False)
            .set_crs("4326")
            .to_crs(crs_meters)
        )

        # take the points and put them in a linestring
        traj_line = shapely.geometry.LineString(traj_gdf.geometry)

        # get the first and last points of the slick curve
        first_point = shapely.geometry.Point(curve.coords[0])
        last_point = shapely.geometry.Point(curve.coords[-1])

        # compute the distance from these points to the start of the trajectory
        first_dist = first_point.distance(shapely.geometry.Point(traj_line.coords[0]))
        last_dist = last_point.distance(shapely.geometry.Point(traj_line.coords[0]))

        if last_dist < first_dist:
            # change input orientation by reversing the slick curve
            curve = shapely.geometry.LineString(list(curve.coords)[::-1])

        # for every point in the curve, find the closest trajectory point and store it off
        traj_points = list()
        for curve_point in curve.coords:
            # compute the distance between this point and every point in the trajectory
            these_distances = list()
            for traj_point in traj_line.coords:
                dist = shapely.geometry.Point(curve_point).distance(
                    shapely.geometry.Point(traj_point)
                )
                these_distances.append(dist)

            closest_distance = min(these_distances)
            closest_idx = these_distances.index(closest_distance)
            traj_points.append(shapely.geometry.Point(traj_line.coords[closest_idx]))

        # compute the frechet distance between the sampled trajectory curve and the slick curve
        traj_line_clip = shapely.geometry.LineString(traj_points)
        dist = frechet_distance(traj_line_clip, curve)

        frechet_score = curve_length * math.exp(-dist / ais_ref_dist)
        frechet_scores.append(frechet_score)

    # print(frechet_scores)
    # print(curves['length'])
    return np.sum(frechet_scores) / np.sum(curves["length"].values)

In [None]:
class MultiSpineASA(AISAnalyzer):
    def __init__(self, s1_scene, **kwargs):
        """
        Initialize the custom AISAnalyzer.
        """
        super().__init__(s1_scene, **kwargs)

    def compute_coincidence_scores(
        self,
        slick_gdf: gpd.GeoDataFrame,
        compute_custom_distance_score=None,
        focus_st_list=[],
    ):
        """
        Associates AIS trajectories with slicks.
        """
        self.results = gpd.GeoDataFrame()

        self.slick_curves = None
        self.slick_gdf = slick_gdf

        if self.ais_gdf is None:
            self.retrieve_ais_data()
        if self.ais_gdf.empty:
            return pd.DataFrame()

        self.slick_to_curves()
        if self.ais_trajectories is None:
            self.build_trajectories()
        if self.ais_buffered is None:
            self.buffer_trajectories()
        if compute_custom_distance_score is None:
            self.score_trajectories()
        else:
            self.score_trajectories(compute_custom_distance_score)
        self.results["collated_score"] = (
            self.results["coincidence_score"] - self.coinc_mean
        ) / self.coinc_std
        return self.results

    def score_trajectories(self, compute_custom_distance_score=None, focus_st=None):
        """
        Measure association by computing multiple metrics between AIS trajectories and slicks

        Returns:
            GeoDataFrame of slick associations
        """
        # print("Scoring trajectories")

        columns = [
            "st_name",
            "ext_id",
            "geometry",
            "coincidence_score",
            "type",
            "ext_name",
            "ext_shiptype",
            "flag",
            "overlap_score",
            "distance_score",
            "temporal_score",
        ]

        # Create a GeoDataFrame of buffered trajectories
        buffered_trajectories_gdf = self.ais_buffered.copy()
        buffered_trajectories_gdf["id"] = [t.id for t in self.ais_trajectories]
        buffered_trajectories_gdf.set_index("id", inplace=True)

        # Perform a spatial join between buffered trajectories and slick geometries
        matches = gpd.sjoin(
            buffered_trajectories_gdf,
            self.slick_gdf,
            how="inner",
            predicate="intersects",
        )

        if matches.empty:
            print("No trajectories intersect the slicks.")
            self.results = gpd.GeoDataFrame(columns=columns, crs="4326")
            return self.results

        # Get unique trajectory IDs that intersect slicks
        intersecting_traj_ids = matches.index.unique()

        # Filter trajectories and weights based on intersecting IDs
        ais_filt = [t for t in self.ais_trajectories if t.id in intersecting_traj_ids]
        weighted_filt = [
            self.ais_weighted[idx]
            for idx, t in enumerate(self.ais_trajectories)
            if t.id in intersecting_traj_ids
        ]
        buffered_filt = [buffered_trajectories_gdf.loc[[t.id]] for t in ais_filt]

        entries = []
        # Skip the loop if weighted_filt is empty
        if weighted_filt:
            # Create a trajectory collection from filtered trajectories
            ais_filt = mpd.TrajectoryCollection(ais_filt)

            # Iterate over filtered trajectories
            for t, w, b in zip(ais_filt, weighted_filt, buffered_filt):
                if t.id != focus_st and focus_st is not None:
                    continue
                print("COMPUTING SCORES FOR", t.id)
                # Compute temporal score
                temporal_score = compute_temporal_score(w, self.slick_gdf)

                # Compute overlap score
                overlap_score = compute_overlap_score(
                    b, self.slick_gdf, self.crs_meters
                )

                # Compute distance score between trajectory and slick curve
                distance_score = compute_custom_distance_score(
                    t, self.slick_curves, self.crs_meters, self.ais_ref_dist
                )

                # Compute total score from these three metrics
                coincidence_score = compute_total_score(
                    temporal_score,
                    overlap_score,
                    distance_score,
                    self.w_temporal,
                    self.w_overlap,
                    self.w_distance,
                )

                print(
                    f"st_name {t.id}: coincidence_score ({round(coincidence_score, 2)}) = "
                    f"({self.w_overlap} * overlap_score ({round(overlap_score, 2)}) + "
                    f"{self.w_temporal} * temporal_score ({round(temporal_score, 2)}) + "
                    f"{self.w_distance} * distance_score ({round(distance_score, 2)})) / "
                    f"({self.w_overlap + self.w_temporal + self.w_distance})"
                )

                entry = {
                    "st_name": t.id,
                    "ext_id": str(t.id),
                    "geometry": shapely.geometry.LineString(
                        [p.coords[0] for p in t.df["geometry"]]
                    ),
                    "coincidence_score": coincidence_score,
                    "type": 1,  # Vessel
                    "ext_name": t.ext_name,
                    "ext_shiptype": t.ext_shiptype,
                    "flag": t.flag,
                    "distance_score": distance_score,
                    "overlap_score": overlap_score,
                    "temporal_score": temporal_score,
                }
                entries.append(entry)

        sources = gpd.GeoDataFrame(entries, columns=columns, crs="4326")
        self.results = sources[sources["coincidence_score"] >= 0]
        return self.results

In [None]:
csv_path = r"C:\Users\ebeva\SkyTruth\cv3\slick_to_source dump 2024-12-31.csv"
true_slick_df = pd.read_csv(csv_path)
csv_path = r"C:\Users\ebeva\SkyTruth\cv3\asa_analysis\slick_to_source False Positive dump 2025-01-3.csv"
false_slick_df = pd.read_csv(csv_path)

In [None]:
print(len(true_slick_df.groupby("scene_id")))
print(len(false_slick_df.groupby("scene_id")))

In [None]:
# columns = [
#     'scene_id', 'slick_id', 'source_id', 'is_fp_slick', 'hitl_verification',
#     'type', 'coincidence_score', 'weighted_coin_score', 'overlap_score',
#     'temporal_score', 'weighted_dist_score', 'distance_score'
# ]

# # Create an empty DataFrame with these columns
# rerun_df = pd.DataFrame(columns=columns)

# # Display the DataFrame
# print(rerun_df)

# for (scene_id, true_group) in tqdm(true_slick_df.groupby('scene_id')):
#     if not np.any(true_group['hitl_verification'].values):
#         continue

#     print(scene_id)
#     s1_scene = get_s1_scene(scene_id)
#     asa = MultiSpineASA(s1_scene, hours_before = 8)

#     false_group = false_slick_df[false_slick_df['scene_id']==scene_id]

#     print("processing ",len(np.unique(true_group['slick'].values)), "true slicks...")
#     print("processing ",len(np.unique(false_group['slick'].values)), "false slicks...")


#     for slick_id,true_slick in true_group.groupby('slick'):
#         geojson_file_path = download_geojson(slick_id)
#         slick_gdf = gpd.read_file(geojson_file_path)
#         res = asa.compute_coincidence_scores(slick_gdf,compute_distance_score)
#         res_weighted = asa.compute_coincidence_scores(slick_gdf,compute_weighted_distance_score)

#         for i,row in true_slick.iterrows():

#             if row['type']==1:
#                 # print(row.values)
#                 sel_res = res[res['st_name']==str(row['st_name'])].iloc[0]
#                 sel_res_weighted = res_weighted[res_weighted['st_name']==str(row['st_name'])].iloc[0]
#                 coincidence_score, overlap_score, temporal_score, distance_score = sel_res['coincidence_score'], sel_res['overlap_score'], sel_res['temporal_score'], sel_res['distance_score']
#                 weighted_dist_score, weighted_coin_score = sel_res_weighted['distance_score'], sel_res_weighted['coincidence_score']
#             # else:
#             #     coincidence_score, overlap_score, temporal_score, distance_score = sel_res['coincidence_score'],0,0,0
#             #     weighted_dist_score, weighted_coin_score = 0, sel_res_weighted['coincidence_score']

#                 insert_row = {
#                     'scene_id':scene_id, 'slick_id':slick_id, 'source_id':row['source'], 'is_fp_slick':False, 'hitl_verification':row['hitl_verification'],
#                     'type':row['type'], 'coincidence_score':coincidence_score, 'weighted_coin_score':weighted_coin_score, 'overlap_score':overlap_score,
#                     'temporal_score':temporal_score, 'weighted_dist_score':weighted_dist_score, 'distance_score':distance_score
#                 }
#             # rerun_df.append(insert_row)
#             new_row = pd.DataFrame([insert_row])
#             rerun_df = pd.concat([rerun_df, new_row], ignore_index=True)
#         pass
#         # down_true = download_geojson(slick_id)
#         # slick_gdf = gpd.read_file(down_true)
#         # res = asa.compute_coincidence_scores(compute_distance_score)
#         # res_weighted = asa.compute_coincidence_scores(compute_weighted_distance_score)

#     # for slick_id,false_slick in false_group.groupby('slick'):
#     #     pass
#     #     down_false = download_geojson(slick_id)
#     #     slick_gdf = gpd.read_file(down_false)
#     #     res = asa.compute_coincidence_scores(slick_gdf,compute_distance_score)
#     #     res_weighted = asa.compute_coincidence_scores(slick_gdf,compute_weighted_distance_score)
#     #     for i,row in false_slick.iterrows():
#     #         if row['type']==1:
#     #             # print(row.values)
#     #             sel_res = res[res['st_name']==str(row['st_name'])].iloc[0]
#     #             sel_res_weighted = res_weighted[res_weighted['st_name']==str(row['st_name'])].iloc[0]
#     #             coincidence_score, overlap_score, temporal_score, distance_score = sel_res['coincidence_score'], sel_res['overlap_score'], sel_res['temporal_score'], sel_res['distance_score']
#     #             weighted_dist_score, weighted_coin_score = sel_res_weighted['distance_score'], sel_res_weighted['coincidence_score']
#     #         else:
#     #             coincidence_score, overlap_score, temporal_score, distance_score = sel_res['coincidence_score'],0,0,0
#     #             weighted_dist_score, weighted_coin_score = 0, sel_res_weighted['coincidence_score']

#     #         insert_row = {
#     #             'scene_id':scene_id, 'slick_id':slick_id, 'source_id':row['source'], 'is_fp_slick':True, 'hitl_verification':row['hitl_verification'],
#     #             'type':row['type'], 'coincidence_score':coincidence_score, 'weighted_coin_score':weighted_coin_score, 'overlap_score':overlap_score,
#     #             'temporal_score':temporal_score, 'weighted_dist_score':weighted_dist_score, 'distance_score':distance_score
#     #         }
#     #         # rerun_df.append(insert_row)
#     #         new_row = pd.DataFrame([insert_row])
#     #         rerun_df = pd.concat([rerun_df, new_row], ignore_index=True)

#     clear_output()
#     # break

In [None]:
HOURS_BEFORE = 8
HOURS_AFTER = 2
TIMESTEPS_PER_HOUR = 6
NUM_TIMESTEPS = HOURS_BEFORE * TIMESTEPS_PER_HOUR

# BUFFERING PARAMETERS FOR AIS
AIS_PROJECT_ID = "world-fishing-827"
AIS_BUFFER = 20000  # buffer around GRD envelope to capture AIS
SPREAD_RATE = 1000  # meters/hour perpendicular to vessel track
BUF_START = 100
BUF_END = BUF_START + SPREAD_RATE * HOURS_BEFORE
BUF_VEC = np.linspace(BUF_START, BUF_END, NUM_TIMESTEPS)

# WEIGHTING PARAMETERS FOR AIS
WEIGHT_START = 2.0
WEIGHT_END = 0.0
WEIGHT_VEC = np.linspace(WEIGHT_START, WEIGHT_END, NUM_TIMESTEPS) / NUM_TIMESTEPS

W_TEMPORAL = 1.0
W_OVERLAP = 1.0
W_DISTANCE = 2.0
AIS_REF_DIST = 4000.0

In [None]:
columns = [
    "scene_id",
    "slick_id",
    "st_name",
    "coincidence_score",
    "overlap_score",
    "temporal_score",
    "distance_score",
    "coll_score",
]

# Create an empty DataFrame with these columns
rerun_df = pd.DataFrame(columns=columns)

# Display the DataFrame
print(rerun_df)

for scene_id, true_group in tqdm(true_slick_df.groupby("scene_id")):
    if not np.any(true_group["hitl_verification"].values):
        continue
    print(scene_id)
    s1_scene = get_s1_scene(scene_id)
    asa = MultiSpineASA(
        s1_scene,
        hours_before=HOURS_BEFORE,
        num_timesteps=NUM_TIMESTEPS,
        buf_end=BUF_END,
        buf_vec=BUF_VEC,
        weight_vec=WEIGHT_VEC,
        w_temporal=W_TEMPORAL,
        w_overlap=W_OVERLAP,
        w_distance=W_DISTANCE,
    )

    print("processing ", len(np.unique(true_group["slick"].values)), "true slicks...")

    for slick_id, true_slick in true_group.groupby("slick"):
        geojson_file_path = download_geojson(slick_id)
        slick_gdf = gpd.read_file(geojson_file_path)
        res = asa.compute_coincidence_scores(slick_gdf, compute_distance_score)

        for i, sel_res in res.iterrows():
            coincidence_score, overlap_score, temporal_score, distance_score = (
                sel_res["coincidence_score"],
                sel_res["overlap_score"],
                sel_res["temporal_score"],
                sel_res["distance_score"],
            )
            insert_row = {
                "scene_id": scene_id,
                "slick_id": slick_id,
                "st_name": sel_res["st_name"],
                "coincidence_score": coincidence_score,
                "overlap_score": overlap_score,
                "temporal_score": temporal_score,
                "distance_score": distance_score,
                "coll_score": sel_res["collated_score"],
            }
            new_row = pd.DataFrame([insert_row])
            rerun_df = pd.concat([rerun_df, new_row], ignore_index=True)

    clear_output()
    # break

In [None]:
res

In [None]:
rerun_df.head()