Generate ASA outputs locally and save results as a dataframe. Test ASA on adjusted values in constants.py or analyzer.py changes.

In [2]:
import os
import sys
from types import SimpleNamespace
from datetime import datetime

import geopandas as gpd
import numpy as np
import pandas as pd
from dotenv import load_dotenv
from geoalchemy2 import WKTElement
from tqdm import tqdm
from IPython.display import clear_output


load_dotenv()

True

In [3]:
git_path = os.getenv("GIT_FOLDER")
cv3_path = os.getenv("CV3_FOLDER") 
sys.path.append(git_path)
sys.path.append(cv3_path)

In [17]:
from cerulean_cloud.cloud_function_asa.utils.analyzer import (  # noqa: E402
    ASA_MAPPING,
    InfrastructureAnalyzer,
    AISAnalyzer,
    DarkAnalyzer,
    SourceAnalyzer,
)

In [18]:
def get_s1_scene(scene_id, download_path=os.getenv("ASA_DOWNLOAD_PATH")):
    """
    Downloads a S1 scene GeoJSON file from the specified URL if it hasn't been downloaded already.
    """
    url = f"https://api.cerulean.skytruth.org/collections/public.sentinel1_grd/items?scene_id={scene_id}&f=geojson"
    geojson_file_path = os.path.join(download_path, f"{scene_id}.geojson")
    if not os.path.exists(geojson_file_path):
        print(f"Downloading GeoJSON file for Scene {scene_id}...")
        os.system(f'curl "{url}" -o "{geojson_file_path}"')
        print(f"Downloaded GeoJSON to {geojson_file_path}")
    else:
        print(f"GeoJSON file already exists at {geojson_file_path}. Skipping download.")
    s1_gdf = gpd.read_file(geojson_file_path)
    s1_scene = SimpleNamespace(
        scene_id=scene_id,
        scihub_ingestion_time=s1_gdf.scihub_ingestion_time.iloc[0],
        start_time=s1_gdf.start_time.iloc[0],
        end_time=s1_gdf.end_time.iloc[0],
        geometry=WKTElement(str(s1_gdf.geometry.iloc[0])),
    )
    return s1_scene

In [19]:
def download_geojson(id, download_path=os.getenv("ASA_DOWNLOAD_PATH")):
    """
    Downloads a GeoJSON file from the specified URL if it hasn't been downloaded already.

    Parameters:
    - id (int): The unique identifier for the GeoJSON item.
    - download_path (str): The directory path where the GeoJSON will be saved.

    Returns:
    - geojson_file_path (str): The file path to the downloaded GeoJSON.
    """
    url = f"https://api.cerulean.skytruth.org/collections/public.slick/items?id={id}&f=geojson"
    geojson_file_path = os.path.join(download_path, f"{id}.geojson")

    if not os.path.exists(geojson_file_path):
        print(f"Downloading GeoJSON file for ID {id}...")
        os.system(f'curl "{url}" -o "{geojson_file_path}"')
        print(f"Downloaded GeoJSON to {geojson_file_path}")
    else:
        print(f"GeoJSON file already exists at {geojson_file_path}. Skipping download.")

    return geojson_file_path

Load a dataframe of cerulean examples to rerun ASA on. Define the names used for accessing S1 Scene ID and Cerulean Slick ID. In our case, the loaded dataset uses columns `scene_id` and `slick`

In [None]:
csv_path = cv3_path + r'\slick_to_source dump 2024-12-31.csv'
scene_slick_df = pd.read_csv(csv_path)
scene_slick_df = scene_slick_df.iloc[[0]]#test case with only a single scene and slick

scene_column = 'scene_id'
slick_column = 'slick'

Select an analyzer.

In [35]:
analyzer, asa_type = AISAnalyzer, 'asa_ais'
# analyzer, asa_type = InfrastructureAnalyzer, 'asa_infra'
# analyzer, asa_type = DarkAnalyzer, 'asa_dark'

Define columns from analyzer outputs that will be saved in the rerun dataframe

In [22]:
columns = [
    'scene_id', 'slick_id', 'st_name', 'coincidence_score', 'overlap_score',
    'temporal_score', 'distance_score', 'collated_score'
]

Run ASA on the Cerulean examples loaded above

In [24]:

# Create an empty DataFrame with these columns
rerun_df = pd.DataFrame(columns=columns)

for (scene_id, scene_group) in tqdm(scene_slick_df.groupby(scene_column)):
    print(scene_id)
    s1_scene = get_s1_scene(scene_id)
    asa = analyzer(s1_scene)
    print("processing ",len(np.unique(scene_group[slick_column].values)), "true slicks...")

    for slick_id,slick_group in scene_group.groupby(slick_column):
        geojson_file_path = download_geojson(slick_id)
        slick_gdf = gpd.read_file(geojson_file_path)
        res = asa.compute_coincidence_scores(slick_gdf)
        
        for i,sel_res in res.iterrows():
            # Start with the first two keys assigned from variables.
            insert_row = {
                'scene_id': scene_id,
                'slick_id': slick_id
            }
            # Now add the rest of the keys from sel_res.
            for col in columns:
                if col in ['scene_id', 'slick_id']:
                    continue  # Already set above.
                insert_row[col] = sel_res.get(col)

            new_row = pd.DataFrame([insert_row])
            rerun_df = pd.concat([rerun_df, new_row], ignore_index=True)
  
    clear_output()

100%|██████████| 1/1 [03:11<00:00, 191.23s/it]


Save resulting ASA outputs

In [25]:
current_date = datetime.now().strftime("%Y-%m-%d")
print(current_date)

2025-02-13


In [None]:
save_folder = cv3_path
save_file_name = f"{save_folder}\{asa_type}_local_run_{current_date}.csv"

In [None]:
rerun_df.to_csv(save_file_name)