In [None]:
import os
import pandas as pd

from datetime import timedelta
import geopandas as gpd
import pandas as pd
from shapely.geometry import shape, Point
from pystac_client import Client

In [None]:


# Path to your directory of parquet files
input_dir = "data/gulf_jan"
output_path = "data/gulf_jan/combined_gulf_jan.parquet"

# Get list of parquet files
files = [f for f in os.listdir(input_dir) if f.endswith(".parquet")]

# Load and concatenate
dfs = []
for f in files:
    full_path = os.path.join(input_dir, f)
    print(f"Loading {full_path}")
    dfs.append(pd.read_parquet(full_path))

combined_df = pd.concat(dfs, ignore_index=True)
print(f"Combined dataframe shape: {combined_df.shape}")


In [None]:
combined_df = pd.read_parquet('data/gulf_jan/combined_gulf_jan.parquet')

In [None]:


# Sentinel-2 metadata query (Jan 2023 over Gulf)
api = Client.open("https://planetarycomputer.microsoft.com/api/stac/v1")
results = api.search(
    collections=["sentinel-2-l2a"],
    bbox=[-97, 21, -81, 31],
    datetime="2023-01-01/2023-01-31",
    query={"eo:cloud_cover": {"lt": 80}}
)

sentinel_passes = [(shape(item.geometry), item.datetime) for item in results.get_all_items()]
sentinel_gdf = gpd.GeoDataFrame(
    {'datetime': [dt for _, dt in sentinel_passes]},
    geometry=[poly for poly, _ in sentinel_passes],
    crs="EPSG:4326"
)

# Convert combined AIS dataframe
combined_df["BaseDateTime"] = pd.to_datetime(combined_df["BaseDateTime"], utc=True)
ais_gdf = gpd.GeoDataFrame(
    combined_df,
    geometry=gpd.points_from_xy(combined_df["LON"], combined_df["LAT"]),
    crs="EPSG:4326"
)

# Match AIS ↔ S2 by time and space
TIME_WINDOW = timedelta(minutes=30)
matching_rows = []

for _, s2_row in sentinel_gdf.iterrows():
    poly = s2_row.geometry
    s2_time = s2_row.datetime

    time_mask = (
        (ais_gdf["BaseDateTime"] >= s2_time - TIME_WINDOW) &
        (ais_gdf["BaseDateTime"] <= s2_time + TIME_WINDOW)
    )
    candidate_ais = ais_gdf[time_mask]
    inside_mask = candidate_ais.geometry.intersects(poly)
    intersecting = candidate_ais[inside_mask]

    if not intersecting.empty:
        matching_rows.append(intersecting)

# Final output
intersected_ais_df = pd.concat(matching_rows, ignore_index=True)
print(f"Intersected AIS samples: {len(intersected_ais_df)}")


In [None]:
intersected_ais_df.to_parquet('data/gulf_jan/jan_gulf_intersections.ipynb', index=False)