In [None]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset
from tqdm import tqdm


In [None]:
tomtom = pd.read_parquet("tomtom_data.parquet")
vessels = pd.read_parquet("vessels_data.parquet")
sensors_location = pd.read_csv("sensor-location.xlsx - Sheet1.csv", )
sensors = pd.read_csv("sensordata_SAIL2025.csv", parse_dates=["timestamp"])

sensors_location["Effectieve breedte"] = (
    sensors_location["Effectieve breedte"]
    .astype(str)
    .str.replace(",", ".")
    .astype(float)
)

sensors_location[["lat", "lon"]] = (
    sensors_location["Lat/Long"]
    .str.replace(" ", "")   # remove spaces
    .str.split(",", expand=True)
    .astype(float)
)

sensors_location = sensors_location.rename(columns={"Objectummer": "sensor_id"})

In [17]:
weather = pd.read_csv("SAIL_Amsterdam_10min_Weather_2025-08-20_to_2025-08-24.csv")

# Drop rows with invalid "24:00" timestamps
weather = weather[~weather["DateTime"].str.contains("24:00", na=False)]

# Convert DateTime to datetime objects
weather["DateTime"] = pd.to_datetime(weather["DateTime"], format="%Y%m%d %H:%M")

# Round timestamps to the nearest 3 minutes
weather["RoundedTime"] = weather["DateTime"].dt.round("3min")

# Drop duplicates, keeping the first entry for each 3-minute slot
three_min_weather_data = (
    weather.sort_values("DateTime")
           .drop_duplicates(subset="RoundedTime", keep="first")
           .set_index("RoundedTime")
           .drop(columns=["DateTime"])
)

# Reset index so RoundedTime becomes a column again
three_min_weather_data = three_min_weather_data.reset_index()
print(three_min_weather_data.head())

ValueError: time data "20250820 24:10" doesn't match format "%Y%m%d %H:%M", at position 132. You might want to try:
    - passing `format` if your strings have a consistent format;
    - passing `format='ISO8601'` if your strings are all ISO8601 but not necessarily in exactly the same format;
    - passing `format='mixed'`, and the format will be inferred for each element individually. You might want to use `dayfirst` alongside this.

In [None]:
#Find Flow per sensor per meter
#Divide by 3 minutes and effective width

width = sensors_location.set_index("sensor_id")["Effectieve breedte"].to_dict()

# Divide each matching visitor column by its sensor’s width
for col in sensors.columns:
    if "_" in col:  # e.g. CMSA-GAKH-01_0
        sensor_id = col.split("_")[0]
        if sensor_id in width:
            sensors[col] = sensors[col] / (3*width[sensor_id])


In [None]:
#Combine vessel data into 3 min bins, taking the average position of the vessel.
vessels["timestamp"] = pd.to_datetime(vessels["timestamp"], utc=True, errors="coerce")
vessels["timestamp"] = vessels["timestamp"].dt.floor("3min")

vessels = (
    vessels.groupby(["timestamp", "imo-number"], as_index=False)
    .agg({
        "lat": "mean",
        "lon": "mean",
        "length": "first"
    })
)

In [None]:
#Combining vessel and sensor data.
vessels["timestamp"] = pd.to_datetime(vessels["timestamp"], utc=True)
sensors["timestamp"] = pd.to_datetime(sensors["timestamp"], utc=True)

combined = sensors.merge(vessels, on="timestamp", how="inner")

combined.head(100)


In [None]:
# Merge sensor coordinates into the combined dataframe
lat_lookup = sensors_location.set_index("sensor_id")["lat"].to_dict()
lon_lookup = sensors_location.set_index("sensor_id")["lon"].to_dict()

In [None]:
def haversine(lat1, lon1, lat2, lon2):
    R = 6371000  # Earth radius (meters)
    lat1, lon1, lat2, lon2 = map(np.radians, [lat1, lon1, lat2, lon2])
    dlat = lat2 - lat1
    dlon = lon2 - lon1
    a = np.sin(dlat/2)**2 + np.cos(lat1)*np.cos(lat2)*np.sin(dlon/2)**2
    return 2 * R * np.arcsin(np.sqrt(a))


# --- Compute distance for every vessel at every timestamp to every sensor ---
for _, sensor in sensors_location.iterrows():
    sensor_id = sensor["sensor_id"]
    s_lat = sensor["lat"]
    s_lon = sensor["lon"]
    
    dist_col = f"dist_{sensor_id}"

    # compute distances for all vessel positions (each timestamp)
    combined[dist_col] = haversine(s_lat, s_lon, combined["lat"], combined["lon"])

    # replace ≥1000 m with infinity (in place!)
    combined[dist_col] = combined[dist_col].where(combined[dist_col] < 1000, np.inf)

In [None]:
combined.info()
combined.tail()