In [54]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Ridge
from sklearn.pipeline import Pipeline
from sklearn.compose import TransformedTargetRegressor
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_absolute_error
import numpy as np
from tqdm import tqdm



In [28]:
tomtom = pd.read_parquet("tomtom_data.parquet")
vessels = pd.read_parquet("vessels_data.parquet")
sensors_location = pd.read_csv("sensor-location.xlsx - Sheet1.csv", )
sensors = pd.read_csv("sensordata_SAIL2025.csv", parse_dates=["timestamp"])

sensors_location["Effectieve breedte"] = (
    sensors_location["Effectieve breedte"]
    .astype(str)
    .str.replace(",", ".")
    .astype(float)
)

sensors_location[["lat", "lon"]] = (
    sensors_location["Lat/Long"]
    .str.replace(" ", "")   # remove spaces
    .str.split(",", expand=True)
    .astype(float)
)

sensors_location = sensors_location.rename(columns={"Objectummer": "sensor_id"})

In [29]:
weather = pd.read_csv("SAIL_Amsterdam_10min_Weather_2025-08-20_to_2025-08-24.csv")

# Read the original CSV
weather = pd.read_csv("SAIL_Amsterdam_10min_Weather_2025-08-20_to_2025-08-24.csv")

# Fix " 24:" timestamps (replace with " 00:")
weather["DateTime"] = weather["DateTime"].str.replace(" 24:", " 00:", regex=False)

# Convert to datetime
weather["DateTime"] = pd.to_datetime(weather["DateTime"], format="%Y%m%d %H:%M")

# Set as index for resampling
weather = weather.set_index("DateTime")

# Resample to every 3 minutes — using the *nearest existing record*
weather_3min = weather.resample("3min").nearest()

# Reset index for clean CSV output
weather_3min = weather_3min.reset_index()

print(weather_3min.head(20))

              DateTime  Temperature_°C  Humidity_%  Rain_mm
0  2025-08-20 00:00:00            16.0        78.8      0.0
1  2025-08-20 00:03:00            16.0        78.8      0.0
2  2025-08-20 00:06:00            15.9        79.9      0.0
3  2025-08-20 00:09:00            15.9        79.9      0.0
4  2025-08-20 00:12:00            15.9        79.9      0.0
5  2025-08-20 00:15:00            15.9        76.0      0.0
6  2025-08-20 00:18:00            15.9        76.0      0.0
7  2025-08-20 00:21:00            15.9        76.0      0.0
8  2025-08-20 00:24:00            15.9        76.0      0.0
9  2025-08-20 00:27:00            16.0        77.2      0.0
10 2025-08-20 00:30:00            16.0        77.2      0.0
11 2025-08-20 00:33:00            16.0        77.2      0.0
12 2025-08-20 00:36:00            15.9        79.8      0.0
13 2025-08-20 00:39:00            15.9        79.8      0.0
14 2025-08-20 00:42:00            15.9        79.8      0.0
15 2025-08-20 00:45:00            15.6  

In [30]:
#Find Flow per sensor per meter
#Divide by 3 minutes and effective width

width = sensors_location.set_index("sensor_id")["Effectieve breedte"].to_dict()

# Divide each matching visitor column by its sensor’s width
for col in sensors.columns:
    if "_" in col:  # e.g. CMSA-GAKH-01_0
        sensor_id = col.split("_")[0]
        if sensor_id in width:
            sensors[col] = sensors[col] / (3*width[sensor_id])


In [50]:
#Combine vessel data into 3 min bins, taking the average position of the vessel.
vessels["timestamp"] = pd.to_datetime(vessels["timestamp"], utc=True, errors="coerce")
vessels["timestamp"] = vessels["timestamp"].dt.floor("3min")

vessels = (
    vessels.groupby(["timestamp", "imo-number"], as_index=False)
    .agg({
        "lat": "mean",
        "lon": "mean",
        "length": "first"
    })
)

vessels.head()

Unnamed: 0,timestamp,imo-number,lat,lon,length
0,2025-08-20 06:27:00+00:00,4becccd38142efcac823c94e99e4c9f918dc47de027d83...,52.059721,3.616344,14000.0
1,2025-08-20 06:27:00+00:00,64a15227c0878ef130cc4e6c27fdc668b69e528a557b63...,53.661851,5.537259,19900.0
2,2025-08-20 06:27:00+00:00,a867f56d1cd3b9b6812798549281aa82c60f5cebcf5f61...,52.086418,3.423325,19900.0
3,2025-08-20 06:27:00+00:00,b2eb0407183a1fabe106857e32added9ec78ac9c079149...,51.947955,4.05242,33600.0
4,2025-08-20 06:30:00+00:00,2d18f769b33844885ab5d48e9b4c0764308170355a4245...,53.410521,4.5894,8900.0


In [None]:
#Combining vessel and sensor data.
vessels["timestamp"] = pd.to_datetime(vessels["timestamp"], utc=True)
sensors["timestamp"] = pd.to_datetime(sensors["timestamp"], utc=True)
weather_3min["DateTime"] = pd.to_datetime(weather_3min["DateTime"], utc=True) 
combined = sensors.merge(vessels, on="timestamp", how="inner")
combined = combined.merge(
    weather_3min.rename(columns={"DateTime": "timestamp"})[
        ["timestamp", "Temperature_°C", "Humidity_%", "Rain_mm"]
    ],
    on="timestamp",
    how="left"   # keep all rows from sensors+vessels even if weather missing
)

combined.head(100)


Unnamed: 0,timestamp,CMSA-GAKH-01_0,CMSA-GAKH-01_180,CMSA-GAWW-11_120,CMSA-GAWW-11_300,CMSA-GAWW-12_115,CMSA-GAWW-12_295,CMSA-GAWW-13_120,CMSA-GAWW-13_300,CMSA-GAWW-14_40,...,month,weekday,is_weekend,imo-number,lat,lon,length,Temperature_°C,Humidity_%,Rain_mm
0,2025-08-20 06:27:00+00:00,0.000000,0.099502,0.392157,0.588235,0.641026,0.25641,0.151515,0.000000,0.185185,...,8,2,0,4becccd38142efcac823c94e99e4c9f918dc47de027d83...,52.059721,3.616344,14000.0,16.6,73.4,0.0
1,2025-08-20 06:27:00+00:00,0.000000,0.099502,0.392157,0.588235,0.641026,0.25641,0.151515,0.000000,0.185185,...,8,2,0,64a15227c0878ef130cc4e6c27fdc668b69e528a557b63...,53.661851,5.537259,19900.0,16.6,73.4,0.0
2,2025-08-20 06:27:00+00:00,0.000000,0.099502,0.392157,0.588235,0.641026,0.25641,0.151515,0.000000,0.185185,...,8,2,0,a867f56d1cd3b9b6812798549281aa82c60f5cebcf5f61...,52.086418,3.423325,19900.0,16.6,73.4,0.0
3,2025-08-20 06:27:00+00:00,0.000000,0.099502,0.392157,0.588235,0.641026,0.25641,0.151515,0.000000,0.185185,...,8,2,0,b2eb0407183a1fabe106857e32added9ec78ac9c079149...,51.947955,4.052420,33600.0,16.6,73.4,0.0
4,2025-08-20 06:30:00+00:00,0.199005,0.149254,0.588235,0.392157,0.769231,0.25641,0.151515,0.151515,0.555556,...,8,2,0,2d18f769b33844885ab5d48e9b4c0764308170355a4245...,53.410521,4.589400,8900.0,16.6,73.4,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,2025-08-20 06:33:00+00:00,0.149254,0.199005,0.784314,0.490196,0.000000,0.25641,0.000000,0.151515,0.648148,...,8,2,0,508607cb85db6e98b5293f54b5002389f6e18c082303f0...,52.406696,4.816781,5900.0,16.6,73.4,0.0
96,2025-08-20 06:33:00+00:00,0.149254,0.199005,0.784314,0.490196,0.000000,0.25641,0.000000,0.151515,0.648148,...,8,2,0,51642004ecb740656103b2b0c3d4ce286db2253a1d930d...,52.406776,4.882029,8800.0,16.6,73.4,0.0
97,2025-08-20 06:33:00+00:00,0.149254,0.199005,0.784314,0.490196,0.000000,0.25641,0.000000,0.151515,0.648148,...,8,2,0,51de12cb55ef030d7e33cd76f8c5badcae3330621021a1...,52.100666,4.267667,5100.0,16.6,73.4,0.0
98,2025-08-20 06:33:00+00:00,0.149254,0.199005,0.784314,0.490196,0.000000,0.25641,0.000000,0.151515,0.648148,...,8,2,0,52bddda6694714952ff0993167c638d56677f44f639d87...,52.405651,4.818190,4100.0,16.6,73.4,0.0


In [33]:
# Merge sensor coordinates into the combined dataframe
lat_lookup = sensors_location.set_index("sensor_id")["lat"].to_dict()
lon_lookup = sensors_location.set_index("sensor_id")["lon"].to_dict()

In [40]:
def haversine(lat1, lon1, lat2, lon2):
    R = 6371000  # Earth radius (meters)
    lat1, lon1, lat2, lon2 = map(np.radians, [lat1, lon1, lat2, lon2])
    dlat = lat2 - lat1
    dlon = lon2 - lon1
    a = np.sin(dlat/2)**2 + np.cos(lat1)*np.cos(lat2)*np.sin(dlon/2)**2
    return 2 * R * np.arcsin(np.sqrt(a))


# --- Compute distance for every vessel at every timestamp to every sensor ---
for _, sensor in sensors_location.iterrows():
    sensor_id = sensor["sensor_id"]
    s_lat = sensor["lat"]
    s_lon = sensor["lon"]
    
    dist_col = f"dist_{sensor_id}"

    # compute distances for all vessel positions (each timestamp)
    combined[dist_col] = haversine(s_lat, s_lon, combined["lat"], combined["lon"])

    # replace ≥1000 m with infinity (in place!)
    combined[dist_col] = combined[dist_col].clip(upper=10000)

In [39]:
combined.info()
combined.tail()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41844 entries, 0 to 41843
Columns: 124 entries, timestamp to dist_GASA-05-W
dtypes: datetime64[ns, UTC](1), float64(114), int64(8), object(1)
memory usage: 39.6+ MB


Unnamed: 0,timestamp,CMSA-GAKH-01_0,CMSA-GAKH-01_180,CMSA-GAWW-11_120,CMSA-GAWW-11_300,CMSA-GAWW-12_115,CMSA-GAWW-12_295,CMSA-GAWW-13_120,CMSA-GAWW-13_300,CMSA-GAWW-14_40,...,dist_GASA-01-A1,dist_GASA-01-A2,dist_GASA-01-B,dist_GASA-01-C,dist_GASA-02-01,dist_GASA-02-02,dist_GASA-03,dist_GASA-04,dist_GASA-05-O,dist_GASA-05-W
41839,2025-08-20 14:27:00+00:00,8.00995,6.169154,4.509804,4.509804,4.230769,4.358974,1.515152,3.030303,2.685185,...,inf,inf,inf,inf,inf,inf,inf,inf,inf,inf
41840,2025-08-20 14:27:00+00:00,8.00995,6.169154,4.509804,4.509804,4.230769,4.358974,1.515152,3.030303,2.685185,...,inf,inf,inf,inf,inf,inf,inf,inf,inf,inf
41841,2025-08-20 14:27:00+00:00,8.00995,6.169154,4.509804,4.509804,4.230769,4.358974,1.515152,3.030303,2.685185,...,inf,inf,inf,inf,inf,inf,inf,inf,inf,inf
41842,2025-08-20 14:27:00+00:00,8.00995,6.169154,4.509804,4.509804,4.230769,4.358974,1.515152,3.030303,2.685185,...,inf,inf,inf,inf,inf,inf,inf,inf,inf,inf
41843,2025-08-20 14:27:00+00:00,8.00995,6.169154,4.509804,4.509804,4.230769,4.358974,1.515152,3.030303,2.685185,...,inf,inf,inf,inf,inf,inf,inf,inf,inf,inf


In [None]:
##Deep learning model (Using Torch)


# --- Split train/test by time ---
train_data = combined[combined["timestamp"].dt.hour < 13]
test_data  = combined[combined["timestamp"].dt.hour >= 13]

feature_cols = (
    ["Temperature_°C", "Humidity_%", "Rain_mm", "length"]
    + [c for c in combined.columns if c.startswith("dist_")]
)

target_cols = [c for c in combined.columns if c.startswith("CMSA-")]



# --- Clean distances ---
for col in feature_cols:
    if col.startswith("dist_"):
        combined[col] = combined[col].clip(0, 10000)  # cap at 10 km

# --- Normalize input features using z-score scaling ---
scaler = StandardScaler()
train_data[feature_cols] = scaler.fit_transform(train_data[feature_cols])
test_data[feature_cols] = scaler.transform(test_data[feature_cols])

# --- Prepare tensors ---
X_train = torch.tensor(train_data[feature_cols].values, dtype=torch.float32)
y_train = torch.tensor(train_data[target_cols].values, dtype=torch.float32)
X_test  = torch.tensor(test_data[feature_cols].values, dtype=torch.float32)
y_test  = torch.tensor(test_data[target_cols].values, dtype=torch.float32)

# --- Define regression model ---
class VesselSensorRegressor(nn.Module):
    def __init__(self, input_dim, output_dim):
        super().__init__()
        self.model = nn.Sequential(
            nn.Linear(input_dim, 256),
            nn.ReLU(),
            nn.Linear(256, 128),
            nn.ReLU(),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Linear(64, output_dim)
        )
    def forward(self, x):
        return self.model(x)

model = VesselSensorRegressor(X_train.shape[1], y_train.shape[1])
loss_fn = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

# --- Train ---
train_loader = DataLoader(TensorDataset(X_train, y_train), batch_size=64, shuffle=True)

for epoch in range(40):
    model.train()
    for xb, yb in train_loader:
        optimizer.zero_grad()
        pred = model(xb)
        loss = loss_fn(pred, yb)
        loss.backward()
        optimizer.step()
    print(f"Epoch {epoch+1:02d} | Train MSE: {loss.item():.6f}")

# --- Evaluate and predict ---
def evaluate_model(model, X, y_true):
    model.eval()
    with torch.no_grad():
        preds = model(X)
        mse = loss_fn(preds, y_true).item()
        rmse = torch.sqrt(torch.mean((preds - y_true)**2)).item()
        mae = torch.mean(torch.abs(preds - y_true)).item()
        mape = torch.mean(torch.abs((preds - y_true) / (y_true + 1e-6))) * 100
        ss_res = torch.sum((y_true - preds)**2)
        ss_tot = torch.sum((y_true - torch.mean(y_true))**2)
        r2 = (1 - ss_res / ss_tot).item()
    return preds, mse, rmse, mae, mape.item(), r2

preds, test_mse, test_rmse, test_mae, test_mape, test_r2 = evaluate_model(model, X_test, y_test)

print(f"\n✅ Test Results:")
print(f"   MSE:  {test_mse:.4f}")
print(f"   RMSE: {test_rmse:.2f} people")
print(f"   MAE:  {test_mae:.2f} people")
print(f"   MAPE: {test_mape:.2f}%")
print(f"   R²:   {test_r2:.3f}")






A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_data[feature_cols] = scaler.fit_transform(train_data[feature_cols])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_data[feature_cols] = scaler.transform(test_data[feature_cols])


Epoch 01 | Train MSE: 0.619472
Epoch 02 | Train MSE: 0.691306
Epoch 03 | Train MSE: 0.667591
Epoch 04 | Train MSE: 0.624946
Epoch 05 | Train MSE: 0.472489
Epoch 06 | Train MSE: 0.581212
Epoch 07 | Train MSE: 0.523332
Epoch 08 | Train MSE: 0.568187
Epoch 09 | Train MSE: 0.462310
Epoch 10 | Train MSE: 0.445386
Epoch 11 | Train MSE: 0.536515
Epoch 12 | Train MSE: 0.441635
Epoch 13 | Train MSE: 0.413566
Epoch 14 | Train MSE: 0.463893
Epoch 15 | Train MSE: 0.507132
Epoch 16 | Train MSE: 0.502255
Epoch 17 | Train MSE: 0.482904
Epoch 18 | Train MSE: 0.473859
Epoch 19 | Train MSE: 0.450758
Epoch 20 | Train MSE: 0.439978
Epoch 21 | Train MSE: 0.493323
Epoch 22 | Train MSE: 0.517293
Epoch 23 | Train MSE: 0.407946
Epoch 24 | Train MSE: 0.367376
Epoch 25 | Train MSE: 0.480460
Epoch 26 | Train MSE: 0.448328
Epoch 27 | Train MSE: 0.455027
Epoch 28 | Train MSE: 0.453085
Epoch 29 | Train MSE: 0.439903
Epoch 30 | Train MSE: 0.386487
Epoch 31 | Train MSE: 0.436166
Epoch 32 | Train MSE: 0.378558
Epoch 33

In [53]:

# --- Save predictions ---
pred_df = pd.DataFrame(preds.numpy(), columns=[f"pred_{col}" for col in target_cols])
pred_df["timestamp"] = test_data["timestamp"].values

# Resample to 3-minute intervals
pred_df["timestamp"] = pred_df["timestamp"].dt.round("3min")
pred_df = pred_df.set_index("timestamp").resample("3min").mean().reset_index()

# --- Save to CSV ---
pred_df.to_csv("predicted_sensor_values_3min.csv", index=False)