In [3]:
import numpy as np
import pandas as pd

import geopandas as gpd

from pyproj import Transformer

from scipy.linalg import block_diag

import matplotlib.pyplot as plt
import scienceplots
plt.style.use('science')

import seaborn as sns

from kalman import KalmanFilter

ModuleNotFoundError: No module named 'scipy'

, extrapolate=False in most cases

# Functions

In [None]:
def interpolate_voyage_vars(voyage_data: pd.DataFrame, quay_point):

    interpolated_data = voyage_data[["x", "y"]].interpolate(method="linear", axis=0)

    # convert position
    transformer_meterstodeg = Transformer.from_crs(crs_from=3857, crs_to=4326)
    lat, lon = transformer_meterstodeg.transform(interpolated_data["x"].to_numpy(),
                                                 interpolated_data["y"].to_numpy())
    interpolated_data["LON"] = lon
    interpolated_data["LAT"] = lat

    interpolated_data_shift1 = interpolated_data.shift(1)
    interpolated_data = interpolated_data.assign(SOG=voyage_data["SOG"],
                                                 COG=voyage_data["COG"],
                                                 distanceToPort=voyage_data["distanceToPort"])

    # fill speed over ground (euclidean_distance/sampling interval)
    sog = np.sqrt((interpolated_data["x"] - interpolated_data_shift1["x"])**2
                  + (interpolated_data["y"] - interpolated_data_shift1["y"])**2) / 120 \
          / 0.5144444 # m/s to knots
    interpolated_data["SOG"].fillna(sog, inplace=True)

    # fill course over ground (angle in relation to the true North)
    cog = np.arctan2(interpolated_data["x"] - interpolated_data_shift1["x"],
                     interpolated_data["y"] - interpolated_data_shift1["y"]) % (2 * np.pi) \
          * 180 / np.pi # from radians to degrees
    interpolated_data["COG"].fillna(cog, inplace=True)
    
    # fill distance to port
    dist = np.sqrt((interpolated_data["x"] - quay_point[0])**2
                   + (interpolated_data["y"] - quay_point[1])**2) \
           * 0.000539956803 # meters to nautical miles
    interpolated_data["distanceToPort"].fillna(dist, inplace=True)

    # remove positions in meters
    interpolated_data.drop(columns=["x", "y"], inplace=True)

    return interpolated_data

In [None]:
def cv_model_2d(dt, accel, meas_noise):

    F = np.eye(4)
    F[0, 1] = F[2, 3] = dt
    H = np.zeros((2,4))
    H[0,0] = H[1,2] = 1

    block = np.array([[dt**4/4, dt**3/2],
                      [dt**3/2, dt**2]])
    Q = block_diag(block, block) * (accel**2)

    R = np.eye(2) * (meas_noise ** 2)
    P = np.eye(4) * 0.01

    return F, H, Q, R, P

def extrapolate_voyage_vars(voyage_data, quay_point):

    # Get KF measures and AIS messages in different arrays
    kf_measures = voyage_data[["x", "vx", "y", "vy"]].values
    voyage_vars = voyage_data[["LON", "LAT", "SOG", "COG", "distanceToPort"]].values

    # Initialize Kalman Filter
    F, H, Q, R, P = cv_model_2d(120, 0.001, 10)
    x0 = kf_measures[0, :, np.newaxis]
    kf = KalmanFilter(x0, F, None, H, Q, R, P)

    # Save P for reset
    P = kf.P

    # Variables
    reset = False
    extrapolated_voyage_vars = np.zeros(shape=(voyage_data.shape[0], 5)) # array of predictions

    # filter the voyage data
    for i in range(1, kf_measures.shape[0]):

        if reset and np.isnan(kf_measures[i,0]): # if there is no info
            continue
        elif reset: # if there is info and KF needs reset
            kf.x = kf_measures[i, :, np.newaxis]
            kf.P = P

        kf.predict()

        # current measure available for correction
        if ~np.isnan(kf_measures[i, 0]):
            kf.update(kf_measures[i, [0, 2], np.newaxis])
            continue

        #past_x, past_y = kf_measures[i-1, [0, 2]]
        x, y = kf.x[0, 0], kf.x[2, 0]
        vel = np.sqrt((kf.x[0, 0] - kf_measures[i-1, 0]) ** 2 + (kf.x[2, 0] - kf_measures[i-1, 2]) ** 2) / 120
        cog = np.arctan2(kf.x[0, 0] - kf_measures[i-1, 0], kf.x[2, 0] - kf_measures[i-1, 2]) % (2 * np.pi)
        dist = np.sqrt((kf.x[0, 0] - quay_point[0]) ** 2 + (kf.x[2, 0] - quay_point[1]) ** 2)
        extrapolated_voyage_vars[i, :] = np.array([x, y, vel, cog, dist])
        reset = True

    extrapolated_voyage_vars[np.isnan(extrapolated_voyage_vars[:,0])] = np.NaN

    transformer = Transformer.from_crs(3857, 4326)
    lat, lon = transformer.transform(extrapolated_voyage_vars[:, 0], extrapolated_voyage_vars[:, 1])
    extrapolated_voyage_vars[:, 0] = lon
    extrapolated_voyage_vars[:, 1] = lat
    extrapolated_voyage_vars[:, 2] /= 0.5144444
    extrapolated_voyage_vars[:, 3] *= 180 / np.pi
    extrapolated_voyage_vars[:, 4] *= 0.000539956803

    # add existing AIS messages to extrapolated voyage
    extrapolated_voyage_vars[~np.isnan(voyage_vars[:,0])] = voyage_vars[~np.isnan(voyage_vars[:,0])]

    # add variables to voyage data
    extrapolated_voyage_vars = pd.DataFrame(data=extrapolated_voyage_vars,
                                            columns=["LON", "LAT", "SOG", "COG", "distanceToPort"])

    return extrapolated_voyage_vars

In [None]:
def get_lagged_voyage_vars(voyage_vars, delays):

    voyage_vars_lagged = voyage_vars.copy()

    for delay in delays:
        voyage_vars_lag = voyage_vars.shift(delay)

        voyage_vars_lag.columns += "_lag" + str(delay)
        voyage_vars_lagged = pd.concat((voyage_vars_lagged, voyage_vars_lag), axis=1)

    return voyage_vars_lagged

In [None]:
def complete_vars(arr, n_feats, total_cols):

    voyage_lags = arr[:total_cols]
    voyage_interp_lags = arr[total_cols:total_cols*2]
    voyage_extrap_lags = arr[total_cols*2:total_cols*3]

    non_nan_indexes = np.argwhere(~np.isnan(voyage_lags))
    if non_nan_indexes.shape[0] == 0 or non_nan_indexes[0, 0] == 0:
        return voyage_lags

    interp_end = non_nan_indexes[0, 0]
    voyage_lags[interp_end:] = voyage_interp_lags[interp_end:]
    voyage_lags[interp_end-n_feats:interp_end] = \
        voyage_extrap_lags[interp_end-n_feats:interp_end]

    return voyage_lags

def get_stacked_voyage_vars_lagged(voyage_vars, delays, extrapolate, quay_point):

    voyage_vars_lagged = get_lagged_voyage_vars(voyage_vars[["LON", "LAT", "SOG", "COG", "distanceToPort"]],
                                                delays=delays)
    voyage_interp = interpolate_voyage_vars(voyage_vars, quay_point)
    voyage_interp_lagged = get_lagged_voyage_vars(voyage_interp, delays=delays)

    if extrapolate:
        voyage_extrap = extrapolate_voyage_vars(voyage_vars, quay_point)
        voyage_extrap_lagged = get_lagged_voyage_vars(voyage_extrap, delays=delays)

        voyage_vars_lagged_stacked = np.hstack((voyage_vars_lagged.values,
                                                voyage_interp_lagged.values,
                                                voyage_extrap_lagged.values))
    else:
        voyage_with_outdated_samples = voyage_vars.loc[:, ["LON", "LAT", "SOG", "COG", "distanceToPort"]].ffill(limit=1)
        voyage_with_outdated_samples_lagged = get_lagged_voyage_vars(voyage_with_outdated_samples, delays=delays)
        voyage_vars_lagged_stacked = np.hstack((voyage_vars_lagged.values,
                                                voyage_interp_lagged.values,
                                                voyage_with_outdated_samples_lagged.values))

    return voyage_vars_lagged_stacked

In [None]:
def get_lagged_cols(columns: np.array, delays):
    columns = np.array(columns)
    final_columns = columns.copy()
    for delay in delays:
        final_columns = np.hstack((final_columns, np.char.add(columns, "_lag" + str(delay))))
    return final_columns

def get_lagged_vars(data: pd.DataFrame, delays, online, fill, extrapolate, quay_point):

    if not fill:
        return data.groupby(level=0, sort=False, group_keys=False)\
            .apply(lambda x: get_lagged_voyage_vars(x, delays))

    if not online:
        return data.groupby(level=0, sort=False, group_keys=False)\
            .apply(lambda x: get_lagged_voyage_vars(interpolate_voyage_vars(x, quay_point), delays))

    stacked_voyage_vars_lagged =  data.groupby(level=0, sort=False, group_keys=False)\
        .apply(lambda x: get_stacked_voyage_vars_lagged(x, delays, extrapolate, quay_point))
    
    stacked_voyage_vars_lagged = np.vstack(stacked_voyage_vars_lagged.values)

    n_feats = int(stacked_voyage_vars_lagged.shape[1] / 3 / (delays[-1]+1))
    voyage_vars_lagged_filled = np.apply_along_axis(lambda x: complete_vars(x,
                                                                            n_feats,
                                                                            n_feats * (delays[-1]+1)),
                                                    axis=1,
                                                    arr=stacked_voyage_vars_lagged)

    voyage_vars_lagged_filled = pd.DataFrame(data=voyage_vars_lagged_filled,
                                             columns=get_lagged_cols(["LON", "LAT",
                                                                      "SOG", "COG",
                                                                      "distanceToPort"], delays))

    return voyage_vars_lagged_filled

In [None]:
def get_lagged_io(lagged_vars: pd.DataFrame, target_var: pd.Series, n_feats, max_delay):

    target_var.interpolate(method="linear", inplace=True)
    lagged_io = pd.concat((lagged_vars.iloc[:, :n_feats*(max_delay+1)], target_var), axis=1)
    lagged_io.loc[lagged_io.iloc[:, :-1].isna().any(axis=1), target_var.name] = np.NaN

    return lagged_io

# Miami Data

### Training Data

In [None]:
# Train set -> Linear Interpolation
train = pd.read_csv("./data/3_miami_train_to_fill.csv")
train["BaseDateTime"] = pd.to_datetime(train["BaseDateTime"])
train.set_index(["voyage_id", "BaseDateTime"], drop=True, inplace=True)

In [None]:
display(train.head())

In [None]:
# Resample to 2 minutes intervals by taking the meane very 2 minutes
train_resampled = train.groupby(level=0, sort=False)\
            .resample("2T", level=1).mean()

In [None]:
display(train_resampled.head())

In [None]:
transformer = Transformer.from_crs(4326, 3857)
quay_point = transformer.transform(25.77, -80.16)

In [None]:
delays = np.arange(1, 16)

In [None]:
import numpy
print(numpy.__version__)


In [None]:
train_vars_lagged = get_lagged_vars(train_resampled[["LON", "LAT",
                                                     "SOG", "COG",
                                                     "distanceToPort"]],
                                    delays=delays,
                                    online=False, fill=False, quay_point=None, extrapolate=False)
train_nohistory = get_lagged_io(train_vars_lagged,
                                train_resampled["remainingVoyageTime"],
                                n_feats=5, max_delay=0)
train_10min = get_lagged_io(train_vars_lagged,
                            train_resampled["remainingVoyageTime"],
                            n_feats=5, max_delay=5)
train_20min = get_lagged_io(train_vars_lagged,
                            train_resampled["remainingVoyageTime"],
                            n_feats=5, max_delay=10)
train_30min = get_lagged_io(train_vars_lagged,
                            train_resampled["remainingVoyageTime"],
                            n_feats=5, max_delay=15)

In [None]:
train_vars_lagged_filled = get_lagged_vars(train_resampled[["LON", "LAT",
                                                            "SOG", "COG",
                                                            "distanceToPort",
                                                            "x", "y"]],
                                           delays=delays,
                                           online=False, fill=True, quay_point=quay_point, extrapolate=False)
train_filled_nohistory = get_lagged_io(train_vars_lagged_filled,
                                       train_resampled["remainingVoyageTime"],
                                       n_feats=5, max_delay=0)
train_filled_10min = get_lagged_io(train_vars_lagged_filled,
                                   train_resampled["remainingVoyageTime"],
                                   n_feats=5, max_delay=5)
train_filled_20min = get_lagged_io(train_vars_lagged_filled,
                                   train_resampled["remainingVoyageTime"],
                                   n_feats=5, max_delay=10)
train_filled_30min = get_lagged_io(train_vars_lagged_filled,
                                   train_resampled["remainingVoyageTime"],
                                   n_feats=5, max_delay=15)

create folder if dont exist

In [None]:
import os

# Define the paths for each dataset
paths = [
    "./data/final_miami_datasets/no_history/",
    "./data/final_miami_datasets/10min_history/",
    "./data/final_miami_datasets/20min_history/",
    "./data/final_miami_datasets/30min_history/"
]

# Create directories if they don't exist
for path in paths:
    os.makedirs(path, exist_ok=True)

# Save each dataset to its respective folder
train_nohistory.dropna()\
    .to_csv("./data/final_miami_datasets/no_history/train.csv", index=True)

train_10min.dropna()\
    .to_csv("./data/final_miami_datasets/10min_history/train.csv", index=True)

train_20min.dropna()\
    .to_csv("./data/final_miami_datasets/20min_history/train.csv", index=True)

train_30min.dropna()\
    .to_csv("./data/final_miami_datasets/30min_history/train.csv", index=True)


In [None]:
paths = [
    "./data/final_miami_datasets/no_history_filled/",
    "./data/final_miami_datasets/10min_history_filled/",
    "./data/final_miami_datasets/20min_history_filled/",
    "./data/final_miami_datasets/30min_history_filled/"
]

# Create directories if they don't exist
for path in paths:
    os.makedirs(path, exist_ok=True)

train_filled_nohistory.dropna()\
    .to_csv("./data/final_miami_datasets/no_history_filled/train.csv", index=True)
train_filled_10min.dropna()\
    .to_csv("./data/final_miami_datasets/10min_history_filled/train.csv", index=True)
train_filled_20min.dropna()\
    .to_csv("./data/final_miami_datasets/20min_history_filled/train.csv", index=True)
train_filled_30min.dropna()\
    .to_csv("./data/final_miami_datasets/30min_history_filled/train.csv", index=True)

### Validation and Test sets

In [None]:
val = pd.read_csv("./data/3_miami_val_to_fill.csv")
test = pd.read_csv("./data/3_miami_test_to_fill.csv")

val["BaseDateTime"] = pd.to_datetime(val["BaseDateTime"])
test["BaseDateTime"] = pd.to_datetime(test["BaseDateTime"])
val.set_index(["voyage_id", "BaseDateTime"], drop=True, inplace=True)
test.set_index(["voyage_id", "BaseDateTime"], drop=True, inplace=True)

val_vx = val["SOG"] * 0.5144444 * np.sin(val["COG"])
val_vy = val["SOG"] * 0.5144444 * np.cos(val["COG"])
val = val.assign(vx=val_vx, vy=val_vy)

test_vx = test["SOG"] * 0.5144444 * np.sin(test["COG"])
test_vy = test["SOG"] * 0.5144444 * np.cos(test["COG"])
test = test.assign(vx=test_vx, vy=test_vy)

In [None]:
display(val.head()[:2])
display(test.head()[:2])

In [None]:
# Resample to 2 minutes intervals by taking the mean every 2 minutes
val_resampled = val.groupby(level=0, sort=False) \
    .resample("2T", level=1).mean()
test_resampled = test.groupby(level=0, sort=False) \
    .resample("2T", level=1).mean()

In [None]:
print(val_resampled.isna().sum()[0] / val_resampled.shape[0] * 100)
print(test_resampled.isna().sum()[0] / test_resampled.shape[0] * 100)

In [None]:
val_vars_lagged = get_lagged_vars(val_resampled[["LON", "LAT",
                                                 "SOG", "COG",
                                                 "distanceToPort"]],
                                  delays=delays,
                                  online=True, fill=False, extrapolate=False, quay_point=None)
val_nohistory = get_lagged_io(val_vars_lagged,
                              val_resampled["remainingVoyageTime"],
                              n_feats=5, max_delay=0)
val_10min = get_lagged_io(val_vars_lagged,
                          val_resampled["remainingVoyageTime"],
                          n_feats=5, max_delay=5)
val_20min = get_lagged_io(val_vars_lagged,
                          val_resampled["remainingVoyageTime"],
                          n_feats=5, max_delay=10)
val_30min = get_lagged_io(val_vars_lagged,
                          val_resampled["remainingVoyageTime"],
                          n_feats=5, max_delay=15)

In [None]:
val_vars_lagged = get_lagged_vars(val_resampled[["LON", "LAT", "SOG", "COG", "distanceToPort", "x", "y", "vx", "vy"]],
                                  delays=delays, online=True, fill=True,  extrapolate=False, quay_point=quay_point)
val_vars_lagged.index = val_resampled.index
val_nohistory_with_outdated_samples = get_lagged_io(val_vars_lagged,
                              val_resampled["remainingVoyageTime"],
                              n_feats=5, max_delay=0)
val_10min_with_outdated_samples = get_lagged_io(val_vars_lagged,
                          val_resampled["remainingVoyageTime"],
                          n_feats=5, max_delay=5)
val_20min_with_outdated_samples = get_lagged_io(val_vars_lagged,
                          val_resampled["remainingVoyageTime"],
                          n_feats=5, max_delay=10)
val_30min_with_outdated_samples = get_lagged_io(val_vars_lagged,
                          val_resampled["remainingVoyageTime"],
                          n_feats=5, max_delay=15)

In [None]:
test_vars_lagged = get_lagged_vars(test_resampled[["LON", "LAT",
                                                   "SOG", "COG",
                                                   "distanceToPort"]],
                                   delays=delays,
                                   online=True, fill=False, quay_point=None, extrapolate=False)
test_nohistory = get_lagged_io(test_vars_lagged,
                               test_resampled["remainingVoyageTime"],
                               n_feats=5, max_delay=0)
test_10min = get_lagged_io(test_vars_lagged,
                           test_resampled["remainingVoyageTime"],
                           n_feats=5, max_delay=5)
test_20min = get_lagged_io(test_vars_lagged,
                           test_resampled["remainingVoyageTime"],
                           n_feats=5, max_delay=10)
test_30min = get_lagged_io(test_vars_lagged,
                           test_resampled["remainingVoyageTime"],
                           n_feats=5, max_delay=15)

In [None]:
val_vars_lagged_filled = get_lagged_vars(val_resampled[["LON", "LAT",
                                                        "SOG", "COG",
                                                        "distanceToPort", "x", "y", "vx", "vy"]],
                                         delays=delays,
                                         online=True, fill=True, quay_point=quay_point, extrapolate=False)
val_vars_lagged_filled.index = val_resampled.index
val_nohistory_filled = get_lagged_io(val_vars_lagged_filled,
                                     val_resampled["remainingVoyageTime"],
                                     n_feats=5, max_delay=0)
val_10min_filled = get_lagged_io(val_vars_lagged_filled,
                                 val_resampled["remainingVoyageTime"],
                                 n_feats=5, max_delay=5)
val_20min_filled = get_lagged_io(val_vars_lagged_filled,
                                 val_resampled["remainingVoyageTime"],
                                 n_feats=5, max_delay=10)
val_30min_filled = get_lagged_io(val_vars_lagged_filled,
                                 val_resampled["remainingVoyageTime"],
                                 n_feats=5, max_delay=15)

In [None]:
test_vars_lagged_filled = get_lagged_vars(test_resampled[["LON", "LAT",
                                                          "SOG", "COG",
                                                          "distanceToPort", "x", "y", "vx", "vy"]],
                                          delays=delays,
                                          online=True, fill=True, quay_point=quay_point, extrapolate=False)
test_vars_lagged_filled.index = test_resampled.index
test_nohistory_filled = get_lagged_io(test_vars_lagged_filled,
                                      test_resampled["remainingVoyageTime"],
                                      n_feats=5, max_delay=0)
test_10min_filled = get_lagged_io(test_vars_lagged_filled,
                                  test_resampled["remainingVoyageTime"],
                                  n_feats=5, max_delay=5)
test_20min_filled = get_lagged_io(test_vars_lagged_filled,
                                  test_resampled["remainingVoyageTime"],
                                  n_feats=5, max_delay=10)
test_30min_filled = get_lagged_io(test_vars_lagged_filled,
                                  test_resampled["remainingVoyageTime"],
                                  n_feats=5, max_delay=15)

In [None]:
val_nohistory.dropna()\
    .to_csv("./data/final_miami_datasets/no_history/val.csv", index=False)
val_10min.dropna()\
    .to_csv("./data/final_miami_datasets/10min_history/val.csv", index=False)
val_20min.dropna()\
    .to_csv("./data/final_miami_datasets/20min_history/val.csv", index=False)
val_30min.dropna()\
    .to_csv("./data/final_miami_datasets/30min_history/val.csv", index=False)

In [None]:
val_nohistory_with_outdated_samples.dropna().to_csv("./data/final_miami_datasets/no_history/val_with_outdated_samples.csv", index=False)
val_10min_with_outdated_samples.dropna().to_csv("./data/final_miami_datasets/10min_history/val_with_outdated_samples.csv", index=False)
val_20min_with_outdated_samples.dropna().to_csv("./data/final_miami_datasets/20min_history/val_with_outdated_samples.csv", index=False)
val_30min_with_outdated_samples.dropna().to_csv("./data/final_miami_datasets/30min_history/val_with_outdated_samples.csv", index=False)

In [None]:
val_nohistory_filled.dropna()\
    .to_csv("./data/final_miami_datasets/no_history_filled/val.csv", index=False)
val_10min_filled.dropna()\
    .to_csv("./data/final_miami_datasets/10min_history_filled/val.csv", index=False)
val_20min_filled.dropna()\
    .to_csv("./data/final_miami_datasets/20min_history_filled/val.csv", index=False)
val_30min_filled.dropna()\
    .to_csv("./data/final_miami_datasets/30min_history_filled/val.csv", index=False)

In [None]:
test_nohistory.dropna()\
    .to_csv("./data/final_miami_datasets/no_history/test.csv", index=False)
test_10min.dropna()\
    .to_csv("./data/final_miami_datasets/10min_history/test.csv", index=False)
test_20min.dropna()\
    .to_csv("./data/final_miami_datasets/20min_history/test.csv", index=False)
test_30min.dropna()\
    .to_csv("./data/final_miami_datasets/30min_history/test.csv", index=False)

In [None]:
test_nohistory_filled.dropna()\
    .to_csv("./data/final_miami_datasets/no_history_filled/test.csv", index=False)
test_10min_filled.dropna()\
    .to_csv("./data/final_miami_datasets/10min_history_filled/test.csv", index=False)
test_20min_filled.dropna()\
    .to_csv("./data/final_miami_datasets/20min_history_filled/test.csv", index=False)
test_30min_filled.dropna()\
    .to_csv("./data/final_miami_datasets/30min_history_filled/test.csv", index=False)

# France Data

### Train Data

In [None]:
# Train set -> Linear Interpolation
train = pd.read_csv("./data/3_france_train_to_fill.csv")
train["BaseDateTime"] = pd.to_datetime(train["BaseDateTime"])
train.set_index(["voyage_id", "BaseDateTime"], drop=True, inplace=True)

In [None]:
# Resample to 2 minutes intervals by taking the meane very 2 minutes
train_resampled = train.groupby(level=0, sort=False)\
            .resample("2T", level=1).mean()

In [None]:
transformer = Transformer.from_crs(4326, 3857)

ports = gpd.read_file("./data/france_data/Ports_Brittany/port.shp")
brest_port = ports[ports["libelle_po"] == "Brest"]
lon = brest_port.iloc[0]["geometry"].geoms[0].xy[0][0]
lat = brest_port.iloc[0]["geometry"].geoms[0].xy[1][0]
berth_port_coords = [lat, lon-0.01]

quay_point = transformer.transform(lat, lon-0.01)

In [None]:
delays = np.arange(1, 16)

In [None]:
train_vars_lagged = get_lagged_vars(train_resampled[["LON", "LAT",
                                                     "SOG", "COG",
                                                     "distanceToPort"]],
                                    delays=delays,
                                    online=False, fill=False, quay_point=None, extrapolate=False)
train_nohistory = get_lagged_io(train_vars_lagged,
                                train_resampled["remainingVoyageTime"],
                                n_feats=5, max_delay=0)
train_10min = get_lagged_io(train_vars_lagged,
                            train_resampled["remainingVoyageTime"],
                            n_feats=5, max_delay=5)
train_20min = get_lagged_io(train_vars_lagged,
                            train_resampled["remainingVoyageTime"],
                            n_feats=5, max_delay=10)
train_30min = get_lagged_io(train_vars_lagged,
                            train_resampled["remainingVoyageTime"],
                            n_feats=5, max_delay=15)

In [None]:
train_vars_lagged_filled = get_lagged_vars(train_resampled[["LON", "LAT",
                                                            "SOG", "COG",
                                                            "distanceToPort",
                                                            "x", "y"]],
                                           delays=delays,
                                           online=False, fill=True, quay_point=quay_point, extrapolate=False)
train_filled_nohistory = get_lagged_io(train_vars_lagged_filled,
                                       train_resampled["remainingVoyageTime"],
                                       n_feats=5, max_delay=0)
train_filled_10min = get_lagged_io(train_vars_lagged_filled,
                                   train_resampled["remainingVoyageTime"],
                                   n_feats=5, max_delay=5)
train_filled_20min = get_lagged_io(train_vars_lagged_filled,
                                   train_resampled["remainingVoyageTime"],
                                   n_feats=5, max_delay=10)
train_filled_30min = get_lagged_io(train_vars_lagged_filled,
                                   train_resampled["remainingVoyageTime"],
                                   n_feats=5, max_delay=15)

In [None]:
# Define the paths for each dataset
paths = [
    "./data/final_france_datasets/no_history/",
    "./data/final_france_datasets/10min_history/",
    "./data/final_france_datasets/20min_history/",
    "./data/final_france_datasets/30min_history/"
]

# Create directories if they don't exist
for path in paths:
    os.makedirs(path, exist_ok=True)


train_nohistory.dropna()\
    .to_csv("./data/final_france_datasets/no_history/train.csv", index=True)
train_10min.dropna()\
    .to_csv("./data/final_france_datasets/10min_history/train.csv", index=True)
train_20min.dropna()\
    .to_csv("./data/final_france_datasets/20min_history/train.csv", index=True)
train_30min.dropna()\
    .to_csv("./data/final_france_datasets/30min_history/train.csv", index=True)

In [None]:
paths = [
    "./data/final_france_datasets/no_history_filled/",
    "./data/final_france_datasets/10min_history_filled/",
    "./data/final_france_datasets/20min_history_filled/",
    "./data/final_france_datasets/30min_history_filled/"
]

# Create directories if they don't exist
for path in paths:
    os.makedirs(path, exist_ok=True)

train_filled_nohistory.dropna()\
    .to_csv("./data/final_france_datasets/no_history_filled/train.csv", index=True)
train_filled_10min.dropna()\
    .to_csv("./data/final_france_datasets/10min_history_filled/train.csv", index=True)
train_filled_20min.dropna()\
    .to_csv("./data/final_france_datasets/20min_history_filled/train.csv", index=True)
train_filled_30min.dropna()\
    .to_csv("./data/final_france_datasets/30min_history_filled/train.csv", index=True)

### Validation and Test Data

In [None]:
val = pd.read_csv("./data/3_france_val_to_fill.csv")
test = pd.read_csv("./data/3_france_test_to_fill.csv")

val["BaseDateTime"] = pd.to_datetime(val["BaseDateTime"])
test["BaseDateTime"] = pd.to_datetime(test["BaseDateTime"])
val.set_index(["voyage_id", "BaseDateTime"], drop=True, inplace=True)
test.set_index(["voyage_id", "BaseDateTime"], drop=True, inplace=True)

val_vx = val["SOG"] * 0.5144444 * np.sin(val["COG"])
val_vy = val["SOG"] * 0.5144444 * np.cos(val["COG"])
val = val.assign(vx=val_vx, vy=val_vy)

test_vx = test["SOG"] * 0.5144444 * np.sin(test["COG"])
test_vy = test["SOG"] * 0.5144444 * np.cos(test["COG"])
test = test.assign(vx=test_vx, vy=test_vy)

In [None]:
# Resample to 2 minutes intervals by taking the mean every 2 minutes
val_resampled = val.groupby(level=0, sort=False) \
    .resample("2T", level=1).mean()
test_resampled = test.groupby(level=0, sort=False) \
    .resample("2T", level=1).mean()

In [None]:
print(val_resampled.isna().sum()[0] / val_resampled.shape[0] * 100)
print(test_resampled.isna().sum()[0] / test_resampled.shape[0] * 100)

In [None]:
val_vars_lagged = get_lagged_vars(val_resampled[["LON", "LAT",
                                                 "SOG", "COG",
                                                 "distanceToPort"]],
                                  delays=delays,
                                  online=True, fill=False, quay_point=None, extrapolate=False)
val_nohistory = get_lagged_io(val_vars_lagged,
                              val_resampled["remainingVoyageTime"],
                              n_feats=5, max_delay=0)
val_10min = get_lagged_io(val_vars_lagged,
                          val_resampled["remainingVoyageTime"],
                          n_feats=5, max_delay=5)
val_20min = get_lagged_io(val_vars_lagged,
                          val_resampled["remainingVoyageTime"],
                          n_feats=5, max_delay=10)
val_30min = get_lagged_io(val_vars_lagged,
                          val_resampled["remainingVoyageTime"],
                          n_feats=5, max_delay=15)

In [None]:
test_vars_lagged = get_lagged_vars(test_resampled[["LON", "LAT",
                                                   "SOG", "COG",
                                                   "distanceToPort"]],
                                   delays=delays,
                                   online=True, fill=False, quay_point=None, extrapolate=False)
test_nohistory = get_lagged_io(test_vars_lagged,
                               test_resampled["remainingVoyageTime"],
                               n_feats=5, max_delay=0)
test_10min = get_lagged_io(test_vars_lagged,
                           test_resampled["remainingVoyageTime"],
                           n_feats=5, max_delay=5)
test_20min = get_lagged_io(test_vars_lagged,
                           test_resampled["remainingVoyageTime"],
                           n_feats=5, max_delay=10)
test_30min = get_lagged_io(test_vars_lagged,
                           test_resampled["remainingVoyageTime"],
                           n_feats=5, max_delay=15)

In [None]:
val_vars_lagged_filled = get_lagged_vars(val_resampled[["LON", "LAT",
                                                        "SOG", "COG",
                                                        "distanceToPort", "x", "y", "vx", "vy"]],
                                         delays=delays,
                                         online=True, fill=True, quay_point=quay_point, extrapolate=False)
val_vars_lagged_filled.index = val_resampled.index
val_nohistory_filled = get_lagged_io(val_vars_lagged_filled,
                                     val_resampled["remainingVoyageTime"],
                                     n_feats=5, max_delay=0)
val_10min_filled = get_lagged_io(val_vars_lagged_filled,
                                 val_resampled["remainingVoyageTime"],
                                 n_feats=5, max_delay=5)
val_20min_filled = get_lagged_io(val_vars_lagged_filled,
                                 val_resampled["remainingVoyageTime"],
                                 n_feats=5, max_delay=10)
val_30min_filled = get_lagged_io(val_vars_lagged_filled,
                                 val_resampled["remainingVoyageTime"],
                                 n_feats=5, max_delay=15)

In [None]:
test_vars_lagged_filled = get_lagged_vars(test_resampled[["LON", "LAT",
                                                          "SOG", "COG",
                                                          "distanceToPort", "x", "y", "vx", "vy"]],
                                          delays=delays,
                                          online=True, fill=True, quay_point=quay_point, extrapolate=False)
test_vars_lagged_filled.index = test_resampled.index
test_nohistory_filled = get_lagged_io(test_vars_lagged_filled,
                                      test_resampled["remainingVoyageTime"],
                                      n_feats=5, max_delay=0)
test_10min_filled = get_lagged_io(test_vars_lagged_filled,
                                  test_resampled["remainingVoyageTime"],
                                  n_feats=5, max_delay=5)
test_20min_filled = get_lagged_io(test_vars_lagged_filled,
                                  test_resampled["remainingVoyageTime"],
                                  n_feats=5, max_delay=10)
test_30min_filled = get_lagged_io(test_vars_lagged_filled,
                                  test_resampled["remainingVoyageTime"],
                                  n_feats=5, max_delay=15)

In [None]:
val_nohistory.dropna()\
    .to_csv("./data/final_france_datasets/no_history/val.csv", index=False)
val_10min.dropna()\
    .to_csv("./data/final_france_datasets/10min_history/val.csv", index=False)
val_20min.dropna()\
    .to_csv("./data/final_france_datasets/20min_history/val.csv", index=False)
val_30min.dropna()\
    .to_csv("./data/final_france_datasets/30min_history/val.csv", index=False)

In [None]:
val_nohistory_filled.dropna()\
    .to_csv("./data/final_france_datasets/no_history_filled/val.csv", index=False)
val_10min_filled.dropna()\
    .to_csv("./data/final_france_datasets/10min_history_filled/val.csv", index=False)
val_20min_filled.dropna()\
    .to_csv("./data/final_france_datasets/20min_history_filled/val.csv", index=False)
val_30min_filled.dropna()\
    .to_csv("./data/final_france_datasets/30min_history_filled/val.csv", index=False)

In [None]:
test_nohistory.dropna()\
    .to_csv("./data/final_france_datasets/no_history/test.csv", index=False)
test_10min.dropna()\
    .to_csv("./data/final_france_datasets/10min_history/test.csv", index=False)
test_20min.dropna()\
    .to_csv("./data/final_france_datasets/20min_history/test.csv", index=False)
test_30min.dropna()\
    .to_csv("./data/final_france_datasets/30min_history/test.csv", index=False)

In [None]:
test_nohistory_filled.dropna()\
    .to_csv("./data/final_france_datasets/no_history_filled/test.csv", index=False)
test_10min_filled.dropna()\
    .to_csv("./data/final_france_datasets/10min_history_filled/test.csv", index=False)
test_20min_filled.dropna()\
    .to_csv("./data/final_france_datasets/20min_history_filled/test.csv", index=False)
test_30min_filled.dropna()\
    .to_csv("./data/final_france_datasets/30min_history_filled/test.csv", index=False)

# Plots and Tables of Availability

In [None]:
train_miami_nohist = pd.read_csv("./data/final_miami_datasets/no_history/train.csv")
val_miami_nohist = pd.read_csv("./data/final_miami_datasets/no_history/val.csv")
test_miami_nohist = pd.read_csv("./data/final_miami_datasets/no_history/test.csv")

train_miami_nohist_filled = pd.read_csv("./data/final_miami_datasets/no_history_filled/train.csv")
val_miami_nohist_filled = pd.read_csv("./data/final_miami_datasets/no_history_filled/val.csv")
test_miami_nohist_filled = pd.read_csv("./data/final_miami_datasets/no_history_filled/test.csv")

train_france_nohist = pd.read_csv("./data/final_france_datasets/no_history/train.csv")
val_france_nohist = pd.read_csv("./data/final_france_datasets/no_history/val.csv")
test_france_nohist = pd.read_csv("./data/final_france_datasets/no_history/test.csv")

train_france_nohist_filled = pd.read_csv("./data/final_france_datasets/no_history_filled/train.csv")
val_france_nohist_filled = pd.read_csv("./data/final_france_datasets/no_history_filled/val.csv")
test_france_nohist_filled = pd.read_csv("./data/final_france_datasets/no_history_filled/test.csv")

In [None]:
# Fazer bins de tempos restantes de viagem
bins = np.array([0.0, 8.0, 16.0, 24.0, 32.0, 40.0, 48.0]) * 60
labels= ["0-8", "8-16", "16-24", "24-32", "32-40", "40-48"]

train_miami_nohist_binned = pd.cut(x=train_miami_nohist["remainingVoyageTime"], bins=bins, labels=labels)\
    .value_counts()
val_miami_nohist_binned = pd.cut(x=val_miami_nohist["remainingVoyageTime"], bins=bins, labels=labels)\
    .value_counts()
test_miami_nohist_binned = pd.cut(x=test_miami_nohist["remainingVoyageTime"], bins=bins, labels=labels)\
    .value_counts()

train_miami_nohist_filled_binned = pd.cut(x=train_miami_nohist_filled["remainingVoyageTime"], bins=bins, labels=labels)\
    .value_counts()
val_miami_nohist_filled_binned = pd.cut(x=val_miami_nohist_filled["remainingVoyageTime"], bins=bins, labels=labels)\
    .value_counts()
test_miami_nohist_filled_binned = pd.cut(x=test_miami_nohist_filled["remainingVoyageTime"], bins=bins, labels=labels)\
    .value_counts()

train_france_nohist_binned = pd.cut(x=train_france_nohist["remainingVoyageTime"], bins=bins, labels=labels)\
    .value_counts()
val_france_nohist_binned = pd.cut(x=val_france_nohist["remainingVoyageTime"], bins=bins, labels=labels)\
    .value_counts()
test_france_nohist_binned = pd.cut(x=test_france_nohist["remainingVoyageTime"], bins=bins, labels=labels)\
    .value_counts()

train_france_nohist_filled_binned = pd.cut(x=train_france_nohist_filled["remainingVoyageTime"], bins=bins, labels=labels)\
    .value_counts()
val_france_nohist_filled_binned = pd.cut(x=val_france_nohist_filled["remainingVoyageTime"], bins=bins, labels=labels)\
    .value_counts()
test_france_nohist_filled_binned = pd.cut(x=test_france_nohist_filled["remainingVoyageTime"], bins=bins, labels=labels)\
    .value_counts()

In [None]:
train_miami_nohist_toplot = pd.concat((train_miami_nohist_binned, train_miami_nohist_filled_binned),
                                    axis=1, keys=["No Fill", "Filled"])\
    .melt(var_name="FillType", value_name="Count", ignore_index=False)\
    .reset_index()

val_miami_nohist_toplot = pd.concat((val_miami_nohist_binned, val_miami_nohist_filled_binned),
                                    axis=1, keys=["No Fill", "Filled"])\
    .melt(var_name="FillType", value_name="Count", ignore_index=False)\
    .reset_index()

test_miami_nohist_toplot = pd.concat((test_miami_nohist_binned, test_miami_nohist_filled_binned),
                                    axis=1, keys=["No Fill", "Filled"])\
    .melt(var_name="FillType", value_name="Count", ignore_index=False)\
    .reset_index()

train_france_nohist_toplot = pd.concat((train_france_nohist_binned, train_france_nohist_filled_binned),
                                    axis=1, keys=["No Fill", "Filled"])\
    .melt(var_name="FillType", value_name="Count", ignore_index=False)\
    .reset_index()

val_france_nohist_toplot = pd.concat((val_france_nohist_binned, val_france_nohist_filled_binned),
                                    axis=1, keys=["No Fill", "Filled"])\
    .melt(var_name="FillType", value_name="Count", ignore_index=False)\
    .reset_index()

test_france_nohist_toplot = pd.concat((test_france_nohist_binned, test_france_nohist_filled_binned),
                                    axis=1, keys=["No Fill", "Filled"])\
    .melt(var_name="FillType", value_name="Count", ignore_index=False)\
    .reset_index()

In [None]:
# Plots dos sets de treino val e teste sem historico para ambos os datasets
_, axs = plt.subplots(nrows=1, ncols=3, figsize=(12, 4))
sns.barplot(data=train_miami_nohist_toplot, x="remainingVoyageTime", y="Count", hue="FillType", ax=axs[0])
sns.barplot(data=val_miami_nohist_toplot, x="remainingVoyageTime", y="Count", hue="FillType", ax=axs[1])
sns.barplot(data=test_miami_nohist_toplot, x="remainingVoyageTime", y="Count", hue="FillType", ax=axs[2])

axs[0].set_title("Train Set")
axs[1].set_title("Validation Set")
axs[2].set_title("Test Set")

axs[0].set_ylabel("Number of Samples")
axs[0].set_xlabel("Remaining Travel Time (h)")

axs[1].set_ylabel("")
axs[2].set_ylabel("")
axs[1].set_xlabel("")
axs[2].set_xlabel("")


plt.savefig("./figures/fill_miami_nohist_bar")
plt.show()

In [None]:
# Plots dos sets de treino val e teste sem historico para ambos os datasets
_, axs = plt.subplots(nrows=1, ncols=3, figsize=(12, 4))
sns.barplot(data=train_france_nohist_toplot, x="remainingVoyageTime", y="Count", hue="FillType", ax=axs[0])
sns.barplot(data=val_france_nohist_toplot, x="remainingVoyageTime", y="Count", hue="FillType", ax=axs[1])
sns.barplot(data=test_france_nohist_toplot, x="remainingVoyageTime", y="Count", hue="FillType", ax=axs[2])

axs[0].set_title("Train Set")
axs[1].set_title("Validation Set")
axs[2].set_title("Test Set")

axs[0].set_ylabel("Number of Samples")
axs[0].set_xlabel("Remaining Travel Time (h)")

axs[1].set_ylabel("")
axs[2].set_ylabel("")
axs[1].set_xlabel("")
axs[2].set_xlabel("")


plt.savefig("./figures/fill_france_nohist_bar")
plt.show()

In [None]:
# Plots do set de val para os 3 historicos em ambos os datasets
val_miami_10min = pd.read_csv("./data/final_miami_datasets/10min_history/val.csv")
val_miami_20min = pd.read_csv("./data/final_miami_datasets/20min_history/val.csv")
val_miami_30min = pd.read_csv("./data/final_miami_datasets/30min_history/val.csv")

val_miami_10min_filled = pd.read_csv("./data/final_miami_datasets/10min_history_filled/val.csv")
val_miami_20min_filled = pd.read_csv("./data/final_miami_datasets/20min_history_filled/val.csv")
val_miami_30min_filled = pd.read_csv("./data/final_miami_datasets/30min_history_filled/val.csv")

val_france_10min = pd.read_csv("./data/final_france_datasets/10min_history/val.csv")
val_france_20min = pd.read_csv("./data/final_france_datasets/20min_history/val.csv")
val_france_30min = pd.read_csv("./data/final_france_datasets/30min_history/val.csv")

val_france_10min_filled = pd.read_csv("./data/final_france_datasets/10min_history_filled/val.csv")
val_france_20min_filled = pd.read_csv("./data/final_france_datasets/20min_history_filled/val.csv")
val_france_30min_filled = pd.read_csv("./data/final_france_datasets/30min_history_filled/val.csv")

In [None]:
# Fazer bins de tempos restantes de viagem
bins = np.array([0.0, 8.0, 16.0, 24.0, 32.0, 40.0, 48.0]) * 60
labels= ["0-8", "8-16", "16-24", "24-32", "32-40", "40-48"]

val_miami_10min_binned = pd.cut(x=val_miami_10min["remainingVoyageTime"], bins=bins, labels=labels)\
    .value_counts()
val_miami_20min_binned = pd.cut(x=val_miami_20min["remainingVoyageTime"], bins=bins, labels=labels)\
    .value_counts()
val_miami_30min_binned = pd.cut(x=val_miami_30min["remainingVoyageTime"], bins=bins, labels=labels)\
    .value_counts()

val_miami_10min_filled_binned = pd.cut(x=val_miami_10min_filled["remainingVoyageTime"], bins=bins, labels=labels)\
    .value_counts()
val_miami_20min_filled_binned = pd.cut(x=val_miami_20min_filled["remainingVoyageTime"], bins=bins, labels=labels)\
    .value_counts()
val_miami_30min_filled_binned = pd.cut(x=val_miami_30min_filled["remainingVoyageTime"], bins=bins, labels=labels)\
    .value_counts()

val_france_10min_binned = pd.cut(x=val_france_10min["remainingVoyageTime"], bins=bins, labels=labels)\
    .value_counts()
val_france_20min_binned = pd.cut(x=val_france_20min["remainingVoyageTime"], bins=bins, labels=labels)\
    .value_counts()
val_france_30min_binned = pd.cut(x=val_france_30min["remainingVoyageTime"], bins=bins, labels=labels)\
    .value_counts()

val_france_10min_filled_binned = pd.cut(x=val_france_10min_filled["remainingVoyageTime"], bins=bins, labels=labels)\
    .value_counts()
val_france_20min_filled_binned = pd.cut(x=val_france_20min_filled["remainingVoyageTime"], bins=bins, labels=labels)\
    .value_counts()
val_france_30min_filled_binned = pd.cut(x=val_france_30min_filled["remainingVoyageTime"], bins=bins, labels=labels)\
    .value_counts()

In [None]:
val_miami_10min_toplot = pd.concat((val_miami_10min_binned, val_miami_10min_filled_binned),
                                    axis=1, keys=["No Fill", "Filled"])\
    .melt(var_name="FillType", value_name="Count", ignore_index=False)\
    .reset_index()

val_miami_20min_toplot = pd.concat((val_miami_20min_binned, val_miami_20min_filled_binned),
                                    axis=1, keys=["No Fill", "Filled"])\
    .melt(var_name="FillType", value_name="Count", ignore_index=False)\
    .reset_index()

val_miami_30min_toplot = pd.concat((val_miami_30min_binned, val_miami_30min_filled_binned),
                                    axis=1, keys=["No Fill", "Filled"])\
    .melt(var_name="FillType", value_name="Count", ignore_index=False)\
    .reset_index()

val_france_10min_toplot = pd.concat((val_france_10min_binned, val_france_10min_filled_binned),
                                    axis=1, keys=["No Fill", "Filled"])\
    .melt(var_name="FillType", value_name="Count", ignore_index=False)\
    .reset_index()

val_france_20min_toplot = pd.concat((val_france_20min_binned, val_france_20min_filled_binned),
                                    axis=1, keys=["No Fill", "Filled"])\
    .melt(var_name="FillType", value_name="Count", ignore_index=False)\
    .reset_index()

val_france_30min_toplot = pd.concat((val_france_30min_binned, val_france_30min_filled_binned),
                                    axis=1, keys=["No Fill", "Filled"])\
    .melt(var_name="FillType", value_name="Count", ignore_index=False)\
    .reset_index()

In [None]:
# Plots dos sets de treino val e teste sem historico para ambos os datasets
_, axs = plt.subplots(nrows=1, ncols=3, figsize=(12, 4))
sns.barplot(data=val_miami_10min_toplot, x="remainingVoyageTime", y="Count", hue="FillType", ax=axs[0])
sns.barplot(data=val_miami_20min_toplot, x="remainingVoyageTime", y="Count", hue="FillType", ax=axs[1])
sns.barplot(data=val_miami_30min_toplot, x="remainingVoyageTime", y="Count", hue="FillType", ax=axs[2])

axs[0].set_title("10 Minnutes")
axs[1].set_title("20 Minutes")
axs[2].set_title("30 Minutes")

axs[0].set_ylabel("Number of Samples")
axs[0].set_xlabel("Remaining Travel Time (h)")

axs[1].set_ylabel("")
axs[2].set_ylabel("")
axs[1].set_xlabel("")
axs[2].set_xlabel("")


plt.savefig("./figures/fill_miami_val_hist_bar")
plt.show()

In [None]:
# Plots dos sets de treino val e teste sem historico para ambos os datasets
_, axs = plt.subplots(nrows=1, ncols=3, figsize=(12, 4))
sns.barplot(data=val_france_10min_toplot, x="remainingVoyageTime", y="Count", hue="FillType", ax=axs[0])
sns.barplot(data=val_france_20min_toplot, x="remainingVoyageTime", y="Count", hue="FillType", ax=axs[1])
sns.barplot(data=val_france_30min_toplot, x="remainingVoyageTime", y="Count", hue="FillType", ax=axs[2])

axs[0].set_title("10 Minnutes")
axs[1].set_title("20 Minutes")
axs[2].set_title("30 Minutes")

axs[0].set_ylabel("Number of Samples")
axs[0].set_xlabel("Remaining Travel Time (h)")

axs[1].set_ylabel("")
axs[2].set_ylabel("")
axs[1].set_xlabel("")
axs[2].set_xlabel("")


plt.savefig("./figures/fill_france_val_hist_bar")
plt.show()