In [1]:
import pandas as pd
import geopandas as gpd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# IMPUTERS
from sklearn.impute import SimpleImputer

In [2]:
df = pd.read_parquet("../data/processed/ais_filtered.parquet")
df.head()

Unnamed: 0,MMSI,BaseDateTime,LAT,LON,SOG,COG,Heading,VesselType,Status,Length,Width,Draft
5,367606520,2024-12-21T00:00:00,29.11332,-90.1961,0.0,304.0,511.0,70.0,0.0,49.0,12.0,3.0
6,636093045,2024-12-21T00:00:00,29.34722,-89.48623,11.5,295.8,295.0,80.0,0.0,184.0,32.0,12.0
12,303923000,2024-12-21T00:00:00,30.02172,-94.00101,0.0,137.8,14.0,70.0,5.0,193.0,28.0,6.5
26,367605680,2024-12-21T00:00:00,29.08294,-91.89141,0.1,279.3,140.0,70.0,0.0,39.0,8.0,2.6
46,367342960,2024-12-21T00:00:01,30.1758,-93.31875,0.0,184.6,191.0,80.0,0.0,183.0,32.0,10.0


# Duplicates

In [3]:
print(f"There are {df.duplicated().sum()} doublons. Manually confirmed for some of them")
df = df.drop_duplicates()

df = df.drop(columns="VesselType") #shouldn't be useful now.


There are 186 doublons. Manually confirmed for some of them


# Missing values

In [4]:
# % missing values per feature
df.isna().sum()/len(df) * 100
# not a lot, worst is 1.21% in Width. Length Width and Draft arent the core features either. Status is a bit more problematic


MMSI            0.000000
BaseDateTime    0.000000
LAT             0.000000
LON             0.000000
SOG             0.000000
COG             0.000000
Heading         0.000000
Status          0.258068
Length          0.107907
Width           1.210661
Draft           0.258068
dtype: float64

In [5]:
# check if missing values are concentrated in some boats.

#calculating number of na per feature and per vessel
na_per_vessel_feature = df[["Status", "Length", "Width", "Draft", "MMSI"]].groupby("MMSI").agg(lambda x: x.isna().sum())
# number of lines (ping) per vessel
ping_per_vessel = df.groupby("MMSI").count()["BaseDateTime"]

# % of pings with na, per feature et per vessel
na_pc = na_per_vessel_feature.divide(ping_per_vessel, axis= 0)*100
#filtering on vessels with na
na_pc.loc[na_pc.any(axis= 1) > 0,:]


Unnamed: 0_level_0,Status,Length,Width,Draft
MMSI,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
205700000,0.0,0.0,100.0,0.0
249579000,0.0,0.0,100.0,0.0
319093800,0.0,0.0,100.0,0.0
368001940,100.0,0.0,0.0,100.0
368203510,100.0,0.0,100.0,100.0
368287410,100.0,0.0,0.0,100.0
368305620,100.0,0.0,0.0,100.0
477669500,0.0,0.0,100.0,0.0
538005805,0.0,0.0,100.0,0.0
538005807,0.0,0.0,100.0,0.0


- There are 15 boats with na
- When values are missing they are missing for the entire sequence. It's not a issue of pings, but an issue of boat
- The status feature is crucial for route prediction, so we'll drop boats with that missing info (not a lot of boat)
- dimension features should be less crucials and fixed for one boat. will impoute using median. 

In [6]:
na_values = {"Length": df["Length"].median(),
             "Width": df["Width"].median(),
             "Draft": df["Draft"].median()}

df = df.fillna(value= na_values)


In [7]:
df = df.dropna(subset=["Status"])

# Dataset Preparation: Feature Engineering

**Objective:** Transform raw AIS data into ML-ready format for 30-minute position prediction

**Steps:**
1. Convert BaseDateTime to datetime and sort by (MMSI, BaseDateTime)
2. Resample to 5-minute intervals per vessel (homogeneous time steps)
3. Create lagged features (5, 15, 30 min):
   - LAT, LON (trajectory history)
   - SOG, COG, Heading (motion history)
4. Create target variables: LAT_target_30min, LON_target_30min (shift forward by 6 rows)
5. Drop rows with NaN (insufficient history or no future data)
6. Select final feature set

**Features for baseline model:**
- Current: LAT, LON, SOG, COG, Heading, Status
- Lagged positions: LAT_lag_5, LAT_lag_15, LAT_lag_30, LON_lag_5, LON_lag_15, LON_lag_30
- Lagged motion: SOG_lag_5, SOG_lag_15, COG_lag_5, COG_lag_15

**Total: ~14-16 features**




In [8]:
#convert to date time
df["BaseDateTime"] = pd.to_datetime(df["BaseDateTime"])
# sort by time and vessesl
df = df.sort_values(by=["MMSI", "BaseDateTime"], ascending= True)

# resampling every 5 min per vessels
df = df.set_index("BaseDateTime")

#resample in 5min bin per boats
df_resampled = df.groupby("MMSI").resample("5min").last()


# #interpolate if this creates NA

df_resampled = df_resampled.interpolate("linear")

#get MMSI and BaseDateTime back to features
df_resampled = df_resampled.drop(columns= "MMSI")
df = df_resampled.reset_index()

  df_resampled = df.groupby("MMSI").resample("5min").last()


In [9]:
def create_time_series_features(df, prediction_horizon_min=30):
    """
    Create lagged features and target proportional to prediction horizon.

    Parameters:
    -----------
    df : DataFrame
        Must be resampled at 5-min intervals, sorted by MMSI and BaseDateTime
    prediction_horizon_min : int
        Prediction horizon in minutes (30, 60, 720, etc.)

    Returns:
    --------
    DataFrame with lagged features and targets
    """
    df_features = df.copy()

    # Calculate lag intervals (in number of 5-min steps)
    # Proportional: 1/6, 1/2, and full horizon
    horizon_steps = prediction_horizon_min // 5  # Convert minutes to 5-min steps

    lag_1 = max(1, horizon_steps // 6)    # ~1/6 of horizon
    lag_2 = max(2, horizon_steps // 2)    # ~1/2 of horizon
    lag_3 = horizon_steps                 # Full horizon (for reference)

    print(f"Prediction horizon: {prediction_horizon_min} min ({horizon_steps} steps)")
    print(f"Lag intervals: {lag_1*5}min, {lag_2*5}min, {lag_3*5}min")

    # Create lagged features
    for var in ['LAT', 'LON', 'SOG', 'COG']:
        df_features[f'{var}_lag_{lag_1*5}min'] = df_features.groupby('MMSI')[var].shift(lag_1)
        df_features[f'{var}_lag_{lag_2*5}min'] = df_features.groupby('MMSI')[var].shift(lag_2)
        df_features[f'{var}_lag_{lag_3*5}min'] = df_features.groupby('MMSI')[var].shift(lag_3)

    # Create targets
    df_features['target_LAT'] = df_features.groupby('MMSI')['LAT'].shift(-horizon_steps)
    df_features['target_LON'] = df_features.groupby('MMSI')['LON'].shift(-horizon_steps)

    # Drop NaN
    df_features = df_features.dropna().reset_index(drop=True)

    return df_features


# ============================================================================
# USAGE EXAMPLES
# ============================================================================

# 30 minutes ahead (current)
df_30min = create_time_series_features(df, prediction_horizon_min=30)

# 1 hour ahead
df_60min = create_time_series_features(df, prediction_horizon_min=60)

# 12 hours ahead
df_12h = create_time_series_features(df, prediction_horizon_min=720)

Prediction horizon: 30 min (6 steps)
Lag intervals: 5min, 15min, 30min
Prediction horizon: 60 min (12 steps)
Lag intervals: 10min, 30min, 60min
Prediction horizon: 720 min (144 steps)
Lag intervals: 120min, 360min, 720min


In [10]:
#create lag LAT and LON features with 5, 15, 30min lag
df["LAT_lag_5"] = df.groupby("MMSI")["LAT"].shift(1)
df["LON_lag_5"] = df.groupby("MMSI")["LON"].shift(1)
df["LAT_lag_15"] = df.groupby("MMSI")["LAT"].shift(3)
df["LON_lag_15"] = df.groupby("MMSI")["LON"].shift(3)
df["LAT_lag_30"] = df.groupby("MMSI")["LAT"].shift(6)
df["LON_lag_30"] = df.groupby("MMSI")["LON"].shift(6)

#create lag SOG, COG with 5, 15, 30min lag
df["SOG_lag_5"] = df.groupby("MMSI")["SOG"].shift(1)
df["COG_lag_5"] = df.groupby("MMSI")["COG"].shift(1)
df["SOG_lag_15"] = df.groupby("MMSI")["SOG"].shift(3)
df["COG_lag_15"] = df.groupby("MMSI")["COG"].shift(3)
df["SOG_lag_30"] = df.groupby("MMSI")["SOG"].shift(6)
df["COG_lag_30"] = df.groupby("MMSI")["COG"].shift(6)


In [11]:
#create target: LAT and LON shifted 6 ahead
df["target_LAT"] = df.groupby("MMSI")["LAT"].shift(-6)
df["target_LON"] = df.groupby("MMSI")["LON"].shift(-6)


In [12]:
df = df.dropna().reset_index(drop= True)
df.head(10)

Unnamed: 0,MMSI,BaseDateTime,LAT,LON,SOG,COG,Heading,Status,Length,Width,...,LAT_lag_30,LON_lag_30,SOG_lag_5,COG_lag_5,SOG_lag_15,COG_lag_15,SOG_lag_30,COG_lag_30,target_LAT,target_LON
0,205685000,2024-12-23 14:25:00,28.70223,-94.3907,4.9,22.2,37.0,0.0,180.0,30.0,...,28.58447,-94.40653,10.5,7.1,13.5,7.1,15.3,6.4,28.74377,-94.37807
1,205685000,2024-12-23 14:30:00,28.70778,-94.388,3.1,26.3,47.0,0.0,180.0,30.0,...,28.611775,-94.402755,4.9,22.2,12.0,7.1,15.15,6.75,28.76378,-94.3832
2,205685000,2024-12-23 14:35:00,28.70888,-94.3874,2.8,24.7,48.0,0.0,180.0,30.0,...,28.63908,-94.39898,3.1,26.3,10.5,7.1,15.0,7.1,28.78305,-94.38802
3,205685000,2024-12-23 14:40:00,28.71272,-94.38537,2.7,36.4,63.0,0.0,180.0,30.0,...,28.65476,-94.39692,2.8,24.7,4.9,22.2,13.5,7.1,28.80278,-94.393
4,205685000,2024-12-23 14:45:00,28.7173,-94.37955,6.2,66.0,62.0,0.0,180.0,30.0,...,28.67044,-94.39486,2.7,36.4,3.1,26.3,12.0,7.1,28.82285,-94.39825
5,205685000,2024-12-23 14:50:00,28.72068,-94.37627,8.3,17.8,20.0,0.0,180.0,30.0,...,28.68612,-94.3928,6.2,66.0,2.8,24.7,10.5,7.1,28.8399,-94.40268
6,205685000,2024-12-23 14:55:00,28.74377,-94.37807,13.0,347.5,350.0,0.0,180.0,30.0,...,28.70223,-94.3907,8.3,17.8,2.7,36.4,4.9,22.2,28.8606,-94.40593
7,205685000,2024-12-23 15:00:00,28.76378,-94.3832,13.7,348.3,350.0,0.0,180.0,30.0,...,28.70778,-94.388,13.0,347.5,6.2,66.0,3.1,26.3,28.87753,-94.40775
8,205685000,2024-12-23 15:05:00,28.78305,-94.38802,14.0,346.1,350.0,0.0,180.0,30.0,...,28.70888,-94.3874,13.7,348.3,8.3,17.8,2.8,24.7,28.89563,-94.40863
9,205685000,2024-12-23 15:10:00,28.80278,-94.393,14.0,347.4,349.0,0.0,180.0,30.0,...,28.71272,-94.38537,14.0,346.1,13.0,347.5,2.7,36.4,28.91747,-94.4097


In [13]:
# # Conceptually:
# df['LAT_lag_5min'] = df.groupby('MMSI')['LAT'].shift(1)  # 1 row back (5min)
# df['LAT_lag_15min'] = df.groupby('MMSI')['LAT'].shift(3)  # 3 rows back (15min)
# # Conceptually:
# df['LAT_target_30min'] = df.groupby('MMSI')['LAT'].shift(-6)  # 6 rows forward
# # Conceptually:
# df['SOG_mean_30min'] = df.groupby('MMSI')['SOG'].rolling(window=6).mean()

# Model Training & Evaluation Pipeline

**Approach:** Predict LAT and LON directly using MultiOutputRegressor with XGBoost

**Pipeline steps:**
1. Train/test split (temporal or by vessel)
2. Setup MultiOutputRegressor(XGBRegressor()) - trains 2 models (one for LAT, one for LON)
3. Train on features ‚Üí predict [LAT_target_30min, LON_target_30min]
4. Evaluate using Haversine distance MAE (km, not degrees)

**Baseline comparison:**
- Calculate naive baseline: extrapolate position using current SOG and COG
- Compare XGBoost MAE vs baseline MAE

**Optional enhancements (if time permits):**
- GridSearchCV with Haversine MAE as custom scorer
- Add rolling features (mean/std of SOG, COG over 30min)
- Add distance traveled feature (requires GeoDataFrame)
- Feature importance analysis


## Definition of necessary functions for metrics and estimations

In [14]:
from sklearn.model_selection import GroupShuffleSplit, GroupKFold, GridSearchCV, cross_val_score, cross_validate
from sklearn.metrics import make_scorer
from sklearn.multioutput import MultiOutputRegressor
from xgboost import XGBRegressor


# ============================================================================
# METRIC FUNCTIONS
# ============================================================================

def haversine_distance(LAT_true, LON_true, LAT_pred, LON_pred):
    """
    Calculate great-circle distance between two points on Earth using Haversine formula.

    Parameters:
    -----------
    LAT_true, LON_true : array-like
        True latitude and longitude coordinates in degrees
    LAT_pred, LON_pred : array-like
        Predicted latitude and longitude coordinates in degrees

    Returns:
    --------
    float
        Great-circle distance in kilometers
    """
    earth_radius = 6371  # Earth mean radius in kilometers

    # Convert degrees to radians
    LAT_true_rad = np.radians(LAT_true)
    LON_true_rad = np.radians(LON_true)
    LAT_pred_rad = np.radians(LAT_pred)
    LON_pred_rad = np.radians(LON_pred)

    # Calculate differences
    d_LAT = LAT_pred_rad - LAT_true_rad
    d_LON = LON_pred_rad - LON_true_rad

    # Haversine formula
    a = (np.sin(d_LAT / 2.0)**2 + np.cos(LAT_true_rad)*np.cos(LAT_pred_rad)*np.sin(d_LON/2.0)**2)
    c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1-a))  # Angular distance in radians
    d = c * earth_radius  # Convert to kilometers
    return d


def haversine_mae(y_true, y_pred):
    """
    Calculate Mean Absolute Error using Haversine distance between positions.

    This metric evaluates prediction quality in km rather than degrees.

    Parameters:
    -----------
    y_true : array, shape (n_samples, 2)
        True positions where column 0 = LAT, column 1 = LON (in degrees)
    y_pred : array, shape (n_samples, 2)
        Predicted positions where column 0 = LAT, column 1 = LON (in degrees)

    Returns:
    --------
    float
        Mean Absolute Error in kilometers
    """

    #make sure we're converting in numpy
    y_true= np.asarray(y_true)
    y_pred = np.asarray(y_pred)
    # Calculate haversine distance for each prediction
    mae = np.mean(abs(haversine_distance(y_true[:,0],
                                         y_true[:,1],
                                         y_pred[:,0],
                                         y_pred[:,1]) ))
    return mae


# Create sklearn-compatible scorer (negated for GridSearchCV minimization)
haversine_scorer = make_scorer(haversine_mae, greater_is_better= False)


# ============================================================================
# BASELINE MODEL (NAIVE PREDICTOR)
# ============================================================================

def position_extrapolation(df: pd.DataFrame):
    """
    Naive baseline: extrapolate position based on linear displacement over last 30 minutes.

    Assumes constant velocity: future_displacement = past_displacement
    Mathematically: position(t+30) = position(t) + [position(t) - position(t-30)]

    Parameters:
    -----------
    df : DataFrame
        Must contain columns: LAT, LON, LAT_lag_30, LON_lag_30

    Returns:
    --------
    LAT_pred, LON_pred : Series
        Predicted latitude and longitude 30 minutes ahead
    """
    # Calculate displacement over the last 30 minutes
    dLAT = df["LAT"] - df["LAT_lag_30"]
    dLON = df["LON"] - df["LON_lag_30"]

    # Extrapolate: assume same displacement for next 30 minutes
    LAT_pred = df["LAT"] + dLAT
    LON_pred = df["LON"] + dLON

    return LAT_pred, LON_pred


## Train / Test split keeping boats in same group

In [15]:
#define features df, targets (2 targets) and the group to guide split
X = df.drop(columns=[ "target_LAT", "target_LON"])
y = df[["target_LAT", "target_LON"]]
groups = df["MMSI"]
X.head(10)

Unnamed: 0,MMSI,BaseDateTime,LAT,LON,SOG,COG,Heading,Status,Length,Width,...,LAT_lag_15,LON_lag_15,LAT_lag_30,LON_lag_30,SOG_lag_5,COG_lag_5,SOG_lag_15,COG_lag_15,SOG_lag_30,COG_lag_30
0,205685000,2024-12-23 14:25:00,28.70223,-94.3907,4.9,22.2,37.0,0.0,180.0,30.0,...,28.65476,-94.39692,28.58447,-94.40653,10.5,7.1,13.5,7.1,15.3,6.4
1,205685000,2024-12-23 14:30:00,28.70778,-94.388,3.1,26.3,47.0,0.0,180.0,30.0,...,28.67044,-94.39486,28.611775,-94.402755,4.9,22.2,12.0,7.1,15.15,6.75
2,205685000,2024-12-23 14:35:00,28.70888,-94.3874,2.8,24.7,48.0,0.0,180.0,30.0,...,28.68612,-94.3928,28.63908,-94.39898,3.1,26.3,10.5,7.1,15.0,7.1
3,205685000,2024-12-23 14:40:00,28.71272,-94.38537,2.7,36.4,63.0,0.0,180.0,30.0,...,28.70223,-94.3907,28.65476,-94.39692,2.8,24.7,4.9,22.2,13.5,7.1
4,205685000,2024-12-23 14:45:00,28.7173,-94.37955,6.2,66.0,62.0,0.0,180.0,30.0,...,28.70778,-94.388,28.67044,-94.39486,2.7,36.4,3.1,26.3,12.0,7.1
5,205685000,2024-12-23 14:50:00,28.72068,-94.37627,8.3,17.8,20.0,0.0,180.0,30.0,...,28.70888,-94.3874,28.68612,-94.3928,6.2,66.0,2.8,24.7,10.5,7.1
6,205685000,2024-12-23 14:55:00,28.74377,-94.37807,13.0,347.5,350.0,0.0,180.0,30.0,...,28.71272,-94.38537,28.70223,-94.3907,8.3,17.8,2.7,36.4,4.9,22.2
7,205685000,2024-12-23 15:00:00,28.76378,-94.3832,13.7,348.3,350.0,0.0,180.0,30.0,...,28.7173,-94.37955,28.70778,-94.388,13.0,347.5,6.2,66.0,3.1,26.3
8,205685000,2024-12-23 15:05:00,28.78305,-94.38802,14.0,346.1,350.0,0.0,180.0,30.0,...,28.72068,-94.37627,28.70888,-94.3874,13.7,348.3,8.3,17.8,2.8,24.7
9,205685000,2024-12-23 15:10:00,28.80278,-94.393,14.0,347.4,349.0,0.0,180.0,30.0,...,28.74377,-94.37807,28.71272,-94.38537,14.0,346.1,13.0,347.5,2.7,36.4


In [16]:



#define splitter making sure that each boat is part of only train or test
gss = GroupShuffleSplit(n_splits=1, test_size= 0.2)

# obtain the indexse from the generator returned by .split()
for train_idx, test_idx in gss.split(X,y,groups):
    X_train, X_test = X.iloc[train_idx,:], X.iloc[test_idx,:]
    y_train, y_test = y.iloc[train_idx,:], y.iloc[test_idx,:]


In [17]:
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(881163, 23)
(881163, 2)
(217682, 23)
(217682, 2)


## Baseline Score

In [18]:

# Generate baseline predictions using linear extrapolation
LAT_pred, LON_pred = position_extrapolation(X_test)

# Prepare arrays for metric calculation
y_true = y_test.values  # shape (n, 2): [LAT, LON]
y_pred_baseline = np.column_stack([LAT_pred, LON_pred])  # shape (n, 2)

# Calculate Haversine MAE
baseline_mae = haversine_mae(y_true, y_pred_baseline)

# Display results

print("BASELINE MODEL PERFORMANCE")
print(f"Model: Linear extrapolation")
print(f"Prediction horizon: 30 minutes")
print(f"Test set size: {len(y_test):,} samples \n {X_test["MMSI"].nunique()} boats")
print(f"\nMean Absolute Error: {baseline_mae:.3f} km")
print(f"                    = {baseline_mae*1000:.1f} meters")


BASELINE MODEL PERFORMANCE
Model: Linear extrapolation
Prediction horizon: 30 minutes
Test set size: 217,682 samples 
 195 boats

Mean Absolute Error: 0.929 km
                    = 928.5 meters


## XGBOOST

In [19]:
groups_crossval = X_train["MMSI"]
X_train = X_train.drop(columns=["MMSI", "BaseDateTime"])

In [20]:
#For this approach we need to drop the boat ID as feature


model = MultiOutputRegressor(XGBRegressor(n_estimators=200,
        max_depth=6,
        learning_rate=0.1,     # Learning rate standard
        subsample=0.8,         # Bagging
        colsample_bytree=0.8,  # Feature sampling
        random_state=42))
gkf = GroupKFold(n_splits= 5)

scores = cross_val_score(model,
                         X_train,
                         y_train,
                         cv= gkf,
                         groups= groups_crossval,
                         scoring=haversine_scorer,
                         n_jobs= -1)

print(f"crossval scores: {-scores}")
print(f"average haversine MAE: {-scores.mean():.2f}")


crossval scores: [2.49911599 2.3946223  2.59274721 2.17475136 3.04657688]
average haversine MAE: 2.54


In [21]:
X_train

Unnamed: 0,LAT,LON,SOG,COG,Heading,Status,Length,Width,Draft,LAT_lag_5,...,LAT_lag_15,LON_lag_15,LAT_lag_30,LON_lag_30,SOG_lag_5,COG_lag_5,SOG_lag_15,COG_lag_15,SOG_lag_30,COG_lag_30
0,28.70223,-94.39070,4.9,22.2,37.0,0.0,180.0,30.0,9.4,28.68612,...,28.65476,-94.39692,28.584470,-94.406530,10.5,7.1,13.5,7.1,15.30,6.40
1,28.70778,-94.38800,3.1,26.3,47.0,0.0,180.0,30.0,7.7,28.70223,...,28.67044,-94.39486,28.611775,-94.402755,4.9,22.2,12.0,7.1,15.15,6.75
2,28.70888,-94.38740,2.8,24.7,48.0,0.0,180.0,30.0,7.7,28.70778,...,28.68612,-94.39280,28.639080,-94.398980,3.1,26.3,10.5,7.1,15.00,7.10
3,28.71272,-94.38537,2.7,36.4,63.0,0.0,180.0,30.0,7.7,28.70888,...,28.70223,-94.39070,28.654760,-94.396920,2.8,24.7,4.9,22.2,13.50,7.10
4,28.71730,-94.37955,6.2,66.0,62.0,0.0,180.0,30.0,7.7,28.71272,...,28.70778,-94.38800,28.670440,-94.394860,2.7,36.4,3.1,26.3,12.00,7.10
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1098840,27.58549,-96.25555,12.8,266.0,260.0,0.0,183.0,32.0,9.0,27.58692,...,27.58913,-96.19766,27.594210,-96.127650,12.8,266.9,12.7,266.5,12.80,264.50
1098841,27.58388,-96.27577,12.9,264.2,259.0,0.0,183.0,32.0,9.0,27.58549,...,27.58801,-96.21555,27.591970,-96.156390,12.8,266.0,12.6,266.7,12.80,265.30
1098842,27.58239,-96.29264,12.9,263.2,260.0,0.0,183.0,32.0,9.0,27.58388,...,27.58692,-96.23419,27.590580,-96.176330,12.9,264.2,12.8,266.9,12.70,265.70
1098843,27.58032,-96.31562,13.0,265.1,261.0,0.0,183.0,32.0,9.0,27.58239,...,27.58549,-96.25555,27.589130,-96.197660,12.9,263.2,12.8,266.0,12.70,266.50


## LINREG

In [22]:
from sklearn.linear_model import LinearRegression

# LinearRegression avec cross-validation
model_lr = MultiOutputRegressor(LinearRegression())

# S'assurer que les indices correspondent
scores_lr = cross_val_score(
    model_lr,
    X_train,
    y_train,
    cv=gkf,
    groups=groups_crossval,
    scoring=haversine_scorer,
    n_jobs=-1
)

print(f"LinearRegression cross-validation scores: {-scores_lr}")
print(f"LinearRegression MAE: {-scores_lr.mean():.2f} km ¬± {scores_lr.std():.2f}")
print(f"Baseline MAE: 0.75 km")
print(f"Ratio: {-scores_lr.mean() / 0.75:.2f}x")

  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_


LinearRegression cross-validation scores: [2.06386398 2.09011286 1.79658993 1.88600113 1.9056542 ]
LinearRegression MAE: 1.95 km ¬± 0.11
Baseline MAE: 0.75 km
Ratio: 2.60x


  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_


In [23]:
# ============================================================================
# TEST WITH 1-HOUR PREDICTION HORIZON
# ============================================================================

print("\n" + "="*70)
print("TESTING 1-HOUR PREDICTION HORIZON")
print("="*70)

# 1. Recr√©er les features avec horizon 60 min
df_60min = create_time_series_features(df, prediction_horizon_min=60)

# 2. Pr√©parer X, y, groups
X_60 = df_60min.drop(columns=["MMSI", "BaseDateTime", "target_LAT", "target_LON"])
y_60 = df_60min[["target_LAT", "target_LON"]]
groups_60 = df_60min["MMSI"]

# 3. Train/Test split
gss = GroupShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_idx, test_idx in gss.split(X_60, y_60, groups_60):
    X_train_60, X_test_60 = X_60.iloc[train_idx], X_60.iloc[test_idx]
    y_train_60, y_test_60 = y_60.iloc[train_idx], y_60.iloc[test_idx]

# 4. Baseline
print("\n1. BASELINE (60min horizon)")
# Note: besoin d'adapter position_extrapolation pour 60min
# Version simple: utiliser le lag √† 60min
LAT_pred_base = X_test_60['LAT'] + (X_test_60['LAT'] - X_test_60['LAT_lag_60min'])
LON_pred_base = X_test_60['LON'] + (X_test_60['LON'] - X_test_60['LON_lag_60min'])
y_pred_base = np.column_stack([LAT_pred_base, LON_pred_base])
baseline_60_mae = haversine_mae(y_test_60.values, y_pred_base)
print(f"   Baseline MAE: {baseline_60_mae:.3f} km")

# 5. LinearRegression
print("\n2. LINEAR REGRESSION (60min horizon)")
model_lr_60 = MultiOutputRegressor(LinearRegression())
model_lr_60.fit(X_train_60, y_train_60)
y_pred_lr_60 = model_lr_60.predict(X_test_60)
lr_60_mae = haversine_mae(y_test_60.values, y_pred_lr_60)
print(f"   LinReg MAE: {lr_60_mae:.3f} km")
print(f"   Improvement: {((baseline_60_mae - lr_60_mae) / baseline_60_mae * 100):.1f}%")

# 6. XGBoost
print("\n3. XGBOOST (60min horizon)")
model_xgb_60 = MultiOutputRegressor(XGBRegressor(n_estimators=200, max_depth=6, learning_rate=0.1))
model_xgb_60.fit(X_train_60, y_train_60)
y_pred_xgb_60 = model_xgb_60.predict(X_test_60)
xgb_60_mae = haversine_mae(y_test_60.values, y_pred_xgb_60)
print(f"   XGBoost MAE: {xgb_60_mae:.3f} km")
print(f"   Improvement: {((baseline_60_mae - xgb_60_mae) / baseline_60_mae * 100):.1f}%")

print("\n" + "="*70)
print(f"30min horizon - Baseline: 0.754 km")
print(f"60min horizon - Baseline: {baseline_60_mae:.3f} km (should be ~1.5 km)")
print("="*70)


TESTING 1-HOUR PREDICTION HORIZON
Prediction horizon: 60 min (12 steps)
Lag intervals: 10min, 30min, 60min

1. BASELINE (60min horizon)
   Baseline MAE: 1.899 km

2. LINEAR REGRESSION (60min horizon)


  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_


   LinReg MAE: 2.941 km
   Improvement: -54.9%

3. XGBOOST (60min horizon)
   XGBoost MAE: 3.607 km
   Improvement: -89.9%

30min horizon - Baseline: 0.754 km
60min horizon - Baseline: 1.899 km (should be ~1.5 km)


In [None]:
# ============================================================================
# MULTI-HORIZON PREDICTION COMPARISON
# ============================================================================

def test_prediction_horizon(df_base, horizon_min, test_size=0.2):
    """
    Test all models for a given prediction horizon.

    Returns: dict with results
    """
    print(f"\n{'='*70}")
    print(f"PREDICTION HORIZON: {horizon_min} minutes ({horizon_min/60:.1f} hours)")
    print(f"{'='*70}")

    # 1. Create features
    df_h = create_time_series_features(df_base, prediction_horizon_min=horizon_min)

    # 2. Prepare data
    X_h = df_h.drop(columns=["MMSI", "BaseDateTime", "target_LAT", "target_LON"])
    y_h = df_h[["target_LAT", "target_LON"]]
    groups_h = df_h["MMSI"]

    print(f"Dataset size: {len(df_h):,} samples, {groups_h.nunique()} vessels")

    # 3. Split
    gss = GroupShuffleSplit(n_splits=1, test_size=test_size, random_state=42)
    for train_idx, test_idx in gss.split(X_h, y_h, groups_h):
        X_train_h, X_test_h = X_h.iloc[train_idx], X_h.iloc[test_idx]
        y_train_h, y_test_h = y_h.iloc[train_idx], y_h.iloc[test_idx]

    # 4. BASELINE
    lag_col = f'LAT_lag_{horizon_min}min'
    LAT_pred_base = X_test_h['LAT'] + (X_test_h['LAT'] - X_test_h[lag_col])
    LON_pred_base = X_test_h['LON'] + (X_test_h['LON'] - X_test_h[f'LON_lag_{horizon_min}min'])
    y_pred_base = np.column_stack([LAT_pred_base, LON_pred_base])
    baseline_mae = haversine_mae(y_test_h.values, y_pred_base)

    # 5. LINEAR REGRESSION
    model_lr = MultiOutputRegressor(LinearRegression())
    model_lr.fit(X_train_h, y_train_h)
    y_pred_lr = model_lr.predict(X_test_h)
    lr_mae = haversine_mae(y_test_h.values, y_pred_lr)

    # 6. XGBOOST
    model_xgb = MultiOutputRegressor(
        XGBRegressor(
            n_estimators=200,
            max_depth=6,
            learning_rate=0.1,
            subsample=0.8,
            colsample_bytree=0.8,
            random_state=42,
            n_jobs=-1
        )
    )
    model_xgb.fit(X_train_h, y_train_h)
    y_pred_xgb = model_xgb.predict(X_test_h)
    xgb_mae = haversine_mae(y_test_h.values, y_pred_xgb)

    # 7. RESULTS
    print(f"\n{'Model':<20} {'MAE (km)':<12} {'vs Baseline':<15} {'Status'}")
    print(f"{'-'*70}")
    print(f"{'Baseline':<20} {baseline_mae:>8.3f} km  {'-':>12}   {'reference'}")

    lr_improvement = ((baseline_mae - lr_mae) / baseline_mae * 100)
    lr_status = "‚úì Better" if lr_mae < baseline_mae else "‚úó Worse"
    print(f"{'Linear Regression':<20} {lr_mae:>8.3f} km  {lr_improvement:>+11.1f}%   {lr_status}")

    xgb_improvement = ((baseline_mae - xgb_mae) / baseline_mae * 100)
    xgb_status = "‚úì Better" if xgb_mae < baseline_mae else "‚úó Worse"
    print(f"{'XGBoost':<20} {xgb_mae:>8.3f} km  {xgb_improvement:>+11.1f}%   {xgb_status}")

    # Winner
    best_mae = min(baseline_mae, lr_mae, xgb_mae)
    if best_mae == baseline_mae:
        winner = "Baseline"
    elif best_mae == lr_mae:
        winner = "Linear Regression"
    else:
        winner = "XGBoost"

    print(f"\nüèÜ Winner: {winner} ({best_mae:.3f} km)")

    return {
        'horizon_min': horizon_min,
        'horizon_hours': horizon_min / 60,
        'baseline_mae': baseline_mae,
        'lr_mae': lr_mae,
        'xgb_mae': xgb_mae,
        'winner': winner,
        'best_mae': best_mae
    }


# ============================================================================
# TEST MULTIPLE HORIZONS
# ============================================================================

print("\n" + "="*70)
print("MULTI-HORIZON PREDICTION ANALYSIS")
print("="*70)

horizons = [30, 60, 360, 720, 1440]  # 30min, 1h, 6h, 12h, 24h
results = []

for horizon in horizons:
    try:
        result = test_prediction_horizon(df, horizon_min=horizon, test_size=0.2)
        results.append(result)
    except Exception as e:
        print(f"\n‚ùå Error for {horizon}min horizon: {e}")
        continue

# ============================================================================
# SUMMARY TABLE
# ============================================================================

print("\n" + "="*70)
print("SUMMARY: ALL PREDICTION HORIZONS")
print("="*70)

import pandas as pd
results_df = pd.DataFrame(results)
print(results_df.to_string(index=False))

# Find best horizon for ML models
print("\n" + "="*70)
print("KEY INSIGHTS:")
print("="*70)

for i, row in results_df.iterrows():
    horizon_h = row['horizon_hours']
    if row['lr_mae'] < row['baseline_mae'] or row['xgb_mae'] < row['baseline_mae']:
        improvement = min(row['lr_mae'], row['xgb_mae'])
        pct = ((row['baseline_mae'] - improvement) / row['baseline_mae'] * 100)
        print(f"‚úì At {horizon_h:.1f}h: ML models beat baseline by {pct:.1f}%")
    else:
        print(f"‚úó At {horizon_h:.1f}h: Baseline still best ({row['baseline_mae']:.2f} km)")


MULTI-HORIZON PREDICTION ANALYSIS

PREDICTION HORIZON: 30 minutes (0.5 hours)
Prediction horizon: 30 min (6 steps)
Lag intervals: 5min, 15min, 30min
Dataset size: 1,087,194 samples, 968 vessels


  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_



Model                MAE (km)     vs Baseline     Status
----------------------------------------------------------------------
Baseline                0.877 km             -   reference
Linear Regression       2.047 km       -133.5%   ‚úó Worse
XGBoost                 2.403 km       -174.0%   ‚úó Worse

üèÜ Winner: Baseline (0.877 km)

PREDICTION HORIZON: 60 minutes (1.0 hours)
Prediction horizon: 60 min (12 steps)
Lag intervals: 10min, 30min, 60min
Dataset size: 1,075,592 samples, 966 vessels


  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_



Model                MAE (km)     vs Baseline     Status
----------------------------------------------------------------------
Baseline                1.899 km             -   reference
Linear Regression       2.941 km        -54.9%   ‚úó Worse
XGBoost                 3.360 km        -76.9%   ‚úó Worse

üèÜ Winner: Baseline (1.899 km)

PREDICTION HORIZON: 360 minutes (6.0 hours)
Prediction horizon: 360 min (72 steps)
Lag intervals: 60min, 180min, 360min
Dataset size: 961,988 samples, 925 vessels


  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_



Model                MAE (km)     vs Baseline     Status
----------------------------------------------------------------------
Baseline               16.959 km             -   reference
Linear Regression      16.481 km         +2.8%   ‚úì Better
XGBoost                14.816 km        +12.6%   ‚úì Better

üèÜ Winner: XGBoost (14.816 km)

PREDICTION HORIZON: 720 minutes (12.0 hours)
Prediction horizon: 720 min (144 steps)
Lag intervals: 120min, 360min, 720min
Dataset size: 833,345 samples, 858 vessels


  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_



Model                MAE (km)     vs Baseline     Status
----------------------------------------------------------------------
Baseline               40.911 km             -   reference
Linear Regression      30.604 km        +25.2%   ‚úì Better
XGBoost                27.215 km        +33.5%   ‚úì Better

üèÜ Winner: XGBoost (27.215 km)

PREDICTION HORIZON: 1440 minutes (24.0 hours)
Prediction horizon: 1440 min (288 steps)
Lag intervals: 240min, 720min, 1440min
Dataset size: 601,424 samples, 743 vessels


  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_



Model                MAE (km)     vs Baseline     Status
----------------------------------------------------------------------
Baseline               78.370 km             -   reference
Linear Regression      53.488 km        +31.7%   ‚úì Better
XGBoost                56.610 km        +27.8%   ‚úì Better

üèÜ Winner: Linear Regression (53.488 km)

SUMMARY: ALL PREDICTION HORIZONS
 horizon_min  horizon_hours  baseline_mae    lr_mae   xgb_mae            winner  best_mae
          30            0.5      0.876717  2.046817  2.402623          Baseline  0.876717
          60            1.0      1.899435  2.941337  3.359846          Baseline  1.899435
         360            6.0     16.959407 16.481246 14.816118           XGBoost 14.816118
         720           12.0     40.910950 30.604194 27.214719           XGBoost 27.214719
        1440           24.0     78.369843 53.487798 56.610266 Linear Regression 53.487798

KEY INSIGHTS:
‚úó At 0.5h: Baseline still best (0.88 km)
‚úó At 1.0h: Bas