In [3]:
import sys
from pathlib import Path
project_root = Path.cwd().parent
sys.path.insert(0, str(project_root))

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from lightgbm import LGBMRegressor
from sklearn.multioutput import MultiOutputRegressor
from sklearn.model_selection import GroupShuffleSplit
from sklearn.linear_model import Ridge
from sklearn.preprocessing import RobustScaler
from sklearn.pipeline import Pipeline

from ml_logic.data_preprocessing import clean_data, resample_pings
from ml_logic.model_evaluation import evaluate_horizon
from ml_logic.feature_engineering import create_time_series_features

import warnings
warnings.filterwarnings('ignore', category=RuntimeWarning)
warnings.filterwarnings('ignore', category=FutureWarning)

# Feature Engineering Analysis

## Objective

This notebook tests the impact of different feature engineering strategies on model performance:
1. **Rolling features**: Sliding window statistics (mean, std) on SOG and COG
2. **Advanced features**: Derived features (geometric ratios, meandering index, rate of change)

**Hypothesis**: These additional features should capture non-linear patterns that simple lag features cannot.

In [4]:


## 1. Data Loading and Preparation
df = pd.read_parquet("../data/processed/ais_filtered.parquet")
df = clean_data(df) #remove missing values and clean
df = resample_pings(df, interval='5min') #uniformize pings


## 3. Test effect of rolling features 

The rolling features are rolling average and std on SOG and COG

In [None]:
#create model (same as in notebook 3)
estimators = {
    "Ridge_scaled": Pipeline([
        ('scaler', RobustScaler()),
        ('model', Ridge(alpha=1.0))
    ]),
    "LightGBM": LGBMRegressor(
        n_estimators=200,
        max_depth=6,
        learning_rate=0.1,
        subsample=0.8,
        colsample_bytree=0.8,
        random_state=273,
        n_jobs=-1,
        verbose=-1
    )
}

results_no_rolling = []
results_with_rolling = []

horizons = [60, 120, 180, 240, 360, 480, 540,  720,  1080, 1440, 2880]
for horizon in horizons:
    print(f"\nHorizon: {horizon} min")

    # Sans rolling
    print("Without rolling features:")
    nb_pings, nb_vessels, mae_scores = evaluate_horizon(
        df, horizon, estimators,
        test_size=0.2,
        random_state=273,
        rolling=False
    )
    results_no_rolling.append({
        'horizon_min': horizon,
        'nb_pings': nb_pings,
        'nb_vessels': nb_vessels,
        **mae_scores
    })

    # Avec rolling
    print("With rolling features:")
    nb_pings, nb_vessels, mae_scores = evaluate_horizon(
        df, horizon, estimators,
        test_size=0.2,
        random_state=273,
        rolling=True
    )
    results_with_rolling.append({
        'horizon_min': horizon,
        'nb_pings': nb_pings,
        'nb_vessels': nb_vessels,
        **mae_scores
    })


df_no_rolling = pd.DataFrame(results_no_rolling)
df_with_rolling = pd.DataFrame(results_with_rolling)

print("\n=== WITHOUT ROLLING ===")
print(df_no_rolling)
print("\n=== WITH ROLLING ===")
print(df_with_rolling)


Conclusion: no effect of rolling features on prediction

## Impact of advanced features

The advanced features are derived featuers like the boat aspect ratio, the boat meandering index, the acceleration. 

In [None]:
df = pd.read_parquet("../data/processed/ais_filtered.parquet")
df = clean_data(df) #remove missing values and clean
df = resample_pings(df, interval='5min') #uniformize pings

#create model (same as in notebook 3)
estimators = {
    "Ridge_scaled": Pipeline([
        ('scaler', RobustScaler()),
        ('model', Ridge(alpha=1.0))
    ]),
    "LightGBM": LGBMRegressor(
        n_estimators=200,
        max_depth=6,
        learning_rate=0.1,
        subsample=0.8,
        colsample_bytree=0.8,
        random_state=273,
        n_jobs=-1,
        verbose=-1
    )
}

results_no_advanced = []
results_with_advanced = []

horizons = [60, 120, 180, 300, 420, 540,  660, 840, 1080, 1440,2160]
for horizon in horizons:
    print(f"\nHorizon: {horizon} min")

    # Without advanced features
    print("Without advanced features:")
    nb_pings, nb_vessels, mae_scores = evaluate_horizon(
        df, horizon, estimators,
        test_size=0.2,
        random_state=273,
        rolling=False,
        advanced_features= False
    )
    results_no_advanced.append({
        'horizon_min': horizon,
        'nb_pings': nb_pings,
        'nb_vessels': nb_vessels,
        **mae_scores
    })

    # With advanced features
    print("With advanced features:")
    nb_pings, nb_vessels, mae_scores = evaluate_horizon(
        df, horizon, estimators,
        test_size=0.2,
        random_state=273,
        rolling=False,
        advanced_features= True
    )
    results_with_advanced.append({
        'horizon_min': horizon,
        'nb_pings': nb_pings,
        'nb_vessels': nb_vessels,
        **mae_scores
    })

# Convertir en DataFrames
df_no_advanced = pd.DataFrame(results_no_advanced)
df_with_advanced = pd.DataFrame(results_with_advanced)

# Comparer
print("\n=== WITHOUT ADVANCED FEATURES ===")
print(df_no_advanced)
print("\n=== WITH ADVANCED FEATURES ===")
print(df_with_advanced)


  print(f"\Horizon: {horizon} min")


\Horizon: 60 min
Without advanced features:
Target prediction horizon: 60 min. Number of steps: 12
Defining lag windows of 10min, 30min, 60min
Testing model Ridge_scaled
fitting...
predicting...
Testing model LightGBM
fitting...
predicting...
With advanced features:
Target prediction horizon: 60 min. Number of steps: 12
Defining lag windows of 10min, 30min, 60min
Adding advanced engineered features
Testing model Ridge_scaled
fitting...
predicting...
Testing model LightGBM
fitting...
predicting...
\Horizon: 120 min
Without advanced features:
Target prediction horizon: 120 min. Number of steps: 24
Defining lag windows of 20min, 60min, 120min
Testing model Ridge_scaled
fitting...
predicting...
Testing model LightGBM
fitting...
predicting...
With advanced features:
Target prediction horizon: 120 min. Number of steps: 24
Defining lag windows of 20min, 60min, 120min
Adding advanced engineered features
Testing model Ridge_scaled
fitting...
predicting...
Testing model LightGBM
fitting...
predi

Conclusion: no effect of advanced features 