In [None]:
import sys
from pathlib import Path
project_root = Path.cwd().parent
sys.path.insert(0, str(project_root))

import pandas as pd
import geopandas as gpd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import GroupKFold
from lightgbm import LGBMRegressor
from sklearn.multioutput import MultiOutputRegressor
from sklearn.model_selection import GroupShuffleSplit
from sklearn.linear_model import Ridge
from sklearn.preprocessing import RobustScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import make_scorer
from ml_logic.data_preprocessing import clean_data, resample_pings
from ml_logic.feature_engineering import create_time_series_features
from ml_logic.metric import position_extrapolation, haversine_mae
from sklearn.inspection import permutation_importance
import warnings
warnings.filterwarnings('ignore', category=RuntimeWarning)
warnings.filterwarnings('ignore', category=FutureWarning)

# Feature Selection Analysis

## Objective

After feature engineering showed no improvement, we now test whether **feature selection** can:
1. Identify redundant or harmful features
2. Improve model performance by removing noise
3. Reduce model complexity without losing predictive power

**Method**: Permutation importance with cross-validation.

**Hypothesis**: Some lag features (especially long-term COG/SOG lags) might be redundant or noisy, and removing them could improve generalization.

We use **480 minutes (8 hours)** as the target horizon because:
- This is where ML models start to significantly outperform the baseline (see notebook 3)
- It represents a good balance between short-term (where baseline dominates) and long-term predictions
- Feature importance patterns are most relevant at this horizon where ML provides value

In [None]:
## 1. Data Preparation
df = pd.read_parquet("../data/processed/ais_filtered.parquet")
df = clean_data(df) #remove missing values and clean
df = resample_pings(df, interval='5min') #uniformize pings

#choice of 480min as time horizon : time where ML-approche provide improvement
df_lag = create_time_series_features(df, target_horizon= 480,
                                     rolling= False,
                                     advanced_features= False)

Target prediction horizon: 480 min. Number of steps: 96
Defining lag windows of 80min, 240min, 480min


In [3]:
#separate X,y and groups for split
X = df_lag.drop(columns=["MMSI", "BaseDateTime", "target_LAT", "target_LON"])
y = df_lag[["target_LAT", "target_LON"]]
groups = df_lag["MMSI"]

In [4]:
# Train/test split respecting MMSI groups
gss = GroupShuffleSplit(n_splits=1, test_size=0.2, random_state=273)
for train_idx, test_idx in gss.split(X, y, groups):
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]
    #need to isolate a group on the train set for future crossval
    groups_train = groups.iloc[train_idx]
    groups_test = groups.iloc[test_idx]

print(f"Train: {len(X_train)} samples, {groups_train.nunique()} vessels")
print(f"Test: {len(X_test)} samples, {groups_test.nunique()} vessels\n")

Train: 748405 samples, 729 vessels
Test: 180459 samples, 183 vessels



In [5]:
#model to use for permutation
estimators = {
    "Ridge_scaled": Pipeline([
        ('scaler', RobustScaler()),
        ('model', Ridge(alpha=1.0))
    ]),
    "LightGBM": LGBMRegressor(
        n_estimators=200,
        max_depth=6,
        learning_rate=0.1,
        subsample=0.8,
        colsample_bytree=0.8,
        random_state=273,
        n_jobs=-1,
        verbose=-1
    )
}

importance_df = pd.DataFrame(index=X_train.columns) #df to stock results of permutation
haversine_scorer = make_scorer(haversine_mae, greater_is_better=False)

## 2. Permutation Importance Analysis

### 2.1 LightGBM: Cross-validation + Permutation Importance

We use **GroupKFold** (5 folds) to ensure no data leakage: each vessel's data stays together in train or validation sets.

In [None]:
fold_importance = [] #list to stock scores in each fold
gkf = GroupKFold(n_splits= 5) #for crossval respecting MMSI

#generating folds without cutting through a single boat ping sequence
for fold_idx, (train_fold_idx, val_fold_idx) in enumerate(gkf.split(X_train, y_train, groups= groups_train)):
    print(f"  Fold {fold_idx + 1}/5")
    #generating the train and val sets
    X_train_fold = X_train.iloc[train_fold_idx]
    X_val_fold = X_train.iloc[val_fold_idx]
    y_train_fold = y_train.iloc[train_fold_idx]
    y_val_fold = y_train.iloc[val_fold_idx]


    model_fold = MultiOutputRegressor(estimators["LightGBM"])
    model_fold.fit(X_train_fold, y_train_fold)

    # Permutation importance on validation fold
    importance = permutation_importance(
        model_fold, X_val_fold, y_val_fold,
        n_repeats=6,
        random_state=273,
        scoring=haversine_scorer,
        n_jobs= -1)

    #list of lists (each list contains the means of the 6 MAE scores of permutation, for each featutre)
    fold_importance.append(importance.importances_mean)

fold_importance = np.array(fold_importance) #conversion in 2D array shape (5, n_features)

importance_df = pd.DataFrame({"feature": X_train.columns,
                              "importance_mean": fold_importance.mean(axis= 0),
                              "importance_std": fold_importance.std(axis= 0)
                              }).sort_values(by="importance_mean",  ascending= False)

importance_df

  Fold 1/5
  Fold 2/5
  Fold 3/5
  Fold 4/5
  Fold 5/5


Unnamed: 0,feature,importance_mean,importance_std
1,LON,266.245483,20.253742
0,LAT,55.458805,4.009232
12,LON_lag_80min,46.695315,1.972898
9,LAT_lag_80min,15.278227,2.084895
14,LON_lag_480min,10.714924,2.770576
10,LAT_lag_240min,8.653096,1.900738
13,LON_lag_240min,8.270339,1.455316
2,SOG,4.842158,0.464616
11,LAT_lag_480min,4.662294,0.34437
6,Length,2.558887,0.922419


**Key observations**:
- Position features (LAT, LON) and their lags dominate importance
- Vessel dimensions (Length, Width, Draft) have moderate importance
- Some COG/SOG lags show very low or negative importance (harmful features)
- Negative importance means the feature actually hurts performance when shuffled (model performs better with noise!)

**Next step**: Compare with Ridge to find consensus on which features to remove.


### 2.2 Ridge: Cross-validation + Permutation Importance

Same methodology as LightGBM to compare feature importance patterns between linear and tree-based models.

In [7]:
fold_importance = []
gkf = GroupKFold(n_splits= 5)
for fold_idx, (train_fold_idx, val_fold_idx) in enumerate(gkf.split(X_train, y_train, groups= groups_train)):
    print(f"  Fold {fold_idx + 1}/5")
    X_train_fold = X_train.iloc[train_fold_idx]
    X_val_fold = X_train.iloc[val_fold_idx]
    y_train_fold = y_train.iloc[train_fold_idx]
    y_val_fold = y_train.iloc[val_fold_idx]


    model_fold = MultiOutputRegressor(estimators["Ridge_scaled"])
    model_fold.fit(X_train_fold, y_train_fold)

    # Permutation importance on validation fold
    importance = permutation_importance(
        model_fold, X_val_fold, y_val_fold,
        n_repeats=6,
        random_state=273,
        scoring=haversine_scorer,
        n_jobs= -1)

    fold_importance.append(importance.importances_mean)

fold_importance = np.array(fold_importance)
importance_df_ridge = pd.DataFrame({"feature": X_train.columns,
                              "importance_mean": fold_importance.mean(axis= 0),
                              "importance_std": fold_importance.std(axis= 0)
                              }).sort_values(by="importance_mean",  ascending= False)

importance_df_ridge

  Fold 1/5
  Fold 2/5
  Fold 3/5
  Fold 4/5
  Fold 5/5


Unnamed: 0,feature,importance_mean,importance_std
1,LON,921.961766,86.917275
12,LON_lag_80min,440.530894,37.958254
0,LAT,227.217915,8.030887
13,LON_lag_240min,130.169964,22.573591
9,LAT_lag_80min,111.434943,7.517471
10,LAT_lag_240min,36.356315,4.368273
14,LON_lag_480min,25.372544,2.655244
11,LAT_lag_480min,6.252056,0.778376
15,SOG_lag_80min,1.466589,0.809965
6,Length,1.352245,0.320931


### Interpretation: Ridge Permutation Importance

**Comparison with LightGBM**:
- Ridge shows similar patterns but with different magnitudes (linear model vs tree-based)
- Both models agree on which features are least important
- Consensus features to remove: `COG_lag_240min`, `COG_lag_480min`, `SOG_lag_240min`

**Why these features?**
- Long-term COG lags (240min, 480min) are less predictive than position lags
- SOG at 240min is redundant with other SOG lags
- These features may introduce noise without adding signal

**Decision**: Remove the 3 consensus low-importance features and test impact via cross-validation.


## 3. Feature Selection Validation

We remove the 3 consensus low-importance features identified by both models and validate the impact via cross-validation.


#### Test with selected features with LGBM model (crossval)

In [8]:
# Removing features with negative or very low importance on both models
features_to_remove = ['COG_lag_240min', 'COG_lag_480min','SOG_lag_240min']
features_to_keep = [f for f in X_train.columns if f not in features_to_remove]

print(f"Removing {len(features_to_remove)} features: {features_to_remove}")

# Create datasets with selected features
X_train_selected = X_train[features_to_keep]
X_test_selected = X_test[features_to_keep]


Removing 3 features: ['COG_lag_240min', 'COG_lag_480min', 'SOG_lag_240min']


In [9]:
# Cross-validation comparison: all features vs selected features
from sklearn.model_selection import cross_val_score

gkf = GroupKFold(n_splits=5)

# Define LightGBM parameters (reusable for CV and final model)
lgbm_params = {
    'n_estimators': 200,
    'max_depth': 6,
    'learning_rate': 0.1,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'random_state': 273,
    'n_jobs': -1,
    'verbose': -1
}

# Test LightGBM
print("LightGBM")

# With all features (separate instance for CV)
print("  With all features")
lgbm_cv_all = LGBMRegressor(**lgbm_params)
scores_all = cross_val_score(
    MultiOutputRegressor(lgbm_cv_all),
    X_train, y_train,
    cv=gkf,
    groups=groups_train,
    scoring=haversine_scorer,
    n_jobs=-1
)

# With selected features (separate instance for CV)
print("  With selected features...")
lgbm_cv_selected = LGBMRegressor(**lgbm_params)
scores_selected = cross_val_score(
    MultiOutputRegressor(lgbm_cv_selected),
    X_train_selected, y_train,
    cv=gkf,
    groups=groups_train,
    scoring=haversine_scorer,
    n_jobs=-1
)

mae_all = -scores_all.mean() #negative bc sklearn negativates the metric
mae_selected = -scores_selected.mean()
improvement = ((mae_all - mae_selected) / mae_all) * 100

print(f"\n  MAE with all features: {mae_all:.3f} ± {scores_all.std():.3f} km")
print(f"  MAE with selected features: {mae_selected:.3f} ± {scores_selected.std():.3f} km")
print(f"  Improvement: {improvement:.2f}%")


LightGBM
  With all features
  With selected features...

  MAE with all features: 20.314 ± 1.655 km
  MAE with selected features: 20.047 ± 2.076 km
  Improvement: 1.32%


#### Test with selected features on Ridge model (crossval)

In [10]:
# Test Ridge with scaling
print("\nRidge (scaled):")
ridge_pipeline = Pipeline([
    ('scaler', RobustScaler()),
    ('model', Ridge(alpha=1.0))
])

# With all features
print("  With all features...")
scores_ridge_all = cross_val_score(
    MultiOutputRegressor(ridge_pipeline),
    X_train, y_train,
    cv=gkf,
    groups=groups_train,
    scoring=haversine_scorer,
    n_jobs=-1
)

# With selected features
print("  With selected features...")
scores_ridge_selected = cross_val_score(
    MultiOutputRegressor(ridge_pipeline),
    X_train_selected, y_train,
    cv=gkf,
    groups=groups_train,
    scoring=haversine_scorer,
    n_jobs=-1
)

mae_ridge_all = -scores_ridge_all.mean()
mae_ridge_selected = -scores_ridge_selected.mean()
improvement_ridge = ((mae_ridge_all - mae_ridge_selected) / mae_ridge_all) * 100

print(f"\n  MAE with all features: {mae_ridge_all:.3f} ± {scores_ridge_all.std():.3f} km")
print(f"  MAE with selected features: {mae_ridge_selected:.3f} ± {scores_ridge_selected.std():.3f} km")
print(f"  Improvement: {improvement_ridge:.2f}%")



Ridge (scaled):
  With all features...
  With selected features...

  MAE with all features: 20.191 ± 1.221 km
  MAE with selected features: 20.184 ± 1.224 km
  Improvement: 0.03%


### Interpretation: Feature Selection Impact

**Results**:
- **LightGBM**: Small improvement (+1.32%) with selected features
- **Ridge**: Negligible improvement (+0.03%) with selected features

**Analysis**:
- Removing 3 low-importance features slightly improves LightGBM (reduces overfitting)
- Ridge is less sensitive to feature removal (linear model, less prone to overfitting)
- The improvement is minimal but consistent with our hypothesis

**Conclusion**: Feature selection provides marginal benefit. The removed features were indeed redundant, but the base feature set was already well-optimized. We proceed with selected features for the final model.


---

## Overall Feature Selection Conclusion


- **Feature selection works** but provides marginal gains (~1%)
- The base feature set was already well-designed (simple lag features are optimal)
- Removing  helps tree-based models (LightGBM) more than linear models (Ridge)

### Next Steps

***Proceed to hyperparameter tuning (notebook 5) with the selected feature set to further optimize model performance.***