In [5]:
import sys
from pathlib import Path
project_root = Path.cwd().parent
sys.path.insert(0, str(project_root))

import pandas as pd
import geopandas as gpd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import GroupKFold
from lightgbm import LGBMRegressor
from sklearn.multioutput import MultiOutputRegressor
from sklearn.model_selection import GroupShuffleSplit
from sklearn.linear_model import Ridge
from sklearn.preprocessing import RobustScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import make_scorer
from ml_logic.data_preprocessing import clean_data, resample_pings, vessel_train_test_split
from ml_logic.feature_engineering import create_time_series_features
from ml_logic.metric import position_extrapolation, haversine_mae
from sklearn.inspection import permutation_importance
import warnings
warnings.filterwarnings('ignore', category=RuntimeWarning)
warnings.filterwarnings('ignore', category=FutureWarning)

# Feature Selection Analysis

## Objective

After feature engineering showed no improvement, we now test whether **feature selection** can:
1. Identify redundant or harmful features
2. Improve model performance by removing noise
3. Reduce model complexity without losing predictive power

**Method**: Permutation importance with cross-validation.

**Hypothesis**: Some lag features (especially long-term COG/SOG lags) might be redundant or noisy, and removing them could improve generalization.

We use **480 minutes (8 hours)** as the target horizon because:
- This is where ML models start to significantly outperform the baseline (see notebook 3)
- It represents a good balance between short-term (where baseline dominates) and long-term predictions
- Feature importance patterns are most relevant at this horizon where ML provides value

In [6]:
# get initial data frame
df = pd.read_parquet("../data/raw/AIS_merged_lon-95.0to-77.2_lat22.5to29.5_20241101to20241130.parquet")

In [7]:

df = clean_data(df) #remove missing values and clean
df = resample_pings(df, interval='10min') #uniformize pings

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[dimension_cols] = df[dimension_cols].replace(0, np.nan)


In [8]:
df_train, df_val, df_test, groups_train, groups_val, groups_test = vessel_train_test_split(df, test_size= 0.2, val_size= 0.15, random_state = 273)

In [9]:

#choice of 480min as time horizon : time where ML-approche provide improvement
df_train_lag = create_time_series_features(df_train, target_horizon=480, time_step=10,
                                     rolling=False, advanced_features=False)

df_val_lag = create_time_series_features(df_val, target_horizon=480, time_step=10,
                                     rolling=False, advanced_features=False)

df_test_lag = create_time_series_features(df_test, target_horizon=480, time_step=10,
                                     rolling=False, advanced_features=False)

Target prediction horizon: 480 min. Number of steps: 48
Defining lag windows of 80min, 240min, 480min
Target prediction horizon: 480 min. Number of steps: 48
Defining lag windows of 80min, 240min, 480min
Target prediction horizon: 480 min. Number of steps: 48
Defining lag windows of 80min, 240min, 480min


In [10]:
# Separate X, y and groups for train/val/test sets
X_train = df_train_lag.drop(columns=["MMSI", "BaseDateTime", "target_LAT", "target_LON"])
y_train = df_train_lag[["target_LAT", "target_LON"]]
groups_train = df_train_lag["MMSI"]

X_val = df_val_lag.drop(columns=["MMSI", "BaseDateTime", "target_LAT", "target_LON"])
y_val = df_val_lag[["target_LAT", "target_LON"]]
groups_val = df_val_lag["MMSI"]

X_test = df_test_lag.drop(columns=["MMSI", "BaseDateTime", "target_LAT", "target_LON"])
y_test = df_test_lag[["target_LAT", "target_LON"]]
groups_test = df_test_lag["MMSI"]

print(f"Train: {len(X_train)} samples, {groups_train.nunique()} vessels")
print(f"Val: {len(X_val)} samples, {groups_val.nunique()} vessels")
print(f"Test: {len(X_test)} samples, {groups_test.nunique()} vessels")


Train: 2754056 samples, 1541 vessels
Val: 489972 samples, 274 vessels
Test: 821272 samples, 455 vessels


In [None]:
# # Train/test split respecting MMSI groups
# gss = GroupShuffleSplit(n_splits=1, test_size=0.2, random_state=273)
# for train_idx, test_idx in gss.split(X, y, groups):
#     X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
#     y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]
#     #need to isolate a group on the train set for future crossval
#     groups_train = groups.iloc[train_idx]
#     groups_test = groups.iloc[test_idx]

# print(f"Train: {len(X_train)} samples, {groups_train.nunique()} vessels")
# print(f"Test: {len(X_test)} samples, {groups_test.nunique()} vessels\n")

Train: 748405 samples, 729 vessels
Test: 180459 samples, 183 vessels



In [11]:
#model to use for permutation
estimators = {
    "Ridge_scaled": Pipeline([
        ('scaler', RobustScaler()),
        ('model', Ridge(alpha=1.0))
    ]),
    "LightGBM": LGBMRegressor(
        n_estimators=200,
        max_depth=6,
        learning_rate=0.1,
        subsample=0.8,
        colsample_bytree=0.8,
        random_state=273,
        n_jobs=-1,
        verbose=-1
    )
}

importance_df = pd.DataFrame(index=X_train.columns) #df to stock results of permutation
haversine_scorer = make_scorer(haversine_mae, greater_is_better=False)

## 2. Permutation Importance Analysis

### 2.1 LightGBM: Cross-validation + Permutation Importance

We use **GroupKFold** (5 folds) to ensure no data leakage: each vessel's data stays together in train or validation sets.

In [12]:
fold_importance = [] #list to stock scores in each fold
gkf = GroupKFold(n_splits= 5) #for crossval respecting MMSI

#generating folds without cutting through a single boat ping sequence
for fold_idx, (train_fold_idx, val_fold_idx) in enumerate(gkf.split(X_train, y_train, groups= groups_train)):
    print(f"  Fold {fold_idx + 1}/5")
    #generating the train and val sets
    X_train_fold = X_train.iloc[train_fold_idx]
    X_val_fold = X_train.iloc[val_fold_idx]
    y_train_fold = y_train.iloc[train_fold_idx]
    y_val_fold = y_train.iloc[val_fold_idx]


    model_fold = MultiOutputRegressor(estimators["LightGBM"])
    model_fold.fit(X_train_fold, y_train_fold)

    # Permutation importance on validation fold
    importance = permutation_importance(
        model_fold, X_val_fold, y_val_fold,
        n_repeats=6,
        random_state=273,
        scoring=haversine_scorer,
        n_jobs= -1)

    #list of lists (each list contains the means of the 6 MAE scores of permutation, for each featutre)
    fold_importance.append(importance.importances_mean)

fold_importance = np.array(fold_importance) #conversion in 2D array shape (5, n_features)

importance_df = pd.DataFrame({"feature": X_train.columns,
                              "importance_mean": fold_importance.mean(axis= 0),
                              "importance_std": fold_importance.std(axis= 0)
                              }).sort_values(by="importance_mean",  ascending= False)

importance_df

  Fold 1/5
  Fold 2/5
  Fold 3/5
  Fold 4/5
  Fold 5/5


Unnamed: 0,feature,importance_mean,importance_std
1,LON,570.964678,28.16414
0,LAT,189.160154,4.747449
12,LON_lag_80min,85.595247,11.280529
14,LON_lag_480min,69.252351,10.048061
9,LAT_lag_80min,41.367002,4.488713
13,LON_lag_240min,30.540199,4.48031
10,LAT_lag_240min,25.762773,3.555138
11,LAT_lag_480min,24.036635,2.164564
3,COG,9.376591,0.282964
2,SOG,5.866807,0.564383


### Interpretation: LightGBM Permutation Importance

**Most important features**:
- **Position features (LAT, LON) and their lags dominate importance** - these are the most predictive features for trajectory prediction
- **Vessel dimensions (Length, Width, Draft) have moderate importance** - provide useful context about vessel behavior and movement patterns

**Least important features** (candidates for removal):
- **`COG_lag_480min`**: 
- **`COG_lag_240min`**: 
- **`SOG_lag_240min`**: 
- **`Heading`**: 
- **`COG_lag_80min`**: 

**Why these features are less important?**
- Long-term COG lags (240min, 480min) are less predictive than position lags at the same horizons
- SOG at 240min may be redundant with other SOG lags that are more informative
- Heading and short-term COG lags show minimal individual contribution compared to position features

**Important caveat**: Low **individual** importance does not necessarily mean these features are useless. In tree-based models like LightGBM, features can contribute through **interactions** with other features, even if they appear unimportant in isolation. We will validate this via cross-validation.


### 2.2 Ridge: Cross-validation + Permutation Importance

Same methodology as LightGBM to compare feature importance patterns between linear and tree-based models.

In [14]:
fold_importance = []
gkf = GroupKFold(n_splits= 5)
for fold_idx, (train_fold_idx, val_fold_idx) in enumerate(gkf.split(X_train, y_train, groups= groups_train)):
    print(f"  Fold {fold_idx + 1}/5")
    X_train_fold = X_train.iloc[train_fold_idx]
    X_val_fold = X_train.iloc[val_fold_idx]
    y_train_fold = y_train.iloc[train_fold_idx]
    y_val_fold = y_train.iloc[val_fold_idx]


    model_fold = MultiOutputRegressor(estimators["Ridge_scaled"])
    model_fold.fit(X_train_fold, y_train_fold)

    # Permutation importance on validation fold
    importance = permutation_importance(
        model_fold, X_val_fold, y_val_fold,
        n_repeats=6,
        random_state=273,
        scoring=haversine_scorer,
        n_jobs= -1)

    fold_importance.append(importance.importances_mean)

fold_importance = np.array(fold_importance)
importance_df_ridge = pd.DataFrame({"feature": X_train.columns,
                              "importance_mean": fold_importance.mean(axis= 0),
                              "importance_std": fold_importance.std(axis= 0)
                              }).sort_values(by="importance_mean",  ascending= False)

importance_df_ridge

  Fold 1/5
  Fold 2/5
  Fold 3/5
  Fold 4/5
  Fold 5/5


Unnamed: 0,feature,importance_mean,importance_std
1,LON,1835.712268,80.316385
0,LAT,838.816096,57.714071
12,LON_lag_80min,827.9757,64.717416
9,LAT_lag_80min,439.476716,62.476026
13,LON_lag_240min,257.62596,10.265817
10,LAT_lag_240min,160.181284,9.458565
14,LON_lag_480min,158.222069,8.383689
11,LAT_lag_480min,1.967266,1.187352
15,SOG_lag_80min,0.569919,0.121975
3,COG,0.434924,0.044662


### Interpretation: Ridge Permutation Importance

**Most important features**:
- **Position features (LAT, LON) and their lags** - similar to LightGBM, these dominate importance
- **Vessel dimensions** - moderate importance, consistent with LightGBM

**Least important features** (candidates for removal):
- **`COG_lag_480min`**: Very low importance - consistent with LightGBM findings
- **`COG_lag_240min`**: Low importance - same pattern as LightGBM
- **`SOG_lag_240min`**: Low importance - redundant with other SOG features
- **`Heading`**: Low importance - minimal contribution
- **`COG_lag_80min`**: Low importance - consistent with LightGBM

**Comparison with LightGBM**:
- Ridge shows **similar patterns** but with different magnitudes (linear model vs tree-based)
- Both models agree on which features are **least important individually**
- **Consensus low-importance features**: `COG_lag_240min`, `COG_lag_480min`, `SOG_lag_240min`, `Heading`, `COG_lag_80min`

**Key difference**:
- **Linear models (Ridge)** cannot capture feature interactions, so low individual importance is more meaningful
- **Tree-based models (LightGBM)** can capture interactions, so features with low individual importance might still contribute through combinations with other features


### Decision: Features to Remove

**Selected features for removal**:
1. **`COG_lag_480min`** - Very low importance in both models
2. **`COG_lag_240min`** - Low importance in both models
3. **`SOG_lag_240min`** - Low importance, redundant with other SOG lags
4. **`Heading`** - Low importance, minimal contribution
5. **`COG_lag_80min`** - Low/negative importance in both models

**Rationale**:
- **Consensus between models**: Both LightGBM and Ridge agree these features have low individual importance
- **Long-term lags less predictive**: COG lags at 240min and 480min are less useful than position lags at the same horizons
- **Model simplicity**: Removing low-importance features reduces complexity

**Validation approach**:
We will test the impact via cross-validation to determine:
- If removal improves performance 
- If these features contribute through interactions in tree-based models (LightGBM)
- If the trade-off between simplicity and performance is acceptable

---

## 3. Feature Selection Validation


#### Test with selected features with LGBM model (crossval)

In [15]:
# Removing features with negative or very low importance on both models
features_to_remove = ["COG_lag_480min",
"SOG_lag_240min",
"COG_lag_240min",
"Heading",
"COG_lag_80min"]
features_to_keep = [f for f in X_train.columns if f not in features_to_remove]

print(f"Removing {len(features_to_remove)} features: {features_to_remove}")

# Create datasets with selected features
X_train_selected = X_train[features_to_keep]
X_test_selected = X_test[features_to_keep]


Removing 5 features: ['COG_lag_480min', 'SOG_lag_240min', 'COG_lag_240min', 'Heading', 'COG_lag_80min']


In [16]:
# Cross-validation comparison: all features vs selected features
from sklearn.model_selection import cross_val_score

gkf = GroupKFold(n_splits=5)

# Define LightGBM parameters (reusable for CV and final model)
lgbm_params = {
    'n_estimators': 200,
    'max_depth': 6,
    'learning_rate': 0.1,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'random_state': 273,
    'n_jobs': -1,
    'verbose': -1
}

# Test LightGBM
print("LightGBM")

# With all features (separate instance for CV)
print("  With all features")
lgbm_cv_all = LGBMRegressor(**lgbm_params)
scores_all = cross_val_score(
    MultiOutputRegressor(lgbm_cv_all),
    X_train, y_train,
    cv=gkf,
    groups=groups_train,
    scoring=haversine_scorer,
    n_jobs=-1
)

# With selected features (separate instance for CV)
print("  With selected features...")
lgbm_cv_selected = LGBMRegressor(**lgbm_params)
scores_selected = cross_val_score(
    MultiOutputRegressor(lgbm_cv_selected),
    X_train_selected, y_train,
    cv=gkf,
    groups=groups_train,
    scoring=haversine_scorer,
    n_jobs=-1
)

mae_all = -scores_all.mean() #negative bc sklearn negativates the metric
mae_selected = -scores_selected.mean()
improvement = ((mae_all - mae_selected) / mae_all) * 100

print(f"\n  MAE with all features: {mae_all:.3f} ± {scores_all.std():.3f} km")
print(f"  MAE with selected features: {mae_selected:.3f} ± {scores_selected.std():.3f} km")
print(f"  Improvement: {improvement:.2f}%")


LightGBM
  With all features
  With selected features...

  MAE with all features: 22.841 ± 0.614 km
  MAE with selected features: 23.109 ± 0.840 km
  Improvement: -1.17%


#### Test with selected features on Ridge model (crossval)

In [17]:
# Test Ridge with scaling
print("\nRidge (scaled):")
ridge_pipeline = Pipeline([
    ('scaler', RobustScaler()),
    ('model', Ridge(alpha=1.0))
])

# With all features
print("  With all features...")
scores_ridge_all = cross_val_score(
    MultiOutputRegressor(ridge_pipeline),
    X_train, y_train,
    cv=gkf,
    groups=groups_train,
    scoring=haversine_scorer,
    n_jobs=-1
)

# With selected features
print("  With selected features...")
scores_ridge_selected = cross_val_score(
    MultiOutputRegressor(ridge_pipeline),
    X_train_selected, y_train,
    cv=gkf,
    groups=groups_train,
    scoring=haversine_scorer,
    n_jobs=-1
)

mae_ridge_all = -scores_ridge_all.mean()
mae_ridge_selected = -scores_ridge_selected.mean()
improvement_ridge = ((mae_ridge_all - mae_ridge_selected) / mae_ridge_all) * 100

print(f"\n  MAE with all features: {mae_ridge_all:.3f} ± {scores_ridge_all.std():.3f} km")
print(f"  MAE with selected features: {mae_ridge_selected:.3f} ± {scores_ridge_selected.std():.3f} km")
print(f"  Improvement: {improvement_ridge:.2f}%")



Ridge (scaled):
  With all features...
  With selected features...

  MAE with all features: 27.135 ± 0.415 km
  MAE with selected features: 27.106 ± 0.434 km
  Improvement: 0.11%


---

## Overall Feature Selection Conclusion

**Key Findings**:
- **Feature selection does not improve performance** for LightGBM (slight deterioration of -1.17%)
- **Minimal benefit for Ridge** (+0.11% improvement)
- The base feature set was already well-designed - removing features reduces available information
- Tree-based models (LightGBM) benefit from feature interactions, making even low-importance features potentially useful

**Decision**:
Despite the small performance trade-off for LightGBM, we proceed with **selected features** for:
1. **Model simplicity**: Fewer features reduce complexity and training time


4. **Marginal impact**: The 1.17% deterioration is small compared to potential benefits of a cleaner feature set

**Removed features**: `COG_lag_480min`, `SOG_lag_240min`, `COG_lag_240min`, `Heading`, `COG_lag_80min`

### Next Steps

***Proceed to hyperparameter tuning (notebook 5) with the selected feature set to further optimize model performance.***