Package imports

In [6]:
# Core Libraries
import gc
import glob
import math
import os
import time
from datetime import datetime, timedelta

# Data Handling
import netCDF4 as nc
import numpy as np
import pandas as pd

# Visualization
import matplotlib.pyplot as plt

# Geospatial Calculations
from geopy import Point
from geopy.distance import great_circle
from haversine import haversine
from scipy.spatial import cKDTree

# Machine Learning Models
from sklearn.ensemble import (
    GradientBoostingRegressor,
    RandomForestRegressor,
    VotingRegressor
)
from sklearn.linear_model import (
    BayesianRidge,
    ElasticNet,
    Lasso,
    LinearRegression,
    Ridge
)
from sklearn.multioutput import MultiOutputRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from xgboost import XGBRegressor
import lightgbm as lgb

# Model Evaluation and Optimization
import optuna
from optuna import create_study
from scipy.stats import randint, uniform
from sklearn.metrics import (
    mean_absolute_error,
    mean_squared_error,
    r2_score
)
from sklearn.model_selection import (
    GroupKFold,
    KFold,
    RandomizedSearchCV,
    cross_val_score,
    train_test_split
)

Function to pre-process spatial data

In [7]:
# Precompute the KDTree and valid_time differences
def precompute_kdtree_and_time_diffs(uwnd_nc_file_path):
    try:
        print("Precomputing KDTree and time differences...")
        # Load the NetCDF file
        ds = nc.Dataset(uwnd_nc_file_path)

        # Extract the valid_time, latitudes, and longitudes from the NetCDF file
        valid_time = ds.variables['valid_time'][:]  # Assuming 'valid_time' is the variable name for time
        latitudes = ds.variables['latitude'][:]
        longitudes = ds.variables['longitude'][:]

        # Convert valid_time from seconds since 1970-01-01 to datetime
        base_time = datetime(1970, 1, 1)
        valid_time_dt = np.array([base_time + timedelta(seconds=int(ts)) for ts in valid_time], dtype='datetime64[ns]')

        # Create a KDTree for fast spatial lookup
        lat_lon_pairs = np.array([(lat, lon) for lat in latitudes for lon in longitudes])
        tree = cKDTree(lat_lon_pairs)

        print("KDTree and time differences precomputed successfully.")
        return tree, valid_time_dt, latitudes, longitudes, lat_lon_pairs
    except Exception as e:
        print(f"Error precomputing KDTree and time differences: {e}")
        raise

uwnd_nc_file_path = '../data/raw/reanalyses/ERA5/era5_uwnd_2023.nc'
vwnd_nc_file_path = '../data/raw/reanalyses/ERA5/era5_vwnd_2023.nc'
try:
    tree, valid_time_dt, latitudes, longitudes, lat_lon_pairs = precompute_kdtree_and_time_diffs(uwnd_nc_file_path)
except Exception as e:
    print(f"Error precomputing KDTree and time differences: {e}")
    raise

Precomputing KDTree and time differences...
KDTree and time differences precomputed successfully.


Function to extract wind components at a given lat/lon (preloads reanalysis netCDFs also)

In [8]:
uwnd_nc_file_path = '../data/raw/reanalyses/ERA5/era5_uwnd_2023.nc'
vwnd_nc_file_path = '../data/raw/reanalyses/ERA5/era5_vwnd_2023.nc'

uwnd_ds = nc.Dataset(uwnd_nc_file_path)
vwnd_ds = nc.Dataset(vwnd_nc_file_path)

uwnd_array = uwnd_ds.variables['u'][:, 0, :, :]  # Assuming 'u' is the variable name for u-component wind and removing the pressure dimension
vwnd_array = vwnd_ds.variables['v'][:, 0, :, :]  # Assuming 'v' is the variable name for v-component wind and removing the pressure dimension

# Function to extract wind components
def extract_wind_components(lat, lon, dt, tree, valid_time_dt, latitudes, longitudes, lat_lon_pairs):
    try:
        # Convert the given datetime to a numpy datetime64 object
        row_datetime = np.datetime64(dt)

        # Find the value in the valid_time dimension closest in time to the datetime in the dataframe
        time_diffs = np.abs(valid_time_dt - row_datetime)
        closest_time_index = np.argmin(time_diffs)

        # Check if the calculated index is within the bounds of the uwnd_array
        if closest_time_index < 0 or closest_time_index >= uwnd_array.shape[0]:
            raise ValueError("The given datetime is out of bounds for the NetCDF data")

        # Select the corresponding netCDF slices
        uwnd_slice = uwnd_array[closest_time_index, :, :]
        vwnd_slice = vwnd_array[closest_time_index, :, :]

        # Find the grid cell of the netCDF slice closest to the given Latitude and Longitude position
        lat_lon = (lat, lon)
        _, closest_point_index = tree.query(lat_lon)
        closest_lat, closest_lon = lat_lon_pairs[closest_point_index]

        # Find the index of the closest latitude/longitude pair in the arrays
        lat_index = np.where(latitudes == closest_lat)[0][0]
        lon_index = np.where(longitudes == closest_lon)[0][0]

        # Extract the u and v wind components
        u_wind = uwnd_slice[lat_index, lon_index]
        v_wind = vwnd_slice[lat_index, lon_index]

        # Round wind components to 4 decimal places
        u_wind = round(u_wind, 4)
        v_wind = round(v_wind, 4)

        return u_wind, v_wind
    except Exception as e:
        print(f"Error extracting wind components: {e}")
        raise

In [25]:
# Example: Predict one step using the function
initial_coords = (83.0, -120.0)  # Starting latitude and longitude
displacement_km = 5.0  # Displacement in kilometers
heading_degrees = 45.0  # Heading in degrees from north

# Calculate new position
new_lat, new_lon = calculate_new_position(initial_coords, displacement_km, heading_degrees)
print(f"New Coordinates: Latitude = {new_lat}, Longitude = {new_lon}")

New Coordinates: Latitude = 83.03172364000761, Longitude = -119.73791635149415


Function to calculate new position from current position, displacement, and heading

In [29]:
# Import the math module
import math

# Redefine the calculate_new_position function with wrapping logic
def calculate_new_position(current_position, displacement, heading):
    R = 6371000  # Earth's radius in meters
    
    # Convert inputs to radians
    lat1 = math.radians(current_position[0])
    lon1 = math.radians(current_position[1])
    heading_rad = math.radians(heading)
    
    # Compute new latitude
    lat2 = math.asin(math.sin(lat1) * math.cos(displacement / R) +
                     math.cos(lat1) * math.sin(displacement / R) * math.cos(heading_rad))
    
    # Compute new longitude
    lon2 = lon1 + math.atan2(math.sin(heading_rad) * math.sin(displacement / R) * math.cos(lat1),
                             math.cos(displacement / R) - math.sin(lat1) * math.sin(lat2))
    
    # Convert back to degrees
    new_lat = math.degrees(lat2)
    new_lon = math.degrees(lon2)
    
    # Wrap longitude to [-180, 180]
    if new_lon > 180:
        new_lon -= 360
    elif new_lon < -180:
        new_lon += 360
    
    return new_lat, new_lon


## Model Optimization and Evaluation Functions

These functions handle model optimization with AutoML and evaluation of predictions. (implementation ongoing)

In [10]:
def evaluate_predictions(true_data, predicted_file):
    """
    Evaluate the accuracy of predictions against true data.
    """
    # Read predicted data
    pred_data = pd.read_csv(predicted_file)
    pred_data['Datetime'] = pd.to_datetime(pred_data['Datetime'])
    
    # Merge true and predicted data on datetime
    merged_data = pd.merge(
        true_data,
        pred_data,
        left_on=['datetime'],
        right_on=['Datetime'],
        suffixes=('_true', '_pred')
    )
    
    # Calculate position errors
    position_errors = []
    for _, row in merged_data.iterrows():
        true_pos = (row['Latitude_true'], row['Longitude_true'])
        pred_pos = (row['Latitude'], row['Longitude'])
        error_km = haversine(true_pos, pred_pos)
        position_errors.append(error_km)
    
    merged_data['position_error_km'] = position_errors
    
    # Calculate metrics
    metrics = {
        'mean_position_error_km': np.mean(position_errors),
        'median_position_error_km': np.median(position_errors),
        'max_position_error_km': np.max(position_errors),
        'std_position_error_km': np.std(position_errors),
        'rmse_lat': np.sqrt(mean_squared_error(merged_data['Latitude_true'], merged_data['Latitude'])),
        'rmse_lon': np.sqrt(mean_squared_error(merged_data['Longitude_true'], merged_data['Longitude'])),
        'mae_lat': mean_absolute_error(merged_data['Latitude_true'], merged_data['Latitude']),
        'mae_lon': mean_absolute_error(merged_data['Longitude_true'], merged_data['Longitude'])
    }
    
    return metrics, merged_data

def plot_trajectory_comparison(merged_data, buoy_id, model_name):
    """
    Plot true vs predicted trajectories
    """
    plt.figure(figsize=(12, 8))
    plt.plot(merged_data['Longitude_true'], merged_data['Latitude_true'], 
             'b-', label='True Trajectory')
    plt.plot(merged_data['Longitude'], merged_data['Latitude'], 
             'r--', label='Predicted Trajectory')
    plt.title(f'True vs Predicted Trajectory - Buoy {buoy_id}\nModel: {model_name}')
    plt.xlabel('Longitude')
    plt.ylabel('Latitude')
    plt.legend()
    plt.grid(True)
    
    # Save plot
    plt.savefig(f'../data/processed/predictions/trajectory_comparison_{buoy_id}_{model_name}.png')
    plt.close()

def evaluate_all_predictions(val_data, predictions_dir):
    """
    Evaluate all prediction files in the specified directory
    """
    results = []
    prediction_files = glob.glob(f"{predictions_dir}/predicted_*.csv")
    
    for pred_file in prediction_files:
        # Extract buoy_id and model_name from filename
        filename = pred_file.split('/')[-1]
        buoy_id = filename.split('_')[1]
        model_name = filename.split('_')[2].replace('.csv', '')
        
        # Get true data for this buoy
        true_data_buoy = val_data[val_data['BuoyID'] == int(buoy_id)]
        
        # Calculate metrics
        metrics, merged_data = evaluate_predictions(true_data_buoy, pred_file)
        metrics['buoy_id'] = buoy_id
        metrics['model_name'] = model_name
        
        # Plot trajectory comparison
        plot_trajectory_comparison(merged_data, buoy_id, model_name)
        
        results.append(metrics)
    
    # Convert results to DataFrame
    results_df = pd.DataFrame(results)
    
    # Save results
    results_df.to_csv(f'{predictions_dir}/evaluation_results.csv', index=False)
    
    return results_df

Iterative predictor function

In [30]:
def iterative_prediction(val_data, model, tree, valid_times, latitudes, longitudes, lat_lon_pairs):

    # Initialize an empty list to store predictions for all buoys
    all_predictions = []

    # Iterate over each unique BuoyID
    unique_buoy_ids = val_data['BuoyID'].unique()
    for buoy_id in unique_buoy_ids:
        buoy_data = val_data[val_data['BuoyID'] == buoy_id]

        # Initialize an empty list to store predictions for the current buoy
        predictions = []

        # Extract initial conditions for the current buoy
        current_lat, current_lon = buoy_data.iloc[0][['Latitude', 'Longitude']]
        current_uwnd, current_vwnd = buoy_data.iloc[0][['era5_uwnd', 'era5_vwnd']]

        # Add the initial condition as the first prediction
        predictions.append([current_lat, current_lon, buoy_data.iloc[0]['datetime']])

        for i in range(1, len(buoy_data)):
            next_row = buoy_data.iloc[i]

            # Prepare input data
            input_data = pd.DataFrame({
                'Latitude': [current_lat],
                'Longitude': [current_lon],
                'era5_uwnd': [current_uwnd],
                'era5_vwnd': [current_vwnd]
            })

            # Make prediction for displacement and heading
            predicted_displacement, predicted_heading = model.predict(input_data)[0]
            predicted_lat, predicted_lon = calculate_new_position(
                (current_lat, current_lon),
                predicted_displacement,
                predicted_heading
            )

            # Extract wind components at the predicted position and time
            predicted_wind_u, predicted_wind_v = extract_wind_components(
                predicted_lat, 
                predicted_lon, 
                next_row['datetime'],
                tree,
                valid_times,
                latitudes,
                longitudes,
                lat_lon_pairs
            )

            # Append the prediction for the current buoy
            predictions.append([predicted_lat, predicted_lon, next_row['datetime']])

            # Update current state for the next iteration
            current_lat, current_lon = predicted_lat, predicted_lon
            current_uwnd, current_vwnd = predicted_wind_u, predicted_wind_v

        # Append predictions of the current buoy to all_predictions
        all_predictions.extend(predictions)

    # Convert all predictions to a NumPy array before returning
    all_predictions_array = np.array(all_predictions, dtype=object)
    return all_predictions_array


Model selection, training, and validation (includes computation timing calculation)

In [31]:
# LightGBM verbosity suppression
lgb_params = {'verbose': -1}

# Load the data from the spreadsheet
buoy_data = pd.read_csv('../combined_buoy_data.csv')

# Drop unused columns
columns_to_keep = ['Latitude', 'Longitude', 'BuoyID', 'datetime', 'era5_uwnd', 'era5_vwnd', 'displacement', 'heading']
buoy_data = buoy_data[columns_to_keep].copy()
buoy_data['datetime'] = pd.to_datetime(buoy_data['datetime'])

# Define features and targets
X = buoy_data[['Latitude', 'Longitude', 'era5_uwnd', 'era5_vwnd', 'BuoyID', 'datetime']]
y = buoy_data[['displacement', 'heading']]
groups = buoy_data['BuoyID']

# Models to evaluate
model_configs = [
    ('ElasticNet', MultiOutputRegressor(ElasticNet(alpha=1.0, l1_ratio=0.5))),
    ('GradientBoosting', MultiOutputRegressor(GradientBoostingRegressor(n_estimators=100, max_depth=5))),
    ('RandomForest', RandomForestRegressor(n_estimators=100, max_depth=10)),
    ('XGBoost', MultiOutputRegressor(XGBRegressor(n_estimators=100, max_depth=6, objective='reg:squarederror'))),
    ('LightGBM', MultiOutputRegressor(lgb.LGBMRegressor(n_estimators=100, max_depth=6, **lgb_params)))
]

# GroupKFold for cross-validation
cv_folds = 5
group_kf = GroupKFold(n_splits=cv_folds)

# Ensure the predictions directory exists
predictions_dir = '../data/processed/predictions'
os.makedirs(predictions_dir, exist_ok=True)

# Initialize DataFrame to store results
results = []

# Cross-validation
for model_name, model in model_configs:
    print(f"\nTesting model: {model_name}")
    model_scores = []  # To store RMSE for each fold
    fold_times = []  # To store time taken for each fold

    for fold_num, (train_index, val_index) in enumerate(group_kf.split(X, y, groups=groups)):
        print(f"\nFold {fold_num + 1}")
        start_time = time.time()

        X_train, X_val = X.iloc[train_index], X.iloc[val_index]
        y_train, y_val = y.iloc[train_index], y.iloc[val_index]

        # Retain 'BuoyID' in X_val for iteration step
        X_val_with_buoyid = X_val.copy()
        X_train = X_train.drop(columns=['BuoyID', 'datetime'])
        X_val = X_val.drop(columns=['BuoyID', 'datetime'])

        # Train the model
        model.fit(X_train, y_train)

        # Predict iteratively
        y_pred = iterative_prediction(
            val_data=X_val_with_buoyid,
            model=model,
            tree=tree,
            valid_times=valid_time_dt,
            latitudes=latitudes,
            longitudes=longitudes,
            lat_lon_pairs=lat_lon_pairs
        )

        # Convert predictions to a DataFrame for easier handling
        y_pred = pd.DataFrame(y_pred, columns=['Latitude', 'Longitude', 'datetime'])

        # Exclude the datetime column for RMSE calculation and ensure numeric dtype
        y_pred_numeric = np.array(y_pred[['Latitude', 'Longitude']].to_numpy(), dtype=np.float64)

        # Ensure y_val is in the same format
        y_val_numeric = y_val.to_numpy()

        # Calculate RMSE
        try:
            rmse = np.sqrt(mean_squared_error(y_val_numeric, y_pred_numeric))
            model_scores.append(rmse)
            print(f"Fold {fold_num + 1} RMSE: {rmse:.3f}")
        except ValueError as e:
            print(f"Error calculating RMSE: {e}")
            continue

        # Record time taken for the fold
        fold_time = time.time() - start_time
        fold_times.append(fold_time)
        print(f"Fold {fold_num + 1} time: {fold_time:.2f} seconds")

        # Save predictions and true values to CSV
        predictions_df = pd.DataFrame({
            'BuoyID': X_val_with_buoyid['BuoyID'].values,  # Add BuoyID to the output
            'True Latitude': X_val_with_buoyid['Latitude'].values,  # Use latitude from X_val_with_buoyid
            'True Longitude': X_val_with_buoyid['Longitude'].values,  # Use longitude from X_val_with_buoyid
            'Predicted Latitude': np.round(y_pred_numeric[:, 0], 4),  # Predicted latitude rounded to 4 decimal places
            'Predicted Longitude': np.round(y_pred_numeric[:, 1], 4)  # Predicted longitude rounded to 4 decimal places
        })
        predictions_file = os.path.join(predictions_dir, f"{model_name}_fold{fold_num + 1}_predictions.csv")
        predictions_df.to_csv(predictions_file, index=False)

    # Store results for this model
    mean_rmse = np.mean(model_scores)
    std_rmse = np.std(model_scores)
    total_time = sum(fold_times)

    results.append({
        'Model': model_name,
        'Mean RMSE': mean_rmse,
        'RMSE StdDev': std_rmse,
        'Total Time (s)': total_time,
        'Mean Time per Fold (s)': np.mean(fold_times)
    })

    print(f"\nCompleted cross-validation for {model_name}. "
          f"Mean RMSE: {mean_rmse:.3f}, Std. Dev: {std_rmse:.3f}, Total Time: {total_time:.2f} seconds")

# Convert results to a DataFrame and save to CSV
results_df = pd.DataFrame(results)
results_df.to_csv('model_comparison_results.csv', index=False)

# Identify the best model based on mean RMSE
best_model_row = results_df.loc[results_df['Mean RMSE'].idxmin()]
print(f"\n=== Best model selected: {best_model_row['Model']} ===")
print(f"Mean RMSE: {best_model_row['Mean RMSE']:.3f}, Total Time: {best_model_row['Total Time (s)']:.2f} seconds")

# Store the best model
best_model = model_configs[results_df['Mean RMSE'].idxmin()][1]

print(f"Best model: {best_model_row['Model']}")
print(f"Mean RMSE: {best_model_row['Mean RMSE']:.3f}")
print(f"Total Time: {best_model_row['Total Time (s)']:.2f} seconds")


Testing model: ElasticNet

Fold 1
Fold 1 RMSE: 827.064
Fold 1 time: 380.83 seconds

Fold 2
Fold 2 RMSE: 767.791
Fold 2 time: 408.17 seconds

Fold 3
Fold 3 RMSE: 825.619
Fold 3 time: 413.79 seconds

Fold 4
Fold 4 RMSE: 813.975
Fold 4 time: 409.12 seconds

Fold 5
Fold 5 RMSE: 1829.535
Fold 5 time: 414.53 seconds

Completed cross-validation for ElasticNet. Mean RMSE: 1012.797, Std. Dev: 408.938, Total Time: 2026.44 seconds

Testing model: GradientBoosting

Fold 1
Fold 1 RMSE: 824.468
Fold 1 time: 984.29 seconds

Fold 2
Fold 2 RMSE: 768.219
Fold 2 time: 949.54 seconds

Fold 3
Fold 3 RMSE: 821.637
Fold 3 time: 927.72 seconds

Fold 4
Fold 4 RMSE: 818.564
Fold 4 time: 928.62 seconds

Fold 5
Fold 5 RMSE: 1830.465
Fold 5 time: 930.42 seconds

Completed cross-validation for GradientBoosting. Mean RMSE: 1012.671, Std. Dev: 409.423, Total Time: 4720.60 seconds

Testing model: RandomForest

Fold 1
Fold 1 RMSE: 824.421
Fold 1 time: 1031.53 seconds

Fold 2
Fold 2 RMSE: 766.104
Fold 2 time: 1012.37 s

Hyperparameter tuning on the best model with Optuna

In [None]:
from joblib import Parallel, delayed
import optuna

# Define the objective function for hyperparameter tuning
def objective(trial):
    if best_model_row['Model'] == 'ElasticNet':
        alpha = trial.suggest_float('alpha', 0.1, 10.0, log=True)
        l1_ratio = trial.suggest_float('l1_ratio', 0.0, 1.0)
        model = MultiOutputRegressor(ElasticNet(alpha=alpha, l1_ratio=l1_ratio))
    elif best_model_row['Model'] == 'GradientBoosting':
        n_estimators = trial.suggest_int('n_estimators', 50, 200)
        max_depth = trial.suggest_int('max_depth', 3, 10)
        model = MultiOutputRegressor(GradientBoostingRegressor(n_estimators=n_estimators, max_depth=max_depth))
    elif best_model_row['Model'] == 'RandomForest':
        n_estimators = trial.suggest_int('n_estimators', 50, 200)
        max_depth = trial.suggest_int('max_depth', 5, 15)
        model = RandomForestRegressor(n_estimators=n_estimators, max_depth=max_depth)
    elif best_model_row['Model'] == 'XGBoost':
        n_estimators = trial.suggest_int('n_estimators', 50, 200)
        max_depth = trial.suggest_int('max_depth', 3, 10)
        learning_rate = trial.suggest_float('learning_rate', 0.01, 0.3, log=True)
        model = MultiOutputRegressor(XGBRegressor(n_estimators=n_estimators, max_depth=max_depth, learning_rate=learning_rate, objective='reg:squarederror'))
    elif best_model_row['Model'] == 'LightGBM':
        n_estimators = trial.suggest_int('n_estimators', 50, 200)
        max_depth = trial.suggest_int('max_depth', 3, 10)
        learning_rate = trial.suggest_float('learning_rate', 0.01, 0.3, log=True)
        model = MultiOutputRegressor(lgb.LGBMRegressor(n_estimators=n_estimators, max_depth=max_depth, learning_rate=learning_rate))

    # Function to process one fold (to parallelize)
    def process_fold(train_index, val_index):
        X_train, X_val = X.iloc[train_index], X.iloc[val_index]
        y_train, y_val = y.iloc[train_index], y.iloc[val_index]

        # Convert to numpy arrays and reduce precision
        X_train = X_train.drop(columns=['BuoyID', 'datetime']).to_numpy(dtype='float32')
        X_val = X_val.drop(columns=['BuoyID', 'datetime']).to_numpy(dtype='float32')
        y_train = y_train.to_numpy(dtype='float32')
        y_val = y_val.to_numpy(dtype='float32')

        # Train the model
        model.fit(X_train, y_train)

        # Iterative prediction
        y_pred = iterative_prediction(
            val_data=X.iloc[val_index],
            model=model,
            tree=tree,
            valid_times=valid_time_dt,
            latitudes=latitudes,
            longitudes=longitudes,
            lat_lon_pairs=lat_lon_pairs
        )

        # Filter out datetime column from predictions
        y_pred_filtered = y_pred[:, :2]  # Keep only Longitude and Latitude

        # Calculate RMSE
        rmse = np.sqrt(mean_squared_error(y_val, y_pred_filtered))

        # Free memory
        del X_train, X_val, y_train, y_val, y_pred, y_pred_filtered
        gc.collect()

        return rmse

    # Limit number of folds to 3
    folds = list(group_kf.split(X, y, groups=groups))[:3]

    # Parallel cross-validation using joblib
    model_scores = Parallel(n_jobs=-1)(delayed(process_fold)(train_idx, val_idx) for train_idx, val_idx in folds)

    # Pruning - Stop unpromising trials early
    intermediate_value = np.mean(model_scores)
    trial.report(intermediate_value, step=0)  # Only one step per trial here
    if trial.should_prune():
        raise optuna.TrialPruned()

    return intermediate_value


# Create an Optuna study with pruning
study = optuna.create_study(
    direction='minimize',
    pruner=optuna.pruners.MedianPruner(n_startup_trials=5, n_warmup_steps=0)
)

# Optimize with reduced trials
study.optimize(objective, n_trials=20)  # Reduced to 20 trials

# Retrieve the best parameters
best_params = study.best_params
print(f"Best parameters: {best_params}")

[I 2024-11-28 09:29:31,329] A new study created in memory with name: no-name-3b7c8661-3f3a-41d7-9542-5d88cb88f0b4


Making predictions with the best tuned model and saving the results for evaluation

In [None]:
# Split the data while ensuring group integrity
train_idx, eval_idx = next(GroupKFold(n_splits=5).split(X, y, groups=groups))
X_train, X_eval = X.iloc[train_idx], X.iloc[eval_idx]
y_train, y_eval = y.iloc[train_idx], y.iloc[eval_idx]

# Drop 'BuoyID' and 'datetime' for training the model
X_train_clean = X_train.drop(columns=['BuoyID', 'datetime'])
X_eval_clean = X_eval.drop(columns=['BuoyID', 'datetime'])

# Instantiate the model using best_params
if best_model_row['Model'] == 'ElasticNet':
    best_model = MultiOutputRegressor(ElasticNet(**best_params))
elif best_model_row['Model'] == 'GradientBoosting':
    best_model = MultiOutputRegressor(GradientBoostingRegressor(**best_params))
elif best_model_row['Model'] == 'RandomForest':
    best_model = RandomForestRegressor(**best_params)
elif best_model_row['Model'] == 'XGBoost':
    best_model = MultiOutputRegressor(XGBRegressor(**best_params, objective='reg:squarederror'))
elif best_model_row['Model'] == 'LightGBM':
    best_model = MultiOutputRegressor(lgb.LGBMRegressor(**best_params))

# Fit the tuned model on the training data
best_model.fit(X_train_clean, y_train)

# Prepare for iterative predictions
X_eval_with_buoyid = X_eval.copy()

# Use the iterative_prediction function for evaluation
y_pred = iterative_prediction(
    val_data=X_eval_with_buoyid,
    model=best_model,
    tree=tree,
    valid_times=valid_time_dt,
    latitudes=latitudes,
    longitudes=longitudes,
    lat_lon_pairs=lat_lon_pairs
)

# Convert predictions to a DataFrame for comparison
y_pred_df = pd.DataFrame(y_pred, columns=['Latitude', 'Longitude'])

# Calculate evaluation metrics
y_pred_numeric = y_pred_df.to_numpy()
y_eval_numeric = y_eval.to_numpy()

# RMSE
eval_rmse = np.sqrt(mean_squared_error(y_eval_numeric, y_pred_numeric))
print(f"Evaluation RMSE: {eval_rmse:.3f}")

# Save predictions for analysis
predictions_df = pd.DataFrame({
    'True Latitude': y_eval['Latitude'].values,
    'True Longitude': y_eval['Longitude'].values,
    'Predicted Latitude': y_pred_df['Latitude'].values,
    'Predicted Longitude': y_pred_df['Longitude'].values
})

# Save predictions to a CSV file
predictions_file = '../data/processed/predictions/tuned_model_predictions.csv'
predictions_df.to_csv(predictions_file, index=False)

print(f"Predictions saved to: {predictions_file}")

## Prediction Evaluation

In [None]:
# Evaluate all predictions
print("Evaluating predictions...")
results_df = evaluate_all_predictions(val_data, '../data/processed/predictions/')

# Display summary statistics
print("\nSummary Statistics:")
print("\nMean metrics across all buoys:")
print(results_df.mean(numeric_only=True))
print("\nMetrics by model:")
print(results_df.groupby('model_name').mean(numeric_only=True))

# Plot overall error distribution
plt.figure(figsize=(10, 6))
plt.hist(results_df['mean_position_error_km'], bins=20)
plt.title('Distribution of Mean Position Errors')
plt.xlabel('Mean Position Error (km)')
plt.ylabel('Frequency')
plt.savefig('../data/processed/predictions/error_distribution.png')
plt.close()