Package imports

In [None]:
import gc
import math
import os
import time
from datetime import datetime, timedelta

import netCDF4 as nc
import numpy as np
import pandas as pd
from geopy import Point
from geopy.distance import great_circle
from scipy.spatial import cKDTree
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, VotingRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import ElasticNet, Ridge, Lasso, BayesianRidge
from xgboost import XGBRegressor
import lightgbm as lgb

from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression

from sklearn.model_selection import KFold, cross_val_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
import numpy as np
import os

from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import RandomizedSearchCV, cross_val_score
import optuna
from scipy.stats import randint, uniform
import glob
from haversine import haversine
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from datetime import datetime, timedelta

ModuleNotFoundError: No module named 'netCDF4'

Function to pre-process spatial data

In [15]:
# Precompute the KDTree and valid_time differences
def precompute_kdtree_and_time_diffs(uwnd_nc_file_path):
    try:
        print("Precomputing KDTree and time differences...")
        # Load the NetCDF file
        ds = nc.Dataset(uwnd_nc_file_path)

        # Extract the valid_time, latitudes, and longitudes from the NetCDF file
        valid_time = ds.variables['valid_time'][:]  # Assuming 'valid_time' is the variable name for time
        latitudes = ds.variables['latitude'][:]
        longitudes = ds.variables['longitude'][:]

        # Convert valid_time from seconds since 1970-01-01 to datetime
        base_time = datetime(1970, 1, 1)
        valid_time_dt = np.array([base_time + timedelta(seconds=int(ts)) for ts in valid_time], dtype='datetime64[ns]')

        # Create a KDTree for fast spatial lookup
        lat_lon_pairs = np.array([(lat, lon) for lat in latitudes for lon in longitudes])
        tree = cKDTree(lat_lon_pairs)

        print("KDTree and time differences precomputed successfully.")
        return tree, valid_time_dt, latitudes, longitudes, lat_lon_pairs
    except Exception as e:
        print(f"Error precomputing KDTree and time differences: {e}")
        raise

uwnd_nc_file_path = '../data/raw/reanalyses/ERA5/era5_uwnd_2023.nc'
vwnd_nc_file_path = '../data/raw/reanalyses/ERA5/era5_vwnd_2023.nc'
try:
    tree, valid_time_dt, latitudes, longitudes, lat_lon_pairs = precompute_kdtree_and_time_diffs(uwnd_nc_file_path)
except Exception as e:
    print(f"Error precomputing KDTree and time differences: {e}")
    raise


Precomputing KDTree and time differences...
KDTree and time differences precomputed successfully.


Function to extract wind components at a given lat/lon (preloads reanalysis netCDFs also)

In [16]:
uwnd_nc_file_path = '../data/raw/reanalyses/ERA5/era5_uwnd_2023.nc'
vwnd_nc_file_path = '../data/raw/reanalyses/ERA5/era5_vwnd_2023.nc'

uwnd_ds = nc.Dataset(uwnd_nc_file_path)
vwnd_ds = nc.Dataset(vwnd_nc_file_path)

uwnd_array = uwnd_ds.variables['u'][:, 0, :, :]  # Assuming 'u' is the variable name for u-component wind and removing the pressure dimension
vwnd_array = vwnd_ds.variables['v'][:, 0, :, :]  # Assuming 'v' is the variable name for v-component wind and removing the pressure dimension

# Function to extract wind components
def extract_wind_components(lat, lon, dt, tree, valid_time_dt, latitudes, longitudes, lat_lon_pairs):
    try:
        # Convert the given datetime to a numpy datetime64 object
        row_datetime = np.datetime64(dt)

        # Find the value in the valid_time dimension closest in time to the datetime in the dataframe
        time_diffs = np.abs(valid_time_dt - row_datetime)
        closest_time_index = np.argmin(time_diffs)

        # Check if the calculated index is within the bounds of the uwnd_array
        if closest_time_index < 0 or closest_time_index >= uwnd_array.shape[0]:
            raise ValueError("The given datetime is out of bounds for the NetCDF data")

        # Select the corresponding netCDF slices
        uwnd_slice = uwnd_array[closest_time_index, :, :]
        vwnd_slice = vwnd_array[closest_time_index, :, :]

        # Find the grid cell of the netCDF slice closest to the given Latitude and Longitude position
        lat_lon = (lat, lon)
        _, closest_point_index = tree.query(lat_lon)
        closest_lat, closest_lon = lat_lon_pairs[closest_point_index]

        # Find the index of the closest latitude/longitude pair in the arrays
        lat_index = np.where(latitudes == closest_lat)[0][0]
        lon_index = np.where(longitudes == closest_lon)[0][0]

        # Extract the u and v wind components
        u_wind = uwnd_slice[lat_index, lon_index]
        v_wind = vwnd_slice[lat_index, lon_index]

        # Round wind components to 4 decimal places
        u_wind = round(u_wind, 4)
        v_wind = round(v_wind, 4)

        return u_wind, v_wind
    except Exception as e:
        print(f"Error extracting wind components: {e}")
        raise

Function to calculate new position from current position, displacement, and heading

In [17]:
def calculate_new_position(current_position, displacement, heading):
    R = 6371000  # Earth's radius in meters
    
    lat1 = math.radians(current_position[0])
    lon1 = math.radians(current_position[1])
    heading_rad = math.radians(heading)
    
    lat2 = math.asin(math.sin(lat1) * math.cos(displacement / R) +
                     math.cos(lat1) * math.sin(displacement / R) * math.cos(heading_rad))
    
    lon2 = lon1 + math.atan2(math.sin(heading_rad) * math.sin(displacement / R) * math.cos(lat1),
                             math.cos(displacement / R) - math.sin(lat1) * math.sin(lat2))
    
    return math.degrees(lat2), math.degrees(lon2)

## Model Optimization and Evaluation Functions

These functions handle model optimization with AutoML and evaluation of predictions.

In [None]:
def evaluate_predictions(true_data, predicted_file):
    """
    Evaluate the accuracy of predictions against true data.
    """
    # Read predicted data
    pred_data = pd.read_csv(predicted_file)
    pred_data['Datetime'] = pd.to_datetime(pred_data['Datetime'])
    
    # Merge true and predicted data on datetime
    merged_data = pd.merge(
        true_data,
        pred_data,
        left_on=['datetime'],
        right_on=['Datetime'],
        suffixes=('_true', '_pred')
    )
    
    # Calculate position errors
    position_errors = []
    for _, row in merged_data.iterrows():
        true_pos = (row['Latitude_true'], row['Longitude_true'])
        pred_pos = (row['Latitude'], row['Longitude'])
        error_km = haversine(true_pos, pred_pos)
        position_errors.append(error_km)
    
    merged_data['position_error_km'] = position_errors
    
    # Calculate metrics
    metrics = {
        'mean_position_error_km': np.mean(position_errors),
        'median_position_error_km': np.median(position_errors),
        'max_position_error_km': np.max(position_errors),
        'std_position_error_km': np.std(position_errors),
        'rmse_lat': np.sqrt(mean_squared_error(merged_data['Latitude_true'], merged_data['Latitude'])),
        'rmse_lon': np.sqrt(mean_squared_error(merged_data['Longitude_true'], merged_data['Longitude'])),
        'mae_lat': mean_absolute_error(merged_data['Latitude_true'], merged_data['Latitude']),
        'mae_lon': mean_absolute_error(merged_data['Longitude_true'], merged_data['Longitude'])
    }
    
    return metrics, merged_data

def plot_trajectory_comparison(merged_data, buoy_id, model_name):
    """
    Plot true vs predicted trajectories
    """
    plt.figure(figsize=(12, 8))
    plt.plot(merged_data['Longitude_true'], merged_data['Latitude_true'], 
             'b-', label='True Trajectory')
    plt.plot(merged_data['Longitude'], merged_data['Latitude'], 
             'r--', label='Predicted Trajectory')
    plt.title(f'True vs Predicted Trajectory - Buoy {buoy_id}\nModel: {model_name}')
    plt.xlabel('Longitude')
    plt.ylabel('Latitude')
    plt.legend()
    plt.grid(True)
    
    # Save plot
    plt.savefig(f'../data/processed/predictions/trajectory_comparison_{buoy_id}_{model_name}.png')
    plt.close()

def objective(trial, X_train, y_train):
    """
    Optuna objective function for hyperparameter optimization
    """
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
        'max_depth': trial.suggest_int('max_depth', 10, 100),
        'min_samples_split': trial.suggest_int('min_samples_split', 2, 20),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 10),
        'max_features': trial.suggest_categorical('max_features', ['auto', 'sqrt', 'log2']),
        'bootstrap': trial.suggest_categorical('bootstrap', [True, False])
    }
    
    model = RandomForestRegressor(**params, n_jobs=-1)
    scores = cross_val_score(
        model, X_train, y_train, 
        cv=3, scoring='neg_mean_squared_error'
    )
    
    return -np.mean(scores)

def train_optimized_model(X_train, y_train, n_trials=100):
    """
    Train model with optimized hyperparameters using Optuna
    """
    study = optuna.create_study(direction='minimize')
    study.optimize(
        lambda trial: objective(trial, X_train, y_train),
        n_trials=n_trials
    )
    
    best_params = study.best_params
    print("Best parameters:", best_params)
    
    best_model = RandomForestRegressor(**best_params, n_jobs=-1)
    best_model.fit(X_train, y_train)
    
    return best_model, best_params

def evaluate_all_predictions(val_data, predictions_dir):
    """
    Evaluate all prediction files in the specified directory
    """
    results = []
    prediction_files = glob.glob(f"{predictions_dir}/predicted_*.csv")
    
    for pred_file in prediction_files:
        # Extract buoy_id and model_name from filename
        filename = pred_file.split('/')[-1]
        buoy_id = filename.split('_')[1]
        model_name = filename.split('_')[2].replace('.csv', '')
        
        # Get true data for this buoy
        true_data_buoy = val_data[val_data['BuoyID'] == int(buoy_id)]
        
        # Calculate metrics
        metrics, merged_data = evaluate_predictions(true_data_buoy, pred_file)
        metrics['buoy_id'] = buoy_id
        metrics['model_name'] = model_name
        
        # Plot trajectory comparison
        plot_trajectory_comparison(merged_data, buoy_id, model_name)
        
        results.append(metrics)
    
    # Convert results to DataFrame
    results_df = pd.DataFrame(results)
    
    # Save results
    results_df.to_csv(f'{predictions_dir}/evaluation_results.csv', index=False)
    
    return results_df

Iterative predictor function

In [None]:
def iterative_prediction(val_data, model, tree, valid_times, latitudes, longitudes, lat_lon_pairs, output_file_path):
    start_time = time.time()
    max_duration = 4 * 60 * 60  # Maximum runtime in seconds

    # Check if the output file exists and write the header only if it doesn't
    if not os.path.exists(output_file_path):
        with open(output_file_path, 'w') as file:
            file.write("Predicted_Latitude,Predicted_Longitude,Datetime,BuoyID\n")
    
    with open(output_file_path, 'a') as file:
        current_lat, current_lon = val_subset.iloc[0][['Latitude', 'Longitude']]
        current_uwnd, current_vwnd = val_subset.iloc[0][['era5_uwnd', 'era5_vwnd']]
        buoy_id = val_data.iloc[0]['BuoyID']
        
        print("\nInitial conditions:")
        print(f"Latitude: {current_lat:.2f}, Longitude: {current_lon:.2f}, Datetime: {val_data.iloc[0]['datetime']}, BuoyID: {buoy_id}")

        for i in range(1, len(val_data)):
            # Check if the maximum duration has been exceeded
            elapsed_time = time.time() - start_time
            if elapsed_time > max_duration:
                print("Maximum duration exceeded. Stopping the script.")
                break

            next_row = val_data.iloc[i]
            
            input_data = pd.DataFrame({
                'Latitude': [current_lat],
                'Longitude': [current_lon],
                'era5_uwnd': [current_uwnd],
                'era5_vwnd': [current_vwnd]
            })
            
            prediction_start_time = time.time()
            predicted_displacement, predicted_heading = model.predict(input_data)[0]
            predicted_lat, predicted_lon = calculate_new_position(
                (current_lat, current_lon),
                predicted_displacement,
                predicted_heading
            )
            prediction_end_time = time.time()
            print(f"Prediction step {i} took {prediction_end_time - prediction_start_time:.2f} seconds.")
            
            wind_extraction_start_time = time.time()
            predicted_wind_u, predicted_wind_v = extract_wind_components(
                predicted_lat, 
                predicted_lon, 
                next_row['datetime'],
                tree,
                valid_times,
                latitudes,
                longitudes,
                lat_lon_pairs
            )
            wind_extraction_end_time = time.time()
            print(f"Wind extraction step {i} took {wind_extraction_end_time - wind_extraction_start_time:.2f} seconds.")

            print(f"\nPrediction for row {i}:")
            print(f"Predicted Latitude: {predicted_lat:.2f}, Predicted Longitude: {predicted_lon:.2f}")
            print(f"Predicted Displacement: {predicted_displacement:.2f}, Predicted Heading: {predicted_heading:.2f}")
            print(f"Updated Wind U: {predicted_wind_u:.2f}, Wind V: {predicted_wind_v:.2f}")
            print(f"Target Datetime: {next_row['datetime']}, BuoyID: {buoy_id}")

            # Write results to the file
            file.write(f"{round(predicted_lat, 3)},{round(predicted_lon, 3)},{next_row['datetime']},{buoy_id}\n")

            # Stop the script if the file size exceeds 2 GB
            file_size = file.tell()
            if file_size > 2 * 1024 * 1024 * 1024:
                print("File size exceeded 2 GB. Stopping the script.")
                break

            # Update current state for next iteration
            current_lat, current_lon = predicted_lat, predicted_lon
            current_uwnd, current_vwnd = predicted_wind_u, predicted_wind_v

            gc.collect()
            if i % 10 == 0:
                print(f"Processed {i} predictions...")

    print(f"\nPrediction complete. Results saved to {output_file_path}")


    


Model selection, training, and validation

In [19]:
# Load the data from the spreadsheet
buoy_data = pd.read_csv('../processed_buoy_data.csv')

# Drop unused columns
columns_to_keep = ['Latitude', 'Longitude', 'BuoyID', 'datetime', 'era5_uwnd', 'era5_vwnd', 'displacement', 'heading']
print("Dropping unused columns and retaining only:", columns_to_keep)
buoy_data = buoy_data[columns_to_keep].copy()
buoy_data['datetime'] = pd.to_datetime(buoy_data['datetime'])
print("Datetime column successfully converted to datetime format.")

# Split data by BuoyID into training and validation sets
print("Splitting data into training and validation sets by unique Buoy IDs.")
buoy_ids = buoy_data['BuoyID'].unique()
train_ids = np.random.choice(buoy_ids, size=int(len(buoy_ids) - 5), replace=False)
val_ids = np.setdiff1d(buoy_ids, train_ids)
print(f"Training IDs count: {len(train_ids)}, Validation IDs count: {len(val_ids)}")

train_data = buoy_data[buoy_data['BuoyID'].isin(train_ids)].copy()
val_data = buoy_data[buoy_data['BuoyID'].isin(val_ids)].copy()
print(f"Training data shape: {train_data.shape}, Validation data shape: {val_data.shape}")

# Set up training and validation data
print("Preparing training features and target labels.")
X_train = train_data[['Latitude', 'Longitude', 'era5_uwnd', 'era5_vwnd', 'BuoyID']].copy()
y_train = train_data[['displacement', 'heading']]

print(f"Feature set shape: {X_train.shape}, Target set shape: {y_train.shape}")

# Define the models to evaluate
models = [
    ('RandomForest', RandomForestRegressor()),
    ('GradientBoosting', GradientBoostingRegressor()),
    ('SVR', SVR()),
    ('XGBoost', XGBRegressor()),
    ('LightGBM', lgb.LGBMRegressor()),
    ('ElasticNet', ElasticNet())
]

# Define the number of cross-validation folds
cv_folds = 5

# Dictionary to store cross-validation results
cv_results = {}

# Function for iterative prediction
def iterative_prediction(model, val_data, n_steps=10):
    """
    Perform iterative prediction of buoy positions over a set number of steps.
    Returns predicted positions at each step.
    """
    predicted_positions = []
    for index, row in val_data.iterrows():
        lat = row['Latitude']
        lon = row['Longitude']
        uwnd = row['era5_uwnd']
        vwnd = row['era5_vwnd']
        
        # Store initial position
        positions = [(lat, lon)]
        
        # Iteratively predict future positions
        for step in range(n_steps):
            # Get displacement and heading predictions
            prediction = model.predict([[lat, lon, uwnd, vwnd]])
            displacement, heading = prediction[0]
            
            # Calculate new predicted position (based on displacement and heading)
            dx = displacement * np.cos(np.radians(heading))
            dy = displacement * np.sin(np.radians(heading))
            
            lat += dy
            lon += dx
            positions.append((lat, lon))
        
        predicted_positions.append(positions)
    
    return np.array(predicted_positions)

# Ensure 'BuoyID' is in the dataset
if 'BuoyID' not in X_train.columns:
    raise KeyError("'BuoyID' column is missing from the dataset")

# Exclude 'BuoyID' from the features
X_train_features = X_train.drop(columns=['BuoyID'])

# Iterate over each model
for model_name, model in models:
    print(f"Testing model: {model_name}")
    
    # Initialize KFold
    kf = KFold(n_splits=cv_folds, shuffle=True, random_state=42)
    
    model_scores = []
    
    # Perform cross-validation
    for fold_num, (train_index, val_index) in enumerate(kf.split(X_train_features)):
        X_train_fold, X_val_fold = X_train_features.iloc[train_index], X_train_features.iloc[val_index]
        y_train_fold, y_val_fold = y_train.iloc[train_index], y_train.iloc[val_index]
        
        # Ensure that BuoyID is preserved in both train and validation sets
        X_train_fold = X_train_features.iloc[train_index].copy()
        X_train_fold['BuoyID'] = X_train.iloc[train_index]['BuoyID'].values

        X_val_fold = X_train_features.iloc[val_index].copy()
        X_val_fold['BuoyID'] = X_train.iloc[val_index]['BuoyID'].values

        # Train the model on the training fold
        print(f"Training model on fold {fold_num+1}...")
        model.fit(X_train_fold.drop(columns=['BuoyID']), y_train_fold)
        print(f"Model {model_name} trained on Fold {fold_num+1}")
        
        val_data_with_steps = []
        total_predictions = 0
        
        # Perform iterative predictions for each unique BuoyID in the validation fold
        for buoy_id in np.unique(X_val_fold['BuoyID']):
            # Get the validation data for the current BuoyID
            buoy_data = X_val_fold[X_val_fold['BuoyID'] == buoy_id]
            true_positions = y_val_fold[y_val_fold.index.isin(buoy_data.index)]
            
            # Run iterative predictions on the validation subset with the calculated steps
            print(f"Predicting for BuoyID {buoy_id} with 10 steps...")
            iter_predictions = iterative_prediction(model, buoy_data, n_steps=10)
            
            # Store the predictions
            val_data_with_steps.append(iter_predictions)
            total_predictions += len(iter_predictions)
        
        print(f"Total predictions made in Fold {fold_num+1}: {total_predictions}")
        
        # Calculate error between predicted and actual positions for each step
        error = 0
        for pred, true in zip(val_data_with_steps, true_positions['displacement']):
            if isinstance(pred, (list, np.ndarray)):
                error += np.mean(np.sqrt((pred[:, 0] - true['Latitude'])**2 + (pred[:, 1] - true['Longitude'])**2))
            else:
                error += np.sqrt((pred[0] - true['Latitude'])**2 + (pred[1] - true['Longitude'])**2)
        
        model_scores.append(error)
        print(f"Fold {fold_num+1} error: {error}")
    
    # Store the results
    cv_results[model_name] = model_scores
    print(f"\nCross-validation completed for {model_name}. Mean error: {np.mean(model_scores):.3f}, Standard deviation: {np.std(model_scores):.3f}\n")

# Select the best model based on mean cross-validation score
best_model_name = min(cv_results, key=lambda k: np.mean(cv_results[k]))
best_model = dict(models)[best_model_name]
print(f"\n=== Best model selected: {best_model_name} with mean error: {np.mean(cv_results[best_model_name]):.3f} ===")

Dropping unused columns and retaining only: ['Latitude', 'Longitude', 'BuoyID', 'datetime', 'era5_uwnd', 'era5_vwnd', 'displacement', 'heading']
Datetime column successfully converted to datetime format.
Splitting data into training and validation sets by unique Buoy IDs.
Training IDs count: 281, Validation IDs count: 5
Training data shape: (1754893, 8), Validation data shape: (23807, 8)
Preparing training features and target labels.
Feature set shape: (1754893, 5), Target set shape: (1754893, 2)
Testing model: RandomForest
Training model on fold 1...
Model RandomForest trained on Fold 1
Predicting for BuoyID 900115 with 10 steps...




KeyboardInterrupt: 

In [None]:
X_val_fold.head()

Once the best model is selected, run the full predictions

## Model Optimization with AutoML

In [None]:
# Train optimized model
print("Training optimized model with Optuna...")
best_model, best_params = train_optimized_model(X_train, y_train, n_trials=100)
print("\nBest parameters found:", best_params)

In [None]:

# Train the best model on the entire training dataset
print("Training best model...")
best_model.fit(X_train, y_train)
print("Model training complete.")

# Ensure the predictions directory exists
predictions_dir = '../data/predictions'
if not os.path.exists(predictions_dir):
    os.makedirs(predictions_dir)
    print(f"Directory {predictions_dir} created.")
else:
    print(f"Directory {predictions_dir} already exists.")

# Get unique BuoyIDs
unique_buoy_ids = val_data['BuoyID'].unique()

# Iterate over each BuoyID in the validation data and make predictions
for buoy_id in unique_buoy_ids:
    # Set output file path
    output_file_path = f'../data/predictions/predictions_{buoy_id}.csv'
    
    # Subset the data for the current BuoyID
    buoy_data = val_data[val_data['BuoyID'] == buoy_id]

    # Run iterative predictions on validation subset
    print("\nStarting iterative predictions on validation subset...")
    iterative_prediction(
        val_data=buoy_data,
        model=best_model,
        tree=tree,
        valid_times=valid_time_dt,
        latitudes=latitudes,
        longitudes=longitudes,
        lat_lon_pairs=lat_lon_pairs,
        output_file_path=output_file_path
    )







## Prediction Evaluation

In [None]:
# Evaluate all predictions
print("Evaluating predictions...")
results_df = evaluate_all_predictions(val_data, '../data/processed/predictions/')

# Display summary statistics
print("\nSummary Statistics:")
print("\nMean metrics across all buoys:")
print(results_df.mean(numeric_only=True))
print("\nMetrics by model:")
print(results_df.groupby('model_name').mean(numeric_only=True))

# Plot overall error distribution
plt.figure(figsize=(10, 6))
plt.hist(results_df['mean_position_error_km'], bins=20)
plt.title('Distribution of Mean Position Errors')
plt.xlabel('Mean Position Error (km)')
plt.ylabel('Frequency')
plt.savefig('../data/processed/predictions/error_distribution.png')
plt.close()

