Package imports

In [3]:
import gc
import os
from datetime import datetime, timedelta
import netCDF4 as nc
import numpy as np
import pandas as pd
from geopy import Point
from geopy.distance import great_circle
from scipy.spatial import cKDTree
from sklearn.ensemble import (
    RandomForestRegressor, GradientBoostingRegressor, VotingRegressor
)
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import ElasticNet, Ridge, Lasso, BayesianRidge, LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import KFold, cross_val_score
from xgboost import XGBRegressor
import lightgbm as lgb
import time
import math

Function to pre-process spatial data

In [4]:
# Precompute the KDTree and valid_time differences
def precompute_kdtree_and_time_diffs(uwnd_nc_file_path):
    try:
        print("Precomputing KDTree and time differences...")
        # Load the NetCDF file
        ds = nc.Dataset(uwnd_nc_file_path)

        # Extract the valid_time, latitudes, and longitudes from the NetCDF file
        valid_time = ds.variables['valid_time'][:]  # Assuming 'valid_time' is the variable name for time
        latitudes = ds.variables['latitude'][:]
        longitudes = ds.variables['longitude'][:]

        # Convert valid_time from seconds since 1970-01-01 to datetime
        base_time = datetime(1970, 1, 1)
        valid_time_dt = np.array([base_time + timedelta(seconds=int(ts)) for ts in valid_time], dtype='datetime64[ns]')

        # Create a KDTree for fast spatial lookup
        lat_lon_pairs = np.array([(lat, lon) for lat in latitudes for lon in longitudes])
        tree = cKDTree(lat_lon_pairs)

        print("KDTree and time differences precomputed successfully.")
        return tree, valid_time_dt, latitudes, longitudes, lat_lon_pairs
    except Exception as e:
        print(f"Error precomputing KDTree and time differences: {e}")
        raise

uwnd_nc_file_path = '../data/raw/reanalyses/ERA5/era5_uwnd_2023.nc'
vwnd_nc_file_path = '../data/raw/reanalyses/ERA5/era5_vwnd_2023.nc'
try:
    tree, valid_time_dt, latitudes, longitudes, lat_lon_pairs = precompute_kdtree_and_time_diffs(uwnd_nc_file_path)
except Exception as e:
    print(f"Error precomputing KDTree and time differences: {e}")
    raise


Precomputing KDTree and time differences...
KDTree and time differences precomputed successfully.


Function to extract wind components at a given lat/lon (preloads reanalysis netCDFs also)

In [5]:
uwnd_ds = nc.Dataset(uwnd_nc_file_path)
vwnd_ds = nc.Dataset(vwnd_nc_file_path)

uwnd_array = uwnd_ds.variables['u'][:, 0, :, :]  # Assuming 'u' is the variable name for u-component wind and removing the pressure dimension
vwnd_array = vwnd_ds.variables['v'][:, 0, :, :]  # Assuming 'v' is the variable name for v-component wind and removing the pressure dimension

# Function to extract wind components
def extract_wind_components(lat, lon, dt, tree, valid_time_dt, latitudes, longitudes, lat_lon_pairs):
    try:
        # Convert the given datetime to a numpy datetime64 object
        row_datetime = np.datetime64(dt)

        # Find the value in the valid_time dimension closest in time to the datetime in the dataframe
        time_diffs = np.abs(valid_time_dt - row_datetime)
        closest_time_index = np.argmin(time_diffs)

        # Check if the calculated index is within the bounds of the uwnd_array
        if closest_time_index < 0 or closest_time_index >= uwnd_array.shape[0]:
            raise ValueError("The given datetime is out of bounds for the NetCDF data")

        # Select the corresponding netCDF slices
        uwnd_slice = uwnd_array[closest_time_index, :, :]
        vwnd_slice = vwnd_array[closest_time_index, :, :]

        # Find the grid cell of the netCDF slice closest to the given Latitude and Longitude position
        lat_lon = (lat, lon)
        _, closest_point_index = tree.query(lat_lon)
        closest_lat, closest_lon = lat_lon_pairs[closest_point_index]

        # Find the index of the closest latitude/longitude pair in the arrays
        lat_index = np.where(latitudes == closest_lat)[0][0]
        lon_index = np.where(longitudes == closest_lon)[0][0]

        # Extract the u and v wind components
        u_wind = uwnd_slice[lat_index, lon_index]
        v_wind = vwnd_slice[lat_index, lon_index]

        # Round wind components to 4 decimal places
        u_wind = round(u_wind, 4)
        v_wind = round(v_wind, 4)

        return u_wind, v_wind
    except Exception as e:
        print(f"Error extracting wind components: {e}")
        raise

Function to calculate new position from current position, displacement, and heading

In [6]:
def calculate_new_position(current_position, displacement, heading):
    R = 6371000  # Earth's radius in meters
    
    lat1 = math.radians(current_position[0])
    lon1 = math.radians(current_position[1])
    heading_rad = math.radians(heading)
    
    lat2 = math.asin(math.sin(lat1) * math.cos(displacement / R) +
                     math.cos(lat1) * math.sin(displacement / R) * math.cos(heading_rad))
    
    lon2 = lon1 + math.atan2(math.sin(heading_rad) * math.sin(displacement / R) * math.cos(lat1),
                             math.cos(displacement / R) - math.sin(lat1) * math.sin(lat2))
    
    return math.degrees(lat2), math.degrees(lon2)

Iterative predictor function

In [7]:
def iterative_prediction(val_data, model, tree, valid_times, latitudes, longitudes, lat_lon_pairs, output_file_path, max_predicts=None):

    # Check if the output file exists and write the header only if it doesn't
    if not os.path.exists(output_file_path):
        with open(output_file_path, 'w') as file:
            file.write("Predicted_Latitude,Predicted_Longitude,Datetime,BuoyID,Wind_U,Wind_V\n")

    # Print the input data length
    print(f"Number of rows in input validation data: {len(val_data)}")

    with open(output_file_path, 'a') as file:
        current_lat, current_lon = val_data.iloc[0][['Latitude', 'Longitude']]
        current_uwnd, current_vwnd = val_data.iloc[0][['era5_uwnd', 'era5_vwnd']]
        buoy_id = val_data.iloc[0]['BuoyID']
        
        print("\nInitial conditions:")
        print(f"Latitude: {current_lat:.2f}, Longitude: {current_lon:.2f}, Datetime: {val_data.iloc[0]['datetime']}, BuoyID: {buoy_id}")

        # Initialize output count for tracking
        output_count = 0

        for i in range(len(val_data)):

            # Check if the maximum number of predictions has been reached
            if max_predicts is not None and i >= max_predicts:
                print(f"Maximum number of predictions ({max_predicts}) reached. Stopping the script.")
                break

            next_row = val_data.iloc[i]
            
            input_data = pd.DataFrame({
                'Latitude': [current_lat],
                'Longitude': [current_lon],
                'era5_uwnd': [current_uwnd],
                'era5_vwnd': [current_vwnd],
                'BuoyID': [buoy_id]
            })
            
            predicted_displacement, predicted_heading = model.predict(input_data)[0]
            predicted_lat, predicted_lon = calculate_new_position(
                (current_lat, current_lon),
                predicted_displacement,
                predicted_heading
            )
            
            predicted_wind_u, predicted_wind_v = extract_wind_components(
                predicted_lat, 
                predicted_lon, 
                next_row['datetime'],
                tree,
                valid_times,
                latitudes,
                longitudes,
                lat_lon_pairs
            )
            
            current_lat, current_lon = predicted_lat, predicted_lon
            current_uwnd, current_vwnd = predicted_wind_u, predicted_wind_v

            # Write the prediction to the output file, including wind components
            file.write(f"{current_lat},{current_lon},{next_row['datetime']},{buoy_id},{predicted_wind_u},{predicted_wind_v}\n")
            output_count += 1  # Increment output count for each successful prediction

        # Print the output data length after predictions
        print(f"Number of rows written to output file: {output_count}")



Splitting the data into training and validation sets

In [9]:
# Parameters for track handling
temporal_window_size = 5  # Define the size of the window in time steps

# Load and prepare data
buoy_data = pd.read_csv('../processed_buoy_data.csv')

# Keep relevant columns and convert datetime
columns_to_keep = ['Latitude', 'Longitude', 'BuoyID', 'datetime', 'wind_magnitude', 'wind_angle', 'displacement', 'heading']
print("Dropping unused columns and retaining only:", columns_to_keep)
buoy_data = buoy_data[columns_to_keep].copy()
buoy_data['datetime'] = pd.to_datetime(buoy_data['datetime'])
print("Datetime column successfully converted to datetime format.")

# Split data by BuoyID into training and validation sets
print("Splitting data into training and validation sets by unique Buoy IDs.")
buoy_ids = buoy_data['BuoyID'].unique()
train_ids = np.random.choice(buoy_ids, size=int(len(buoy_ids) - 5), replace=False)
val_ids = np.setdiff1d(buoy_ids, train_ids)
print(f"Training IDs count: {len(train_ids)}, Validation IDs count: {len(val_ids)}")

train_data = buoy_data[buoy_data['BuoyID'].isin(train_ids)].copy()
val_data = buoy_data[buoy_data['BuoyID'].isin(val_ids)].copy()
print(f"Training data shape: {train_data.shape}, Validation data shape: {val_data.shape}")

# Prepare temporal windows for training
X_train, y_train = [], []
for buoy_id in train_data['BuoyID'].unique():
    track_data = train_data[train_data['BuoyID'] == buoy_id].sort_values('datetime').reset_index(drop=True)
    
    for i in range(temporal_window_size, len(track_data)):
        # Select data within the temporal window
        window = track_data.iloc[i-temporal_window_size:i]
        
        # Aggregate window data (example: mean or last values)
        window_features = [
            window['Latitude'].values[-1],  # Current latitude
            window['Longitude'].values[-1], # Current longitude
            window['wind_magnitude'].values[-1], # Current wind magnitude
            window['wind_angle'].values[-1],     # Current wind angle
            buoy_id                               # Buoy ID (as-is or encoded if needed)
        ]
        
        # Append features and target for each window
        X_train.append(window_features)
        y_train.append(track_data.iloc[i][['displacement', 'heading']].values)

# Convert to DataFrame for model training
X_train = pd.DataFrame(X_train, columns=['Latitude', 'Longitude', 'wind_magnitude', 'wind_angle', 'BuoyID'])
y_train = pd.DataFrame(y_train, columns=['displacement', 'heading'])

print(f"Prepared training data with temporal windowing. Feature set shape: {X_train.shape}, Target set shape: {y_train.shape}")

# Train the model
print("Training model...")
model = RandomForestRegressor()
model.fit(X_train, y_train)
print("Model training complete.")


Dropping unused columns and retaining only: ['Latitude', 'Longitude', 'BuoyID', 'datetime', 'wind_magnitude', 'wind_angle', 'displacement', 'heading']
Datetime column successfully converted to datetime format.
Splitting data into training and validation sets by unique Buoy IDs.
Training IDs count: 281, Validation IDs count: 5
Training data shape: (1742093, 8), Validation data shape: (36607, 8)
Prepared training data with temporal windowing. Feature set shape: (1740688, 5), Target set shape: (1740688, 2)
Training model...
Model training complete.


Predicting

In [10]:
print("\nStarting iterative predictions on validation subset...")

output_file_path = '../data/processed/predictions/'

# Create the output directory if it does not exist
if not os.path.exists(output_file_path):
    os.makedirs(output_file_path)

# Get unique buoyID values in the validation data
unique_buoy_ids = val_data['BuoyID'].unique()

# Loop through each buoyID and make predictions
for buoy_id in unique_buoy_ids:
    print(f"Processing BuoyID: {buoy_id}")
    
    # Filter validation data for the current buoyID
    val_data_buoy = val_data[val_data['BuoyID'] == buoy_id]
    
    # Create an output file path for the current buoyID
    output_file_path_buoy = f"{output_file_path}predicted_{buoy_id}.csv"
    
    # Print the output file path for the current buoyID
    print(f"Output file path for BuoyID {buoy_id}: {output_file_path_buoy}")

    # Make iterative predictions for the current buoyID
    iterative_prediction(
        val_data=val_data_buoy,
        model=model,
        tree=tree,
        valid_times=valid_time_dt,
        latitudes=latitudes,
        longitudes=longitudes,
        output_file_path=output_file_path_buoy,
        lat_lon_pairs=lat_lon_pairs,
        max_predicts=None
    )
    print(f"Predictions for BuoyID {buoy_id} completed and saved")


Starting iterative predictions on validation subset...
Processing BuoyID: 900126
Output file path for BuoyID 900126: ../data/processed/predictions/predicted_900126.csv
Number of rows in input validation data: 17433


KeyError: "None of [Index(['era5_uwnd', 'era5_vwnd'], dtype='object')] are in the [index]"