Package imports

In [None]:
import pandas as pd
import numpy as np
from geopy.distance import great_circle
from geopy import Point
import math
import netCDF4 as nc
from datetime import datetime
from datetime import timedelta
from scipy.spatial import cKDTree

Functions to pre-process spatial data and extract wind values from a lat/lon location

In [None]:
# Precompute the KDTree and valid_time differences
def precompute_kdtree_and_time_diffs(uwnd_nc_file_path):
    try:
        print("Precomputing KDTree and time differences...")
        # Load the NetCDF file
        ds = nc.Dataset(uwnd_nc_file_path)

        # Extract the valid_time, latitudes, and longitudes from the NetCDF file
        valid_time = ds.variables['valid_time'][:]  # Assuming 'valid_time' is the variable name for time
        latitudes = ds.variables['latitude'][:]
        longitudes = ds.variables['longitude'][:]

        # Convert valid_time from seconds since 1970-01-01 to datetime
        base_time = datetime(1970, 1, 1)
        valid_time_dt = np.array([base_time + timedelta(seconds=int(ts)) for ts in valid_time], dtype='datetime64[ns]')

        # Create a KDTree for fast spatial lookup
        lat_lon_pairs = np.array([(lat, lon) for lat in latitudes for lon in longitudes])
        tree = cKDTree(lat_lon_pairs)

        print("KDTree and time differences precomputed successfully.")
        return tree, valid_time_dt, latitudes, longitudes, lat_lon_pairs
    except Exception as e:
        print(f"Error precomputing KDTree and time differences: {e}")
        raise

uwnd_nc_file_path = '../data/raw/reanalyses/ERA5/era5_uwnd_2023.nc'
vwnd_nc_file_path = '../data/raw/reanalyses/ERA5/era5_vwnd_2023.nc'
try:
    tree, valid_time_dt, latitudes, longitudes, lat_lon_pairs = precompute_kdtree_and_time_diffs(uwnd_nc_file_path)
except Exception as e:
    print(f"Error precomputing KDTree and time differences: {e}")
    raise

# Function to extract wind components
def extract_wind_components(lat, lon, dt, uwnd_nc_file_path, vwnd_nc_file_path, tree, valid_time_dt, latitudes, longitudes, lat_lon_pairs):
    try:
        # Load the NetCDF files
        uwnd_ds = nc.Dataset(uwnd_nc_file_path)
        vwnd_ds = nc.Dataset(vwnd_nc_file_path)

        # Extract the wind values from the NetCDF files
        uwnd_array = uwnd_ds.variables['u'][:, 0, :, :]  # Assuming 'u' is the variable name for u-component wind and removing the pressure dimension
        vwnd_array = vwnd_ds.variables['v'][:, 0, :, :]  # Assuming 'v' is the variable name for v-component wind and removing the pressure dimension

        # Convert the given datetime to a numpy datetime64 object
        row_datetime = np.datetime64(dt)

        # Find the value in the valid_time dimension closest in time to the datetime in the dataframe
        time_diffs = np.abs(valid_time_dt - row_datetime)
        closest_time_index = np.argmin(time_diffs)

        # Check if the calculated index is within the bounds of the uwnd_array
        if closest_time_index < 0 or closest_time_index >= uwnd_array.shape[0]:
            raise ValueError("The given datetime is out of bounds for the NetCDF data")

        # Select the corresponding netCDF slices
        uwnd_slice = uwnd_array[closest_time_index, :, :]
        vwnd_slice = vwnd_array[closest_time_index, :, :]

        # Find the grid cell of the netCDF slice closest to the given Latitude and Longitude position
        lat_lon = (lat, lon)
        _, closest_point_index = tree.query(lat_lon)
        closest_lat, closest_lon = lat_lon_pairs[closest_point_index]

        # Find the index of the closest latitude/longitude pair in the arrays
        lat_index = np.where(latitudes == closest_lat)[0][0]
        lon_index = np.where(longitudes == closest_lon)[0][0]

        # Extract the u and v wind components
        u_wind = uwnd_slice[lat_index, lon_index]
        v_wind = vwnd_slice[lat_index, lon_index]

        # Round wind components to 4 decimal places
        u_wind = round(u_wind, 4)
        v_wind = round(v_wind, 4)

        return u_wind, v_wind
    except Exception as e:
        print(f"Error extracting wind components: {e}")
        raise

Model training and validation

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
import gc
import math
import os

# Load the data from the spreadsheet
buoy_data = pd.read_csv('../processed_buoy_data.csv')

# Display the first few rows of the dataframe
buoy_data.head()

def calculate_new_position(current_position, displacement, heading):
    """
    Calculate the new latitude and longitude given an initial position,
    displacement in meters, and heading in degrees.
    
    Parameters:
    - current_position: tuple (latitude, longitude) in decimal degrees
    - displacement: distance to move in meters
    - heading: angle in degrees from the north (0 degrees is north, 90 degrees is east)
    
    Returns:
    - new_latitude, new_longitude: updated position in decimal degrees
    """
    R = 6371000  # Earth's radius in meters
    
    lat1 = math.radians(current_position[0])
    lon1 = math.radians(current_position[1])
    heading_rad = math.radians(heading)
    
    lat2 = math.asin(math.sin(lat1) * math.cos(displacement / R) +
                     math.cos(lat1) * math.sin(displacement / R) * math.cos(heading_rad))
    
    lon2 = lon1 + math.atan2(math.sin(heading_rad) * math.sin(displacement / R) * math.cos(lat1),
                             math.cos(displacement / R) - math.sin(lat1) * math.sin(lat2))
    
    new_latitude = math.degrees(lat2)
    new_longitude = math.degrees(lon2)
    
    return new_latitude, new_longitude

print("Dropping unused columns...")
columns_to_keep = ['Latitude', 'Longitude', 'BuoyID', 'datetime', 'era5_uwnd', 'era5_vwnd', 'displacement', 'heading']
buoy_data = buoy_data[columns_to_keep].copy()

buoy_data['datetime'] = pd.to_datetime(buoy_data['datetime'])
print("Datetime column converted.")

print("Splitting data by BuoyID...")
buoy_ids = buoy_data['BuoyID'].unique()
train_ids = np.random.choice(buoy_ids, size=int(len(buoy_ids) * 0.8), replace=False)
val_ids = np.setdiff1d(buoy_ids, train_ids)

train_data = buoy_data[buoy_data['BuoyID'].isin(train_ids)].copy()
val_data = buoy_data[buoy_data['BuoyID'].isin(val_ids)].copy()

print("Subsetting training and validation data for specific BuoyIDs...")
#CHANGE SIZE TO SUBSET DATA FOR TRAINING
train_buoy_ids = np.random.choice(train_data['BuoyID'].unique(), size=105, replace=False)
train_subset = train_data[train_data['BuoyID'].isin(train_buoy_ids)].copy()
#CHANGE SIZE TO SUBSET DATA FOR VALIDATION
val_buoy_id = np.random.choice(val_data['BuoyID'].unique(), size=1)[0]
val_subset = val_data[val_data['BuoyID'] == val_buoy_id].copy()

del train_data, val_data
gc.collect()
print("Memory cleaned up after subsetting data.")

X_train = train_subset[['Latitude', 'Longitude', 'era5_uwnd', 'era5_vwnd']]
y_train = train_subset[['displacement', 'heading']]

print("Training model...")
model = RandomForestRegressor()
model.fit(X_train, y_train)
print("Model training complete.")

output_file_path = f'../predictions_{val_buoy_id}.csv'

# Check if the output file exists and delete it if it does
if os.path.exists(output_file_path):
    os.remove(output_file_path)
    print(f"Existing file {output_file_path} deleted.")
else:
    print(f"No existing file found at {output_file_path}.")

def iterative_prediction(val_subset, model, kdtree, uwnd_nc_file_path, vwnd_nc_file_path, valid_times, latitudes, longitudes, lat_lon_pairs, output_file_path):
    # Check if the output file exists and write the header only if it doesn't
    if not os.path.exists(output_file_path):
        with open(output_file_path, 'w') as file:
            file.write("Predicted_Latitude,Predicted_Longitude,Datetime,BuoyID\n")
    
    with open(output_file_path, 'a') as file:
        current_lat, current_lon = val_subset.iloc[0][['Latitude', 'Longitude']]
        current_uwnd, current_vwnd = val_subset.iloc[0][['era5_uwnd', 'era5_vwnd']]
        buoy_id = val_subset.iloc[0]['BuoyID']
        
        print("\nInitial conditions:")
        print(f"Latitude: {current_lat:.2f}, Longitude: {current_lon:.2f}, Datetime: {val_subset.iloc[0]['datetime']}, BuoyID: {buoy_id}")

        for i in range(1, len(val_subset)):
            next_row = val_subset.iloc[i]
            
            input_data = pd.DataFrame({
                'Latitude': [current_lat],
                'Longitude': [current_lon],
                'era5_uwnd': [current_uwnd],
                'era5_vwnd': [current_vwnd]
            })
            
            predicted_displacement, predicted_heading = model.predict(input_data)[0]
            predicted_lat, predicted_lon = calculate_new_position(
                (current_lat, current_lon),
                predicted_displacement,
                predicted_heading
            )

            predicted_wind_u, predicted_wind_v = extract_wind_components(
                predicted_lat, 
                predicted_lon, 
                next_row['datetime'],
                uwnd_nc_file_path,
                vwnd_nc_file_path,
                kdtree,
                valid_times,
                latitudes,
                longitudes,
                lat_lon_pairs  # Pass lat_lon_pairs here
            )

            print(f"\nPrediction for row {i}:")
            print(f"Predicted Latitude: {predicted_lat:.2f}, Predicted Longitude: {predicted_lon:.2f}")
            print(f"Predicted Displacement: {predicted_displacement:.2f}, Predicted Heading: {predicted_heading:.2f}")
            print(f"Updated Wind U: {predicted_wind_u:.2f}, Wind V: {predicted_wind_v:.2f}")
            print(f"Target Datetime: {next_row['datetime']}, BuoyID: {buoy_id}")

            # Round numerical values before writing to the file
            file.write(f"{round(predicted_lat, 2)},{round(predicted_lon, 2)},{next_row['datetime']},{buoy_id}\n")

            file_size = file.tell()
            if file_size > 2 * 1024 * 1024 * 1024:
                print("File size exceeded 2 GB. Stopping the script.")
                break

            current_lat, current_lon = predicted_lat, predicted_lon
            current_uwnd, current_vwnd = predicted_wind_u, predicted_wind_v

            gc.collect()
            if i % 10 == 0:
                print(f"Processed {i} predictions...")
    
    print(f"\nPrediction complete. Results saved to {output_file_path}")

print("\nStarting iterative predictions on validation subset...")
iterative_prediction(
    val_subset=val_subset,
    model=model,
    kdtree=tree,
    uwnd_nc_file_path=uwnd_nc_file_path,
    vwnd_nc_file_path=vwnd_nc_file_path,
    valid_times=valid_time_dt,
    latitudes=latitudes,
    longitudes=longitudes,
    output_file_path=output_file_path,
    lat_lon_pairs=lat_lon_pairs
)