# Preparing past buoy data and reanalyses for use in model training

Concatenate all past buoy data into a single dataframe

This section will collect all of the cleaned buoy data and combine them into a single dataframe. A column to represent the day of year (DOY) as an integer is also added. These data will be used (along with weather reanalyses) as training data for the machine learning model. Also removes buoys deployed outside of the arctic (<64 degrees N)

In [None]:
# Concatenate multiple raw buoy CSV files into a single DataFrame and add a new column with the Day of Year (DOY) as an integer

import pandas as pd
import glob
import os

# Define the path to the folder containing the CSV files
folder_path = '../data/cleaned/buoydata/past'

# Use glob to get all CSV files in the folder
csv_files = glob.glob(os.path.join(folder_path, '*.csv'))

# Initialize an empty list to store DataFrames
dfs = []

# Loop through the list of CSV files and read each one into a DataFrame
for file in csv_files:
    df = pd.read_csv(file)
    dfs.append(df)

# Concatenate all DataFrames into a single DataFrame
combined_df = pd.concat(dfs, ignore_index=True)

# Rename the lat and lon columns to Latitude and Longitude
combined_df.rename(columns={'Lat': 'Latitude', 'Lon': 'Longitude'}, inplace=True)

# Pad Month, Day, Hour, Min, and Sec columns with leading zeros
combined_df['Month'] = combined_df['Month'].apply(lambda x: f'{x:02}')
combined_df['Day'] = combined_df['Day'].apply(lambda x: f'{x:02}')
combined_df['Hour'] = combined_df['Hour'].apply(lambda x: f'{x:02}')
combined_df['Min'] = combined_df['Min'].apply(lambda x: f'{x:02}')
combined_df['Sec'] = combined_df['Sec'].apply(lambda x: f'{x:02}')

# Create a new column called datetime by combining Year, Month, Day, Hour, Min, and Sec columns
combined_df['datetime'] = pd.to_datetime(combined_df['Year'].astype(str) + '-' +
                                         combined_df['Month'].astype(str) + '-' +
                                         combined_df['Day'].astype(str) + ' ' +
                                         combined_df['Hour'].astype(str) + ':' +
                                         combined_df['Min'].astype(str) + ':' +
                                         combined_df['Sec'].astype(str))

# Add a new column with the Day of Year (DOY) as an integer
combined_df['DOY'] = combined_df['datetime'].dt.dayofyear

# Iterate through the combined_df by BuoyID
for buoy_id, group in combined_df.groupby('BuoyID'):
    # Sort the records for each BuoyID by datetime from oldest to newest
    group = group.sort_values(by='datetime')
    
    # Check if the first row of the sorted data has a latitude value less than 64
    if group.iloc[0]['Latitude'] < 64:
        # Remove the entire BuoyID from the dataset
        combined_df = combined_df[combined_df['BuoyID'] != buoy_id]

Interpolate ERA5 to buoy data

In [None]:
import pandas as pd
import numpy as np
import netCDF4 as nc
from scipy.spatial import cKDTree
from datetime import datetime, timezone

# Load the NetCDF files
uwnd_nc_file_path = '../data/raw/reanalyses/ERA5/era5_uwnd_2023.nc'
vwnd_nc_file_path = '../data/raw/reanalyses/ERA5/era5_vwnd_2023.nc'
uwnd_ds = nc.Dataset(uwnd_nc_file_path)
vwnd_ds = nc.Dataset(vwnd_nc_file_path)

# Extract the valid_time, latitudes, longitudes, and u-component wind values from the NetCDF file
valid_time = uwnd_ds.variables['valid_time'][:]  # Assuming 'valid_time' is the variable name for time
latitudes = uwnd_ds.variables['latitude'][:]
longitudes = uwnd_ds.variables['longitude'][:]
uwnd_array = uwnd_ds.variables['u'][:, 0, :, :]  # Assuming 'u' is the variable name for u-component wind and removing the pressure dimension
vwnd_array = vwnd_ds.variables['v'][:, 0, :, :]  # Assuming 'v' is the variable name for v-component wind and removing the pressure dimension

# Add a column to the dataframe called "timestamp"
combined_df['timestamp'] = combined_df['datetime'].apply(lambda x: int(x.replace(tzinfo=timezone.utc).timestamp()))

# Create a KDTree for fast spatial lookup
lat_lon_pairs = np.array([(lat, lon) for lat in latitudes for lon in longitudes])
tree = cKDTree(lat_lon_pairs)

# Add new columns to combined_df for the u-component and v-component wind values
combined_df['era5_uwnd'] = np.nan
combined_df['era5_vwnd'] = np.nan

# Check the shape of the uwnd_array
print(f"uwnd_array shape: {uwnd_array.shape}")
print(f"vwnd_array shape: {vwnd_array.shape}")

# Iterate through each row in the dataframe
for index, row in combined_df.iterrows():
    # Find the value of the netCDF variable valid_time closest to the timestamp value
    timestamp = row['timestamp']
    time_diffs = np.abs(valid_time - timestamp)
    closest_time_index = np.argmin(time_diffs)
    
    # Check if the calculated index is within the bounds of the uwnd_array
    if closest_time_index < 0 or closest_time_index >= uwnd_array.shape[0]:
        print(f"Skipping row {index} with timestamp {timestamp} as it is out of bounds")
        continue
    
    # Select the corresponding netCDF slices
    uwnd_slice = uwnd_array[closest_time_index, :, :]
    vwnd_slice = vwnd_array[closest_time_index, :, :]
    
    # Find the grid cell of the netCDF slice closest to the Latitude and Longitude position
    lat_lon = (row['Latitude'], row['Longitude'])
    _, closest_point_index = tree.query(lat_lon)
    closest_lat, closest_lon = lat_lon_pairs[closest_point_index]
    
    # Find the index of the closest latitude/longitude pair in the arrays
    lat_index = np.where(latitudes == closest_lat)[0][0]
    lon_index = np.where(longitudes == closest_lon)[0][0]
    
    # Assign the corresponding u and v values to the new columns in the dataframe
    combined_df.at[index, 'era5_uwnd'] = uwnd_slice[lat_index, lon_index]
    combined_df.at[index, 'era5_vwnd'] = vwnd_slice[lat_index, lon_index]

# Drop the timestamp column from the dataframe
combined_df.drop(columns=['timestamp'], inplace=True)

# Print the dataframe head
print(combined_df.head())

# Print a message saying the script has completed
print("The ERA5 wind assignment script has completed.")

Interpolate IBCAO v5 bathymetry to buoy data

|Save the combined DataFrame to a new CSV file

In [None]:
combined_df.to_csv('combined_buoy_data.csv', index=False)
print("combined_df has been saved to 'combined_buoy_data.csv'.")

Add more data to the spreadsheet (wind vector and displacement/heading columns)

In [2]:
import pandas as pd
import numpy as np
from geopy.distance import great_circle
from geopy import Point
import math

# Load and Preprocess Data

print("Loading buoy data from the spreadsheet...")

# Load the buoy data from the spreadsheet
buoy_data = pd.read_csv('../combined_buoy_data.csv')
print("Buoy data loaded successfully.")
print(buoy_data.head())

print("Extracting necessary columns...")

# Extract necessary columns
buoy_data = buoy_data[['Latitude', 'Longitude', 'BuoyID', 'datetime', 'era5_uwnd', 'era5_vwnd']]
print("Columns extracted successfully.")
print(buoy_data.head())

print("Rounding wind columns to two decimal places...")

# Round wind columns to two decimal places
buoy_data['era5_uwnd'] = buoy_data['era5_uwnd'].round(2)
buoy_data['era5_vwnd'] = buoy_data['era5_vwnd'].round(2)
print("Wind columns rounded successfully.")

print("Calculating wind magnitude and wind angle...")

# Calculate wind magnitude and wind angle
buoy_data['wind_magnitude'] = np.sqrt(buoy_data['era5_uwnd']**2 + buoy_data['era5_vwnd']**2)
buoy_data['wind_angle'] = np.degrees(np.arctan2(buoy_data['era5_vwnd'], buoy_data['era5_uwnd']))

print("Wind magnitude and wind angle calculated successfully.")
print(buoy_data.head())

print("Displaying the first few rows of the preprocessed data:")

# Display the first few rows of the preprocessed data
buoy_data.head()

print("Calculating displacement and heading...")

# Initialize displacement and heading columns
buoy_data['displacement'] = 0.0
buoy_data['heading'] = 0.0

# Function to calculate displacement and heading for each group
def calculate_displacement_and_heading(group):
    group = group.sort_values(by='datetime').reset_index(drop=True)
    for i in range(1, len(group)):
        prev_point = Point(group.loc[i-1, 'Latitude'], group.loc[i-1, 'Longitude'])
        curr_point = Point(group.loc[i, 'Latitude'], group.loc[i, 'Longitude'])
        
        # Calculate displacement
        group.loc[i, 'displacement'] = great_circle(prev_point, curr_point).meters
        
        # Calculate heading
        lat1, lon1 = math.radians(group.loc[i-1, 'Latitude']), math.radians(group.loc[i-1, 'Longitude'])
        lat2, lon2 = math.radians(group.loc[i, 'Latitude']), math.radians(group.loc[i, 'Longitude'])
        
        dlon = lon2 - lon1
        x = math.sin(dlon) * math.cos(lat2)
        y = math.cos(lat1) * math.sin(lat2) - (math.sin(lat1) * math.cos(lat2) * math.cos(dlon))
        initial_heading = math.atan2(x, y)
        initial_heading = math.degrees(initial_heading)
        compass_heading = (initial_heading + 360) % 360
        
        group.loc[i, 'heading'] = compass_heading
    return group

# Apply the function to each group
buoy_data = buoy_data.groupby('BuoyID').apply(calculate_displacement_and_heading).reset_index(drop=True)

print("Displacement and heading calculated successfully.")
print(buoy_data.head())

# Save the processed buoy_data back to the spreadsheet
output_csv_path = 'processed_buoy_data.csv'
buoy_data.to_csv(output_csv_path, index=False)
print(f"Processed buoy data saved to {output_csv_path}.")


Loading buoy data from the spreadsheet...
Buoy data loaded successfully.
            BuoyID  Year  Month  Day  Hour  Min  Sec  Latitude  Longitude  \
0  300025010734900  2023      8    7     0    7   32  77.33740 -138.15785   
1  300025010734900  2023      8    7     0   51    5  77.33538 -138.13705   
2  300025010734900  2023      8    7     1    1   40  77.33479 -138.13317   
3  300025010734900  2023      8    7     2    1   21  77.33148 -138.11950   
4  300025010734900  2023      8    7     3    1   11  77.32867 -138.12018   

   GPSdelay    BPT      BP     Ts     Ta     Th  Batt             datetime  \
0         0 -999.0  1016.5 -999.0 -999.0  12.96    13  2023-08-07 00:07:32   
1         0 -999.0  1016.5 -999.0 -999.0  13.86    13  2023-08-07 00:51:05   
2         0 -999.0  1016.6 -999.0 -999.0  13.90    13  2023-08-07 01:01:40   
3         0 -999.0  -999.0 -999.0 -999.0  12.74    13  2023-08-07 02:01:21   
4         0 -999.0  1017.4 -999.0 -999.0  11.92    13  2023-08-07 03:01:11

Cleaned buoy data geospatial bounds confirmation

This cell will analyze and display the minimum and maximum values of the latitude and longitude fields of the data. 

In [None]:
# Confirm the latitude and longitude ranges

min_latitude = combined_df['Latitude'].min()
max_latitude = combined_df['Latitude'].max()
min_longitude = combined_df['Longitude'].min()
max_longitude = combined_df['Longitude'].max()

print(f"Latitude: min = {min_latitude}, max = {max_latitude}")
print(f"Longitude: min = {min_longitude}, max = {max_longitude}")