# Preparing past buoy data and reanalyses for use in model training

## 1. Concatenate all past buoy data into a single dataframe
This section will collect all of the cleaned buoy data and combine them into a single dataframe. A column to represent the day of year (DOY) as an integer is also added. These data will be used (along with weather reanalyses) as training data for the machine learning model.  

In [None]:
# Concatenate multiple raw buoy CSV files into a single DataFrame and add a new column with the Day of Year (DOY) as an integer

import pandas as pd
import glob
import os

# Define the path to the folder containing the CSV files
folder_path = '../data/cleaned/buoydata/past'

# Use glob to get all CSV files in the folder
csv_files = glob.glob(os.path.join(folder_path, '*.csv'))

# Initialize an empty list to store DataFrames
dfs = []

# Loop through the list of CSV files and read each one into a DataFrame
for file in csv_files:
    df = pd.read_csv(file)
    dfs.append(df)

# Concatenate all DataFrames into a single DataFrame
combined_df = pd.concat(dfs, ignore_index=True)

# Rename the lat and lon columns to Latitude and Longitude
combined_df.rename(columns={'Lat': 'Latitude', 'Lon': 'Longitude'}, inplace=True)

# Pad Month, Day, Hour, Min, and Sec columns with leading zeros
combined_df['Month'] = combined_df['Month'].apply(lambda x: f'{x:02}')
combined_df['Day'] = combined_df['Day'].apply(lambda x: f'{x:02}')
combined_df['Hour'] = combined_df['Hour'].apply(lambda x: f'{x:02}')
combined_df['Min'] = combined_df['Min'].apply(lambda x: f'{x:02}')
combined_df['Sec'] = combined_df['Sec'].apply(lambda x: f'{x:02}')

# Create a new column called datetime by combining Year, Month, Day, Hour, Min, and Sec columns
combined_df['datetime'] = pd.to_datetime(combined_df['Year'].astype(str) + '-' +
                                         combined_df['Month'].astype(str) + '-' +
                                         combined_df['Day'].astype(str) + ' ' +
                                         combined_df['Hour'].astype(str) + ':' +
                                         combined_df['Min'].astype(str) + ':' +
                                         combined_df['Sec'].astype(str))

# Add a new column with the Day of Year (DOY) as an integer
combined_df['DOY'] = combined_df['datetime'].dt.dayofyear

# Display the combined DataFrame
combined_df.head()

## 2. Cleaned buoy data geospatial bounds confirmation
This cell will analyze and display the minimum and maximum values of the latitude and longitude fields of the data. 

In [None]:
# Confirm the latitude and longitude ranges

min_latitude = combined_df['Latitude'].min()
max_latitude = combined_df['Latitude'].max()
min_longitude = combined_df['Longitude'].min()
max_longitude = combined_df['Longitude'].max()

print(f"Latitude: min = {min_latitude}, max = {max_latitude}")
print(f"Longitude: min = {min_longitude}, max = {max_longitude}")

## 3. Interpolate ERA5 reanalysis data to past buoy locations

In [None]:
import pandas as pd
import numpy as np
import netCDF4 as nc
from scipy.spatial import cKDTree
from datetime import datetime, timezone

# Load the NetCDF files
uwnd_nc_file_path = '../data/raw/reanalyses/ERA5/era5_uwnd_2023.nc'
vwnd_nc_file_path = '../data/raw/reanalyses/ERA5/era5_vwnd_2023.nc'
uwnd_ds = nc.Dataset(uwnd_nc_file_path)
vwnd_ds = nc.Dataset(vwnd_nc_file_path)

# Extract the valid_time, latitudes, longitudes, and u-component wind values from the NetCDF file
valid_time = uwnd_ds.variables['valid_time'][:]  # Assuming 'valid_time' is the variable name for time
latitudes = uwnd_ds.variables['latitude'][:]
longitudes = uwnd_ds.variables['longitude'][:]
uwnd_array = uwnd_ds.variables['u'][:, 0, :, :]  # Assuming 'u' is the variable name for u-component wind and removing the pressure dimension
vwnd_array = vwnd_ds.variables['v'][:, 0, :, :]  # Assuming 'v' is the variable name for v-component wind and removing the pressure dimension

# Add a column to the dataframe called "timestamp"
combined_df['timestamp'] = combined_df['datetime'].apply(lambda x: int(x.replace(tzinfo=timezone.utc).timestamp()))

# Create a KDTree for fast spatial lookup
lat_lon_pairs = np.array([(lat, lon) for lat in latitudes for lon in longitudes])
tree = cKDTree(lat_lon_pairs)

# Add new columns to combined_df for the u-component and v-component wind values
combined_df['era5_uwnd'] = np.nan
combined_df['era5_vwnd'] = np.nan

# Check the shape of the uwnd_array
print(f"uwnd_array shape: {uwnd_array.shape}")
print(f"vwnd_array shape: {vwnd_array.shape}")

# Iterate through each row in the dataframe
for index, row in combined_df.iterrows():
    # Find the value of the netCDF variable valid_time closest to the timestamp value
    timestamp = row['timestamp']
    time_diffs = np.abs(valid_time - timestamp)
    closest_time_index = np.argmin(time_diffs)
    
    # Check if the calculated index is within the bounds of the uwnd_array
    if closest_time_index < 0 or closest_time_index >= uwnd_array.shape[0]:
        print(f"Skipping row {index} with timestamp {timestamp} as it is out of bounds")
        continue
    
    # Select the corresponding netCDF slices
    uwnd_slice = uwnd_array[closest_time_index, :, :]
    vwnd_slice = vwnd_array[closest_time_index, :, :]
    
    # Find the grid cell of the netCDF slice closest to the Latitude and Longitude position
    lat_lon = (row['Latitude'], row['Longitude'])
    _, closest_point_index = tree.query(lat_lon)
    closest_lat, closest_lon = lat_lon_pairs[closest_point_index]
    
    # Find the index of the closest latitude/longitude pair in the arrays
    lat_index = np.where(latitudes == closest_lat)[0][0]
    lon_index = np.where(longitudes == closest_lon)[0][0]
    
    # Assign the corresponding u and v values to the new columns in the dataframe
    combined_df.at[index, 'era5_uwnd'] = uwnd_slice[lat_index, lon_index]
    combined_df.at[index, 'era5_vwnd'] = vwnd_slice[lat_index, lon_index]

# Drop the timestamp column from the dataframe
combined_df.drop(columns=['timestamp'], inplace=True)

# Print the dataframe head
print(combined_df.head())

# Print a message saying the script has completed
print("The script has completed.")

In [None]:
# Save the combined_df DataFrame to a CSV file
output_path = '../data/cleaned/buoydata/past/IABP_2023_era5_interpolated.csv'
combined_df.to_csv(output_path, index=False)

print(f"DataFrame saved to {output_path}")