# Preparing past buoy data and reanalyses for use in model training

## 1. Concatenate all past buoy data into a single dataframe
This section will collect all of the cleaned buoy data and combine them into a single dataframe. A column to represent the day of year (DOY) as an integer is also added. These data will be used (along with weather reanalyses) as training data for the machine learning model.  

In [None]:
# Concatenate multiple raw buoy CSV files into a single DataFrame and add a new column with the Day of Year (DOY) as an integer

import pandas as pd
import glob
import os

# Define the path to the folder containing the CSV files
folder_path = '../data/cleaned/buoydata/past'

# Use glob to get all CSV files in the folder
csv_files = glob.glob(os.path.join(folder_path, '*.csv'))

# Initialize an empty list to store DataFrames
dfs = []

# Loop through the list of CSV files and read each one into a DataFrame
for file in csv_files:
    df = pd.read_csv(file)
    dfs.append(df)

# Concatenate all DataFrames into a single DataFrame
combined_df = pd.concat(dfs, ignore_index=True)

# Rename the lat and lon columns to Latitude and Longitude
combined_df.rename(columns={'Lat': 'Latitude', 'Lon': 'Longitude'}, inplace=True)

# Pad Month, Day, Hour, Min, and Sec columns with leading zeros
combined_df['Month'] = combined_df['Month'].apply(lambda x: f'{x:02}')
combined_df['Day'] = combined_df['Day'].apply(lambda x: f'{x:02}')
combined_df['Hour'] = combined_df['Hour'].apply(lambda x: f'{x:02}')
combined_df['Min'] = combined_df['Min'].apply(lambda x: f'{x:02}')
combined_df['Sec'] = combined_df['Sec'].apply(lambda x: f'{x:02}')

# Create a new column called datetime by combining Year, Month, Day, Hour, Min, and Sec columns
combined_df['datetime'] = pd.to_datetime(combined_df['Year'].astype(str) + '-' +
                                         combined_df['Month'].astype(str) + '-' +
                                         combined_df['Day'].astype(str) + ' ' +
                                         combined_df['Hour'].astype(str) + ':' +
                                         combined_df['Min'].astype(str) + ':' +
                                         combined_df['Sec'].astype(str))

# Add a new column with the Day of Year (DOY) as an integer
combined_df['DOY'] = combined_df['datetime'].dt.dayofyear

# Display the combined DataFrame
combined_df.head()

## 2. Cleaned buoy data geospatial bounds confirmation
To confirm the resulting dataframe only contains buoy data within the area of interest (Arctic Ocean), this cell will analyze and display the minimum and maximum values of the latitude and longitude fields of the data. 

In [None]:
# Confirm the latitude and longitude ranges

min_latitude = combined_df['Latitude'].min()
max_latitude = combined_df['Latitude'].max()
min_longitude = combined_df['Longitude'].min()
max_longitude = combined_df['Longitude'].max()

print(f"Latitude: min = {min_latitude}, max = {max_latitude}")
print(f"Longitude: min = {min_longitude}, max = {max_longitude}")

In [None]:
# Subset the data to only include rows where the month is January
combined_df = combined_df[combined_df['Month'] == '01']

# Preparing current buoy data and forecasts for use in model training

## 1. Concatenate all current buoy data into a single dataframe
This section will collect all of the cleaned buoy data and combine them into a single dataframe. A column to represent the day of year (DOY) as an integer is also added. Finally, the buoy data is subsetted to only the most recent positions for each buoyID.

In [None]:
# Concatenate multiple current buoy CSV files into a single DataFrame and add a new column with the Day of Year (DOY) as an integer

import pandas as pd
import glob
import os

# Define the path to the folder containing the CSV files
folder_path = '../data/cleaned/buoydata/current'

# Use glob to get all CSV files in the folder
csv_files = glob.glob(os.path.join(folder_path, '*.csv'))

# Initialize an empty list to store DataFrames
dfs = []

# Loop through the list of CSV files and read each one into a DataFrame
for file in csv_files:
    df = pd.read_csv(file)
    dfs.append(df)

# Concatenate all DataFrames into a single DataFrame
combined_df_current = pd.concat(dfs, ignore_index=True)

# Make a new column in the dataframe of DOY truncated to an integer
combined_df_current['DOY_int'] = combined_df_current['DOY'].astype(int)

# Rename the lat and lon columns to Latitude and Longitude
combined_df_current.rename(columns={'Lat': 'Latitude', 'Lon': 'Longitude'}, inplace=True)

# For each unique BuoyID, keep only the most current position based on the DOY column
combined_df_current = combined_df_current.loc[combined_df_current.groupby('BuoyID')['DOY'].idxmax()]

# Display the combined DataFrame
combined_df_current.head()

## 2. Converting the netCDF GFS forecast data into arrays for interpolation with current buoy positions for use as initial conditions for prediction

This cell converts the GFS netCDF file into arrays for the `ugrd` and `vgrd` variables at a specific pressure level (currently set to the second item in the list, which corresponds to near-surface conditions). The script ensures that there is exactly one file in the specified directory and extracts the necessary variables from the netCDF dataset. The shapes of the resulting arrays are printed to verify the extraction process. This step is crucial for preparing the forecast data for interpolation with the current buoy positions, which will be used as initial conditions for prediction models.

In [None]:
# Convert the GFS netCDF file into arrays for the ugrd and vgrd variables at a specific pressure level (currently set to the second item in the list 
# which corresponds to near surface conditions)

import netCDF4 as nc
import os

# Define the directory path to the dataset in the data/raw/forecasts/gfs folder
directory_path = '../data/raw/forecasts/gfs'

# Get the list of files in the directory
files = os.listdir(directory_path)

# Ensure there is exactly one file in the directory
if len(files) != 1:
    raise ValueError("There should be exactly one file in the directory.")

# Get the file path
file_path = os.path.join(directory_path, files[0])

# Reopen the dataset
dataset = nc.Dataset(file_path, 'r')

# Extract the second item in the pfull list
pfull_index = 1

# Create arrays for ugrd and vgrd with pfull set to the second item
ugrd_array = dataset.variables['ugrd'][0, pfull_index, :, :]
vgrd_array = dataset.variables['vgrd'][0, pfull_index, :, :]

# Print the shapes of the arrays to verify
print("ugrd_array shape:", ugrd_array.shape)
print("vgrd_array shape:", vgrd_array.shape)

# Extract the latitude and longitude variables from the dataset
lat_array = dataset.variables['lat'][:]
lon_array = dataset.variables['lon'][:]

# Print the shapes of the latitude and longitude arrays to verify
print("lat_array shape:", lat_array.shape)
print("lon_array shape:", lon_array.shape)

# Close the dataset
dataset.close()

## 3. Interpolating forecast data with current buoy positions
This cell will use the GFS forecasts to assign wind forecast values to the current buoy positions for use as initial conditions for prediction.

In [None]:
import numpy as np
from scipy.interpolate import griddata

# Step 1: Get the buoy points from combined_df_current
buoy_points = combined_df_current[['Latitude', 'Longitude']].values

# Step 2: Create a meshgrid of the lat and lon arrays
lon_grid, lat_grid = np.meshgrid(lon_array[0], lat_array[:, 0])

# Step 3: Flatten the meshgrid arrays for interpolation
points = np.array([lat_grid.flatten(), lon_grid.flatten()]).T

# Step 4: Interpolate the ugrd and vgrd values
ugrd_values = ugrd_array.flatten()
vgrd_values = vgrd_array.flatten()

interpolated_ugrd = griddata(points, ugrd_values, buoy_points, method='linear')
interpolated_vgrd = griddata(points, vgrd_values, buoy_points, method='linear')

# Step 5: Assign the interpolated values to the DataFrame
combined_df_current['ugrd_gfs_interp'] = interpolated_ugrd
combined_df_current['vgrd_gfs_interp'] = interpolated_vgrd

# Step 6: Assign the discrete values (closest grid point) to the DataFrame
assigned_ugrd = []
assigned_vgrd = []

for lat, lon in buoy_points:
    lat_idx = (np.abs(lat_array[:, 0] - lat)).argmin()
    lon_idx = (np.abs(lon_array[0] - lon)).argmin()
    
    assigned_ugrd.append(ugrd_array[lat_idx, lon_idx])
    assigned_vgrd.append(vgrd_array[lat_idx, lon_idx])

combined_df_current['ugrd_gfs_discrete'] = assigned_ugrd
combined_df_current['vgrd_gfs_discrete'] = assigned_vgrd

# Ensure the new columns are of float type
combined_df_current['ugrd_gfs_interp'] = combined_df_current['ugrd_gfs_interp'].astype(float)
combined_df_current['vgrd_gfs_interp'] = combined_df_current['vgrd_gfs_interp'].astype(float)
combined_df_current['ugrd_gfs_discrete'] = combined_df_current['ugrd_gfs_discrete'].astype(float)
combined_df_current['vgrd_gfs_discrete'] = combined_df_current['vgrd_gfs_discrete'].astype(float)

# Display the updated DataFrame head
combined_df_current.head()