In [87]:
# Concatenate multiple CSV files into a single DataFrame and add a new column with the Day of Year (DOY) as an integer

import pandas as pd
import glob
import os

# Define the path to the folder containing the CSV files
folder_path = '../data/raw/buoydata/past'

# Use glob to get all CSV files in the folder
csv_files = glob.glob(os.path.join(folder_path, '*.csv'))

# Initialize an empty list to store DataFrames
dfs = []

# Loop through the list of CSV files and read each one into a DataFrame
for file in csv_files:
    df = pd.read_csv(file)
    dfs.append(df)

# Concatenate all DataFrames into a single DataFrame
combined_df = pd.concat(dfs, ignore_index=True)

# Make a new column in the dataframe of DOY truncated to an integer
combined_df['DOY_int'] = combined_df['DOY'].astype(int)

# Display the combined DataFrame
combined_df.head()

Unnamed: 0,BuoyID,Year,Hour,Min,DOY,POS_DOY,Lat,Lon,BP,Ts,Ta,iIceC,iBP,iTs,iTa_2m,iWindE_0Layer,iWindN_0Layer,DOY_int
0,145803,2024,23,0,233.9583,233.9583,83.70087,-99.84067,1001,-1.47,-33.67,0.9,1000.24,-1.45,-1.33,-9.82,0.28,233
1,145803,2024,0,0,234.0,234.0,83.70457,-99.90634,1000,-1.48,-33.57,0.89,999.48,-1.47,-1.32,-10.08,0.24,234
2,145803,2024,1,0,234.0417,234.0417,83.70965,-99.9744,999,-1.49,-33.53,0.89,998.57,-1.56,-1.34,-10.38,0.15,234
3,145803,2024,2,0,234.0833,234.0833,83.71607,-100.03897,998,-1.54,-33.68,0.88,997.7,-1.64,-1.32,-10.66,0.14,234
4,145803,2024,3,0,234.125,234.125,83.72346,-100.0944,997,-1.57,-32.58,0.88,996.87,-1.71,-1.45,-10.88,0.05,234


In [81]:
# Convert the uwnd and vwnd NetCDF files to 3D numpy arrays and convert the time variable as day of year (DOY)

import netCDF4 as nc
import numpy as np
import datetime

# Open the NetCDF file
file_path = '..\\data\\raw\\reanalyses\\ncep\\uwnd.sfc.2024.nc'
dataset = nc.Dataset(file_path, 'r')

# Extract the uwnd variable
uwnd_var = dataset.variables['uwnd']

# Convert the uwnd variable to a 3D numpy array
uwnd_3d_array = uwnd_var[:]

# Open the vwnd NetCDF file
vwnd_file_path = '..\\data\\raw\\reanalyses\\ncep\\vwnd.sfc.2024.nc'
vwnd_dataset = nc.Dataset(vwnd_file_path, 'r')

# Extract the vwnd variable
vwnd_var = vwnd_dataset.variables['vwnd']

# Convert the vwnd variable to a 3D numpy array
vwnd_3d_array = vwnd_var[:]

# Convert the time variable to day of year (DOY)
time_var = dataset.variables['time']
reference_date_str = time_var.units.split('since')[1].strip().split('.')[0]
reference_date = datetime.datetime.strptime(reference_date_str, '%Y-%m-%d %H:%M:%S')
doy = [(reference_date + datetime.timedelta(days=t)).timetuple().tm_yday for t in time_var[:]]

# Ensure the DOY values are within the valid range
doy = np.array(doy)
valid_indices = (doy >= 1) & (doy <= 365)
doy = doy[valid_indices]
uwnd_3d_array = uwnd_3d_array[valid_indices]
vwnd_3d_array = vwnd_3d_array[valid_indices]

# Print the shape of the uwnd and vwnd arrays
print(uwnd_3d_array.shape)
print(vwnd_3d_array.shape)

# Close the NetCDF files
dataset.close()

# Print the minimum and maximum values of the lat and lon in each array
print(f"uwnd_3d_array lat min: {latitudes.min()}, lat max: {latitudes.max()}")
print(f"uwnd_3d_array lon min: {longitudes.min()}, lon max: {longitudes.max()}")

print(f"vwnd_3d_array lat min: {latitudes.min()}, lat max: {latitudes.max()}")
print(f"vwnd_3d_array lon min: {longitudes.min()}, lon max: {longitudes.max()}")


Converted DOY values: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 

In [97]:
combined_df = combined_df[combined_df['DOY_int'] == 100]

In [102]:
import numpy as np
import pandas as pd
import geopandas as gpd
from datetime import datetime, timedelta
from scipy.interpolate import RegularGridInterpolator

# Assuming `time`, `latitudes`, `longitudes`, and `uwnd_3d_array` are already defined
reference_date_str = time_var.units.split('since')[1].strip().split('.')[0]
reference_date = datetime.strptime(reference_date_str, '%Y-%m-%d %H:%M:%S')

doy_values = [(reference_date + timedelta(days=t)).timetuple().tm_yday for t in time]

uwnd_interpolator = RegularGridInterpolator((doy_values, latitudes, longitudes), uwnd_3d_array)

# Function to extract wind components for each row
def extract_wind_components(row):
    point = row.geometry
    doy = row.DOY_int

    if doy not in doy_values:
        return pd.Series({'uwnd': np.nan})

    uwnd_value = uwnd_interpolator((doy, point.y, point.x))

    return pd.Series({'uwnd': uwnd_value})

# Get a list of all the unique DOYs in the buoy data
unique_doys = combined_df['DOY_int'].unique()

# Iterate through each unique DOY in combined_df
result_dfs = []

for doy in unique_doys:
    # Subset combined_df by DOY
    subset_df = combined_df[combined_df['DOY_int'] == doy].copy()

    # Convert the subset to a GeoDataFrame
    gdf = gpd.GeoDataFrame(subset_df, geometry=gpd.points_from_xy(subset_df.Lon, subset_df.Lat), crs="EPSG:4326")

    # Apply the function to extract wind components
    gdf['uwnd'] = gdf.apply(extract_wind_components, axis=1)

    # Append the result to the list
    result_dfs.append(gdf)

# Combine the subsets back into a single DataFrame
combined_result_df = pd.concat(result_dfs, ignore_index=True)

# Display the updated DataFrame
print(combined_result_df.head())

AttributeError: NetCDF: Not a valid ID