In [10]:
# Concatenate multiple CSV files into a single DataFrame and add a new column with the Day of Year (DOY) as an integer

import pandas as pd
import glob
import os

# Define the path to the folder containing the CSV files
folder_path = '../data/raw/buoydata/past'

# Use glob to get all CSV files in the folder
csv_files = glob.glob(os.path.join(folder_path, '*.csv'))

# Initialize an empty list to store DataFrames
dfs = []

# Loop through the list of CSV files and read each one into a DataFrame
for file in csv_files:
    df = pd.read_csv(file)
    dfs.append(df)

# Concatenate all DataFrames into a single DataFrame
combined_df = pd.concat(dfs, ignore_index=True)

# Make a new column in the dataframe of DOY truncated to an integer
combined_df['DOY_int'] = combined_df['DOY'].astype(int)

# Rename the lat and lon columns to Latitude and Longitude
combined_df.rename(columns={'Lat': 'Latitude', 'Lon': 'Longitude'}, inplace=True)

# Display the combined DataFrame
combined_df.head()

Unnamed: 0,BuoyID,Year,Hour,Min,DOY,POS_DOY,Latitude,Longitude,BP,Ts,Ta,iIceC,iBP,iTs,iTa_2m,iWindE_0Layer,iWindN_0Layer,DOY_int
0,145803,2024,23,0,233.9583,233.9583,83.70087,260.15933,1001,-1.47,-33.67,0.9,1000.24,-1.45,-1.33,-9.82,0.28,233
1,145803,2024,0,0,234.0,234.0,83.70457,260.09366,1000,-1.48,-33.57,0.89,999.48,-1.47,-1.32,-10.08,0.24,234
2,145803,2024,1,0,234.0417,234.0417,83.70965,260.0256,999,-1.49,-33.53,0.89,998.57,-1.56,-1.34,-10.38,0.15,234
3,145803,2024,2,0,234.0833,234.0833,83.71607,259.96103,998,-1.54,-33.68,0.88,997.7,-1.64,-1.32,-10.66,0.14,234
4,145803,2024,3,0,234.125,234.125,83.72346,259.9056,997,-1.57,-32.58,0.88,996.87,-1.71,-1.45,-10.88,0.05,234


In [11]:
min_latitude = combined_df['Latitude'].min()
max_latitude = combined_df['Latitude'].max()
min_longitude = combined_df['Longitude'].min()
max_longitude = combined_df['Longitude'].max()

print(f"Latitude: min = {min_latitude}, max = {max_latitude}")
print(f"Longitude: min = {min_longitude}, max = {max_longitude}")

Latitude: min = 66.0, max = 119.715
Longitude: min = 0.0001, max = 359.9999


In [17]:
# Convert the uwnd and vwnd NetCDF files to 3D numpy arrays and convert the time variable as day of year (DOY)

import netCDF4 as nc
import numpy as np
import datetime

# Open the NetCDF file
file_path = '..\\data\\raw\\reanalyses\\ncep\\uwnd.sfc.2024.nc'
dataset = nc.Dataset(file_path, 'r')

# Extract the uwnd variable
uwnd_var = dataset.variables['uwnd']

# Convert the uwnd variable to a 3D numpy array
uwnd_3d_array = uwnd_var[:]

# Open the vwnd NetCDF file
vwnd_file_path = '..\\data\\raw\\reanalyses\\ncep\\vwnd.sfc.2024.nc'
vwnd_dataset = nc.Dataset(vwnd_file_path, 'r')

# Extract the vwnd variable
vwnd_var = vwnd_dataset.variables['vwnd']

# Convert the vwnd variable to a 3D numpy array
vwnd_3d_array = vwnd_var[:]

# Extract the latitudes and longitudes
latitudes = vwnd_dataset.variables['lat'][:]
longitudes = vwnd_dataset.variables['lon'][:]

# Convert the time variable to day of year (DOY)
time_var = vwnd_dataset.variables['time']
reference_date_str = time_var.units.split('since')[1].strip().split('.')[0]
reference_date = datetime.datetime.strptime(reference_date_str, '%Y-%m-%d %H:%M:%S')
doy = [(reference_date + datetime.timedelta(days=t)).timetuple().tm_yday for t in time_var[:]]

# Ensure the DOY values are within the valid range
doy = np.array(doy)
valid_indices = (doy >= 1) & (doy <= 365)
doy = doy[valid_indices]
uwnd_3d_array = uwnd_3d_array[valid_indices]
vwnd_3d_array = vwnd_3d_array[valid_indices]

# Print the shape of the uwnd and vwnd arrays
print(uwnd_3d_array.shape)
print(vwnd_3d_array.shape)

# Close the NetCDF files
dataset.close()
vwnd_dataset.close()

# Print the minimum and maximum values of the lat and lon in each array
print(f"uwnd_3d_array lat min: {latitudes.min()}, lat max: {latitudes.max()}")
print(f"uwnd_3d_array lon min: {longitudes.min()}, lon max: {longitudes.max()}")

print(f"vwnd_3d_array lat min: {latitudes.min()}, lat max: {latitudes.max()}")
print(f"vwnd_3d_array lon min: {longitudes.min()}, lon max: {longitudes.max()}")


(297, 73, 144)
(297, 73, 144)
uwnd_3d_array lat min: -90.0, lat max: 90.0
uwnd_3d_array lon min: 0.0, lon max: 357.5
vwnd_3d_array lat min: -90.0, lat max: 90.0
vwnd_3d_array lon min: 0.0, lon max: 357.5


In [24]:
combined_df = combined_df[combined_df['DOY_int'] == 100]

# save the combined_df to a csv file
#make a directory to save the file
if not os.path.exists('..\\data\\processed'):
    os.makedirs('..\\data\\processed')
combined_df.to_csv('..\\data\\processed\\combined_data.csv', index=False)

In [26]:
# THIS CURRENTLY DOES NOT WORK...TRYING TO FIX

import pandas as pd
import numpy as np
from scipy.interpolate import griddata

# Assuming combined_df, uwnd_3d_array, vwnd_3d_array, latitudes, and longitudes are already defined

# Step 1: Get a list of the unique DOY_int values in the data frame
unique_doy = combined_df['DOY_int'].unique()

# Step 2: Iterate through each unique value
for doy in unique_doy:
    # Step 3: Select the corresponding layers of the 3D arrays for the same DOY
    uwnd_layer = uwnd_3d_array[doy - 1]  # Assuming DOY starts from 1
    vwnd_layer = vwnd_3d_array[doy - 1]  # Assuming DOY starts from 1
    
    # Debug: Print the shape of the selected layers
    print(f"DOY: {doy}, uwnd_layer shape: {uwnd_layer.shape}, vwnd_layer shape: {vwnd_layer.shape}")
    
    # Get the buoy data for the current DOY
    buoy_data = combined_df[combined_df['DOY_int'] == doy]
    
    # Debug: Print the buoy data for the current DOY
    print(f"Buoy data for DOY {doy}:\n{buoy_data[['Latitude', 'Longitude']].head()}")
    
    # Step 4: Interpolate the buoy data with the corresponding uwnd and vwnd values
    points = np.array([(lat, lon) for lat in latitudes for lon in longitudes])
    uwnd_values = uwnd_layer.flatten()
    vwnd_values = vwnd_layer.flatten()
    
    buoy_points = buoy_data[['Latitude', 'Longitude']].values
    interpolated_uwnd = griddata(points, uwnd_values, buoy_points, method='linear')
    interpolated_vwnd = griddata(points, vwnd_values, buoy_points, method='linear')
    
    # Debug: Print the interpolated values
    print(f"Interpolated uwnd for DOY {doy}:\n{interpolated_uwnd}")
    print(f"Interpolated vwnd for DOY {doy}:\n{interpolated_vwnd}")
    
    # Step 5: Append the results as new columns in the dataframe
    combined_df.loc[combined_df['DOY_int'] == doy, 'uwnd_ncep'] = interpolated_uwnd
    combined_df.loc[combined_df['DOY_int'] == doy, 'vwnd_ncep'] = interpolated_vwnd

# Ensure the new columns are of float type
combined_df['uwnd_ncep'] = combined_df['uwnd_ncep'].astype(float)
combined_df['vwnd_ncep'] = combined_df['vwnd_ncep'].astype(float)

# Debug: Print the updated dataframe
print("Updated combined_df with interpolated values:\n", combined_df.head())

# Check specific row for expected value
print(f"Value of uwnd_ncep at row 743: {combined_df.at[743, 'uwnd_ncep']}")

DOY: 100, uwnd_layer shape: (73, 144), vwnd_layer shape: (73, 144)
Buoy data for DOY 100:
     Latitude  Longitude
743   73.1816   213.7466
744   73.1769   213.7523
745   73.1724   213.7577
746   73.1684   213.7623
747   73.1655   213.7669
Interpolated uwnd for DOY 100:
[4.42304822 4.42053302 4.41812463 ... 1.63346364 1.63158367 1.62974323]
Interpolated vwnd for DOY 100:
[-3.21146965 -3.21657406 -3.22145207 ...  0.79509005  0.79359839
  0.79209753]
Updated combined_df with interpolated values:
               BuoyID  Year  Hour  Min       DOY   POS_DOY  Latitude  \
743  300025010923700  2024     0    0  100.0000  100.0000   73.1816   
744  300025010923700  2024     1    0  100.0417  100.0417   73.1769   
745  300025010923700  2024     2    0  100.0833  100.0833   73.1724   
746  300025010923700  2024     3    0  100.1250  100.1250   73.1684   
747  300025010923700  2024     4    0  100.1667  100.1667   73.1655   

     Longitude    BP     Ts     Ta  iIceC      iBP    iTs  iTa_2m  \
743 