## 1. Raw buoy data cleaning
This cell will clean the buoy data to remove rows with NAs, ensure that the format of lat/lon pairs is standardized, remove buoy locations outside of the Arctic, and remove buoy tracks with less than 50 rows (these are assumed to have been destroyed or instruments failed)

The resulting cleaned data will be stored in the data/cleaned/buoydata/past folder.

In [None]:
import os
import pandas as pd
import rasterio
from pyproj import Transformer
import shutil

# Define input and output directories
input_directory = '../data/raw/buoydata/past'
output_directory = '../data/cleaned/buoydata/past'
removed_rows_output_path = '../data/cleaned/buoydata/removed_rows_past.csv'
raster_path = '../data/raw/geospatial/arctic_land.tif'

# Clear the output directory if it exists, otherwise create it
if os.path.exists(output_directory):
    shutil.rmtree(output_directory)
os.makedirs(output_directory, exist_ok=True)

# DataFrame to store all removed rows
removed_rows_df = pd.DataFrame()

# Define the transformer
transformer = Transformer.from_crs("epsg:4326", "epsg:3413", always_xy=True)

# Function to project coordinates from WGS 1984 to EPSG 3413
def project_to_epsg3413(lat, lon):
    x, y = transformer.transform(lon, lat)
    return x, y

# Open the raster file
with rasterio.open(raster_path) as src:
    # Read the raster data
    raster_data = src.read(1)
    affine_transform = src.transform

    # Function to check if a point overlaps with raster cells with value=1
    def overlaps_raster_value_one(lat, lon):
        if pd.isna(lat) or pd.isna(lon):
            return False
        x, y = project_to_epsg3413(lat, lon)
        try:
            row, col = src.index(x, y)
            if 0 <= row < raster_data.shape[0] and 0 <= col < raster_data.shape[1]:
                return raster_data[row, col] == 1
        except ValueError:
            return False
        return False

    for filename in os.listdir(input_directory):
        if filename.endswith('.csv'):
            input_file_path = os.path.join(input_directory, filename)
            
            # Read the CSV file
            df = pd.read_csv(input_file_path)
            
            # Remove rows with NAs
            df_cleaned = df.dropna()

            # Create a parallel version of the CSV with projected coordinates
            df_cleaned['x'], df_cleaned['y'] = zip(*df_cleaned.apply(lambda row: project_to_epsg3413(row['Lat'], row['Lon']), axis=1))

            # Determine which rows overlap with raster cells with value=1
            df_cleaned['overlaps'] = df_cleaned.apply(lambda row: overlaps_raster_value_one(row['Lat'], row['Lon']), axis=1)
            removed_rows = df_cleaned[df_cleaned['overlaps']]
            df_cleaned = df_cleaned[~df_cleaned['overlaps']]

            # Append removed rows to the DataFrame for validation
            removed_rows_df = pd.concat([removed_rows_df, removed_rows])

            # Drop the 'overlaps' and projected coordinate columns
            df_cleaned = df_cleaned.drop(columns=['overlaps', 'x', 'y'])

            # Save the cleaned data to the output directory unless the file has less than 50 rows
            if len(df_cleaned) > 50:
                output_file_path = os.path.join(output_directory, filename)
                df_cleaned.to_csv(output_file_path, index=False)
            else:
                print(f'{filename} has been deleted for having less than 50 rows')

# Save the removed rows to a separate CSV for validation
removed_rows_df.to_csv(removed_rows_output_path, index=False)

# Print a message to indicate that the script has finished
print('All files have been cleaned and saved to the cleaned directory')

## 2. Current buoy data cleaning.
This cell will clean the buoy data to remove rows with NAs, ensure that the format of lat/lon pairs is standardized, and remove buoy locations outside of the Arctic. It is not practical to remove files for having less than 50 rows in this case as collections of current buoy tracks necessarily have less transmissions in total.

The resulting cleaned data will be stored in the data/cleaned/buoydata/current folder.

In [None]:
import os
import pandas as pd
import rasterio
from pyproj import Transformer
import shutil

# Define input and output directories
input_directory = '../data/raw/buoydata/current'
output_directory = '../data/cleaned/buoydata/current'
removed_rows_output_path = '../data/cleaned/buoydata/removed_rows_current.csv'
raster_path = '../data/raw/geospatial/arctic_land.tif'

# Clear the output directory if it exists, otherwise create it
if os.path.exists(output_directory):
    shutil.rmtree(output_directory)
os.makedirs(output_directory, exist_ok=True)

# DataFrame to store all removed rows
removed_rows_df = pd.DataFrame()

# Define the transformer
transformer = Transformer.from_crs("epsg:4326", "epsg:3413", always_xy=True)

# Function to project coordinates from WGS 1984 to EPSG 3413
def project_to_epsg3413(lat, lon):
    x, y = transformer.transform(lon, lat)
    return x, y

# Open the raster file
with rasterio.open(raster_path) as src:
    # Read the raster data
    raster_data = src.read(1)
    affine_transform = src.transform

    # Function to check if a point overlaps with raster cells with value=1
    def overlaps_raster_value_one(lat, lon):
        if pd.isna(lat) or pd.isna(lon):
            return False
        x, y = project_to_epsg3413(lat, lon)
        try:
            row, col = src.index(x, y)
            if 0 <= row < raster_data.shape[0] and 0 <= col < raster_data.shape[1]:
                return raster_data[row, col] == 1
        except ValueError:
            return False
        return False

    for filename in os.listdir(input_directory):
        if filename.endswith('.csv'):
            input_file_path = os.path.join(input_directory, filename)
            
            # Read the CSV file
            df = pd.read_csv(input_file_path)
            
            # Remove rows with NAs
            df_cleaned = df.dropna()

            # Create a parallel version of the CSV with projected coordinates
            df_cleaned['x'], df_cleaned['y'] = zip(*df_cleaned.apply(lambda row: project_to_epsg3413(row['Lat'], row['Lon']), axis=1))

            # Determine which rows overlap with raster cells with value=1
            df_cleaned['overlaps'] = df_cleaned.apply(lambda row: overlaps_raster_value_one(row['Lat'], row['Lon']), axis=1)
            removed_rows = df_cleaned[df_cleaned['overlaps']]
            df_cleaned = df_cleaned[~df_cleaned['overlaps']]

            # Append removed rows to the DataFrame for validation
            removed_rows_df = pd.concat([removed_rows_df, removed_rows])

            # Drop the 'overlaps' and projected coordinate columns
            df_cleaned = df_cleaned.drop(columns=['overlaps', 'x', 'y'])
    
            # Save the cleaned data to the output directory
            output_file_path = os.path.join(output_directory, filename)
            df_cleaned.to_csv(output_file_path, index=False)

# Save the removed rows to a separate CSV for validation
removed_rows_df.to_csv(removed_rows_output_path, index=False)

# Print a message to indicate that the script has finished
print('All files have been cleaned and saved to the cleaned directory')