## 1. Raw buoy data cleaning
This cell will clean the buoy data to remove rows with NAs, ensure that the format of lat/lon pairs is standardized, remove buoy locations outside of the Arctic, and remove buoy tracks with less than 50 rows (these are assumed to have been destroyed or instruments failed)

The resulting cleaned data will be stored in the data/cleaned/buoydata/past folder.

In [None]:
# Cleaning buoy data, including removing rows with NAs, ensuring all longitude values are 0 to 360, removing rows with Lat values less than 66, and 
# removing files with less than 50 rows

# TO DO: remove data on land

import os
import pandas as pd
import geopandas as gpd

input_directory = '../data/raw/buoydata/past'
output_directory = '../data/cleaned/buoydata/past'

# Create the output directory if it doesn't exist
os.makedirs(output_directory, exist_ok=True)

for filename in os.listdir(input_directory):
    if filename.endswith('.csv'):
        input_file_path = os.path.join(input_directory, filename)
        
        # Read the CSV file
        df = pd.read_csv(input_file_path)
        
        # Remove rows with NAs
        df_cleaned = df.dropna()

        # Ensure that all longitude values are 0 to 360 and not -180 to 180
        df_cleaned['Lon'] = df_cleaned['Lon'].apply(lambda x: x + 360 if x < 0 else x)

        # Remove rows with Lat values less than 66
        df_cleaned = df_cleaned[df_cleaned['Lat'] >= 66]

        # Save the cleaned data to the output directory unless the file has less than 50 rows
        if len(df_cleaned) > 50:
            output_file_path = os.path.join(output_directory, filename)
            df_cleaned.to_csv(output_file_path, index=False)
        else:
            print(f'{filename} has been deleted for having less than 50 rows')

# Print a message to indicate that the script has finished
print('All files have been cleaned and saved to the cleaned directory')

## 2. Current buoy data cleaning.
This cell will clean the buoy data to remove rows with NAs, ensure that the format of lat/lon pairs is standardized, and remove buoy locations outside of the Arctic. It is not practical to remove files for having less than 50 rows in this case as collections of current buoy tracks necessarily have less transmissions in total.

The resulting cleaned data will be stored in the data/cleaned/buoydata/current folder.

In [None]:
# Cleaning current buoy data, including removing rows with NAs, ensuring all longitude values are 0 to 360, and removing rows with Lat values less than 66

input_directory_current = '../data/raw/buoydata/current'
output_directory_current = '../data/cleaned/buoydata/current'

# Create the output directory if it doesn't exist
os.makedirs(output_directory_current, exist_ok=True)

for filename in os.listdir(input_directory_current):
    if filename.endswith('.csv'):
        input_file_path_current = os.path.join(input_directory_current, filename)
        
        # Read the CSV file
        df_current = pd.read_csv(input_file_path_current)
        
        # Remove rows with NAs
        df_cleaned_current = df_current.dropna()

        # Ensure that all longitude values are 0 to 360 and not -180 to 180
        df_cleaned_current['Lon'] = df_cleaned_current['Lon'].apply(lambda x: x + 360 if x < 0 else x)

        # Remove rows with Lat values less than 66
        df_cleaned_current = df_cleaned_current[df_cleaned_current['Lat'] >= 66]

        # Save the cleaned data to the output directory
        output_file_path_current = os.path.join(output_directory_current, filename)
        df_cleaned_current.to_csv(output_file_path_current, index=False)

# Print a message to indicate that the script has finished
print('All current buoy data files have been cleaned and saved to the cleaned directory')