## 1. Raw buoy data cleaning
This cell will clean the buoy data to remove rows with NAs, ensure that the format of lat/lon pairs is standardized, remove buoy locations outside of the Arctic, and remove buoy tracks with less than 50 rows (these are assumed to have been destroyed or instruments failed)

The resulting cleaned data will be stored in the data/cleaned/buoydata/past folder.

In [1]:
# Cleaning buoy data, including removing rows with NAs, ensuring all longitude values are 0 to 360, removing rows with Lat values less than 66, and 
# removing files with less than 50 rows

# TO DO: remove data on land

import os
import pandas as pd
import geopandas as gpd

input_directory = '../data/raw/buoydata/past'
output_directory = '../data/cleaned/buoydata/past'

# Create the output directory if it doesn't exist
os.makedirs(output_directory, exist_ok=True)

for filename in os.listdir(input_directory):
    if filename.endswith('.csv'):
        input_file_path = os.path.join(input_directory, filename)
        
        # Read the CSV file
        df = pd.read_csv(input_file_path)
        
        # Remove rows with NAs
        df_cleaned = df.dropna()

        # Ensure that all longitude values are 0 to 360 and not -180 to 180
        df_cleaned['Lon'] = df_cleaned['Lon'].apply(lambda x: x + 360 if x < 0 else x)

        # Remove rows with Lat values less than 66
        df_cleaned = df_cleaned[df_cleaned['Lat'] >= 66]

        # Save the cleaned data to the output directory unless the file has less than 50 rows
        if len(df_cleaned) > 50:
            output_file_path = os.path.join(output_directory, filename)
            df_cleaned.to_csv(output_file_path, index=False)
        else:
            print(f'{filename} has been deleted for having less than 50 rows')

# Print a message to indicate that the script has finished
print('All files have been cleaned and saved to the cleaned directory')

300025010734900.csv has been deleted for having less than 50 rows
300234062644380.csv has been deleted for having less than 50 rows
300234066034140.csv has been deleted for having less than 50 rows
300234066215880.csv has been deleted for having less than 50 rows
300234066216670.csv has been deleted for having less than 50 rows
300234066216690.csv has been deleted for having less than 50 rows
300234066216700.csv has been deleted for having less than 50 rows
300234067874480.csv has been deleted for having less than 50 rows
300234067877380.csv has been deleted for having less than 50 rows
300234067976260.csv has been deleted for having less than 50 rows
300234068346100.csv has been deleted for having less than 50 rows
300234068447190.csv has been deleted for having less than 50 rows
300234068448290.csv has been deleted for having less than 50 rows
300234068767570.csv has been deleted for having less than 50 rows
300234068769540.csv has been deleted for having less than 50 rows
3005340628