# Data Downloading
## 1. Download a collection of past buoy transmissions from the IABP website
This section will download a collection of past buoy transmissions from a section of the IABP website. We will make use of that data here for use in training machine learning algorithms.

In [None]:
import os
import pandas as pd
import requests

# URL of the file to download
file_url = 'https://iabp.apl.uw.edu/Data_Products/LEVEL1_DATA/LEVEL1_2023.csv'

# Directory to save the downloaded file
temp_dir = '../data/raw/buoydata/past/temp'
os.makedirs(temp_dir, exist_ok=True)

# Path to save the downloaded file
temp_file_path = os.path.join(temp_dir, 'IABP_Level1_2023all.csv')

# Download the file
response = requests.get(file_url)
response.raise_for_status()  # Check if the request was successful

# Save the file
with open(temp_file_path, 'wb') as file:
    file.write(response.content)

print(f'Downloaded file to {temp_file_path}')

# Read the downloaded CSV file
df = pd.read_csv(temp_file_path)

# Directory to save the separated CSV files
output_dir = '../data/raw/buoydata/past'
os.makedirs(output_dir, exist_ok=True)

# Clear all files in the directory before saving new data, except the "temp" folder
for filename in os.listdir(output_dir):
    file_path = os.path.join(output_dir, filename)
    try:
        if os.path.isfile(file_path) or os.path.islink(file_path):
            os.unlink(file_path)
        elif os.path.isdir(file_path) and filename != 'temp':
            os.rmdir(file_path)
    except Exception as e:
        print(f'Failed to delete {file_path}. Reason: {e}')
    
# Separate the file into discrete CSV files based on the buoyID column
for buoy_id, group in df.groupby('BuoyID'):
    output_file_path = os.path.join(output_dir, f'{buoy_id}.csv')
    group.to_csv(output_file_path, index=False)
    print(f'Saved {output_file_path}')

## 2. Download a collection of real-time buoy data for use in predictions
This section will download real-time buoy data from the IABP website. All buoys that have reported in the last 24 hours will be queried and downloaded. Sometimes server errors can occur with the API so those that produce a 500 error will be skipped.

In [None]:
# Download the last n days of buoy data (you can change below) for use in predictions with IDs of your choice
# The data will be saved in the data/raw/buoydata/current folder. Note that buoys that produce a 500 error will be skipped.

import os
import requests
import pandas as pd
from datetime import datetime, timedelta, timezone

# URL to get the table of all buoys
table_url = 'https://iabp.apl.uw.edu/TABLES/ArcticTable_Current.txt'

# Fetch the table
response = requests.get(table_url)
response.raise_for_status()

# Convert the table to a DataFrame without a header
data = response.text.splitlines()
rows = [line.split(';') for line in data]
df = pd.DataFrame(rows)

# Manually select the first column (buoy ID) and the seventh column (date)
df = df[[0, 6]]
df.columns = ['BuoyID', 'Date']

# Get the current time
current_time = datetime.now(timezone.utc)
print(f"Current time: {current_time}")

# Filter buoy IDs that have reported in the last 24 hours
bids = []
for index, row in df.iterrows():
    try:
        # Update the date format to match MM/DD/YYYY HH:MM:SS
        report_time = datetime.strptime(row['Date'], '%m/%d/%Y %H:%M:%S').replace(tzinfo=timezone.utc)
        if current_time - report_time <= timedelta(hours=24):
            bids.append(row['BuoyID'])
    except ValueError as e:
        continue  # Skip rows with invalid date format

print(f'Selected buoy IDs: {bids}')

# Directory to save the downloaded CSV files
output_dir = '../data/raw/buoydata/current'

# Create the directory if it does not exist
os.makedirs(output_dir, exist_ok=True)

# Clear all files in the directory before downloading new data
for filename in os.listdir(output_dir):
    file_path = os.path.join(output_dir, filename)
    try:
        if os.path.isfile(file_path) or os.path.islink(file_path):
            os.unlink(file_path)
        elif os.path.isdir(file_path):
            os.rmdir(file_path)
    except Exception as e:
        print(f'Failed to delete {file_path}. Reason: {e}')

# Number of days to download data for
ndays = 2

# Base URL for the API
base_url = 'https://iabp.apl.uw.edu/download'

# Iterate over each bid value
for bid in bids:
    # Construct the URL for the current bid
    url = f'{base_url}?bid={bid}&ndays={ndays}'
    
    try:
        # Send a GET request to the URL
        response = requests.get(url)
        response.raise_for_status()  # Check if the request was successful
        
        # Construct the filename and save path
        filename = f'{bid}.csv'
        file_path = os.path.join(output_dir, filename)
        
        # Save the CSV file
        with open(file_path, 'wb') as file:
            file.write(response.content)
        
        print(f'Downloaded {filename} to {file_path}')
    except requests.exceptions.HTTPError as e:
        if response.status_code == 500:
            print(f"Skipping {bid} due to HTTP 500 error")
        else:
            print(f"HTTP error occurred for {bid}: {e}")
    except Exception as e:
        print(f"An error occurred for {bid}: {e}")

## 3. Download ERA5 surface wind (u and v) products for interpolation with past buoy data
This section will download the 2023 ERA5 reanalysis (u and v components of wind) and save as a netCDF. This data will be interpolated with the past buoy data as training data for machine learning algorithms.

In [16]:
# This script will download the ERA5 reanalysis data for the year 2023 from the CDS API. The data will be downloaded in netCDF format and will contain the u-component of the wind at the 1 hPa pressure level. The data will be downloaded for the entire globe and for all hours of the day. The data will be saved in the data/raw/reanalyses/ERA5 directory. 
# If the directory does not exist, it will be created. If the directory already exists, all files in the directory will be deleted before the new data is downloaded.
# WARNING: the output file will be large (approximately 1.5 GB) and the download may take a long time. Make sure you have enough disk space and a stable internet connection before running this script.

import os
import cdsapi

# Create the output directory if it does not exist
os.makedirs("../data/raw/reanalyses/ERA5", exist_ok=True)
        
# Set the CDSAPI_RC environment variable to the path of your .cdsapirc file
os.environ['CDSAPI_RC'] = '../.cdsapirc'

dataset = "reanalysis-era5-pressure-levels"
request = {
    "product_type": ["reanalysis"],
    "variable": ["u_component_of_wind"],
    "year": ["2023"],
    "month": [
        "01", #"02", "03",
        #"04", "05", "06",
        #"07", "08", "09",
        #"10", "11", "12"
    ],
    "day": [
        "01", "02", "03",
        "04", "05", "06",
        "07", "08", "09",
        "10", "11", "12",
        "13", "14", "15",
        "16", "17", "18",
        "19", "20", "21",
        "22", "23", "24",
        "25", "26", "27",
        "28", "29", "30",
        "31"
    ],
    "time": [
        "00:00", "01:00", "02:00",
        "03:00", "04:00", "05:00",
        "06:00", "07:00", "08:00",
        "09:00", "10:00", "11:00",
        "12:00", "13:00", "14:00",
        "15:00", "16:00", "17:00",
        "18:00", "19:00", "20:00",
        "21:00", "22:00", "23:00"
    ],
    "pressure_level": ["1"],
    "data_format": "netcdf",
    "download_format": "unarchived",
    "area": [90, -180, 23, 180]
}

client = cdsapi.Client()

target_file = "../data/raw/reanalyses/ERA5/era5_uwnd_2023.nc"
client.retrieve(dataset, request, target_file)

dataset = "reanalysis-era5-pressure-levels"
request = {
    "product_type": ["reanalysis"],
    "variable": ["v_component_of_wind"],
    "year": ["2023"],
    "month": [
        "01", #"02", "03",
        #"04", "05", "06",
        #"07", "08", "09",
        #"10", "11", "12"
    ],
    "day": [
        "01", "02", "03",
        "04", "05", "06",
        "07", "08", "09",
        "10", "11", "12",
        "13", "14", "15",
        "16", "17", "18",
        "19", "20", "21",
        "22", "23", "24",
        "25", "26", "27",
        "28", "29", "30",
        "31"
    ],
    "time": [
        "00:00", "01:00", "02:00",
        "03:00", "04:00", "05:00",
        "06:00", "07:00", "08:00",
        "09:00", "10:00", "11:00",
        "12:00", "13:00", "14:00",
        "15:00", "16:00", "17:00",
        "18:00", "19:00", "20:00",
        "21:00", "22:00", "23:00"
    ],
    "pressure_level": ["1"],
    "data_format": "netcdf",
    "download_format": "unarchived",
    "area": [90, -180, 23, 180]
}

client = cdsapi.Client()

target_file = "../data/raw/reanalyses/ERA5/era5_vwnd_2023.nc"
client.retrieve(dataset, request, target_file)

2024-11-07 15:32:11,048 INFO [2024-09-28T00:00:00] **Welcome to the New Climate Data Store (CDS)!** This new system is in its early days of full operations and still undergoing enhancements and fine tuning. Some disruptions are to be expected. Your 
[feedback](https://jira.ecmwf.int/plugins/servlet/desk/portal/1/create/202) is key to improve the user experience on the new CDS for the benefit of everyone. Thank you.
2024-11-07 15:32:11,049 INFO [2024-09-26T00:00:00] Watch our [Forum](https://forum.ecmwf.int/) for Announcements, news and other discussed topics.
2024-11-07 15:32:11,050 INFO [2024-09-16T00:00:00] Remember that you need to have an ECMWF account to use the new CDS. **Your old CDS credentials will not work in new CDS!**
[Forum announcement](https://forum.ecmwf.int/t/final-validated-era5-product-to-differ-from-era5t-in-july-2024/6685)
for details and watch it for further updates on this.
2024-11-07 15:32:11,511 INFO Request ID is cafaf517-7c79-4f55-bc87-6d94b36d29b0
2024-11-07

f490db7cc025062d24651ab80175da7f.nc:   0%|          | 0.00/425M [00:00<?, ?B/s]

2024-11-07 15:34:14,677 INFO [2024-09-28T00:00:00] **Welcome to the New Climate Data Store (CDS)!** This new system is in its early days of full operations and still undergoing enhancements and fine tuning. Some disruptions are to be expected. Your 
[feedback](https://jira.ecmwf.int/plugins/servlet/desk/portal/1/create/202) is key to improve the user experience on the new CDS for the benefit of everyone. Thank you.
2024-11-07 15:34:14,679 INFO [2024-09-26T00:00:00] Watch our [Forum](https://forum.ecmwf.int/) for Announcements, news and other discussed topics.
2024-11-07 15:34:14,679 INFO [2024-09-16T00:00:00] Remember that you need to have an ECMWF account to use the new CDS. **Your old CDS credentials will not work in new CDS!**
[Forum announcement](https://forum.ecmwf.int/t/final-validated-era5-product-to-differ-from-era5t-in-july-2024/6685)
for details and watch it for further updates on this.
2024-11-07 15:34:15,270 INFO Request ID is d33ed661-3f3d-4163-90ce-31bcffeb5d6f
2024-11-07

1b4bc0e1230e9c9c199946081909c444.nc:   0%|          | 0.00/446M [00:00<?, ?B/s]

'../data/raw/reanalyses/ERA5/era5_vwnd_2023.nc'