# Data Downloading
## 1. Download a collection of past buoy transmissions from the IABP website (with interpolated MERRA-2 data)
This section will download a collection of past buoy transmissions from a (hidden) section of the IABP website where buoy data were interpolated with MERRA-2 reanalysis data for use in another project. We will make use of that data here for use in training machine learning algorithms.

In [None]:
#Download all the csv files from the IABP interpolated data storage website and save them in the data/raw/buoydata folder for later use

import os
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin

# URL of the webpage to scrape
url = 'https://iabp.apl.uw.edu/Data_Products/Daily_Interp/BuoyData_2024/'

# Directory to save the downloaded CSV files
output_dir = '../data/raw/buoydata/past'

# Create the directory if it does not exist
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

# Send a GET request to the webpage
response = requests.get(url)
response.raise_for_status()  # Check if the request was successful

# Parse the webpage content
soup = BeautifulSoup(response.text, 'html.parser')

# Find all links ending with .csv
csv_links = soup.find_all('a', href=lambda href: href and href.endswith('.csv'))

# Download each CSV file
for link in csv_links:
    csv_url = urljoin(url, link['href'])
    csv_response = requests.get(csv_url)
    csv_response.raise_for_status()
    
    # Extract the filename from the URL
    filename = os.path.join(output_dir, os.path.basename(csv_url))
    
    # Save the CSV file
    with open(filename, 'wb') as file:
        file.write(csv_response.content)
    
    print(f'Downloaded {filename}')

print('All files downloaded successfully!')

## 2. Download a collection of real-time buoy data for use in predictions
This section will download real-time buoy data from the IABP website. All buoys that have reported in the last 24 hours will be queried and downloaded. Sometimes server errors can occur with the API so those that produce a 500 error will be skipped.

In [None]:
# Download the last n days of buoy data (you can change below) for use in predictions with IDs of your choice
# The data will be saved in the data/raw/buoydata/current folder. Note that buoys that produce a 500 error will be skipped.

import os
import requests
import pandas as pd
from datetime import datetime, timedelta, timezone

# URL to get the table of all buoys
table_url = 'https://iabp.apl.uw.edu/TABLES/ArcticTable_Current.txt'

# Fetch the table
response = requests.get(table_url)
response.raise_for_status()

# Convert the table to a DataFrame without a header
data = response.text.splitlines()
rows = [line.split(';') for line in data]
df = pd.DataFrame(rows)

# Manually select the first column (buoy ID) and the seventh column (date)
df = df[[0, 6]]
df.columns = ['BuoyID', 'Date']

# Get the current time
current_time = datetime.now(timezone.utc)
print(f"Current time: {current_time}")

# Filter buoy IDs that have reported in the last 24 hours
bids = []
for index, row in df.iterrows():
    try:
        # Update the date format to match MM/DD/YYYY HH:MM:SS
        report_time = datetime.strptime(row['Date'], '%m/%d/%Y %H:%M:%S').replace(tzinfo=timezone.utc)
        if current_time - report_time <= timedelta(hours=24):
            bids.append(row['BuoyID'])
    except ValueError as e:
        continue  # Skip rows with invalid date format

print(f'Selected buoy IDs: {bids}')

# Directory to save the downloaded CSV files
output_dir = '../data/raw/buoydata/current'

# Create the directory if it does not exist
os.makedirs(output_dir, exist_ok=True)

# Clear all files in the directory before downloading new data
for filename in os.listdir(output_dir):
    file_path = os.path.join(output_dir, filename)
    try:
        if os.path.isfile(file_path) or os.path.islink(file_path):
            os.unlink(file_path)
        elif os.path.isdir(file_path):
            os.rmdir(file_path)
    except Exception as e:
        print(f'Failed to delete {file_path}. Reason: {e}')

# Number of days to download data for
ndays = 2

# Base URL for the API
base_url = 'https://iabp.apl.uw.edu/download'

# Iterate over each bid value
for bid in bids:
    # Construct the URL for the current bid
    url = f'{base_url}?bid={bid}&ndays={ndays}'
    
    try:
        # Send a GET request to the URL
        response = requests.get(url)
        response.raise_for_status()  # Check if the request was successful
        
        # Construct the filename and save path
        filename = f'{bid}.csv'
        file_path = os.path.join(output_dir, filename)
        
        # Save the CSV file
        with open(file_path, 'wb') as file:
            file.write(response.content)
        
        print(f'Downloaded {filename} to {file_path}')
    except requests.exceptions.HTTPError as e:
        if response.status_code == 500:
            print(f"Skipping {bid} due to HTTP 500 error")
        else:
            print(f"HTTP error occurred for {bid}: {e}")
    except Exception as e:
        print(f"An error occurred for {bid}: {e}")

## 3. Download NCEP surface wind (u and v) products for interpolation with past buoy data
This section will download the 2024 NCEP reanalysis (u and v components of wind) and save as a netCDF. This data will be interpolated with the past buoy data as training data for machine learning algorithms.

In [None]:
#Download the NCEP surface winds (u and v) reanalysis data for the year 2024
#The data is stored in NetCDF format on an FTP server. We will download the files and save them in the data/raw/reanalyses/ncep folder.

import ftplib
import os

# FTP server details
ftp_server = 'ftp.cdc.noaa.gov'
ftp_path = '/Datasets/ncep/'
filename = 'uwnd.sfc.2024.nc'

# Local directory to save the downloaded file
local_dir = '../data/raw/reanalyses/ncep'
os.makedirs(local_dir, exist_ok=True)

# Connect to the FTP server
ftp = ftplib.FTP(ftp_server)
ftp.login()

# Change to the specified directory
ftp.cwd(ftp_path)

# Download the file
local_filename = os.path.join(local_dir, filename)
with open(local_filename, 'wb') as file:
    ftp.retrbinary(f'RETR {filename}', file.write)

print(f'Downloaded {filename} to {local_filename}')

#Doing the same for the vwnd file

filename = 'vwnd.sfc.2024.nc'

# Local directory to save the downloaded file
local_dir = '../data/raw/reanalyses/ncep'
os.makedirs(local_dir, exist_ok=True)

# Connect to the FTP server
ftp = ftplib.FTP(ftp_server)
ftp.login()

# Change to the specified directory
ftp.cwd(ftp_path)

# Download the file
local_filename = os.path.join(local_dir, filename)
with open(local_filename, 'wb') as file:
    ftp.retrbinary(f'RETR {filename}', file.write)

print(f'Downloaded {filename} to {local_filename}')

# Close the FTP connection
ftp.quit()

## 4. Download the most recent GFS forecast to interpolate with the real-time buoy data
This section will access the NOAA FTP repository and download the most recent GFS forecast data.

In [None]:
# Download the most recent GFS forecast data from the NOAA FTP server
# The data is stored in netCDF format in the data/raw/forecasts/gfs directory
# Be aware that the file is large and may take some time to download and that all files in the gfs directory will be wiped before downloading the new data

import ftplib
from datetime import datetime
import os
from ftplib import FTP

# Define the directory path to the gfs folder
gfs_directory = 'data/raw/forecasts/gfs'

# Remove all files in the gfs directory before downloading new data
for filename in os.listdir(gfs_directory):
    file_path = os.path.join(gfs_directory, filename)
    try:
        if os.path.isfile(file_path) or os.path.islink(file_path):
            os.unlink(file_path)
        elif os.path.isdir(file_path):
            os.rmdir(file_path)
    except Exception as e:
        print(f'Failed to delete {file_path}. Reason: {e}')

# FTP server details
ftp_server = 'ftp.ncep.noaa.gov'
ftp_path = '/pub/data/nccf/com/gfs/prod/'

# Connect to the FTP server
ftp = ftplib.FTP(ftp_server)
ftp.login()

# Change to the specified directory
ftp.cwd(ftp_path)

# List directories and their modification times
directories = []
ftp.retrlines('LIST', directories.append)

# Filter directories with "gfs" in their name and get their modification times
gfs_dirs = []
for entry in directories:
    parts = entry.split()
    name = parts[-1]
    if 'gfs' in name:
        # Parse the modification time
        mod_time_str = ' '.join(parts[-4:-1])
        mod_time = datetime.strptime(mod_time_str, '%b %d %H:%M')
        gfs_dirs.append((name, mod_time))

# Find the most recently edited directory
most_recent_dir = max(gfs_dirs, key=lambda x: x[1])

# Enter the most recently edited directory
ftp.cwd(most_recent_dir[0])
print(f"Entered directory: {most_recent_dir[0]}")

# List subdirectories and their modification times
subdirectories = []
ftp.retrlines('LIST', subdirectories.append)

# Filter subdirectories and get their modification times
sub_dirs = []
for entry in subdirectories:
    parts = entry.split()
    name = parts[-1]
    if entry.startswith('d'):
        # Parse the modification time
        mod_time_str = ' '.join(parts[-4:-1])
        mod_time = datetime.strptime(mod_time_str, '%b %d %H:%M')
        sub_dirs.append((name, mod_time))

# Find the most recently edited subdirectory
most_recent_subdir = max(sub_dirs, key=lambda x: x[1])

# Enter the most recently edited subdirectory
ftp.cwd(most_recent_subdir[0])
print(f"Entered subdirectory: {most_recent_subdir[0]}")

# Enter the "atmos" directory
ftp.cwd('atmos')
print("Entered directory: atmos")

# List files in the directory
files = []
ftp.retrlines('LIST', files.append)

# Filter .nc files with "atmf" in their name and get their modification times
nc_files = []
for entry in files:
    parts = entry.split()
    name = parts[-1]
    if name.endswith('.nc') and 'atmf' in name:
        # Parse the modification time
        mod_time_str = ' '.join(parts[-4:-1])
        mod_time = datetime.strptime(mod_time_str, '%b %d %H:%M')
        nc_files.append((name, mod_time))

# Find the most recently edited .nc file with "atmf" in its name
most_recent_nc_file = max(nc_files, key=lambda x: x[1])

# Ensure the local directory exists
local_dir = '../data/raw/forecasts/gfs'
os.makedirs(local_dir, exist_ok=True)

# Download the most recently edited .nc file
local_filename = os.path.join(local_dir, most_recent_nc_file[0])
with open(local_filename, 'wb') as file:
    ftp.retrbinary(f'RETR {most_recent_nc_file[0]}', file.write)

print(f'Downloaded {most_recent_nc_file[0]} to {local_filename}')

# Close the FTP connection
ftp.quit()

## 5. Download a raster dataset of the land in our area of interest

In [2]:
import os
import gdown

# URL of the Google Drive file
url = 'https://drive.google.com/uc?id=1u6FwmRsOH7FfY7jhb0XQJ6yurWwy3AOW'

# Directory to save the downloaded file
output_dir = '../data/raw/geospatial'
os.makedirs(output_dir, exist_ok=True)

# Output file path
output_file = os.path.join(output_dir, 'arctic_land.tif')

# Download the file
gdown.download(url, output_file, quiet=False)

print(f'Downloaded file to {output_file}')

Downloading...
From: https://drive.google.com/uc?id=1u6FwmRsOH7FfY7jhb0XQJ6yurWwy3AOW
To: c:\Users\benco\Documents\GitHub\MLGEO2024_AObuoypredict\data\raw\geospatial\arctic_land.tif
100%|██████████| 5.15M/5.15M [00:00<00:00, 24.3MB/s]

Downloaded file to ../data/raw/geospatial\arctic_land.tif



