In [6]:
import os
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin

# Define the base folder path where files will be downloaded
base_download_folder = r"F:\aloha\ALOHA Cabled Observatory Database\ACO\hot"

# Loop through the directories from hot-314 to hot-317
for i in range(315, 318):
    # Define the URL to scrape files from for each hot directory
    base_url = f"https://hahana.soest.hawaii.edu/FTP/hot/ctd/hot-{i}/"
    
    # Define the folder where files will be downloaded for each hot directory
    download_folder = os.path.join(base_download_folder, f"hot-{i}")
    
    # Create the folder if it doesn't exist
    if not os.path.exists(download_folder):
        os.makedirs(download_folder)
        print(f"Created folder: {download_folder}")
    else:
        print(f"Using existing folder: {download_folder}")

    # Fetch the page content
    response = requests.get(base_url)
    if response.status_code != 200:
        print(f"Failed to access {base_url}")
        continue

    # Parse the HTML page using BeautifulSoup
    soup = BeautifulSoup(response.text, 'html.parser')

    # Find all links on the page
    links = soup.find_all('a')

    # Check if any files are found on the page
    if not links:
        print(f"No files found on the page: {base_url}")
        continue
    else:
        print(f"Found {len(links)} links on the page for hot-{i}.")

    # Loop through all links and download the files
    for link in links:
        file_name = link.get('href')
        file_url = urljoin(base_url, file_name)

        # Include .ctd files in the download list
        if not file_name.endswith(('.cnv', '.txt', '.dat', '.zip', '.csv', '.ctd')):  # Added '.ctd' extension
            print(f"Skipping non-data file: {file_name}")
            continue

        # Construct the full local file path
        local_file_path = os.path.join(download_folder, file_name)

        print(f"Downloading {file_url} to {local_file_path}...")

        try:
            # Download the file
            with requests.get(file_url, stream=True) as r:
                r.raise_for_status()  # Check if the request was successful
                # Check if content length is reasonable (non-empty file)
                content_length = r.headers.get('Content-Length')
                if content_length and int(content_length) == 0:
                    print(f"Warning: {file_name} is an empty file. Skipping download.")
                    continue

                # Save the file locally
                with open(local_file_path, 'wb') as f:
                    for chunk in r.iter_content(chunk_size=8192):
                        f.write(chunk)
            print(f"Downloaded: {local_file_path}")
        except Exception as e:
            print(f"Error downloading {file_url}: {e}")

print("Download process completed.")


Using existing folder: F:\aloha\ALOHA Cabled Observatory Database\ACO\hot\hot-315
Found 23 links on the page for hot-315.
Skipping non-data file: ?C=N;O=D
Skipping non-data file: ?C=M;O=A
Skipping non-data file: ?C=S;O=A
Skipping non-data file: ?C=D;O=A
Skipping non-data file: /FTP/hot/ctd/
Downloading https://hahana.soest.hawaii.edu/FTP/hot/ctd/hot-315/h315a0101.ctd to F:\aloha\ALOHA Cabled Observatory Database\ACO\hot\hot-315\h315a0101.ctd...
Downloaded: F:\aloha\ALOHA Cabled Observatory Database\ACO\hot\hot-315\h315a0101.ctd
Downloading https://hahana.soest.hawaii.edu/FTP/hot/ctd/hot-315/h315a0201.ctd to F:\aloha\ALOHA Cabled Observatory Database\ACO\hot\hot-315\h315a0201.ctd...
Downloaded: F:\aloha\ALOHA Cabled Observatory Database\ACO\hot\hot-315\h315a0201.ctd
Downloading https://hahana.soest.hawaii.edu/FTP/hot/ctd/hot-315/h315a0202.ctd to F:\aloha\ALOHA Cabled Observatory Database\ACO\hot\hot-315\h315a0202.ctd...
Downloaded: F:\aloha\ALOHA Cabled Observatory Database\ACO\hot\hot-