In [2]:
import os
import subprocess
import pandas as pd
from urllib.parse import urlparse
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry
import requests
from tqdm import tqdm

# Create a session with retry logic
def create_retry_session(retries=5, backoff_factor=0.3, status_forcelist=(500, 502, 504)):
    session = requests.Session()
    retry = Retry(
        total=retries,
        read=retries,
        connect=retries,
        backoff_factor=backoff_factor,
        status_forcelist=status_forcelist,
    )
    adapter = HTTPAdapter(max_retries=retry)
    session.mount('http://', adapter)
    session.mount('https://', adapter)
    return session

# Download with aria2 (parallel download)
def download_with_aria2(url, folder):
    file_name = os.path.basename(urlparse(url).path)
    file_path = os.path.join(folder, file_name)
    command = ['aria2c', '-x', '16', '-s', '16', '-c', '-o', file_name, '-d', folder, url]
    result = subprocess.run(command)
    return file_name if result.returncode == 0 else None

# Download with requests (fallback)
def download_file(url, folder, chunk_size=1024):
    file_name = os.path.basename(urlparse(url).path)
    file_path = os.path.join(folder, file_name)
    session = create_retry_session()
    response = session.get(url, stream=True)
    total_size = int(response.headers.get('content-length', 0))
    
    with open(file_path, 'wb') as file, tqdm(
        desc=file_name,
        total=total_size,
        unit='B',
        unit_scale=True,
        unit_divisor=1024,
        miniters=1
    ) as bar:
        for chunk in response.iter_content(chunk_size=chunk_size):
            file.write(chunk)
            bar.update(len(chunk))
    return file_name

# Function to handle the download process from the CSV
def process_downloads(csv_file, download_folder):
    # Read the CSV and add 'Status' if it doesn't exist
    df = pd.read_csv(csv_file)
    if 'Status' not in df.columns:
        df['Status'] = 'Incomplete'

    os.makedirs(download_folder, exist_ok=True)

    # Loop through the CSV and download files
    for index, row in df.iterrows():
        if df.loc[index, 'Status'] == 'Completed':
            print (f"Skip {package}. Completed")
            continue

        download_link = row['Download Link']
        package = row['Package']
        print(f"Downloading {package}...")

        try:
            # Download via aria2 or fallback to requests
            file_name = download_with_aria2(download_link, download_folder)
            if not file_name:
                file_name = download_file(download_link, download_folder)
            
            if file_name:
                df.loc[index, 'Status'] = 'Completed'
                df.loc[index, 'Downloaded File Name'] = file_name
                print(f"{file_name} downloaded successfully.")
            else:
                df.loc[index, 'Status'] = 'Incomplete'
                print(f"Download failed for {package}.")

        except Exception as e:
            print(f"Error downloading {package}: {e}")
            df.loc[index, 'Status'] = 'Error'

        # Update the original CSV after each download
        df.to_csv(csv_file, index=False)

    print("Download process completed. All updates saved.")

# Usage example:
csv_file = 'DTM_download_V2.csv'
download_folder = '/Users/shuyang/Data/DTM_ZIP'
process_downloads(csv_file, download_folder)


Downloading Lake Erie J...

09/23 19:56:21 [[1;32mNOTICE[0m] Downloading 1 item(s)
[#9afca7 890MiB/2.8GiB(30%) CN:16 DL:18MiB ETA:1m50s]
[#9afca7 910MiB/2.8GiB(31%) CN:16 DL:18MiB ETA:1m44s]
[#9afca7 0.9GiB/2.8GiB(32%) CN:16 DL:19MiB ETA:1m38s]
[#9afca7 0.9GiB/2.8GiB(32%) CN:16 DL:18MiB ETA:1m47s]
[#9afca7 0.9GiB/2.8GiB(32%) CN:16 DL:15MiB ETA:2m8s]
[#9afca7 0.9GiB/2.8GiB(32%) CN:16 DL:12MiB ETA:2m29s]
[#9afca7 0.9GiB/2.8GiB(33%) CN:16 DL:13MiB ETA:2m26s]
[#9afca7 0.9GiB/2.8GiB(33%) CN:16 DL:13MiB ETA:2m24s]
[#9afca7 0.9GiB/2.8GiB(34%) CN:16 DL:13MiB ETA:2m22s]
[#9afca7 0.9GiB/2.8GiB(34%) CN:16 DL:13MiB ETA:2m17s]
[#9afca7 1.0GiB/2.8GiB(35%) CN:16 DL:14MiB ETA:2m5s]
[#9afca7 1.0GiB/2.8GiB(36%) CN:16 DL:15MiB ETA:1m54s]
[#9afca7 1.0GiB/2.8GiB(37%) CN:16 DL:16MiB ETA:1m50s]
[#9afca7 1.0GiB/2.8GiB(38%) CN:16 DL:18MiB ETA:1m36s]
[#9afca7 1.1GiB/2.8GiB(39%) CN:16 DL:21MiB ETA:1m21s]
[#9afca7 1.1GiB/2.8GiB(40%) CN:16 DL:23MiB ETA:1m11s]
[#9afca7 1.1GiB/2.8GiB(41%) CN:16 DL:25MiB ETA:1m5s]


KeyboardInterrupt: 

In [None]:
import os
import subprocess
import pandas as pd
from urllib.parse import urlparse
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry
import requests
from tqdm import tqdm

# Create a session with retry logic
def create_retry_session(retries=5, backoff_factor=0.3, status_forcelist=(500, 502, 504)):
    session = requests.Session()
    retry = Retry(
        total=retries,
        read=retries,
        connect=retries,
        backoff_factor=backoff_factor,
        status_forcelist=status_forcelist,
    )
    adapter = HTTPAdapter(max_retries=retry)
    session.mount('http://', adapter)
    session.mount('https://', adapter)
    return session

# Function to download a file with aria2 (parallel download)
def download_with_aria2(url, folder):
    file_name = os.path.basename(urlparse(url).path)
    file_path = os.path.join(folder, file_name)
    # Use aria2c for parallel download with retries
    command = ['aria2c', '-x', '16', '-s', '16', '-c', '-o', file_name, '-d', folder, url]
    result = subprocess.run(command)
    if result.returncode == 0:
        return file_name
    return None

# Function to download a file using requests (with retry logic)
def download_file(url, folder, chunk_size=1024):
    file_name = os.path.basename(urlparse(url).path)
    file_path = os.path.join(folder, file_name)
    
    session = create_retry_session()
    response = session.get(url, stream=True)
    total_size = int(response.headers.get('content-length', 0))
    
    # Download with a progress bar
    with open(file_path, 'wb') as file, tqdm(
        desc=file_name,
        total=total_size,
        unit='B',
        unit_scale=True,
        unit_divisor=1024,
        miniters=1
    ) as bar:
        for chunk in response.iter_content(chunk_size=chunk_size):
            if chunk:
                file.write(chunk)
                bar.update(len(chunk))
    return file_name

# Read the CSV file
df = pd.read_csv('DTM_download.csv')

# Add a new 'Status' column to track the download progress if it doesn't already exist
if 'Status' not in df.columns:
    df['Status'] = 'Incomplete'

# Set the download folder (customize as needed)
download_folder = '/Users/shuyang/Data/DTM_ZIP'

# Create the download folder if it doesn't exist
if not os.path.exists(download_folder):
    os.makedirs(download_folder)

# Loop through each row in the CSV and download the file
for index, row in df.iterrows():
    project = row['Project']
    package = row['Package']
    size = float(row['Size'])  # Ensure size is treated as a float (representing GB)
    resolution = row['Res. (m)']
    download_link = row['Download Link']
    
    # Check if the file is already marked as 'Completed'
    if df.loc[index, 'Status'] == 'Completed':
        print(f"{download_link} is already downloaded.")
        continue

    try:
        # Download the file using aria2 (for robust parallel download) or fallback to requests
        print(f"Downloading {package} from {project} ({size} GB)...")
        file_name = download_with_aria2(download_link, download_folder)
        
        # If aria2 fails, try with requests
        if not file_name:
            print(f"Aria2 download failed for {package}, retrying with requests.")
            file_name = download_file(download_link, download_folder)
        
        # Mark as completed if the file was successfully downloaded
        if file_name:
            df.loc[index, 'Status'] = 'Completed'
            df.loc[index, 'Downloaded File Name'] = file_name
            print(f"{file_name} downloaded successfully.")
        else:
            df.loc[index, 'Status'] = 'Incomplete'
            print(f"Download of {package} failed or incomplete.")
    
    except Exception as e:
        print(f"Error downloading {download_link}: {e}")
        df.loc[index, 'Status'] = 'Error'

    # Update the CSV after each file
    df.to_csv('DTM_download_updated.csv', index=False)
    print(f"CSV updated after processing {package}.")

print("Download process completed. All updates saved in 'DTM_download_updated.csv'.")
