In [1]:
pip install requests tqdm pandas


Collecting pandas
  Downloading pandas-2.2.3-cp312-cp312-macosx_11_0_arm64.whl.metadata (89 kB)
Collecting pytz>=2020.1 (from pandas)
  Downloading pytz-2024.2-py2.py3-none-any.whl.metadata (22 kB)
Collecting tzdata>=2022.7 (from pandas)
  Downloading tzdata-2024.1-py2.py3-none-any.whl.metadata (1.4 kB)
Downloading pandas-2.2.3-cp312-cp312-macosx_11_0_arm64.whl (11.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m11.4/11.4 MB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hDownloading pytz-2024.2-py2.py3-none-any.whl (508 kB)
Downloading tzdata-2024.1-py2.py3-none-any.whl (345 kB)
Installing collected packages: pytz, tzdata, pandas
Successfully installed pandas-2.2.3 pytz-2024.2 tzdata-2024.1
Note: you may need to restart the kernel to use updated packages.


In [4]:
import os
import subprocess
import pandas as pd
from urllib.parse import urlparse
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry
import requests
from tqdm import tqdm

# Create a session with retry logic
def create_retry_session(retries=5, backoff_factor=0.3, status_forcelist=(500, 502, 504)):
    session = requests.Session()
    retry = Retry(
        total=retries,
        read=retries,
        connect=retries,
        backoff_factor=backoff_factor,
        status_forcelist=status_forcelist,
    )
    adapter = HTTPAdapter(max_retries=retry)
    session.mount('http://', adapter)
    session.mount('https://', adapter)
    return session

# Function to download a file with aria2 (parallel download)
def download_with_aria2(url, folder):
    file_name = os.path.basename(urlparse(url).path)
    file_path = os.path.join(folder, file_name)
    # Use aria2c for parallel download with retries
    command = ['aria2c', '-x', '16', '-s', '16', '-c', '-o', file_name, '-d', folder, url]
    result = subprocess.run(command)
    if result.returncode == 0:
        return file_name
    return None

# Function to download a file using requests (with retry logic)
def download_file(url, folder, chunk_size=1024):
    file_name = os.path.basename(urlparse(url).path)
    file_path = os.path.join(folder, file_name)
    
    session = create_retry_session()
    response = session.get(url, stream=True)
    total_size = int(response.headers.get('content-length', 0))
    
    # Download with a progress bar
    with open(file_path, 'wb') as file, tqdm(
        desc=file_name,
        total=total_size,
        unit='B',
        unit_scale=True,
        unit_divisor=1024,
        miniters=1
    ) as bar:
        for chunk in response.iter_content(chunk_size=chunk_size):
            if chunk:
                file.write(chunk)
                bar.update(len(chunk))
    return file_name

# Read the CSV file
df = pd.read_csv('DTM_download.csv')

# Add a new 'Status' column to track the download progress if it doesn't already exist
if 'Status' not in df.columns:
    df['Status'] = 'Incomplete'

# Set the download folder (customize as needed)
download_folder = '/Users/shuyang/Data/DTM_ZIP'

# Create the download folder if it doesn't exist
if not os.path.exists(download_folder):
    os.makedirs(download_folder)

# Loop through each row in the CSV and download the file
for index, row in df.iterrows():
    project = row['Project']
    package = row['Package']
    size = float(row['Size'])  # Ensure size is treated as a float (representing GB)
    resolution = row['Res. (m)']
    download_link = row['Download Link']
    
    # Check if the file is already marked as 'Completed'
    if df.loc[index, 'Status'] == 'Completed':
        print(f"{download_link} is already downloaded.")
        continue

    try:
        # Download the file using aria2 (for robust parallel download) or fallback to requests
        print(f"Downloading {package} from {project} ({size} GB)...")
        file_name = download_with_aria2(download_link, download_folder)
        
        # If aria2 fails, try with requests
        if not file_name:
            print(f"Aria2 download failed for {package}, retrying with requests.")
            file_name = download_file(download_link, download_folder)
        
        # Mark as completed if the file was successfully downloaded
        if file_name:
            df.loc[index, 'Status'] = 'Completed'
            df.loc[index, 'Downloaded File Name'] = file_name
            print(f"{file_name} downloaded successfully.")
        else:
            df.loc[index, 'Status'] = 'Incomplete'
            print(f"Download of {package} failed or incomplete.")
    
    except Exception as e:
        print(f"Error downloading {download_link}: {e}")
        df.loc[index, 'Status'] = 'Error'

    # Update the CSV after each file
    df.to_csv('DTM_download_updated.csv', index=False)
    print(f"CSV updated after processing {package}.")

print("Download process completed. All updates saved in 'DTM_download_updated.csv'.")


https://ws.gisetl.lrc.gov.on.ca/fmedatadownload/Packages/LIDAR2016to18_DTM-Crne-A.zip is already downloaded.
https://ws.gisetl.lrc.gov.on.ca/fmedatadownload/Packages/LIDAR2016to18_DTM-Crne-B.zip is already downloaded.
https://ws.gisetl.lrc.gov.on.ca/fmedatadownload/Packages/LIDAR2016to18_DTM-Crne-C.zip is already downloaded.
Downloading Cochrane D from OMAFRA Lidar 2016-18 (3.07 GB)...

09/23 16:08:06 [[1;32mNOTICE[0m] Downloading 1 item(s)

09/23 16:08:06 [[1;32mNOTICE[0m] Allocating disk space. Use --file-allocation=none to disable it. See --file-allocation option in man page for more details.
[#82ccf6 7.1MiB/2.9GiB(0%) CN:16 DL:10MiB ETA:4m59s]
[#82ccf6 17MiB/2.9GiB(0%) CN:16 DL:9.9MiB ETA:4m58s]
[#82ccf6 32MiB/2.9GiB(1%) CN:16 DL:11MiB ETA:4m8s]
[#82ccf6 47MiB/2.9GiB(1%) CN:16 DL:12MiB ETA:3m51s]
[#82ccf6 61MiB/2.9GiB(2%) CN:16 DL:13MiB ETA:3m43s]
[#82ccf6 79MiB/2.9GiB(2%) CN:16 DL:13MiB ETA:3m29s]
[#82ccf6 89MiB/2.9GiB(2%) CN:16 DL:13MiB ETA:3m36s]
[#82ccf6 104MiB/2.9GiB(3%) C