In [6]:
import os
import subprocess
import pandas as pd
from urllib.parse import urlparse
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry
import requests
from tqdm import tqdm

# Create a session with retry logic
def create_retry_session(retries=5, backoff_factor=0.3, status_forcelist=(500, 502, 504)):
    session = requests.Session()
    retry = Retry(
        total=retries,
        read=retries,
        connect=retries,
        backoff_factor=backoff_factor,
        status_forcelist=status_forcelist,
    )
    adapter = HTTPAdapter(max_retries=retry)
    session.mount('http://', adapter)
    session.mount('https://', adapter)
    return session

# Download with aria2 (parallel download)
def download_with_aria2(url, folder):
    file_name = os.path.basename(urlparse(url).path)
    file_path = os.path.join(folder, file_name)
    command = ['aria2c', '-x', '16', '-s', '16', '-c', '-o', file_name, '-d', folder, url]
    result = subprocess.run(command)
    return file_name if result.returncode == 0 else None

# Download with requests (fallback)
def download_file(url, folder, chunk_size=1024):
    file_name = os.path.basename(urlparse(url).path)
    file_path = os.path.join(folder, file_name)
    session = create_retry_session()
    response = session.get(url, stream=True)
    total_size = int(response.headers.get('content-length', 0))
    
    with open(file_path, 'wb') as file, tqdm(
        desc=file_name,
        total=total_size,
        unit='B',
        unit_scale=True,
        unit_divisor=1024,
        miniters=1
    ) as bar:
        for chunk in response.iter_content(chunk_size=chunk_size):
            file.write(chunk)
            bar.update(len(chunk))
    return file_name

# Function to handle the download process from the CSV
def process_downloads(csv_file, download_folder):
    # Read the CSV and add 'Status' if it doesn't exist
    df = pd.read_csv(csv_file)
    if 'Status' not in df.columns:
        df['Status'] = 'Incomplete'

    os.makedirs(download_folder, exist_ok=True)

    # Loop through the CSV and download files
    for index, row in df.iterrows():
        if df.loc[index, 'Status'] == 'Completed':
            #print (f"Skip . Completed")
            continue

        download_link = row['Download Link']
        package = row['Package']
        print(f"Downloading {package}...")

        try:
            # Download via aria2 or fallback to requests
            file_name = download_with_aria2(download_link, download_folder)
            if not file_name:
                file_name = download_file(download_link, download_folder)
            
            if file_name:
                df.loc[index, 'Status'] = 'Completed'
                df.loc[index, 'Downloaded File Name'] = file_name
                print(f"{file_name} downloaded successfully.")
            else:
                df.loc[index, 'Status'] = 'Incomplete'
                print(f"Download failed for {package}.")

        except Exception as e:
            print(f"Error downloading {package}: {e}")
            df.loc[index, 'Status'] = 'Error'

        # Update the original CSV after each download
        df.to_csv(csv_file, index=False)

    print("Download process completed. All updates saved.")


In [8]:

# Usage example
csv_file = 'DTM_download_V2.csv'
download_folder = '/Users/shuyang/Data/DTM_ZIP'
process_downloads(csv_file, download_folder)


Downloading Sudbury-DTM-09...

10/07 17:57:23 [[1;32mNOTICE[0m] Downloading 1 item(s)

10/07 17:57:23 [[1;32mNOTICE[0m] Allocating disk space. Use --file-allocation=none to disable it. See --file-allocation option in man page for more details.
[#58e9d7 4.0MiB/3.1GiB(0%) CN:16 DL:17MiB ETA:3m8s]
[#58e9d7 23MiB/3.1GiB(0%) CN:16 DL:19MiB ETA:2m45s]
[#58e9d7 39MiB/3.1GiB(1%) CN:16 DL:17MiB ETA:2m59s]
[#58e9d7 58MiB/3.1GiB(1%) CN:16 DL:18MiB ETA:2m52s]
[#58e9d7 73MiB/3.1GiB(2%) CN:16 DL:17MiB ETA:2m59s]
[#58e9d7 93MiB/3.1GiB(2%) CN:16 DL:18MiB ETA:2m52s]
[#58e9d7 109MiB/3.1GiB(3%) CN:16 DL:17MiB ETA:2m54s]
[#58e9d7 126MiB/3.1GiB(3%) CN:16 DL:17MiB ETA:2m54s]
[#58e9d7 144MiB/3.1GiB(4%) CN:16 DL:17MiB ETA:2m53s]
[#58e9d7 165MiB/3.1GiB(5%) CN:16 DL:18MiB ETA:2m49s]
[#58e9d7 182MiB/3.1GiB(5%) CN:16 DL:17MiB ETA:2m49s]
[#58e9d7 203MiB/3.1GiB(6%) CN:16 DL:18MiB ETA:2m44s]
[#58e9d7 225MiB/3.1GiB(7%) CN:16 DL:18MiB ETA:2m39s]
[#58e9d7 245MiB/3.1GiB(7%) CN:16 DL:19MiB ETA:2m34s]
[#58e9d7 264MiB/

In [2]:
import os
import pandas as pd

# Function to verify downloaded files
def verify_downloads(csv_file, download_folder):
    # Read the CSV file
    df = pd.read_csv(csv_file)

    # Check if the necessary columns exist
    if 'Downloaded File Name' not in df.columns or 'Status' not in df.columns:
        raise ValueError("CSV file must contain 'Downloaded File Name' and 'Status' columns.")

    missing_files = []

    # Loop through each row to check file existence
    for index, row in df.iterrows():
        file_name = row['Downloaded File Name']
        status = row['Status']
        file_path = os.path.join(download_folder, file_name)

        # If status is 'Completed' but file is missing, add to the list
        if status == 'Completed' and not os.path.isfile(file_path):
            missing_files.append(file_name)
    
    # Print the missing files
    if missing_files:
        print("The following files are marked as 'Completed' but are missing in the download folder:")
        for file in missing_files:
            print(file)
    else:
        print("All files marked as 'Completed' are present in the download folder.")



In [5]:
# Usage example
csv_file = 'DTM_download_V2.csv'
download_folder = '/Volumes/Seagate Bac/DTM_ZIP'
verify_downloads(csv_file, download_folder)


The following files are marked as 'Completed' but are missing in the download folder:
LIDAR2016to18_DTM-Crne-C.zi
Sudbury-DTM-09.zip
Sudbury-DTM-10.zip
Sudbury-DTM-11.zip
Sudbury-DTM-12.zip
Sudbury-DTM-13.zip
Sudbury-DTM-14.zip
