In [1]:
import os
import re
import requests
from tqdm import tqdm

CLOUDFLARE_WORKER_URL = "https://cdn.zhuo.li"
DOWNLOAD_PATH = "/root"
CHUNK_SIZE = 1024
MAX_RETRIES = 10

def download(target_url):
    retries = 0
    while retries < MAX_RETRIES:
        try:
            # Get the total file size
            response = requests.head(f"{CLOUDFLARE_WORKER_URL}?url={target_url}")
            file_size = int(response.headers.get("Content-Length", 0))

            # Get the filename from the Content-Disposition header or default to a generic name
            cd = response.headers.get('Content-Disposition')
            if cd:
                fname = re.findall("filename[^;=\n]*=((['\"]).*?\2|[^;\n]*)", cd)
                if len(fname) > 0 and len(fname[0]) > 0:
                    fname = fname[0][0].replace('"', '').replace("'", "")
                else:
                    fname = os.path.basename(target_url)
            else:
                fname = os.path.basename(target_url)

            # Append the download path to the filename
            fname = os.path.join(DOWNLOAD_PATH, fname)

            # If the file already exists, get the downloaded size for resuming
            if os.path.exists(fname):
                downloaded_size = os.path.getsize(fname)
            else:
                downloaded_size = 0

            # If the file is already fully downloaded, no need to download again
            if downloaded_size >= file_size:
                print(f"{fname} has been fully downloaded.")
                return

            headers = {"Range": f"bytes={downloaded_size}-"}
            response = requests.get(f"{CLOUDFLARE_WORKER_URL}?url={target_url}", headers=headers, stream=True)

            if response.status_code in (200, 206):
                progress = tqdm(total=file_size, initial=downloaded_size, unit="B", unit_scale=True)
                with open(fname, 'ab') as f:
                    for chunk in response.iter_content(chunk_size=CHUNK_SIZE):
                        f.write(chunk)
                        progress.update(len(chunk))
                progress.close()
                
                # Check for file integrity by comparing the downloaded file size and the expected size
                if os.path.getsize(fname) != file_size:
                    print("Downloaded file size does not match the expected size. Retrying.")
                    retries += 1
                    continue
                else:
                    print(f"Download completed: {fname}")
                    return
            else:
                print(f"Failed to download the file. Status code: {response.status_code}")
                retries += 1
        except Exception as e:
            print(f"An error occurred: {str(e)}. Retrying.")
            retries += 1

    print(f"Failed to download the file after {MAX_RETRIES} attempts.")

In [2]:
download("https://download.ifi.uzh.ch/rpg/RVT/datasets/preprocessed/gen4.tar")

  0%|          | 69.5M/190G [00:07<9:57:05, 5.31MB/s] 