In [2]:
import argparse
import os
from pathlib import Path
import time
import logging
from concurrent.futures import ThreadPoolExecutor, as_completed
from typing import Tuple
from numpy import indices
import requests
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry
from tqdm import tqdm

BASE_URL = "https://physionet.org/files/challenge-2019/1.0.0/training/training_setA"
FNAME_TMPL = "p{idx:06d}.psv"

def create_session(timeout: int = 30, max_retries: int = 5, backoff_factor: float = 1.0, user_agent: str = None) -> requests.Session:
     s = requests.Session()
     retries = Retry(
          total=max_retries,
          backoff_factor=backoff_factor,
          status_forcelist=[429, 500, 502, 503, 504],
          allowed_methods=frozenset(["GET", "HEAD"])
     )
     adapter = HTTPAdapter(max_retries=retries)
     s.mount("https://", adapter)
     s.mount("http://", adapter)
     s.headers.update({"User-Agent": user_agent or "physionet-bulk-downloader/1.0"})
     s.request_timeout = timeout 
     return s

def download_one(session: requests.Session, idx: int, out_dir: Path, timeout: int, skip_if_exists: bool, throttle: float) -> Tuple[int, str]:
     fname = FNAME_TMPL.format(idx=idx)
     url = f"{BASE_URL}/{fname}"
     out_path = out_dir / fname

     if skip_if_exists and out_path.exists():
          return idx, "skipped"

     tmp_path = out_path.with_suffix(".part")
     try:
          with session.get(url, stream=True, timeout=timeout) as r:
               if r.status_code == 200:
                    with open(tmp_path, "wb") as f:
                         for chunk in r.iter_content(chunk_size=16_384):
                              if chunk:
                                   f.write(chunk)
                    os.replace(tmp_path, out_path)  
                    if throttle:
                         time.sleep(throttle)
                    return idx, "ok"
               elif r.status_code == 404:
                    return idx, "not_found"
               else:
                    return idx, f"status_{r.status_code}"
     except Exception as e:
          try:
               if tmp_path.exists():
                    tmp_path.unlink()
          except Exception:
               pass
          return idx, f"error:{e}"

def main():
     parser = argparse.ArgumentParser(description="Bulk download PhysioNet .psv files (p000001.psv ...)")
     parser.add_argument("--start", type=int, default=1, help="Start index (inclusive)")
     parser.add_argument("--end", type=int, default=20643, help="End index (inclusive)")
     parser.add_argument("--out-dir", type=str, default="./Data-Set-A", help="Output directory")
     parser.add_argument("--workers", type=int, default=8, help="Number of concurrent workers")
     parser.add_argument("--timeout", type=int, default=30, help="Request timeout (s)")
     parser.add_argument("--retries", type=int, default=5, help="Max retries per request")
     parser.add_argument("--backoff", type=float, default=1.0, help="Backoff factor for retries")
     parser.add_argument("--throttle", type=float, default=0.0, help="Sleep (s) after each successful download (per worker) to reduce server load")
     parser.add_argument("--skip-if-exists", action="store_true", help="Skip download if file already exists")
     parser.add_argument("--user-agent", type=str, default="physionet-bulk-downloader/1.0", help="User-Agent header")
     parser.add_argument("--make-url-list", action="store_true", help="Only generate a file urls.txt with all URLs (no downloading)")
     parser.add_argument("--url-list-path", type=str, default="urls.txt", help="Path to save url list if --make-url-list")
     parser.add_argument("--log", type=str, default="download.log", help="Log file")
     args = parser.parse_args()

     logging.basicConfig(filename=args.log, level=logging.INFO, format="%(asctime)s %(levelname)s:%(message)s")
     console = logging.StreamHandler()
     console.setLevel(logging.WARNING)
     logging.getLogger("").addHandler(console)

     out_dir = Path(args.out_dir)
     out_dir.mkdir(parents=True, exist_ok=True)

     if args.make_url_list:
          with open(args.url_list_path, "w", encoding="utf-8") as f:
               for i in range(args.start, args.end + 1):
                    fname = FNAME_TMPL.format(idx=i)
                    url = f"{BASE_URL}/{fname}"
                    f.write(url + "\n")
          print(f"Wrote URL list to {args.url_list_path}. Use `aria2c -i {args.url_list_path}` or `wget -i {args.url_list_path}` to download.")
          return

     session = create_session(timeout=args.timeout, max_retries=args.retries, backoff_factor=args.backoff, user_agent=args.user_agent)

     indices = list(range(args.start, args.end + 1))
     results = {}
     errors = []

     with ThreadPoolExecutor(max_workers=args.workers) as ex:
          futures = {ex.submit(download_one, session, i, out_dir, args.timeout, args.skip_if_exists, args.throttle): i for i in indices}
          with tqdm(total=len(indices), desc="Downloading", unit="file") as pbar:
               for fut in as_completed(futures):
                    idx = futures[fut]
                    try:
                         idx_ret, status = fut.result()
                    except Exception as e:
                         idx_ret, status = idx, f"exception:{e}"
                    results[idx_ret] = status
                    if status != "ok" and status != "skipped":
                         errors.append((idx_ret, status))
                         logging.info("Index %06d -> %s", idx_ret, status)
                    pbar.update(1)

     total = len(indices)
     ok = sum(1 for v in results.values() if v == "ok")
     skipped = sum(1 for v in results.values() if v == "skipped")
     notfound = sum(1 for v in results.values() if v == "not_found")
     other = total - ok - skipped - notfound

     print("\nSummary")
     print(f"  Total requested: {total}")
     print(f"  OK:              {ok}")
     print(f"  Skipped:         {skipped}")
     print(f"  404 not found:   {notfound}")
     print(f"  Other errors:    {other}")
     if errors:
          print("\nSome errors / non-200 responses (sample up to 50):")
          for e in errors[:50]:
               print(" ", e)

     print(f"\nLog file: {args.log}")

if __name__ == "__main__":
     import sys
     sys.argv = ["download_physionet_bulk.py", "--start", "1", "--end", "20643", "--skip-if-exists"]
     main()

Downloading: 100%|██████████| 20643/20643 [17:41<00:00, 19.45file/s]


Summary
  Total requested: 20643
  OK:              20336
  Skipped:         0
  404 not found:   307
  Other errors:    0

Some errors / non-200 responses (sample up to 50):
  (19313, 'not_found')
  (20001, 'not_found')
  (20002, 'not_found')
  (20003, 'not_found')
  (20004, 'not_found')
  (20005, 'not_found')
  (20006, 'not_found')
  (20010, 'not_found')
  (20016, 'not_found')
  (20011, 'not_found')
  (20008, 'not_found')
  (20013, 'not_found')
  (20009, 'not_found')
  (20012, 'not_found')
  (20014, 'not_found')
  (20015, 'not_found')
  (20017, 'not_found')
  (20018, 'not_found')
  (20019, 'not_found')
  (20022, 'not_found')
  (20021, 'not_found')
  (20020, 'not_found')
  (20023, 'not_found')
  (20024, 'not_found')
  (20025, 'not_found')
  (20026, 'not_found')
  (20027, 'not_found')
  (20029, 'not_found')
  (20030, 'not_found')
  (20032, 'not_found')
  (20031, 'not_found')
  (20028, 'not_found')
  (20033, 'not_found')
  (20034, 'not_found')
  (20035, 'not_found')
  (20036, 'not_foun




In [1]:
import argparse
import os
from pathlib import Path
import time
import logging
from concurrent.futures import ThreadPoolExecutor, as_completed
from typing import Tuple
from numpy import indices
import requests
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry
from tqdm import tqdm

BASE_URL = "https://physionet.org/files/challenge-2019/1.0.0/training/training_setB"
FNAME_TMPL = "p1{idx:05d}.psv"

def create_session(timeout: int = 30, max_retries: int = 5, backoff_factor: float = 1.0, user_agent: str = None) -> requests.Session:
     s = requests.Session()
     retries = Retry(
          total=max_retries,
          backoff_factor=backoff_factor,
          status_forcelist=[429, 500, 502, 503, 504],
          allowed_methods=frozenset(["GET", "HEAD"])
     )
     adapter = HTTPAdapter(max_retries=retries)
     s.mount("https://", adapter)
     s.mount("http://", adapter)
     s.headers.update({"User-Agent": user_agent or "physionet-bulk-downloader/1.0"})
     s.request_timeout = timeout 
     return s

def download_one(session: requests.Session, idx: int, out_dir: Path, timeout: int, skip_if_exists: bool, throttle: float) -> Tuple[int, str]:
     fname = FNAME_TMPL.format(idx=idx)
     url = f"{BASE_URL}/{fname}"
     out_path = out_dir / fname

     if skip_if_exists and out_path.exists():
          return idx, "skipped"

     tmp_path = out_path.with_suffix(".part")
     try:
          with session.get(url, stream=True, timeout=timeout) as r:
               if r.status_code == 200:
                    with open(tmp_path, "wb") as f:
                         for chunk in r.iter_content(chunk_size=16_384):
                              if chunk:
                                   f.write(chunk)
                    os.replace(tmp_path, out_path)  
                    if throttle:
                         time.sleep(throttle)
                    return idx, "ok"
               elif r.status_code == 404:
                    return idx, "not_found"
               else:
                    return idx, f"status_{r.status_code}"
     except Exception as e:
          try:
               if tmp_path.exists():
                    tmp_path.unlink()
          except Exception:
               pass
          return idx, f"error:{e}"

def main():
     parser = argparse.ArgumentParser(description="Bulk download PhysioNet .psv files (p000001.psv ...)")
     parser.add_argument("--start", type=int, default=1, help="Start index (inclusive)")
     parser.add_argument("--end", type=int, default=20000, help="End index (inclusive)")
     parser.add_argument("--out-dir", type=str, default="./Data-Set-B", help="Output directory")
     parser.add_argument("--workers", type=int, default=8, help="Number of concurrent workers")
     parser.add_argument("--timeout", type=int, default=30, help="Request timeout (s)")
     parser.add_argument("--retries", type=int, default=5, help="Max retries per request")
     parser.add_argument("--backoff", type=float, default=1.0, help="Backoff factor for retries")
     parser.add_argument("--throttle", type=float, default=0.0, help="Sleep (s) after each successful download (per worker) to reduce server load")
     parser.add_argument("--skip-if-exists", action="store_true", help="Skip download if file already exists")
     parser.add_argument("--user-agent", type=str, default="physionet-bulk-downloader/1.0", help="User-Agent header")
     parser.add_argument("--make-url-list", action="store_true", help="Only generate a file urls.txt with all URLs (no downloading)")
     parser.add_argument("--url-list-path", type=str, default="urls.txt", help="Path to save url list if --make-url-list")
     parser.add_argument("--log", type=str, default="download.log", help="Log file")
     args = parser.parse_args()

     logging.basicConfig(filename=args.log, level=logging.INFO, format="%(asctime)s %(levelname)s:%(message)s")
     console = logging.StreamHandler()
     console.setLevel(logging.WARNING)
     logging.getLogger("").addHandler(console)

     out_dir = Path(args.out_dir)
     out_dir.mkdir(parents=True, exist_ok=True)

     if args.make_url_list:
          with open(args.url_list_path, "w", encoding="utf-8") as f:
               for i in range(args.start, args.end + 1):
                    fname = FNAME_TMPL.format(idx=i)
                    url = f"{BASE_URL}/{fname}"
                    f.write(url + "\n")
          print(f"Wrote URL list to {args.url_list_path}. Use `aria2c -i {args.url_list_path}` or `wget -i {args.url_list_path}` to download.")
          return

     session = create_session(timeout=args.timeout, max_retries=args.retries, backoff_factor=args.backoff, user_agent=args.user_agent)

     indices = list(range(args.start, args.end + 1))
     results = {}
     errors = []

     with ThreadPoolExecutor(max_workers=args.workers) as ex:
          futures = {ex.submit(download_one, session, i, out_dir, args.timeout, args.skip_if_exists, args.throttle): i for i in indices}
          with tqdm(total=len(indices), desc="Downloading", unit="file") as pbar:
               for fut in as_completed(futures):
                    idx = futures[fut]
                    try:
                         idx_ret, status = fut.result()
                    except Exception as e:
                         idx_ret, status = idx, f"exception:{e}"
                    results[idx_ret] = status
                    if status != "ok" and status != "skipped":
                         errors.append((idx_ret, status))
                         logging.info("Index %06d -> %s", idx_ret, status)
                    pbar.update(1)

     total = len(indices)
     ok = sum(1 for v in results.values() if v == "ok")
     skipped = sum(1 for v in results.values() if v == "skipped")
     notfound = sum(1 for v in results.values() if v == "not_found")
     other = total - ok - skipped - notfound

     print("\nSummary")
     print(f"  Total requested: {total}")
     print(f"  OK:              {ok}")
     print(f"  Skipped:         {skipped}")
     print(f"  404 not found:   {notfound}")
     print(f"  Other errors:    {other}")
     if errors:
          print("\nSome errors / non-200 responses (sample up to 50):")
          for e in errors[:50]:
               print(" ", e)

     print(f"\nLog file: {args.log}")

if __name__ == "__main__":
     import sys
     sys.argv = ["download_physionet_bulk.py", "--start", "1", "--end", "20000", "--skip-if-exists"]
     main()

Downloading: 100%|██████████| 20000/20000 [13:55<00:00, 23.93file/s]


Summary
  Total requested: 20000
  OK:              20000
  Skipped:         0
  404 not found:   0
  Other errors:    0

Log file: download.log



