# Downloading necessary data for Ireland et al., 2025
### Data have been deposited to Zenodo (doi: 10.5281/zenodo.15857302)
Files can be downloaded manually from [Zenodo](https://zenodo.org/records/15857303) but for the code in this repository to run without modification, the data must be in the `../data` directory.  

This notebook downloads any missing data files from the Zenodo archive directly into the `../data` directory. 

In [None]:
import os
import csv
import requests
import time
from pathlib import Path
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm import tqdm

In [None]:
# Configuration
ZENODO_RECORD_ID = "15857303"
DATA_DIR = Path("../data")
MANIFEST_FILE = Path("../data/zenodo_manifest.csv")
ERROR_LOG_FILE = "download_errors.log"
BASE_URL = f"https://zenodo.org/records/{ZENODO_RECORD_ID}/files"
MAX_RETRIES = 3
THREADS = 4  # You can increase based on bandwidth/CPU



In [None]:
# Ensure data directory exists
assert DATA_DIR.exists(), f"Could not find data directory; unsure this notebook is being executed from the code directory"

# Load filenames from manifest
with open(MANIFEST_FILE, newline="") as csvfile:
    reader = csv.reader(csvfile)
    filenames = [row[0].strip() for row in reader if row]

# Filter files that are already present
missing_files = [f for f in filenames if not Path(DATA_DIR, f).exists()]

# Prepare error log
with open(ERROR_LOG_FILE, "w") as log:
    log.write("Download errors:\n")

# Download function with retry and progress bar
def download_file(filename):
    url = f"{BASE_URL}/{filename}?download=1"
    target_path = DATA_DIR / filename

    for attempt in range(1, MAX_RETRIES + 1):
        try:
            with requests.get(url, stream=True, timeout=30) as response:
                response.raise_for_status()
                total = int(response.headers.get('content-length', 0))
                with open(target_path, "wb") as f, tqdm(
                    total=total,
                    unit='B',
                    unit_scale=True,
                    unit_divisor=1024,
                    desc=filename,
                    initial=0,
                    ascii=True,
                    leave=False
                ) as bar:
                    for chunk in response.iter_content(chunk_size=8192):
                        f.write(chunk)
                        bar.update(len(chunk))
            return f"✅ Downloaded: {filename}"
        except Exception as e:
            if attempt < MAX_RETRIES:
                time.sleep(2 ** attempt)  # Exponential backoff
            else:
                error_message = f"❌ Failed: {filename} (after {MAX_RETRIES} attempts): {e}"
                with open(ERROR_LOG_FILE, "a") as log:
                    log.write(error_message + "\n")
                return error_message



In [None]:
# Parallel downloading
print(f"⬇️  Starting downloads for {len(missing_files)} files using {THREADS} threads...\n")

with ThreadPoolExecutor(max_workers=THREADS) as executor:
    futures = {executor.submit(download_file, f): f for f in missing_files}
    for future in tqdm(as_completed(futures), total=len(futures), desc="Overall Progress", ascii=True):
        result = future.result()
        print(result)

print("\n✅ All downloads complete.")