# Download GBIF occurrence data

This notebook fetches the latest occurrence records for **Poaceae** (grasses) and
**Cyperaceae** (sedges) from **Montserrat** via the GBIF **asynchronous download API**.

This uses the proper GBIF download workflow:
1. Submit a download request (requires GBIF credentials)
2. Poll until GBIF finishes preparing the archive
3. Download the Darwin Core Archive ZIP
4. Extract `occurrence.txt` and `multimedia.txt`

**Credentials:** Set `GBIF_USER` and `GBIF_PWD` environment variables (or GitHub secrets).

**Output files** (tab-separated, Darwin Core Archive format):
- `data/occurrence.txt` — occurrence records
- `data/multimedia.txt` — media records linked to occurrences

In [None]:
import os
import sys
import time
import zipfile
import requests

os.makedirs("data", exist_ok=True)

# GBIF credentials — required for the download API
GBIF_USER = os.environ.get("GBIF_USER", "")
GBIF_PWD = os.environ.get("GBIF_PWD", "")

if not GBIF_USER or not GBIF_PWD:
    print("ERROR: GBIF_USER and GBIF_PWD environment variables must be set.")
    print("Register at https://www.gbif.org/user/profile to get credentials.")
    sys.exit(1)

# Download API endpoint
GBIF_DOWNLOAD_API = "https://api.gbif.org/v1/occurrence/download/request"

# Country code for Montserrat
COUNTRY = "MS"

# Maximum time to wait for download to be ready (seconds)
MAX_WAIT = 3600  # 1 hour

# Poll interval (seconds)
POLL_INTERVAL = 30

print(f"GBIF user: {GBIF_USER}")
print(f"Country: {COUNTRY}")
print("Download API ready.")

In [None]:
# Build the download request predicate
# Poaceae (taxonKey=3073) and Cyperaceae (taxonKey=7708) in Montserrat
FAMILIES = {
    "Poaceae": "3073",
    "Cyperaceae": "7708",
}

download_request = {
    "creator": GBIF_USER,
    "notificationAddresses": [],
    "sendNotification": False,
    "format": "DWCA",
    "predicate": {
        "type": "and",
        "predicates": [
            {
                "type": "equals",
                "key": "COUNTRY",
                "value": COUNTRY,
                "matchCase": False
            },
            {
                "type": "equals",
                "key": "OCCURRENCE_STATUS",
                "value": "present",
                "matchCase": False
            },
            {
                "type": "in",
                "key": "TAXON_KEY",
                "values": list(FAMILIES.values()),
                "matchCase": False
            }
        ]
    }
}

print(f"Families: {', '.join(FAMILIES.keys())}")
print("Submitting download request to GBIF...")
resp = requests.post(
    GBIF_DOWNLOAD_API,
    json=download_request,
    auth=(GBIF_USER, GBIF_PWD),
    headers={"Content-Type": "application/json"},
    timeout=30,
)
resp.raise_for_status()

download_key = resp.text.strip()
print(f"Download request submitted: {download_key}")
print(f"Track at: https://www.gbif.org/occurrence/download/{download_key}")

In [None]:
# Poll until the download is ready
status_url = f"https://api.gbif.org/v1/occurrence/download/{download_key}"
start_time = time.time()

print(f"Waiting for GBIF to prepare the download (polling every {POLL_INTERVAL}s, max {MAX_WAIT}s)...")
print()

while True:
    elapsed = time.time() - start_time
    if elapsed > MAX_WAIT:
        print(f"ERROR: Download timed out after {MAX_WAIT}s")
        sys.exit(1)

    resp = requests.get(status_url, timeout=30)
    resp.raise_for_status()
    status_data = resp.json()
    status = status_data.get("status", "UNKNOWN")
    
    minutes = int(elapsed // 60)
    seconds = int(elapsed % 60)
    print(f"  [{minutes:02d}:{seconds:02d}] Status: {status}")

    if status == "SUCCEEDED":
        total_records = status_data.get("totalRecords", 0)
        download_link = status_data.get("downloadLink", "")
        file_size = status_data.get("size", 0)
        print()
        print(f"Download ready!")
        print(f"  Records: {total_records:,}")
        print(f"  Size: {file_size:,} bytes")
        print(f"  Link: {download_link}")
        break
    elif status in ("FAILED", "KILLED", "CANCELLED"):
        print(f"ERROR: Download {status}")
        sys.exit(1)
    
    time.sleep(POLL_INTERVAL)

In [None]:
# Download the ZIP file
zip_path = "data/gbif_download.zip"
extract_dir = "data/gbif_download"

print(f"Downloading {download_link} ...")
resp = requests.get(download_link, stream=True, timeout=300)
resp.raise_for_status()

total_size = int(resp.headers.get("content-length", 0))
downloaded = 0

with open(zip_path, "wb") as f:
    for chunk in resp.iter_content(chunk_size=1024 * 1024):  # 1MB chunks
        f.write(chunk)
        downloaded += len(chunk)
        if total_size > 0:
            pct = downloaded * 100 // total_size
            print(f"  Downloaded {downloaded:,} / {total_size:,} bytes ({pct}%)", end="\r")

actual_size = os.path.getsize(zip_path)
print(f"  Downloaded {actual_size:,} bytes to {zip_path}")
print()

# Extract the ZIP
print(f"Extracting to {extract_dir}/ ...")
os.makedirs(extract_dir, exist_ok=True)

with zipfile.ZipFile(zip_path, "r") as zf:
    members = zf.namelist()
    print(f"  Archive contains {len(members)} files:")
    for m in members:
        info = zf.getinfo(m)
        print(f"    {m} ({info.file_size:,} bytes)")
    zf.extractall(extract_dir)

print("Extraction complete.")

In [None]:
import shutil
import glob

# Find occurrence.txt and multimedia.txt in the extracted archive
occ_src = os.path.join(extract_dir, "occurrence.txt")
media_src = os.path.join(extract_dir, "multimedia.txt")

occ_dst = "data/occurrence.txt"
media_dst = "data/multimedia.txt"

if os.path.exists(occ_src):
    shutil.copy2(occ_src, occ_dst)
    occ_size = os.path.getsize(occ_dst)
    print(f"Copied occurrence.txt ({occ_size:,} bytes)")
else:
    print(f"WARNING: {occ_src} not found in archive!")
    # List what IS in the archive
    for f in glob.glob(os.path.join(extract_dir, "*")):
        print(f"  Found: {os.path.basename(f)}")

if os.path.exists(media_src):
    shutil.copy2(media_src, media_dst)
    media_size = os.path.getsize(media_dst)
    print(f"Copied multimedia.txt ({media_size:,} bytes)")
else:
    # Create an empty multimedia.txt if not present
    with open(media_dst, "w") as f:
        f.write("gbifID\ttype\tformat\tidentifier\treferences\ttitle\tdescription\tsource\taudience\tcreated\tcreator\tcontributor\tpublisher\tlicense\trightsHolder\n")
    print("No multimedia.txt in archive — created empty file.")

# Quick summary
import pandas as pd

print()
print("=== GBIF Download Summary ===")
print(f"Download key: {download_key}")
print(f"Country: Montserrat ({COUNTRY})")
print(f"Scope: Poaceae + Cyperaceae")

df = pd.read_csv(occ_dst, sep="\t", on_bad_lines="warn", low_memory=False)
print(f"Total occurrences: {len(df)}")

n_species = df["species"].nunique() if "species" in df.columns else 0
print(f"Unique species: {n_species}")

n_coords = df["decimalLatitude"].notna().sum() if "decimalLatitude" in df.columns else 0
print(f"With coordinates: {n_coords}")

if "kingdom" in df.columns:
    print()
    print("Kingdom breakdown:")
    for k, c in df["kingdom"].value_counts().items():
        print(f"  {k}: {c} records")

# Clean up ZIP (keep extracted dir for reference)
os.remove(zip_path)
print()
print(f"Cleaned up {zip_path}")
print("Done!")