# Download GBIF occurrence data

This notebook fetches the latest occurrence records for **Poaceae** (grasses) and
**Cyperaceae** (sedges) from **Montserrat** via the GBIF API.

**Two modes:**
- **With GBIF credentials** (`GBIF_USER` + `GBIF_PWD`): uses the async download API
  (no record limit, proper Darwin Core Archive)
- **Without credentials**: falls back to the synchronous search API
  (max 100k records, fine for small datasets)

**Output files** (tab-separated, Darwin Core Archive format):
- `data/occurrence.txt` — occurrence records
- `data/multimedia.txt` — media records linked to occurrences

In [None]:
import os
import sys
import time
import zipfile
import shutil
import requests
import pandas as pd

os.makedirs("data", exist_ok=True)

# GBIF credentials — optional, enables async download API
GBIF_USER = os.environ.get("GBIF_USER", "")
GBIF_PWD = os.environ.get("GBIF_PWD", "")

# Country code for Montserrat
COUNTRY = "MS"

# Taxonomic filter: Poaceae and Cyperaceae family keys
FAMILIES = {
    "Poaceae": 3073,
    "Cyperaceae": 7708,
}

# GBIF API endpoints
GBIF_DOWNLOAD_API = "https://api.gbif.org/v1/occurrence/download/request"
GBIF_SEARCH_API = "https://api.gbif.org/v1/occurrence/search"

USE_DOWNLOAD_API = bool(GBIF_USER and GBIF_PWD)

if USE_DOWNLOAD_API:
    print(f"GBIF credentials found (user: {GBIF_USER})")
    print("Will use the async download API.")
else:
    print("No GBIF credentials — using synchronous search API.")
    print("(Set GBIF_USER and GBIF_PWD for the async download API.)")

print(f"Country: {COUNTRY}")
print(f"Families: {', '.join(FAMILIES.keys())}")

In [None]:
# ============================================================
# MODE A: Async download API (with credentials)
# ============================================================

if USE_DOWNLOAD_API:
    download_request = {
        "creator": GBIF_USER,
        "notificationAddresses": [],
        "sendNotification": False,
        "format": "DWCA",
        "predicate": {
            "type": "and",
            "predicates": [
                {"type": "equals", "key": "COUNTRY", "value": COUNTRY, "matchCase": False},
                {"type": "equals", "key": "OCCURRENCE_STATUS", "value": "present", "matchCase": False},
                {"type": "in", "key": "TAXON_KEY", "values": [str(v) for v in FAMILIES.values()], "matchCase": False},
            ]
        }
    }

    print("Submitting download request to GBIF...")
    resp = requests.post(
        GBIF_DOWNLOAD_API, json=download_request,
        auth=(GBIF_USER, GBIF_PWD),
        headers={"Content-Type": "application/json"}, timeout=30,
    )
    resp.raise_for_status()
    download_key = resp.text.strip()
    print(f"Download key: {download_key}")
    print(f"Track at: https://www.gbif.org/occurrence/download/{download_key}")
    print()

    # Poll until ready
    MAX_WAIT, POLL_INTERVAL = 3600, 30
    status_url = f"https://api.gbif.org/v1/occurrence/download/{download_key}"
    start = time.time()
    print(f"Waiting for GBIF to prepare the download...")
    while True:
        elapsed = time.time() - start
        if elapsed > MAX_WAIT:
            print(f"ERROR: Timed out after {MAX_WAIT}s")
            sys.exit(1)
        resp = requests.get(status_url, timeout=30)
        resp.raise_for_status()
        sd = resp.json()
        status = sd.get("status", "UNKNOWN")
        mins, secs = int(elapsed // 60), int(elapsed % 60)
        print(f"  [{mins:02d}:{secs:02d}] {status}")
        if status == "SUCCEEDED":
            download_link = sd["downloadLink"]
            print(f"  Ready! {sd.get('totalRecords', 0):,} records, {sd.get('size', 0):,} bytes")
            break
        elif status in ("FAILED", "KILLED", "CANCELLED"):
            print(f"ERROR: Download {status}")
            sys.exit(1)
        time.sleep(POLL_INTERVAL)

    # Download and extract
    zip_path = "data/gbif_download.zip"
    extract_dir = "data/gbif_download"
    print(f"Downloading ZIP...")
    resp = requests.get(download_link, stream=True, timeout=300)
    resp.raise_for_status()
    with open(zip_path, "wb") as f:
        for chunk in resp.iter_content(chunk_size=1024 * 1024):
            f.write(chunk)
    print(f"  {os.path.getsize(zip_path):,} bytes")

    os.makedirs(extract_dir, exist_ok=True)
    with zipfile.ZipFile(zip_path, "r") as zf:
        zf.extractall(extract_dir)

    # Copy to standard locations
    for fname in ["occurrence.txt", "multimedia.txt"]:
        src = os.path.join(extract_dir, fname)
        dst = os.path.join("data", fname)
        if os.path.exists(src):
            shutil.copy2(src, dst)
            print(f"  {fname}: {os.path.getsize(dst):,} bytes")
        elif fname == "multimedia.txt":
            cols = "gbifID\ttype\tformat\tidentifier\treferences\ttitle\tdescription\tsource\taudience\tcreated\tcreator\tcontributor\tpublisher\tlicense\trightsHolder"
            with open(dst, "w") as mf:
                mf.write(cols + "\n")
            print(f"  {fname}: created empty")

    os.remove(zip_path)
    shutil.rmtree(extract_dir, ignore_errors=True)
    print("Async download complete.")

else:
    print("Skipping async download — will use search API in next cell.")

In [None]:
# ============================================================
# MODE B: Synchronous search API (no credentials needed)
# ============================================================

if not USE_DOWNLOAD_API:
    def fetch_all_occurrences(taxon_key, country, page_size=300):
        """Fetch all occurrences for a taxon key in a country, handling pagination."""
        all_records = []
        offset = 0
        while True:
            params = {
                "taxonKey": taxon_key, "country": country,
                "occurrenceStatus": "PRESENT", "limit": page_size, "offset": offset,
            }
            resp = requests.get(GBIF_SEARCH_API, params=params, timeout=30)
            resp.raise_for_status()
            data = resp.json()
            results = data.get("results", [])
            all_records.extend(results)
            if data.get("endOfRecords", True) or len(results) == 0:
                break
            offset += page_size
            time.sleep(0.3)
        return all_records, data.get("count", len(all_records))

    all_occurrences = []
    for family_name, taxon_key in FAMILIES.items():
        records, total = fetch_all_occurrences(taxon_key, COUNTRY)
        print(f"{family_name} (key={taxon_key}): {len(records)} records (total: {total})")
        all_occurrences.extend(records)
    print(f"Total: {len(all_occurrences)} records")

    # DwC column mapping
    OCCURRENCE_COLUMNS = [
        "gbifID", "accessRights", "bibliographicCitation", "language", "license",
        "modified", "publisher", "references", "rightsHolder", "type",
        "institutionID", "collectionID", "datasetID", "institutionCode",
        "collectionCode", "datasetName", "ownerInstitutionCode", "basisOfRecord",
        "informationWithheld", "dataGeneralizations", "dynamicProperties",
        "occurrenceID", "catalogNumber", "recordNumber", "recordedBy",
        "recordedByID", "individualCount", "organismQuantity",
        "organismQuantityType", "sex", "lifeStage", "reproductiveCondition",
        "caste", "behavior", "vitality", "establishmentMeans",
        "degreeOfEstablishment", "pathway", "georeferenceVerificationStatus",
        "occurrenceStatus", "preparations", "disposition",
        "associatedOccurrences", "associatedReferences", "associatedSequences",
        "associatedTaxa", "otherCatalogNumbers", "occurrenceRemarks",
        "organismID", "organismName", "organismScope", "eventID", "parentEventID",
        "fieldNumber", "eventDate", "eventTime", "startDayOfYear", "endDayOfYear",
        "year", "month", "day", "verbatimEventDate", "habitat", "samplingProtocol",
        "sampleSizeValue", "sampleSizeUnit", "samplingEffort", "fieldNotes",
        "eventRemarks", "locationID", "continent", "waterBody", "islandGroup",
        "island", "country", "countryCode", "stateProvince", "county",
        "municipality", "locality", "verbatimLocality", "minimumElevationInMeters",
        "maximumElevationInMeters", "verbatimElevation",
        "minimumDepthInMeters", "maximumDepthInMeters", "verbatimDepth",
        "minimumDistanceAboveSurfaceInMeters", "maximumDistanceAboveSurfaceInMeters",
        "locationRemarks", "decimalLatitude", "decimalLongitude",
        "geodeticDatum", "coordinateUncertaintyInMeters",
        "coordinatePrecision", "pointRadiusSpatialFit",
        "verbatimCoordinates", "verbatimLatitude", "verbatimLongitude",
        "verbatimCoordinateSystem", "verbatimSRS", "footprintWKT",
        "footprintSRS", "footprintSpatialFit", "georeferencedBy",
        "georeferencedDate", "georeferenceProtocol", "georeferenceSources",
        "georeferenceRemarks", "geologicalContextID", "earliestEonOrLowestEonothem",
        "latestEonOrHighestEonothem", "earliestEraOrLowestErathem",
        "latestEraOrHighestErathem", "earliestPeriodOrLowestSystem",
        "latestPeriodOrHighestSystem", "earliestEpochOrLowestSeries",
        "latestEpochOrHighestSeries", "earliestAgeOrLowestStage",
        "latestAgeOrHighestStage", "lowestBiostratigraphicZone",
        "highestBiostratigraphicZone", "lithostratigraphicTerms", "group",
        "formation", "member", "bed", "identificationID", "verbatimIdentification",
        "identificationQualifier", "typeStatus", "identifiedBy", "identifiedByID",
        "dateIdentified", "identificationReferences", "identificationVerificationStatus",
        "identificationRemarks", "taxonID", "scientificNameID", "acceptedNameUsageID",
        "parentNameUsageID", "originalNameUsageID", "nameAccordingToID",
        "namePublishedInID", "taxonConceptID", "scientificName",
        "acceptedScientificName", "parentNameUsage", "originalNameUsage",
        "nameAccordingTo", "namePublishedIn", "namePublishedInYear",
        "higherClassification", "kingdom", "phylum", "class", "order", "family",
        "subfamily", "genus", "genericName", "subgenus", "infragenericEpithet",
        "specificEpithet", "infraspecificEpithet", "cultivarEpithet", "taxonRank",
        "verbatimTaxonRank", "scientificNameAuthorship", "vernacularName",
        "nomenclaturalCode", "taxonomicStatus", "nomenclaturalStatus",
        "taxonRemarks", "datasetKey", "publishingCountry", "lastInterpreted",
        "elevation", "elevationAccuracy", "depth", "depthAccuracy",
        "distanceFromCentroidInMeters", "stateProvince", "issue",
        "mediaType", "hasCoordinate", "hasGeospatialIssues", "taxonKey",
        "acceptedTaxonKey", "kingdomKey", "phylumKey", "classKey", "orderKey",
        "familyKey", "genusKey", "subgenusKey", "speciesKey", "species",
        "iucnRedListCategory", "verbatimScientificName",
    ]

    rows = []
    for rec in all_occurrences:
        row = {}
        for col in OCCURRENCE_COLUMNS:
            val = rec.get(col, "")
            if col == "gbifID":
                val = rec.get("gbifID", rec.get("key", ""))
            elif col == "publisher":
                val = rec.get("publishingOrgKey", "")
            elif col == "issue":
                issues = rec.get("issues", [])
                val = ";".join(issues) if isinstance(issues, list) else str(issues)
            elif col == "mediaType":
                media = rec.get("media", [])
                types = list(set(m.get("type", "") for m in media if m.get("type")))
                val = ";".join(types)
            elif col == "hasCoordinate":
                val = str(rec.get("hasCoordinate", "")).upper()
            elif col == "hasGeospatialIssues":
                val = str(rec.get("hasGeospatialIssues", "")).upper()
            elif col == "recordedByID":
                ids = rec.get("recordedByIDs", [])
                val = "|".join(d.get("value", "") for d in ids if isinstance(d, dict)) if isinstance(ids, list) else ""
            elif col == "identifiedByID":
                ids = rec.get("identifiedByIDs", [])
                val = "|".join(d.get("value", "") for d in ids if isinstance(d, dict)) if isinstance(ids, list) else ""
            elif isinstance(val, list):
                val = ";".join(str(v) for v in val)
            elif isinstance(val, dict):
                val = ""
            if val is None:
                val = ""
            row[col] = val
        rows.append(row)

    df_occ = pd.DataFrame(rows, columns=OCCURRENCE_COLUMNS)
    df_occ.to_csv("data/occurrence.txt", sep="\t", index=False)
    print(f"Saved {len(df_occ)} occurrences to data/occurrence.txt")

    # Extract multimedia
    MEDIA_COLUMNS = [
        "gbifID", "type", "format", "identifier", "references",
        "title", "description", "source", "audience", "created",
        "creator", "contributor", "publisher", "license", "rightsHolder",
    ]
    media_rows = []
    for rec in all_occurrences:
        gbif_id = rec.get("gbifID", rec.get("key", ""))
        for m in rec.get("media", []):
            media_rows.append({col: m.get(col, "") if col != "gbifID" else gbif_id for col in MEDIA_COLUMNS})
    df_media = pd.DataFrame(media_rows, columns=MEDIA_COLUMNS)
    df_media.to_csv("data/multimedia.txt", sep="\t", index=False)
    print(f"Saved {len(df_media)} media records to data/multimedia.txt")

    print("Search API download complete.")
else:
    print("Already downloaded via async API — skipping search.")

In [None]:
# Summary
df = pd.read_csv("data/occurrence.txt", sep="\t", on_bad_lines="warn", low_memory=False)
print("=== GBIF Download Summary ===")
ts = pd.Timestamp.now().strftime("%Y-%m-%d %H:%M:%S")
print(f"Date: {ts}")
print(f"Country: Montserrat ({COUNTRY})")
print(f"Families: {', '.join(FAMILIES.keys())}")
mode = "async download API" if USE_DOWNLOAD_API else "synchronous search API"
print(f"Mode: {mode}")
print(f"Total occurrences: {len(df)}")
n_species = df["species"].nunique() if "species" in df.columns else 0
print(f"Unique species: {n_species}")
n_coords = df["decimalLatitude"].notna().sum() if "decimalLatitude" in df.columns else 0
print(f"With coordinates: {n_coords}")
print()
if "basisOfRecord" in df.columns:
    print("Basis of record:")
    print(df["basisOfRecord"].value_counts().to_string())
occ_size = os.path.getsize("data/occurrence.txt")
media_size = os.path.getsize("data/multimedia.txt")
print()
print(f"Files: occurrence.txt ({occ_size:,} bytes), multimedia.txt ({media_size:,} bytes)")