# Download GBIF occurrence data

This notebook fetches the latest occurrence records for **Poaceae** (grasses) and
**Cyperaceae** (sedges) from **Montserrat** via the GBIF occurrence search API.

No authentication is required — the search API is public and synchronous.
For ~300 records this completes in seconds.

**Output files** (tab-separated, compatible with Darwin Core Archive format):
- `data/occurrence.txt` — occurrence records
- `data/multimedia.txt` — media records (images, sounds) linked to occurrences

In [None]:
import os
import time
import requests
import pandas as pd

os.makedirs('data', exist_ok=True)

# GBIF taxon keys for the two families
FAMILIES = {
    'Poaceae': 3073,     # grasses
    'Cyperaceae': 7708,  # sedges
}

# Country code for Montserrat
COUNTRY = 'MS'

# GBIF occurrence search endpoint
GBIF_API = 'https://api.gbif.org/v1/occurrence/search'

print('Dependencies loaded.')
print(f'Will fetch occurrences for {", ".join(FAMILIES.keys())} in country={COUNTRY}')

In [None]:
def fetch_all_occurrences(taxon_key, country, page_size=300):
    """Fetch all occurrences for a taxon key in a country, handling pagination."""
    all_records = []
    offset = 0
    
    while True:
        params = {
            'taxonKey': taxon_key,
            'country': country,
            'occurrenceStatus': 'PRESENT',
            'limit': page_size,
            'offset': offset,
        }
        resp = requests.get(GBIF_API, params=params, timeout=30)
        resp.raise_for_status()
        data = resp.json()
        
        results = data.get('results', [])
        all_records.extend(results)
        
        # Check if there are more pages
        if data.get('endOfRecords', True) or len(results) == 0:
            break
        
        offset += page_size
        time.sleep(0.5)  # polite rate limiting
    
    return all_records, data.get('count', len(all_records))


# Fetch all records
all_occurrences = []

for family_name, taxon_key in FAMILIES.items():
    records, total = fetch_all_occurrences(taxon_key, COUNTRY)
    print(f'{family_name} (taxonKey={taxon_key}): {len(records)} records fetched (total available: {total})')
    all_occurrences.extend(records)

print(f'\nTotal: {len(all_occurrences)} occurrence records')

In [None]:
# Define columns to match the Darwin Core Archive format
# These are the columns used by inspectData.ipynb and generateTaxaPages.ipynb
OCCURRENCE_COLUMNS = [
    'gbifID', 'accessRights', 'bibliographicCitation', 'language', 'license',
    'modified', 'publisher', 'references', 'rightsHolder', 'type',
    'institutionID', 'collectionID', 'datasetID', 'institutionCode',
    'collectionCode', 'datasetName', 'ownerInstitutionCode', 'basisOfRecord',
    'informationWithheld', 'dataGeneralizations', 'dynamicProperties',
    'occurrenceID', 'catalogNumber', 'recordNumber', 'recordedBy',
    'recordedByID', 'individualCount', 'organismQuantity',
    'organismQuantityType', 'sex', 'lifeStage', 'reproductiveCondition',
    'caste', 'behavior', 'vitality', 'establishmentMeans',
    'degreeOfEstablishment', 'pathway', 'georeferenceVerificationStatus',
    'occurrenceStatus', 'preparations', 'disposition',
    'associatedOccurrences', 'associatedReferences', 'associatedSequences',
    'associatedTaxa', 'otherCatalogNumbers', 'occurrenceRemarks',
    'organismID', 'organismName', 'organismScope', 'eventID', 'parentEventID',
    'fieldNumber', 'eventDate', 'eventTime', 'startDayOfYear', 'endDayOfYear',
    'year', 'month', 'day', 'verbatimEventDate', 'habitat', 'samplingProtocol',
    'sampleSizeValue', 'sampleSizeUnit', 'samplingEffort', 'fieldNotes',
    'eventRemarks', 'locationID', 'continent', 'waterBody', 'islandGroup',
    'island', 'country', 'countryCode', 'stateProvince', 'county',
    'municipality', 'locality', 'verbatimLocality', 'minimumElevationInMeters',
    'maximumElevationInMeters', 'verbatimElevation',
    'minimumDepthInMeters', 'maximumDepthInMeters', 'verbatimDepth',
    'minimumDistanceAboveSurfaceInMeters', 'maximumDistanceAboveSurfaceInMeters',
    'locationRemarks', 'decimalLatitude', 'decimalLongitude',
    'geodeticDatum', 'coordinateUncertaintyInMeters',
    'coordinatePrecision', 'pointRadiusSpatialFit',
    'verbatimCoordinates', 'verbatimLatitude', 'verbatimLongitude',
    'verbatimCoordinateSystem', 'verbatimSRS', 'footprintWKT',
    'footprintSRS', 'footprintSpatialFit', 'georeferencedBy',
    'georeferencedDate', 'georeferenceProtocol', 'georeferenceSources',
    'georeferenceRemarks', 'geologicalContextID', 'earliestEonOrLowestEonothem',
    'latestEonOrHighestEonothem', 'earliestEraOrLowestErathem',
    'latestEraOrHighestErathem', 'earliestPeriodOrLowestSystem',
    'latestPeriodOrHighestSystem', 'earliestEpochOrLowestSeries',
    'latestEpochOrHighestSeries', 'earliestAgeOrLowestStage',
    'latestAgeOrHighestStage', 'lowestBiostratigraphicZone',
    'highestBiostratigraphicZone', 'lithostratigraphicTerms', 'group',
    'formation', 'member', 'bed', 'identificationID', 'verbatimIdentification',
    'identificationQualifier', 'typeStatus', 'identifiedBy', 'identifiedByID',
    'dateIdentified', 'identificationReferences', 'identificationVerificationStatus',
    'identificationRemarks', 'taxonID', 'scientificNameID', 'acceptedNameUsageID',
    'parentNameUsageID', 'originalNameUsageID', 'nameAccordingToID',
    'namePublishedInID', 'taxonConceptID', 'scientificName',
    'acceptedScientificName', 'parentNameUsage', 'originalNameUsage',
    'nameAccordingTo', 'namePublishedIn', 'namePublishedInYear',
    'higherClassification', 'kingdom', 'phylum', 'class', 'order', 'family',
    'subfamily', 'genus', 'genericName', 'subgenus', 'infragenericEpithet',
    'specificEpithet', 'infraspecificEpithet', 'cultivarEpithet', 'taxonRank',
    'verbatimTaxonRank', 'scientificNameAuthorship', 'vernacularName',
    'nomenclaturalCode', 'taxonomicStatus', 'nomenclaturalStatus',
    'taxonRemarks', 'datasetKey', 'publishingCountry', 'lastInterpreted',
    'elevation', 'elevationAccuracy', 'depth', 'depthAccuracy',
    'distanceFromCentroidInMeters', 'stateProvince', 'issue',
    'mediaType', 'hasCoordinate', 'hasGeospatialIssues', 'taxonKey',
    'acceptedTaxonKey', 'kingdomKey', 'phylumKey', 'classKey', 'orderKey',
    'familyKey', 'genusKey', 'subgenusKey', 'speciesKey', 'species',
    'iucnRedListCategory', 'verbatimScientificName',
]

# Build occurrence rows — map API field names to DwC column names
rows = []
for rec in all_occurrences:
    row = {}
    for col in OCCURRENCE_COLUMNS:
        val = rec.get(col, '')
        # Some fields need special handling
        if col == 'gbifID':
            val = rec.get('gbifID', rec.get('key', ''))
        elif col == 'publisher':
            val = rec.get('publishingOrgKey', '')
        elif col == 'issue':
            issues = rec.get('issues', [])
            val = ';'.join(issues) if isinstance(issues, list) else str(issues)
        elif col == 'mediaType':
            media = rec.get('media', [])
            types = list(set(m.get('type', '') for m in media if m.get('type')))
            val = ';'.join(types)
        elif col == 'hasCoordinate':
            val = str(rec.get('hasCoordinate', '')).upper()
        elif col == 'hasGeospatialIssues':
            val = str(rec.get('hasGeospatialIssues', '')).upper()
        elif col == 'recordedByID':
            ids = rec.get('recordedByIDs', [])
            val = '|'.join(d.get('value', '') for d in ids if isinstance(d, dict)) if isinstance(ids, list) else ''
        elif col == 'identifiedByID':
            ids = rec.get('identifiedByIDs', [])
            val = '|'.join(d.get('value', '') for d in ids if isinstance(d, dict)) if isinstance(ids, list) else ''
        elif isinstance(val, list):
            val = ';'.join(str(v) for v in val)
        elif isinstance(val, dict):
            val = ''
        
        if val is None:
            val = ''
        row[col] = val
    rows.append(row)

df_occ = pd.DataFrame(rows, columns=OCCURRENCE_COLUMNS)
print(f'Occurrence DataFrame: {len(df_occ)} rows x {len(df_occ.columns)} columns')
print(f'Species: {df_occ["species"].nunique()} unique')
print(f'With coordinates: {df_occ["decimalLatitude"].notna().sum()}')
print(f'Basis of record distribution:')
print(df_occ['basisOfRecord'].value_counts().to_string())

In [None]:
# Extract media records from occurrences
MEDIA_COLUMNS = [
    'gbifID', 'type', 'format', 'identifier', 'references',
    'title', 'description', 'source', 'audience', 'created',
    'creator', 'contributor', 'publisher', 'license', 'rightsHolder'
]

media_rows = []
for rec in all_occurrences:
    gbif_id = rec.get('gbifID', rec.get('key', ''))
    for m in rec.get('media', []):
        media_row = {
            'gbifID': gbif_id,
            'type': m.get('type', ''),
            'format': m.get('format', ''),
            'identifier': m.get('identifier', ''),
            'references': m.get('references', ''),
            'title': m.get('title', ''),
            'description': m.get('description', ''),
            'source': m.get('source', ''),
            'audience': m.get('audience', ''),
            'created': m.get('created', ''),
            'creator': m.get('creator', ''),
            'contributor': m.get('contributor', ''),
            'publisher': m.get('publisher', ''),
            'license': m.get('license', ''),
            'rightsHolder': m.get('rightsHolder', ''),
        }
        media_rows.append(media_row)

df_media = pd.DataFrame(media_rows, columns=MEDIA_COLUMNS)
print(f'Multimedia DataFrame: {len(df_media)} rows')
print(f'Media types: {df_media["type"].value_counts().to_string()}')

In [None]:
# Save occurrence data
occ_path = 'data/occurrence.txt'
df_occ.to_csv(occ_path, sep='\t', index=False)
print(f'Saved {len(df_occ)} occurrences to {occ_path}')

# Save multimedia data  
media_path = 'data/multimedia.txt'
df_media.to_csv(media_path, sep='\t', index=False)
print(f'Saved {len(df_media)} media records to {media_path}')

# Summary
print(f'\n=== GBIF Download Summary ===')
print(f'Date: {pd.Timestamp.now().strftime("%Y-%m-%d %H:%M:%S")}')
print(f'Country: Montserrat ({COUNTRY})')
print(f'Families: {", ".join(FAMILIES.keys())}')
print(f'Total occurrences: {len(df_occ)}')
print(f'Total media records: {len(df_media)}')
print(f'Species with records: {df_occ["species"].nunique()}')
n_coords = (df_occ["decimalLatitude"].notna() & df_occ["decimalLongitude"].notna()).sum()
print(f'With coordinates: {n_coords}')
print(f'\nFiles written:')
print(f'  {occ_path} ({os.path.getsize(occ_path):,} bytes)')
print(f'  {media_path} ({os.path.getsize(media_path):,} bytes)')