# Generate taxa pages with OSM maps

This notebook generates enriched species pages for the Visible Nature Atlas.
For each species, it:
1. Fetches worldwide GBIF occurrence records (with coordinates)
2. Generates an interactive Leaflet/OpenStreetMap map
3. Fetches a Wikipedia introduction
4. Writes a complete markdown page with embedded map HTML

**Prerequisites**: Run `inspectData.ipynb` first to generate `gbifMontserrat.ttl`.

In [None]:
import os
import time
import requests
import pandas as pd
import folium
import yaml
import tqdm
from rdflib import Graph, Namespace
from rdflib.namespace import RDFS

WDT = Namespace("http://www.wikidata.org/prop/direct/")
WD  = Namespace("http://www.wikidata.org/entity/")

os.makedirs('taxa', exist_ok=True)
os.makedirs('maps', exist_ok=True)

print('Dependencies loaded.')

## Load the local RDF graph

We load `gbifMontserrat.ttl` generated by `inspectData.ipynb` to get the list of species and their Wikidata URIs.

In [None]:
g = Graph()
g.parse('gbifMontserrat.ttl', format='turtle')

# Extract distinct taxa: {taxon_label: wikidata_qid}
taxa_query = """
PREFIX wd:  <http://www.wikidata.org/entity/>
PREFIX wdt: <http://www.wikidata.org/prop/direct/>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>

SELECT DISTINCT ?taxon ?taxonLabel WHERE {
    ?obs wdt:P225 ?taxon .
    ?taxon rdfs:label ?taxonLabel .
}
"""

taxa = {}
for row in g.query(taxa_query):
    qid = str(row.taxon).replace('http://www.wikidata.org/entity/', '')
    label = str(row.taxonLabel)
    taxa[label] = {'qid': qid, 'uri': str(row.taxon)}

print(f'Found {len(taxa)} distinct taxa:')
for name, info in sorted(taxa.items()):
    print(f'  {name} ({info["qid"]})')

## Helper: fetch worldwide occurrences from GBIF API

We query the GBIF occurrence API to get georeferenced worldwide records for each species. 
This gives us coordinates to plot on a global OSM map.

In [None]:
def fetch_gbif_occurrences(taxon_name, limit=300):
    """Fetch georeferenced GBIF occurrences for a species by name.
    Returns a list of dicts with lat, lon, country, year, institutionCode.
    """
    url = 'https://api.gbif.org/v1/occurrence/search'
    params = {
        'scientificName': taxon_name,
        'hasCoordinate': 'true',
        'hasGeospatialIssue': 'false',
        'limit': limit,
        'offset': 0
    }
    try:
        resp = requests.get(url, params=params, timeout=20)
        resp.raise_for_status()
        data = resp.json()
        results = []
        for rec in data.get('results', []):
            lat = rec.get('decimalLatitude')
            lon = rec.get('decimalLongitude')
            if lat is not None and lon is not None:
                results.append({
                    'lat': lat,
                    'lon': lon,
                    'country': rec.get('country', ''),
                    'year': rec.get('year', ''),
                    'institution': rec.get('institutionCode', ''),
                    'gbifID': rec.get('gbifID', ''),
                    'basisOfRecord': rec.get('basisOfRecord', '')
                })
        return results, data.get('count', 0)
    except Exception as e:
        print(f'  Warning: could not fetch GBIF occurrences for {taxon_name}: {e}')
        return [], 0


def get_wikipedia_intro(taxon_name):
    """Fetch the introductory paragraph from English Wikipedia."""
    url = 'https://en.wikipedia.org/w/api.php'
    params = {
        'action': 'query',
        'format': 'json',
        'titles': taxon_name.replace(' ', '_'),
        'prop': 'extracts',
        'exintro': True,
        'explaintext': True,
        'redirects': 1
    }
    try:
        resp = requests.get(url, params=params, timeout=10)
        data = resp.json()
        page = next(iter(data['query']['pages'].values()))
        extract = page.get('extract', '').strip()
        # Return only the first paragraph (up to first double newline)
        paragraphs = [p.strip() for p in extract.split('\n') if p.strip()]
        return paragraphs[0] if paragraphs else ''
    except Exception as e:
        return ''


def resolve_license_label(license_uri):
    """Convert a Wikidata license URI to a (label, url) tuple."""
    license_map = {
        'Q18199165': ('CC BY 4.0',       'https://creativecommons.org/licenses/by/4.0/'),
        'Q20007257': ('CC BY-SA 4.0',    'https://creativecommons.org/licenses/by-sa/4.0/'),
        'Q6938433':  ('CC0 1.0',         'https://creativecommons.org/publicdomain/zero/1.0/'),
        'Q19068220': ('CC BY-NC 4.0',    'https://creativecommons.org/licenses/by-nc/4.0/'),
        'Q26952697': ('CC BY-NC-SA 4.0', 'https://creativecommons.org/licenses/by-nc-sa/4.0/'),
        'Q35254':    ('CC BY-SA 3.0',    'https://creativecommons.org/licenses/by-sa/3.0/'),
        'Q24082749': ('CC BY-SA 4.0',    'https://creativecommons.org/licenses/by-sa/4.0/'),
    }
    s = str(license_uri)
    if 'wikidata.org/entity/Q' in s:
        qid = s.split('/')[-1]
        if qid in license_map:
            return license_map[qid]
        return qid, s
    s_lower = s.lower()
    if 'publicdomain/zero' in s_lower:   return 'CC0 1.0', s
    if 'by-nc-sa' in s_lower:           return 'CC BY-NC-SA 4.0', s
    if 'by-nc' in s_lower:              return 'CC BY-NC 4.0', s
    if 'by-sa' in s_lower:              return 'CC BY-SA 4.0', s
    if 'by' in s_lower:                 return 'CC BY 4.0', s
    return s, s


print('Helper functions defined.')


## Generate Folium map for a species

Creates an interactive OpenStreetMap map with clustered observation markers.
Returns the path to the saved HTML file (embedded in the markdown page).

In [None]:
def generate_species_map(taxon_name, occurrences, output_path):
    """Generate a Folium map with observation points and save as HTML.
    
    Args:
        taxon_name: Scientific name of the species
        occurrences: List of occurrence dicts with lat, lon, country, year, institution, gbifID
        output_path: Path to save the HTML map file
    Returns:
        Path to the saved HTML file, or None if no occurrences
    """
    if not occurrences:
        return None

    # Compute centroid for initial map view
    lats = [o['lat'] for o in occurrences]
    lons = [o['lon'] for o in occurrences]
    center = [sum(lats) / len(lats), sum(lons) / len(lons)]

    m = folium.Map(
        location=center,
        zoom_start=3,
        tiles='OpenStreetMap',
        width='100%',
        height='450px'
    )

    # Add tile layer attribution
    folium.TileLayer(
        tiles='https://{s}.tile.openstreetmap.org/{z}/{x}/{y}.png',
        attr='© <a href="https://www.openstreetmap.org/copyright">OpenStreetMap</a> contributors',
        name='OpenStreetMap',
        overlay=False,
        control=True
    ).add_to(m)

    # Colour-code by basis of record
    color_map = {
        'HUMAN_OBSERVATION': '#2ecc71',
        'PRESERVED_SPECIMEN': '#e67e22',
        'MACHINE_OBSERVATION': '#3498db',
        'LITERATURE': '#9b59b6',
        'MATERIAL_SAMPLE': '#e74c3c',
    }

    for occ in occurrences:
        color = color_map.get(occ.get('basisOfRecord', ''), '#95a5a6')
        year_str = f", {occ['year']}" if occ.get('year') else ''
        inst_str = f"<br><em>{occ['institution']}</em>" if occ.get('institution') else ''
        gbif_link = ''
        if occ.get('gbifID'):
            gbif_link = f'<br><a href="https://www.gbif.org/occurrence/{occ["gbifID"]}" target="_blank">GBIF record</a>'
        
        popup_html = (
            f"<b>{taxon_name}</b><br>"
            f"{occ.get('country', '')}{year_str}"
            f"{inst_str}"
            f"{gbif_link}"
        )
        folium.CircleMarker(
            location=[occ['lat'], occ['lon']],
            radius=5,
            color=color,
            fill=True,
            fill_color=color,
            fill_opacity=0.7,
            popup=folium.Popup(popup_html, max_width=250),
            tooltip=f"{occ.get('country', '')} {year_str}"
        ).add_to(m)

    # Legend
    legend_html = '''
    <div style="position:fixed; bottom:20px; left:20px; z-index:9999;
                background:white; padding:10px; border-radius:6px;
                border:1px solid #ccc; font-size:12px; line-height:1.8;">
        <b>Basis of record</b><br>
        <span style="color:#2ecc71;">●</span> Human observation<br>
        <span style="color:#e67e22;">●</span> Preserved specimen<br>
        <span style="color:#3498db;">●</span> Machine observation<br>
        <span style="color:#9b59b6;">●</span> Literature<br>
        <span style="color:#95a5a6;">●</span> Other
    </div>
    '''
    m.get_root().html.add_child(folium.Element(legend_html))

    m.save(output_path)
    return output_path


print('Map generation function defined.')

## Load taxa information from the existing pipeline

We extract taxonpages data from the GBIF RDF graph to get observations grouped by publisher.

In [None]:
# Query the local RDF graph for all observation data
obs_query = """
PREFIX wd:  <http://www.wikidata.org/entity/>
PREFIX wdt: <http://www.wikidata.org/prop/direct/>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX dc:   <http://purl.org/dc/elements/1.1/>

SELECT DISTINCT ?taxon ?taxonLabel ?publisher ?publisherLabel 
                ?observation ?gbifObservation ?media_url ?license WHERE {
    ?media wdt:P2699 ?media_url ;
           wdt:P275 ?license ;
           wdt:P361 ?observation .
    ?observation wdt:P225 ?taxon ;
                 wdt:P854 ?gbifObservation ;
                 wdt:P123 ?publisher .
    ?taxon rdfs:label ?taxonLabel .
    ?publisher rdfs:label ?publisherLabel .
}
"""

# Build taxonpages: {taxon_label: {qid, publisher: {org: [{obs_id, media, license}]}}}
taxonpages = {}

for row in g.query(obs_query):
    taxon_label = str(row.taxonLabel)
    qid = str(row.taxon).replace('http://www.wikidata.org/entity/', '')
    publisher = str(row.publisherLabel)
    obs_id = str(row.gbifObservation)
    media_url = str(row.media_url)
    license_uri = str(row.license)

    if taxon_label not in taxonpages:
        taxonpages[taxon_label] = {'qid': qid, 'uri': str(row.taxon), 'publishers': {}}
    
    if publisher not in taxonpages[taxon_label]['publishers']:
        taxonpages[taxon_label]['publishers'][publisher] = {}
    
    if obs_id not in taxonpages[taxon_label]['publishers'][publisher]:
        taxonpages[taxon_label]['publishers'][publisher][obs_id] = {
            'obs_id': obs_id,
            'media': [],
            'license': license_uri
        }
    
    if media_url and media_url != 'nan':
        existing = taxonpages[taxon_label]['publishers'][publisher][obs_id]['media']
        if media_url not in existing:
            existing.append(media_url)

print(f'Loaded {len(taxonpages)} taxa with observation data.')
for name in sorted(taxonpages.keys()):
    n_obs = sum(len(obs) for obs in taxonpages[name]['publishers'].values())
    print(f'  {name}: {len(taxonpages[name]["publishers"])} publishers, {n_obs} observations')

## Generate enriched taxa pages

For each species we:
1. Fetch worldwide GBIF occurrences (for the map)
2. Fetch the Wikipedia introduction
3. Generate a Folium/OSM map saved as `maps/<Species_name>.html`
4. Write a complete markdown page that embeds the map via an `<iframe>`

In [None]:
toc_taxa = []

for taxon_name in tqdm.tqdm(sorted(taxonpages.keys()), desc='Generating taxa pages'):
    info = taxonpages[taxon_name]
    qid = info['qid']
    
    # --- 1. Wikipedia introduction ---
    wiki_intro = get_wikipedia_intro(taxon_name)
    time.sleep(0.5)  # be polite to the API

    # --- 2. Worldwide GBIF occurrences for the map ---
    occurrences, total_count = fetch_gbif_occurrences(taxon_name, limit=300)
    time.sleep(0.5)

    # --- 3. Generate map ---
    safe_name = taxon_name.replace(' ', '_')
    map_path = f'maps/{safe_name}.html'
    generate_species_map(taxon_name, occurrences, map_path)

    # --- 4. Write markdown page ---
    page_path = f'taxa/{safe_name}.md'
    toc_taxa.append({'file': f'taxa/{safe_name}'})

    lines = []

    # Title
    lines.append(f'# *{taxon_name}*')
    lines.append('')

    # Wikidata / Scholia link
    lines.append(
        f'[Wikidata ({qid})](https://www.wikidata.org/wiki/{qid}) · '
        f'[Scholia](https://scholia.toolforge.org/taxon/{qid}) · '
        f'[GBIF species page](https://www.gbif.org/species/search?q={taxon_name.replace(" ", "%20")})'
    )
    lines.append('')

    # Wikipedia introduction
    if wiki_intro:
        lines.append('## About this species')
        lines.append('')
        lines.append(wiki_intro)
        lines.append('')
        wp_url = f'https://en.wikipedia.org/wiki/{taxon_name.replace(" ", "_")}'
        lines.append(f'Read more on [English Wikipedia]({wp_url}).')
        lines.append('')

    # Map section
    lines.append('## Global distribution')
    lines.append('')
    if occurrences:
        lines.append(
            f'Map shows **{len(occurrences)}** georeferenced GBIF records '
            f'(out of {total_count} total) for *{taxon_name}* worldwide. '
            f'Click markers for details.'
        )
        lines.append('')
        # Embed the map as a raw HTML iframe
        # The path is relative to the taxa/ directory, so we go up one level
        lines.append('```{raw} html')
        lines.append(f'<iframe src="../{map_path}" width="100%" height="500px" '
                     f'frameborder="0" scrolling="no" '
                     f'style="border-radius:8px; margin-bottom:1rem;"></iframe>')
        lines.append('```')
        lines.append('')
        lines.append(
            f'Map data © [OpenStreetMap](https://www.openstreetmap.org/copyright) contributors. '
            f'Occurrence data from [GBIF](https://www.gbif.org/).'
        )
    else:
        lines.append('*No georeferenced occurrence records found in GBIF for this species.*')
    lines.append('')

    # Observations from Montserrat
    lines.append('## Observations from Montserrat')
    lines.append('')
    lines.append('The following observations are from the GBIF dataset covering Montserrat grasses and sedges.')
    lines.append('')

    for org_name, obs_dict in sorted(info['publishers'].items()):
        lines.append(f'### {org_name}')
        lines.append('')
        for obs_id, obs_data in obs_dict.items():
            license_label, license_url = resolve_license_label(obs_data['license'])
            lines.append(f'**Observation:** [{obs_id}]({obs_id})  ')
            lines.append(f'**License:** [{license_label}]({license_url})')
            lines.append('')
            for media_url in obs_data['media']:
                if media_url and media_url != 'nan':
                    # Prefer medium-sized images over square thumbnails
                    display_url = media_url.replace('square', 'medium')
                    lines.append(f'![{taxon_name} — {org_name}]({display_url})')
                    lines.append('')

    # Write the file
    with open(page_path, 'w', encoding='utf-8') as f:
        f.write('\n'.join(lines))

print(f'\nGenerated {len(toc_taxa)} taxa pages.')


## Update `_toc.yml`

Update the table of contents with the newly generated pages.

## Generate Montserrat overview map

Build a single map of all georeferenced GBIF observations on Montserrat,
coloured by species. Saved as `maps/montserrat_overview.html` and embedded in `intro.md`.

In [None]:
import pandas as pd
import colorsys

# Load the raw GBIF occurrence data (Montserrat dataset)
df_occ = pd.read_csv(
    'data/0002020-240626123714530/occurrence.txt',
    sep='\t', on_bad_lines='warn', low_memory=False
)

# Keep only georeferenced species-level records
df_coords = df_occ[
    df_occ['decimalLatitude'].notna() &
    df_occ['decimalLongitude'].notna() &
    (df_occ['taxonRank'] == 'SPECIES')
].copy()

print(f'Georeferenced records: {len(df_coords)} across {df_coords["species"].nunique()} species')

# Assign a distinct colour per species using evenly spaced hues
species_list = sorted(df_coords['species'].dropna().unique())
n = len(species_list)

def hsl_to_hex(h, s, l):
    r, g, b = colorsys.hls_to_rgb(h, l, s)
    return '#{:02x}{:02x}{:02x}'.format(int(r*255), int(g*255), int(b*255))

species_colours = {
    sp: hsl_to_hex(i / n, 0.75, 0.42)
    for i, sp in enumerate(species_list)
}

# Build the overview map centred on Montserrat
overview_map = folium.Map(
    location=[16.745, -62.202],
    zoom_start=12,
    tiles='OpenStreetMap',
    width='100%',
    height='500px'
)

for _, row in df_coords.iterrows():
    sp = row.get('species', '')
    colour = species_colours.get(sp, '#888888')
    safe_name = str(sp).replace(' ', '_') if sp else ''
    gbif_link = 'https://www.gbif.org/occurrence/' + str(int(row['gbifID']))
    year_str = str(int(row['year'])) if pd.notna(row.get('year')) else ''
    inst_str = str(row.get('institutionCode', ''))
    popup_html = (
        '<b><i>' + str(sp) + '</i></b><br>' +
        inst_str + ' ' + year_str + '<br>' +
        '<a href="' + gbif_link + '" target="_blank">GBIF record</a>'
    )
    folium.CircleMarker(
        location=[row['decimalLatitude'], row['decimalLongitude']],
        radius=7,
        color=colour,
        fill=True,
        fill_color=colour,
        fill_opacity=0.85,
        popup=folium.Popup(popup_html, max_width=220),
        tooltip='<i>' + str(sp) + '</i>'
    ).add_to(overview_map)

# Species legend
legend_items = ''.join(
    '<span style="color:' + c + ';">&#9679;</span> <i>' + sp + '</i><br>'
    for sp, c in sorted(species_colours.items())
)
legend_html = (
    '<div style="position:fixed; bottom:20px; left:20px; z-index:9999;'
    ' background:white; padding:10px 14px; border-radius:6px;'
    ' border:1px solid #ccc; font-size:11px; line-height:1.8;'
    ' max-height:280px; overflow-y:auto;">'
    '<b>Species</b><br>' + legend_items + '</div>'
)
overview_map.get_root().html.add_child(folium.Element(legend_html))

overview_map.save('maps/montserrat_overview.html')
print('Saved maps/montserrat_overview.html')


In [None]:
with open('_toc.yml', 'r') as f:
    toc = yaml.safe_load(f)

# Update Part 2 (taxa) with sorted list
toc_taxa_sorted = sorted(toc_taxa, key=lambda x: x['file'])
toc['parts'][1]['chapters'] = toc_taxa_sorted

# Also update Part 1 (organisation) from directory listing
org_chapters = []
for fname in sorted(os.listdir('organisation')):
    if fname.endswith('.md'):
        org_chapters.append({'file': f'organisation/{fname[:-3]}'})
toc['parts'][0]['chapters'] = org_chapters

with open('_toc.yml', 'w') as f:
    yaml.dump(toc, f, allow_unicode=True, default_flow_style=False, sort_keys=False)

print('_toc.yml updated.')
print(f'  Part 1 (collections): {len(org_chapters)} chapters')
print(f'  Part 2 (taxa):        {len(toc_taxa_sorted)} chapters')

## Done

All taxa pages have been generated. You can now build the Jupyter Book:

```bash
jupyter-book build .
```