# Generate multilingual taxa pages with OSM maps

This notebook generates enriched, multilingual species pages for the Visible Nature Atlas.

**For each species, it:**
1. Fetches worldwide GBIF occurrence records (with coordinates)
2. Generates an interactive Leaflet/OpenStreetMap map
3. Discovers available Wikipedia language editions via Wikidata SPARQL
4. Fetches Wikipedia introductions **per language**
5. Fetches Plazi TreatmentBank treatments and BHL literature
6. Generates a **Wikipedia language availability table** (which species has articles in which languages)
7. Writes complete markdown pages per language: `taxa_{lang}/Genus_species.md`
8. Generates per-language `_config_{lang}.yml`, `_toc_{lang}.yml`, `intro_{lang}.md`
9. Creates a landing page (`index.html`) with a language picker

**Prerequisites**: Run `inspectData.ipynb` first to generate `gbifMontserrat.ttl`.

In [None]:
import os
import time
import json
import requests
import pandas as pd
import folium
import yaml
import tqdm
import colorsys
from rdflib import Graph, Namespace
from rdflib.namespace import RDFS

WDT = Namespace("http://www.wikidata.org/prop/direct/")
WD  = Namespace("http://www.wikidata.org/entity/")

# BHL API key — set via environment variable or paste here for local testing
BHL_API_KEY = os.environ.get('BHL_API_KEY', '')

# Bot-generated Wikipedia editions to exclude (thin/auto-generated content)
BOT_WIKIS = {'ceb', 'war', 'min', 'shn'}

# Minimum number of species that must have a Wikipedia article in a language
# for that language to get its own book
MIN_SPECIES_PER_LANG = 3

os.makedirs('maps', exist_ok=True)

if BHL_API_KEY:
    print(f'BHL API key loaded ({len(BHL_API_KEY)} chars).')
else:
    print('No BHL_API_KEY set — BHL sections will be skipped.')

print('Dependencies loaded.')

## Load the local RDF graph

We load `gbifMontserrat.ttl` generated by `inspectData.ipynb` to get the list of species and their Wikidata URIs.

In [None]:
g = Graph()
g.parse('gbifMontserrat.ttl', format='turtle')

# Extract distinct taxa: {taxon_label: wikidata_qid}
taxa_query = """
PREFIX wd:  <http://www.wikidata.org/entity/>
PREFIX wdt: <http://www.wikidata.org/prop/direct/>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>

SELECT DISTINCT ?taxon ?taxonLabel WHERE {
    ?obs wdt:P225 ?taxon .
    ?taxon rdfs:label ?taxonLabel .
}
"""

taxa = {}
for row in g.query(taxa_query):
    qid = str(row.taxon).replace('http://www.wikidata.org/entity/', '')
    label = str(row.taxonLabel)
    taxa[label] = {'qid': qid, 'uri': str(row.taxon)}

print(f'Found {len(taxa)} distinct taxa:')
for name, info in sorted(taxa.items()):
    print(f'  {name} ({info["qid"]})')

## Helper functions

All data-fetching and utility functions: GBIF occurrences, Wikipedia intros (multilingual),
license resolution, Plazi treatments, BHL publications, and **Wikidata language discovery**.

In [None]:
def fetch_gbif_occurrences(taxon_name, limit=300):
    """Fetch georeferenced GBIF occurrences for a species by name."""
    url = 'https://api.gbif.org/v1/occurrence/search'
    params = {
        'scientificName': taxon_name,
        'hasCoordinate': 'true',
        'hasGeospatialIssue': 'false',
        'limit': limit,
        'offset': 0
    }
    try:
        resp = requests.get(url, params=params, timeout=20)
        resp.raise_for_status()
        data = resp.json()
        results = []
        for rec in data.get('results', []):
            lat = rec.get('decimalLatitude')
            lon = rec.get('decimalLongitude')
            if lat is not None and lon is not None:
                results.append({
                    'lat': lat, 'lon': lon,
                    'country': rec.get('country', ''),
                    'year': rec.get('year', ''),
                    'institution': rec.get('institutionCode', ''),
                    'gbifID': rec.get('gbifID', ''),
                    'basisOfRecord': rec.get('basisOfRecord', '')
                })
        return results, data.get('count', 0)
    except Exception as e:
        print(f'  Warning: could not fetch GBIF occurrences for {taxon_name}: {e}')
        return [], 0


def get_wikipedia_intro(taxon_name, lang='en'):
    """Fetch the introductory paragraph from Wikipedia in the given language."""
    url = f'https://{lang}.wikipedia.org/w/api.php'
    params = {
        'action': 'query', 'format': 'json',
        'titles': taxon_name.replace(' ', '_'),
        'prop': 'extracts', 'exintro': True,
        'explaintext': True, 'redirects': 1
    }
    for attempt in range(3):
        try:
            resp = requests.get(url, params=params, timeout=10)
            if resp.status_code != 200:
                time.sleep(1 * (attempt + 1))
                continue
            data = resp.json()
            page = next(iter(data['query']['pages'].values()))
            extract = page.get('extract', '').strip()
            paragraphs = [p.strip() for p in extract.split('\n') if p.strip()]
            return paragraphs[0] if paragraphs else ''
        except Exception:
            time.sleep(1 * (attempt + 1))
    return ''


def resolve_license_label(license_uri):
    """Convert a Wikidata license URI to a (label, url) tuple."""
    license_map = {
        'Q18199165': ('CC BY 4.0',       'https://creativecommons.org/licenses/by/4.0/'),
        'Q20007257': ('CC BY-SA 4.0',    'https://creativecommons.org/licenses/by-sa/4.0/'),
        'Q6938433':  ('CC0 1.0',         'https://creativecommons.org/publicdomain/zero/1.0/'),
        'Q19068220': ('CC BY-NC 4.0',    'https://creativecommons.org/licenses/by-nc/4.0/'),
        'Q26952697': ('CC BY-NC-SA 4.0', 'https://creativecommons.org/licenses/by-nc-sa/4.0/'),
        'Q35254':    ('CC BY-SA 3.0',    'https://creativecommons.org/licenses/by-sa/3.0/'),
        'Q24082749': ('CC BY-SA 4.0',    'https://creativecommons.org/licenses/by-sa/4.0/'),
    }
    s = str(license_uri)
    if 'wikidata.org/entity/Q' in s:
        qid = s.split('/')[-1]
        if qid in license_map:
            return license_map[qid]
        return qid, s
    s_lower = s.lower()
    if 'publicdomain/zero' in s_lower:   return 'CC0 1.0', s
    if 'by-nc-sa' in s_lower:           return 'CC BY-NC-SA 4.0', s
    if 'by-nc' in s_lower:              return 'CC BY-NC 4.0', s
    if 'by-sa' in s_lower:              return 'CC BY-SA 4.0', s
    if 'by' in s_lower:                 return 'CC BY 4.0', s
    return s, s


def fetch_plazi_treatments(genus, species):
    """Fetch taxonomic treatments from Plazi via LINDAS SPARQL endpoint."""
    query = '''
PREFIX dwc: <http://rs.tdwg.org/dwc/terms/>
PREFIX dc:  <http://purl.org/dc/elements/1.1/>
PREFIX treat: <http://plazi.org/vocab/treatment#>

SELECT DISTINCT ?treatment ?creator
       (SAMPLE(?title) AS ?pubTitle)
WHERE {{
  ?treatment a treat:Treatment .
  ?treatment (treat:deprecates|treat:augmentsTaxonConcept|treat:definesTaxonConcept) ?tc .
  ?tc dwc:species "{sp}" .
  ?tc dwc:genus "{ge}" .
  OPTIONAL {{ ?treatment dc:creator ?creator . }}
  OPTIONAL {{ ?treatment treat:publishedIn ?pub . ?pub dc:title ?title . }}
}}
GROUP BY ?treatment ?creator
LIMIT 10
'''.format(ge=genus, sp=species)
    try:
        resp = requests.get(
            'https://lindas.admin.ch/query',
            params={'query': query},
            headers={'Accept': 'application/sparql-results+json',
                     'User-Agent': 'VisibleNatureAtlas/1.0'},
            timeout=20
        )
        resp.raise_for_status()
        data = resp.json()
        results = []
        seen = set()
        for b in data.get('results', {}).get('bindings', []):
            t_url = b.get('treatment', {}).get('value', '')
            if t_url in seen:
                continue
            seen.add(t_url)
            results.append({
                'treatment_url': t_url,
                'treatment_page': t_url.replace('http://', 'https://'),
                'creator': b.get('creator', {}).get('value', ''),
                'pub_title': b.get('pubTitle', {}).get('value', ''),
            })
        return results
    except Exception as e:
        print(f'  Plazi warning for {genus} {species}: {e}')
        return []


def fetch_bhl_publications(taxon_name, api_key, limit=5):
    """Search BHL for publications mentioning a taxon."""
    if not api_key:
        print(f'  BHL: skipping {taxon_name} — no API key')
        return []
    try:
        resp = requests.get(
            'https://www.biodiversitylibrary.org/api3',
            params={
                'op': 'PublicationSearch', 'searchterm': taxon_name,
                'searchtype': '', 'page': 1, 'pagesize': limit,
                'apikey': api_key, 'format': 'json',
            },
            timeout=15
        )
        print(f'  BHL HTTP {resp.status_code} for {taxon_name}')
        resp.raise_for_status()
        data = resp.json()
        status = data.get('Status', '(missing)')
        error_msg = data.get('ErrorMessage', '')
        result_raw = data.get('Result', [])
        n_raw = len(result_raw) if isinstance(result_raw, list) else f'type={type(result_raw).__name__}'
        print(f'  BHL response: Status={status}, ErrorMessage={error_msg}, Result count={n_raw}')
        if status != 'ok':
            print(f'  BHL: Status is not "ok" — returning empty for {taxon_name}')
            return []
        results = []
        for item in result_raw[:limit]:
            if not isinstance(item, dict):
                print(f'  BHL: unexpected Result item type: {type(item).__name__} = {repr(item)[:100]}')
                continue
            pub_url = item.get('Url', '')
            if not pub_url and item.get('TitleID'):
                pub_url = 'https://www.biodiversitylibrary.org/title/' + str(item['TitleID'])
            results.append({
                'title': item.get('Title', 'Untitled'),
                'authors': item.get('Authors', ''),
                'date': item.get('Date', ''),
                'bhl_url': pub_url,
            })
        print(f'  BHL: {len(results)} publications found for {taxon_name}')
        return results
    except Exception as e:
        print(f'  BHL error for {taxon_name}: {type(e).__name__}: {e}')
        return []


def discover_wikipedia_languages(qids):
    """Discover available Wikipedia editions for a set of Wikidata items.
    
    Uses Wikidata SPARQL to find all sitelinks in one query.
    Returns dict: {taxon_label: {lang_code: article_title, ...}, ...}
    and a reverse map: {lang_code: set of taxon_labels with articles}
    """
    # Build VALUES clause with QIDs
    values = ' '.join(f'wd:{qid}' for qid in qids.values())
    
    query = f'''
SELECT ?item ?itemLabel ?sitelink ?site WHERE {{
  VALUES ?item {{ {values} }}
  ?sitelink schema:about ?item ;
            schema:isPartOf ?site ;
            schema:name ?name .
  ?site wikibase:wikiGroup "wikipedia" .
  SERVICE wikibase:label {{ bd:serviceParam wikibase:language "en" . }}
}}
'''
    try:
        resp = requests.get(
            'https://query.wikidata.org/sparql',
            params={'query': query, 'format': 'json'},
            headers={'User-Agent': 'VisibleNatureAtlas/1.0 (https://github.com/VisibleNatureAtlas)'},
            timeout=30
        )
        resp.raise_for_status()
        data = resp.json()
    except Exception as e:
        print(f'Warning: Wikidata language discovery failed: {e}')
        # Fallback: just return English
        return {label: {'en': label.replace(' ', '_')} for label in qids.keys()}, {'en': set(qids.keys())}
    
    # Build QID → label reverse lookup
    qid_to_label = {v: k for k, v in qids.items()}
    
    # Parse results
    taxon_langs = {}  # {taxon_label: {lang: article_title}}
    lang_taxa = {}    # {lang: set(taxon_labels)}
    
    for b in data.get('results', {}).get('bindings', []):
        item_uri = b.get('item', {}).get('value', '')
        qid = item_uri.split('/')[-1]
        taxon_label = qid_to_label.get(qid, '')
        if not taxon_label:
            continue
        
        site_url = b.get('site', {}).get('value', '')
        # Extract language code from site URL like https://en.wikipedia.org/
        if '.wikipedia.org' not in site_url:
            continue
        lang = site_url.split('//')[1].split('.')[0] if '//' in site_url else ''
        if not lang or lang in BOT_WIKIS:
            continue
        
        sitelink_url = b.get('sitelink', {}).get('value', '')
        article_title = sitelink_url.split('/wiki/')[-1] if '/wiki/' in sitelink_url else taxon_label.replace(' ', '_')
        
        if taxon_label not in taxon_langs:
            taxon_langs[taxon_label] = {}
        taxon_langs[taxon_label][lang] = article_title
        
        if lang not in lang_taxa:
            lang_taxa[lang] = set()
        lang_taxa[lang].add(taxon_label)
    
    return taxon_langs, lang_taxa


# Language names for display (ISO 639-1 → English name)
LANG_NAMES = {
    'en': 'English', 'fr': 'French', 'de': 'German', 'es': 'Spanish',
    'pt': 'Portuguese', 'it': 'Italian', 'nl': 'Dutch', 'sv': 'Swedish',
    'pl': 'Polish', 'ru': 'Russian', 'ja': 'Japanese', 'zh': 'Chinese',
    'ko': 'Korean', 'ar': 'Arabic', 'ca': 'Catalan', 'cs': 'Czech',
    'da': 'Danish', 'fi': 'Finnish', 'el': 'Greek', 'he': 'Hebrew',
    'hi': 'Hindi', 'hu': 'Hungarian', 'id': 'Indonesian', 'ms': 'Malay',
    'no': 'Norwegian', 'fa': 'Persian', 'ro': 'Romanian', 'sk': 'Slovak',
    'th': 'Thai', 'tr': 'Turkish', 'uk': 'Ukrainian', 'vi': 'Vietnamese',
    'eu': 'Basque', 'gl': 'Galician', 'hr': 'Croatian', 'lt': 'Lithuanian',
    'lv': 'Latvian', 'sr': 'Serbian', 'sl': 'Slovenian', 'bg': 'Bulgarian',
    'et': 'Estonian', 'simple': 'Simple English', 'nb': 'Norwegian Bokm\u00e5l',
    'nn': 'Norwegian Nynorsk', 'eo': 'Esperanto', 'az': 'Azerbaijani',
    'ta': 'Tamil', 'te': 'Telugu', 'bn': 'Bengali', 'ur': 'Urdu',
    'pa': 'Punjabi', 'ml': 'Malayalam', 'kn': 'Kannada', 'gu': 'Gujarati',
    'mr': 'Marathi', 'af': 'Afrikaans', 'sw': 'Swahili', 'ga': 'Irish',
    'cy': 'Welsh', 'la': 'Latin', 'sh': 'Serbo-Croatian', 'ast': 'Asturian',
    'oc': 'Occitan', 'an': 'Aragonese', 'mg': 'Malagasy', 'tl': 'Tagalog',
    'qu': 'Quechua', 'nds': 'Low German',
}


def get_lang_name(code):
    """Return the English name for a language code, or the code itself."""
    return LANG_NAMES.get(code, code)


print('All helper functions defined (including multilingual + Plazi + BHL).')

## Generate Folium map for a species

Creates an interactive OpenStreetMap map with clustered observation markers.
Returns the path to the saved HTML file (embedded in the markdown page).

In [None]:
def generate_species_map(taxon_name, occurrences, output_path):
    """Generate a Folium map with observation points and save as HTML.
    
    Args:
        taxon_name: Scientific name of the species
        occurrences: List of occurrence dicts with lat, lon, country, year, institution, gbifID
        output_path: Path to save the HTML map file
    Returns:
        Path to the saved HTML file, or None if no occurrences
    """
    if not occurrences:
        return None

    # Compute centroid for initial map view
    lats = [o['lat'] for o in occurrences]
    lons = [o['lon'] for o in occurrences]
    center = [sum(lats) / len(lats), sum(lons) / len(lons)]

    m = folium.Map(
        location=center,
        zoom_start=3,
        tiles='OpenStreetMap',
        width='100%',
        height='450px'
    )

    # Add tile layer attribution
    folium.TileLayer(
        tiles='https://{s}.tile.openstreetmap.org/{z}/{x}/{y}.png',
        attr='© <a href="https://www.openstreetmap.org/copyright">OpenStreetMap</a> contributors',
        name='OpenStreetMap',
        overlay=False,
        control=True
    ).add_to(m)

    # Colour-code by basis of record
    color_map = {
        'HUMAN_OBSERVATION': '#2ecc71',
        'PRESERVED_SPECIMEN': '#e67e22',
        'MACHINE_OBSERVATION': '#3498db',
        'LITERATURE': '#9b59b6',
        'MATERIAL_SAMPLE': '#e74c3c',
    }

    for occ in occurrences:
        color = color_map.get(occ.get('basisOfRecord', ''), '#95a5a6')
        year_str = f", {occ['year']}" if occ.get('year') else ''
        inst_str = f"<br><em>{occ['institution']}</em>" if occ.get('institution') else ''
        gbif_link = ''
        if occ.get('gbifID'):
            gbif_link = f'<br><a href="https://www.gbif.org/occurrence/{occ["gbifID"]}" target="_blank">GBIF record</a>'
        
        popup_html = (
            f"<b>{taxon_name}</b><br>"
            f"{occ.get('country', '')}{year_str}"
            f"{inst_str}"
            f"{gbif_link}"
        )
        folium.CircleMarker(
            location=[occ['lat'], occ['lon']],
            radius=5,
            color=color,
            fill=True,
            fill_color=color,
            fill_opacity=0.7,
            popup=folium.Popup(popup_html, max_width=250),
            tooltip=f"{occ.get('country', '')} {year_str}"
        ).add_to(m)

    # Legend
    legend_html = '''
    <div style="position:fixed; bottom:20px; left:20px; z-index:9999;
                background:white; padding:10px; border-radius:6px;
                border:1px solid #ccc; font-size:12px; line-height:1.8;">
        <b>Basis of record</b><br>
        <span style="color:#2ecc71;">●</span> Human observation<br>
        <span style="color:#e67e22;">●</span> Preserved specimen<br>
        <span style="color:#3498db;">●</span> Machine observation<br>
        <span style="color:#9b59b6;">●</span> Literature<br>
        <span style="color:#95a5a6;">●</span> Other
    </div>
    '''
    m.get_root().html.add_child(folium.Element(legend_html))

    m.save(output_path)
    return output_path


print('Map generation function defined.')

## Load taxa information from the RDF graph

Extract taxon data from `gbifMontserrat.ttl` — observations grouped by publisher,
with media URLs and license information.

In [None]:
# Query the local RDF graph for all observation data
obs_query = """
PREFIX wd:  <http://www.wikidata.org/entity/>
PREFIX wdt: <http://www.wikidata.org/prop/direct/>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX dc:   <http://purl.org/dc/elements/1.1/>

SELECT DISTINCT ?taxon ?taxonLabel ?publisher ?publisherLabel 
                ?observation ?gbifObservation ?media_url ?license WHERE {
    ?media wdt:P2699 ?media_url ;
           wdt:P275 ?license ;
           wdt:P361 ?observation .
    ?observation wdt:P225 ?taxon ;
                 wdt:P854 ?gbifObservation ;
                 wdt:P123 ?publisher .
    ?taxon rdfs:label ?taxonLabel .
    ?publisher rdfs:label ?publisherLabel .
}
"""

# Build taxonpages: {taxon_label: {qid, publisher: {org: [{obs_id, media, license}]}}}
taxonpages = {}

for row in g.query(obs_query):
    taxon_label = str(row.taxonLabel)
    qid = str(row.taxon).replace('http://www.wikidata.org/entity/', '')
    publisher = str(row.publisherLabel)
    obs_id = str(row.gbifObservation)
    media_url = str(row.media_url)
    license_uri = str(row.license)

    if taxon_label not in taxonpages:
        taxonpages[taxon_label] = {'qid': qid, 'uri': str(row.taxon), 'publishers': {}}
    
    if publisher not in taxonpages[taxon_label]['publishers']:
        taxonpages[taxon_label]['publishers'][publisher] = {}
    
    if obs_id not in taxonpages[taxon_label]['publishers'][publisher]:
        taxonpages[taxon_label]['publishers'][publisher][obs_id] = {
            'obs_id': obs_id,
            'media': [],
            'license': license_uri
        }
    
    if media_url and media_url != 'nan':
        existing = taxonpages[taxon_label]['publishers'][publisher][obs_id]['media']
        if media_url not in existing:
            existing.append(media_url)

print(f'Loaded {len(taxonpages)} taxa with observation data.')
for name in sorted(taxonpages.keys()):
    n_obs = sum(len(obs) for obs in taxonpages[name]['publishers'].values())
    print(f'  {name}: {len(taxonpages[name]["publishers"])} publishers, {n_obs} observations')

## Step 1: Discover available Wikipedia languages

Query Wikidata SPARQL for all Wikipedia sitelinks of our species. This determines
which languages will get their own Jupyter Book edition.

In [None]:
# Build QID map: {taxon_label: QID}
qid_map = {label: info['qid'] for label, info in taxa.items()}

print(f'Discovering Wikipedia language editions for {len(qid_map)} taxa via Wikidata SPARQL...')
taxon_langs, lang_taxa = discover_wikipedia_languages(qid_map)

# Filter languages with enough species coverage
active_langs = {
    lang: species_set
    for lang, species_set in lang_taxa.items()
    if len(species_set) >= MIN_SPECIES_PER_LANG
}

# Always include English even if it falls below threshold
if 'en' not in active_langs and 'en' in lang_taxa:
    active_langs['en'] = lang_taxa['en']

print(f'\nWikipedia language availability:')
for lang in sorted(active_langs.keys(), key=lambda l: len(active_langs[l]), reverse=True):
    n = len(active_langs[lang])
    print(f'  {lang:>8s} ({get_lang_name(lang):>20s}): {n}/{len(taxonpages)} species')

print(f'\n{len(active_langs)} languages with >= {MIN_SPECIES_PER_LANG} species articles (+ English).')
print(f'Filtered out bot wikis: {BOT_WIKIS}')

## Step 2: Fetch language-independent data (GBIF, maps, Plazi, BHL)

These are fetched once — they don't depend on language. We store the results
in `shared_data[taxon_name]` for reuse across all language editions.

In [None]:
shared_data = {}

for taxon_name in tqdm.tqdm(sorted(taxonpages.keys()), desc='Fetching shared data'):
    info = taxonpages[taxon_name]
    qid = info['qid']
    safe_name = taxon_name.replace(' ', '_')
    
    # GBIF occurrences
    occurrences, total_count = fetch_gbif_occurrences(taxon_name, limit=300)
    time.sleep(0.5)
    
    # Generate map
    map_path = 'maps/' + safe_name + '.html'
    generate_species_map(taxon_name, occurrences, map_path)
    
    # Plazi treatments
    parts = taxon_name.split(' ', 1)
    genus = parts[0]
    species = parts[1] if len(parts) > 1 else ''
    treatments = fetch_plazi_treatments(genus, species) if species else []
    time.sleep(0.3)
    
    # BHL publications
    bhl_pubs = fetch_bhl_publications(taxon_name, BHL_API_KEY)
    time.sleep(0.3)
    
    shared_data[taxon_name] = {
        'qid': qid,
        'safe_name': safe_name,
        'occurrences': occurrences,
        'total_count': total_count,
        'treatments': treatments,
        'bhl_pubs': bhl_pubs,
        'map_path': map_path,
    }

# Summary of BHL results
bhl_total = sum(1 for sd in shared_data.values() if sd['bhl_pubs'])
bhl_pubs_total = sum(len(sd['bhl_pubs']) for sd in shared_data.values())
print(f'\nShared data fetched for {len(shared_data)} species.')
print(f'BHL summary: {bhl_total}/{len(shared_data)} species have BHL publications ({bhl_pubs_total} total)')
for tn, sd in sorted(shared_data.items()):
    n_bhl = len(sd['bhl_pubs'])
    if n_bhl > 0:
        print(f'  {tn}: {n_bhl} BHL publications')
    else:
        print(f'  {tn}: no BHL publications')

In [None]:
# ---- Generate Wikipedia language availability table ----
# This table shows which species have Wikipedia articles in which languages.
# It will be included in the intro page of each language edition.

def generate_wikipedia_lang_table(taxon_langs, active_langs, all_taxa):
    """Generate a markdown table showing Wikipedia coverage per species per language.
    
    Rows = species (sorted), Columns = active languages (sorted by coverage).
    Cells = checkmark or dash.
    Returns markdown string.
    """
    # Sort languages by coverage (descending), then alphabetically
    sorted_langs = sorted(
        active_langs.keys(),
        key=lambda l: (-len(active_langs[l]), l)
    )
    sorted_taxa = sorted(all_taxa)
    
    # Header row
    header = '| Species |'
    separator = '|---------|'
    for lang in sorted_langs:
        name = get_lang_name(lang)
        header += f' {name} ({lang}) |'
        separator += ':-:|'
    
    rows = [header, separator]
    
    for taxon in sorted_taxa:
        langs_for_taxon = taxon_langs.get(taxon, {})
        row = f'| *{taxon}* |'
        for lang in sorted_langs:
            if lang in langs_for_taxon:
                # Link to Wikipedia article
                article = langs_for_taxon[lang]
                wp_url = f'https://{lang}.wikipedia.org/wiki/{article}'
                row += f' [{lang}]({wp_url}) |'
            else:
                row += ' — |'
        rows.append(row)
    
    return '\n'.join(rows)


wiki_table_md = generate_wikipedia_lang_table(taxon_langs, active_langs, taxonpages.keys())
print('Wikipedia language availability table generated.')
print(f'Table: {len(taxonpages)} species x {len(active_langs)} languages')

In [None]:
# ---- Pre-fetch all Wikipedia intros across all languages ----
# Fetch once, store in wiki_intros[lang][taxon_name] = intro_text
# This avoids repeated fetches and reduces total API calls

wiki_intros = {}  # {lang: {taxon_name: intro_text}}

total_fetches = sum(
    1 for lang in active_langs
    for tn in taxonpages
    if lang in taxon_langs.get(tn, {})
)
print(f'Pre-fetching {total_fetches} Wikipedia intros across {len(active_langs)} languages...')

fetch_count = 0
for lang in tqdm.tqdm(sorted(active_langs.keys()), desc='Fetching Wikipedia intros'):
    wiki_intros[lang] = {}
    
    for taxon_name in sorted(taxonpages.keys()):
        if lang not in taxon_langs.get(taxon_name, {}):
            continue
        
        article_title = taxon_langs[taxon_name][lang]
        intro = get_wikipedia_intro(article_title, lang=lang)
        wiki_intros[lang][taxon_name] = intro
        fetch_count += 1
        
        # Polite rate limiting: short sleep every 10 requests
        if fetch_count % 10 == 0:
            time.sleep(0.5)

print(f'Fetched {fetch_count} Wikipedia intros.')

In [None]:
# ---- Step 3: Generate per-language taxa pages (using pre-fetched intros) ----

for lang in tqdm.tqdm(sorted(active_langs.keys()), desc='Generating language editions'):
    lang_dir = f'taxa_{lang}'
    os.makedirs(lang_dir, exist_ok=True)
    lang_name = get_lang_name(lang)
    
    print(f'\n--- {lang_name} ({lang}) ---')
    
    toc_taxa_lang = []
    
    for taxon_name in sorted(taxonpages.keys()):
        info = taxonpages[taxon_name]
        sd = shared_data[taxon_name]
        qid = sd['qid']
        safe_name = sd['safe_name']
        occurrences = sd['occurrences']
        total_count = sd['total_count']
        treatments = sd['treatments']
        bhl_pubs = sd['bhl_pubs']
        
        # Use pre-fetched Wikipedia intro
        has_wiki = lang in taxon_langs.get(taxon_name, {})
        wiki_intro = wiki_intros.get(lang, {}).get(taxon_name, '')
        
        # Write markdown page
        page_path = f'{lang_dir}/{safe_name}.md'
        toc_taxa_lang.append({'file': f'{lang_dir}/{safe_name}'})
        
        lines = []
        
        # Title
        lines.append(f'# *{taxon_name}*')
        lines.append('')
        
        # Wikidata / Scholia / GBIF link bar
        lines.append(
            f'[Wikidata ({qid})](https://www.wikidata.org/wiki/{qid}) \u00b7 '
            f'[Scholia](https://scholia.toolforge.org/taxon/{qid}) \u00b7 '
            f'[GBIF](https://www.gbif.org/species/search?q={taxon_name.replace(" ", "%20")})'
        )
        lines.append('')
        
        # Wikipedia introduction
        if wiki_intro:
            lines.append('## About this species')
            lines.append('')
            lines.append(wiki_intro)
            lines.append('')
            article = taxon_langs[taxon_name][lang]
            wp_url = f'https://{lang}.wikipedia.org/wiki/{article}'
            lines.append(f'Read more on [{lang_name} Wikipedia]({wp_url}).')
            lines.append('')
        elif has_wiki:
            article = taxon_langs[taxon_name][lang]
            wp_url = f'https://{lang}.wikipedia.org/wiki/{article}'
            lines.append(f'Read about this species on [{lang_name} Wikipedia]({wp_url}).')
            lines.append('')
        
        # Other language editions
        other_langs = {l for l in taxon_langs.get(taxon_name, {}).keys() if l != lang and l in active_langs}
        if other_langs:
            lang_links = []
            for ol in sorted(other_langs):
                article = taxon_langs[taxon_name][ol]
                lang_links.append(f'[{get_lang_name(ol)}](https://{ol}.wikipedia.org/wiki/{article})')
            lines.append('Also available in: ' + ' \u00b7 '.join(lang_links))
            lines.append('')
        
        # Map section
        lines.append('## Global distribution')
        lines.append('')
        if occurrences:
            lines.append(
                f'Map shows **{len(occurrences)}** georeferenced GBIF records '
                f'(out of {total_count} total) for *{taxon_name}* worldwide. '
                'Click markers for details.'
            )
            lines.append('')
            lines.append('```{raw} html')
            lines.append(f'<iframe src="../maps/{safe_name}.html" width="100%" height="500px" '
                         'frameborder="0" scrolling="no" '
                         'style="border-radius:8px; margin-bottom:1rem;"></iframe>')
            lines.append('```')
            lines.append('')
            lines.append(
                'Map data \u00a9 [OpenStreetMap](https://www.openstreetmap.org/copyright) contributors. '
                'Occurrence data from [GBIF](https://www.gbif.org/).'
            )
        else:
            lines.append('*No georeferenced occurrence records found in GBIF for this species.*')
        lines.append('')
        
        # Plazi TreatmentBank section
        if treatments:
            lines.append('## Taxonomic treatments')
            lines.append('')
            lines.append('Taxonomic treatments from [Plazi TreatmentBank](https://plazi.org/treatmentbank/):')
            lines.append('')
            for tr in treatments:
                tr_url = tr['treatment_page']
                creator = tr['creator']
                pub = tr['pub_title']
                label_parts = []
                if creator:
                    label_parts.append(creator)
                if pub:
                    label_parts.append(f'*{pub}*')
                label = ' \u2014 '.join(label_parts) if label_parts else 'Treatment'
                lines.append(f'- [{label}]({tr_url})')
                lines.append('')
            syno_url = f'https://synospecies.plazi.org/#{taxon_name.replace(" ", "%20")}'
            lines.append(f'[View all treatments for *{taxon_name}* on Synospecies]({syno_url})')
            lines.append('')
        
        # BHL Literature section
        if bhl_pubs:
            lines.append('## Literature')
            lines.append('')
            lines.append('Publications from the [Biodiversity Heritage Library](https://www.biodiversitylibrary.org/):')
            lines.append('')
            for pub in bhl_pubs:
                title = pub['title']
                url = pub['bhl_url']
                authors = pub['authors']
                date = pub['date']
                suffix_parts = []
                if authors:
                    suffix_parts.append(authors)
                if date:
                    suffix_parts.append(date)
                suffix = ' \u2014 ' + ', '.join(suffix_parts) if suffix_parts else ''
                lines.append(f'- [{title}]({url}){suffix}')
                lines.append('')
            bhl_search = f'https://www.biodiversitylibrary.org/search?SearchTerm={taxon_name.replace(" ", "+")}'
            lines.append(f'[Search for *{taxon_name}* on BHL]({bhl_search})')
            lines.append('')
        
        # Observations from Montserrat
        lines.append('## Observations from Montserrat')
        lines.append('')
        lines.append('Observations from the GBIF dataset covering Montserrat grasses and sedges.')
        lines.append('')
        
        for org_name, obs_dict in sorted(info['publishers'].items()):
            lines.append(f'### {org_name}')
            lines.append('')
            for obs_id, obs_data in obs_dict.items():
                license_label, license_url = resolve_license_label(obs_data['license'])
                lines.append(f'**Observation:** [{obs_id}]({obs_id})  ')
                lines.append(f'**License:** [{license_label}]({license_url})')
                lines.append('')
                for media_url in obs_data['media']:
                    if media_url and media_url != 'nan':
                        display_url = media_url.replace('square', 'medium')
                        lines.append(f'![{taxon_name} \u2014 {org_name}]({display_url})')
                        lines.append('')
        
        with open(page_path, 'w', encoding='utf-8') as f:
            f.write('\n'.join(lines))
    
    print(f'  Generated {len(toc_taxa_lang)} pages in {lang_dir}/')

print(f'\nAll language editions generated.')

## Step 4: Generate Montserrat overview map + per-language configs

Generate the overview map (shared across all languages), then create
`_config_{lang}.yml`, `_toc_{lang}.yml`, `intro_{lang}.md`, and the landing page.

In [None]:
# ---- Montserrat overview map (shared, language-independent) ----

df_occ = pd.read_csv(
    'data/0002020-240626123714530/occurrence.txt',
    sep='\t', on_bad_lines='warn', low_memory=False
)

df_coords = df_occ[
    df_occ['decimalLatitude'].notna() &
    df_occ['decimalLongitude'].notna() &
    (df_occ['taxonRank'] == 'SPECIES')
].copy()

print(f'Georeferenced records: {len(df_coords)} across {df_coords["species"].nunique()} species')

species_list = sorted(df_coords['species'].dropna().unique())
n = len(species_list)

def hsl_to_hex(h, s, l):
    r, g, b = colorsys.hls_to_rgb(h, l, s)
    return '#{:02x}{:02x}{:02x}'.format(int(r*255), int(g*255), int(b*255))

species_colours = {sp: hsl_to_hex(i / n, 0.75, 0.42) for i, sp in enumerate(species_list)}

overview_map = folium.Map(
    location=[16.745, -62.202], zoom_start=12,
    tiles='OpenStreetMap', width='100%', height='500px'
)

for _, row in df_coords.iterrows():
    sp = row.get('species', '')
    colour = species_colours.get(sp, '#888888')
    gbif_link = 'https://www.gbif.org/occurrence/' + str(int(row['gbifID']))
    year_str = str(int(row['year'])) if pd.notna(row.get('year')) else ''
    inst_str = str(row.get('institutionCode', ''))
    popup_html = (
        f'<b><i>{sp}</i></b><br>{inst_str} {year_str}<br>'
        f'<a href="{gbif_link}" target="_blank">GBIF record</a>'
    )
    folium.CircleMarker(
        location=[row['decimalLatitude'], row['decimalLongitude']],
        radius=7, color=colour, fill=True, fill_color=colour,
        fill_opacity=0.85,
        popup=folium.Popup(popup_html, max_width=220),
        tooltip=f'<i>{sp}</i>'
    ).add_to(overview_map)

legend_items = ''.join(
    f'<span style="color:{c};">&#9679;</span> <i>{sp}</i><br>'
    for sp, c in sorted(species_colours.items())
)
legend_html = (
    '<div style="position:fixed; bottom:20px; left:20px; z-index:9999;'
    ' background:white; padding:10px 14px; border-radius:6px;'
    ' border:1px solid #ccc; font-size:11px; line-height:1.8;'
    ' max-height:280px; overflow-y:auto;">'
    '<b>Species</b><br>' + legend_items + '</div>'
)
overview_map.get_root().html.add_child(folium.Element(legend_html))
overview_map.save('maps/montserrat_overview.html')
print('Saved maps/montserrat_overview.html')

In [None]:
# ---- Generate per-language _config, _toc, intro files ----

for lang in sorted(active_langs.keys()):
    lang_name = get_lang_name(lang)
    lang_dir = f'taxa_{lang}'
    
    # Build sorted taxa file list for this language
    toc_taxa_sorted = sorted(
        [{'file': f'{lang_dir}/{tn.replace(" ", "_")}'} for tn in taxonpages.keys()],
        key=lambda x: x['file']
    )
    
    # Organisation chapters (shared across languages)
    org_chapters = []
    for fname in sorted(os.listdir('organisation')):
        if fname.endswith('.md'):
            org_chapters.append({'file': f'organisation/{fname[:-3]}'})
    
    # ---- _toc_{lang}.yml ----
    toc_data = {
        'format': 'jb-book',
        'root': f'intro_{lang}',
        'parts': [
            {
                'caption': 'Collections with grasses and sedges from Montserrat',
                'chapters': org_chapters
            },
            {
                'caption': 'Taxa originally from Montserrat',
                'chapters': toc_taxa_sorted
            }
        ]
    }
    
    toc_path = f'_toc_{lang}.yml'
    with open(toc_path, 'w', encoding='utf-8') as f:
        yaml.dump(toc_data, f, allow_unicode=True, default_flow_style=False, sort_keys=False)
    
    # ---- _config_{lang}.yml ----
    config_data = {
        'title': f'Grasses and sedges of Montserrat ({lang_name})',
        'author': 'Sofie Meeus, Quentin Groom, Andra Waagmeester',
        'logo': 'logo.png',
        'execute': {
            'execute_notebooks': 'off'  # No execution — pages are pre-generated markdown
        },
        'latex': {
            'latex_documents': {'targetname': 'book.tex'}
        },
        'bibtex_bibfiles': ['references.bib'],
        'repository': {
            'url': 'https://github.com/VisibleNatureAtlas/Grasses-and-sedges-of-Montserrat',
            'path_to_book': '.',
            'branch': 'main'
        },
        'html': {
            'use_issues_button': True,
            'use_repository_button': True,
            'extra_navbar': 'Powered by <a href="https://jupyterbook.org">Jupyter Book</a>',
            'extra_footer': (
                '<p>Data sourced from <a href="https://www.gbif.org/">GBIF</a> and '
                '<a href="https://www.wikidata.org/">Wikidata</a> under open licenses. '
                'Maps \u00a9 <a href="https://www.openstreetmap.org/copyright">OpenStreetMap</a> contributors.</p>'
            )
        },
        'sphinx': {
            'config': {
                'html_show_copyright': False,
                'exclude_patterns': ['node_modules', 'myenv', '_build', 'maps',
                                     'GrassesMontserrat']
            }
        }
    }
    
    # Also exclude other language taxa dirs to avoid confusion
    for other_lang in active_langs:
        if other_lang != lang:
            config_data['sphinx']['config']['exclude_patterns'].append(f'taxa_{other_lang}')
    # Exclude the old taxa/ dir too
    config_data['sphinx']['config']['exclude_patterns'].append('taxa')
    
    config_path = f'_config_{lang}.yml'
    with open(config_path, 'w', encoding='utf-8') as f:
        yaml.dump(config_data, f, allow_unicode=True, default_flow_style=False, sort_keys=False)
    
    # ---- intro_{lang}.md ----
    # Count Wikipedia coverage for this language
    n_with_wiki = len(active_langs[lang])
    n_total = len(taxonpages)
    
    intro_lines = []
    intro_lines.append(f'# Visible Nature Atlas: Grasses and sedges of Montserrat')
    intro_lines.append('')
    intro_lines.append(f'*{lang_name} edition* \u2014 Species descriptions from [{lang_name} Wikipedia](https://{lang}.wikipedia.org/).')
    intro_lines.append('')
    intro_lines.append(
        'This is a volume in the **Visible Nature Atlas** series \u2014 an open, data-driven collection of '
        'biodiversity books covering the flora and fauna of specific regions worldwide.'
    )
    intro_lines.append('')
    
    # Language picker
    intro_lines.append('## Other language editions')
    intro_lines.append('')
    other_editions = []
    for ol in sorted(active_langs.keys()):
        if ol != lang:
            ol_name = get_lang_name(ol)
            n_ol = len(active_langs[ol])
            other_editions.append(f'[{ol_name} ({n_ol} species)](../{ol}/index.html)')
    if other_editions:
        intro_lines.append(' \u00b7 '.join(other_editions))
        intro_lines.append('')
    
    intro_lines.append('## About this atlas')
    intro_lines.append('')
    intro_lines.append(
        'This atlas documents the **grasses (Poaceae) and sedges (Cyperaceae)** of **Montserrat**, a small '
        'British Overseas Territory in the Caribbean Lesser Antilles. Despite its modest size (102 km\u00b2), '
        'Montserrat harbours a rich diversity of native and introduced grass and sedge species.'
    )
    intro_lines.append('')
    intro_lines.append(f'This edition includes Wikipedia descriptions for **{n_with_wiki}** out of {n_total} species.')
    intro_lines.append('')
    
    # Overview map — intro is the root page, so maps/ is a sibling directory (no ../)
    intro_lines.append('## Observations on Montserrat')
    intro_lines.append('')
    intro_lines.append('```{raw} html')
    intro_lines.append('<iframe src="maps/montserrat_overview.html"')
    intro_lines.append('        width="100%" height="520px"')
    intro_lines.append('        frameborder="0" scrolling="no"')
    intro_lines.append('        style="border-radius:8px; margin-bottom:0.5rem;"></iframe>')
    intro_lines.append('```')
    intro_lines.append('')
    intro_lines.append(
        'Map data \u00a9 [OpenStreetMap](https://www.openstreetmap.org/copyright) contributors. '
        'Occurrence data from [GBIF](https://www.gbif.org/).'
    )
    intro_lines.append('')
    
    # Wikipedia language availability table
    intro_lines.append('## Wikipedia language availability')
    intro_lines.append('')
    intro_lines.append(
        'The table below shows which species have Wikipedia articles in each language. '
        'Click a language code to visit the Wikipedia article.'
    )
    intro_lines.append('')
    intro_lines.append(wiki_table_md)
    intro_lines.append('')
    
    # Data sources
    intro_lines.append('## Data sources')
    intro_lines.append('')
    intro_lines.append('| Source | Role |')
    intro_lines.append('|--------|------|')
    intro_lines.append('| **[GBIF](https://www.gbif.org/)** | Occurrence records |')
    intro_lines.append('| **[Wikidata](https://www.wikidata.org/)** | Taxonomic identifiers, metadata |')
    intro_lines.append(f'| **[Wikipedia](https://{lang}.wikipedia.org/)** | Species descriptions |')
    intro_lines.append('| **[Plazi TreatmentBank](https://plazi.org/)** | Taxonomic treatments |')
    intro_lines.append('| **[BHL](https://www.biodiversitylibrary.org/)** | Historical literature |')
    intro_lines.append('| **[OpenStreetMap](https://www.openstreetmap.org/)** | Map tiles |')
    intro_lines.append('')
    
    intro_lines.append('## Authors')
    intro_lines.append('')
    intro_lines.append('- **Sofie Meeus** \u2014 [Meise Botanic Garden](https://www.botanicgarden.be/)')
    intro_lines.append('- **Quentin Groom** \u2014 [Meise Botanic Garden](https://www.botanicgarden.be/)')
    intro_lines.append('- **Andra Waagmeester** \u2014 [Gene Wiki](https://www.wikidata.org/wiki/User:Andrawaag)')
    intro_lines.append('')
    intro_lines.append('```{note}')
    intro_lines.append('The interactive maps require JavaScript to be enabled in your browser.')
    intro_lines.append('```')
    intro_lines.append('')
    
    intro_path = f'intro_{lang}.md'
    with open(intro_path, 'w', encoding='utf-8') as f:
        f.write('\n'.join(intro_lines))
    
    print(f'{lang}: {config_path}, {toc_path}, {intro_path}')

print(f'\nGenerated configs for {len(active_langs)} languages.')

In [None]:
# ---- Write .languages.txt for the build script ----
sorted_langs_list = sorted(active_langs.keys())
with open('.languages.txt', 'w') as f:
    f.write('\n'.join(sorted_langs_list) + '\n')
print(f'Wrote .languages.txt with {len(sorted_langs_list)} languages: {", ".join(sorted_langs_list)}')

# ---- Generate landing page (index.html) ----
# This is a simple HTML page that links to each language edition.

lang_cards = ''
for lang in sorted(sorted_langs_list, key=lambda l: (-len(active_langs[l]), l)):
    name = get_lang_name(lang)
    n = len(active_langs[lang])
    total = len(taxonpages)
    pct = round(100 * n / total) if total > 0 else 0
    lang_cards += f'''
        <a href="{lang}/index.html" class="lang-card">
            <div class="lang-code">{lang}</div>
            <div class="lang-name">{name}</div>
            <div class="lang-stats">{n}/{total} species with Wikipedia ({pct}%)</div>
        </a>
'''

landing_html = f'''<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <meta name="viewport" content="width=device-width, initial-scale=1.0">
    <title>Visible Nature Atlas: Grasses and sedges of Montserrat</title>
    <style>
        * {{ margin: 0; padding: 0; box-sizing: border-box; }}
        body {{
            font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, sans-serif;
            background: #f7f9fc;
            color: #2c3e50;
            min-height: 100vh;
        }}
        .hero {{
            background: linear-gradient(135deg, #1a5632 0%, #2d8a56 50%, #4aa564 100%);
            color: white;
            padding: 3rem 2rem;
            text-align: center;
        }}
        .hero h1 {{ font-size: 2.2rem; margin-bottom: 0.5rem; }}
        .hero p {{ font-size: 1.1rem; opacity: 0.9; max-width: 600px; margin: 0 auto; }}
        .container {{
            max-width: 900px;
            margin: 2rem auto;
            padding: 0 1.5rem;
        }}
        h2 {{
            font-size: 1.4rem;
            margin-bottom: 1rem;
            color: #2c3e50;
        }}
        .lang-grid {{
            display: grid;
            grid-template-columns: repeat(auto-fill, minmax(220px, 1fr));
            gap: 1rem;
            margin-bottom: 2rem;
        }}
        .lang-card {{
            background: white;
            border: 1px solid #e0e6ed;
            border-radius: 8px;
            padding: 1.2rem;
            text-decoration: none;
            color: inherit;
            transition: transform 0.15s, box-shadow 0.15s;
        }}
        .lang-card:hover {{
            transform: translateY(-2px);
            box-shadow: 0 4px 12px rgba(0,0,0,0.1);
        }}
        .lang-code {{
            font-size: 1.6rem;
            font-weight: 700;
            color: #1a5632;
        }}
        .lang-name {{
            font-size: 1rem;
            margin: 0.3rem 0;
        }}
        .lang-stats {{
            font-size: 0.85rem;
            color: #7f8c8d;
        }}
        .footer {{
            text-align: center;
            padding: 2rem;
            font-size: 0.85rem;
            color: #7f8c8d;
        }}
        .footer a {{ color: #1a5632; }}
    </style>
</head>
<body>
    <div class="hero">
        <h1>Grasses and sedges of Montserrat</h1>
        <p>A multilingual biodiversity atlas from the Visible Nature Atlas series.
           Choose a language edition below.</p>
    </div>
    <div class="container">
        <h2>Available editions ({len(sorted_langs_list)} languages)</h2>
        <div class="lang-grid">
            {lang_cards}
        </div>
    </div>
    <div class="footer">
        Data from <a href="https://www.gbif.org/">GBIF</a>,
        <a href="https://www.wikidata.org/">Wikidata</a>,
        <a href="https://en.wikipedia.org/">Wikipedia</a>,
        <a href="https://plazi.org/">Plazi</a> &amp;
        <a href="https://www.biodiversitylibrary.org/">BHL</a>.
        Maps &copy; <a href="https://www.openstreetmap.org/copyright">OpenStreetMap</a> contributors.<br>
        Built with <a href="https://jupyterbook.org/">Jupyter Book</a>.
        Source on <a href="https://github.com/VisibleNatureAtlas/Grasses-and-sedges-of-Montserrat">GitHub</a>.
    </div>
</body>
</html>'''

with open('_build_landing.html', 'w', encoding='utf-8') as f:
    f.write(landing_html)

print(f'Landing page written to _build_landing.html')
print(f'\nDone! Ready to build {len(sorted_langs_list)} language editions.')