In [1]:
from mdutils.mdutils import MdUtils
import tqdm
import math
import requests
import pandas as pd
import time
from wikidataintegrator import wdi_core, wdi_login
from rdflib import Graph, Literal, RDF, URIRef, Namespace, BNode
from rdflib.namespace import DCTERMS, FOAF, XSD, DC, RDFS
import shutil
import os

# Load occurence data

In [34]:
df = pd.read_csv('data/0002020-240626123714530/occurrence.txt', sep='\t', on_bad_lines = 'warn')
df

Unnamed: 0,gbifID,accessRights,bibliographicCitation,language,license,modified,publisher,references,rightsHolder,type,...,publishedByGbifRegion,level0Gid,level0Name,level1Gid,level1Name,level2Gid,level2Name,level3Gid,level3Name,iucnRedListCategory
0,4876300936,,,,CC_BY_4_0,2023-03-20T00:00:00Z,,,Meise Botanic Garden,PhysicalObject,...,EUROPE,MSR,Montserrat,MSR.3_1,Saint Peter,,,,,LC
1,4876284841,,,,CC_BY_4_0,2023-03-21T00:00:00Z,,,Meise Botanic Garden,PhysicalObject,...,EUROPE,MSR,Montserrat,MSR.3_1,Saint Peter,,,,,NE
2,4607368012,,,,CC_BY_NC_4_0,2024-05-24T09:26:47Z,,https://www.inaturalist.org/observations/20511...,slauren,,...,NORTH_AMERICA,MSR,Montserrat,MSR.3_1,Saint Peter,,,,,NE
3,4536015362,,,,CC0_1_0,2024-04-26T16:57:26Z,,https://www.inaturalist.org/observations/20102...,monkeyjodey,,...,EUROPE,MSR,Montserrat,MSR.3_1,Saint Peter,,,,,NE
4,4536000017,,,,CC0_1_0,2024-04-26T16:57:26Z,,https://www.inaturalist.org/observations/20125...,monkeyjodey,,...,EUROPE,MSR,Montserrat,MSR.1_1,Saint Anthon,,,,,NE
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
82,1319347135,,,,CC0_1_0,2017-10-31T12:14:00Z,,,,,...,NORTH_AMERICA,,,,,,,,,LC
83,1318020074,,,,CC0_1_0,2017-10-31T12:10:00Z,,,,,...,NORTH_AMERICA,,,,,,,,,NE
84,1317614158,,,,CC0_1_0,2021-03-06T11:43:00Z,,,,,...,NORTH_AMERICA,,,,,,,,,NE
85,1317605282,,,,CC0_1_0,2021-03-08T09:38:00Z,,,,,...,NORTH_AMERICA,,,,,,,,,NE


# Fetch institute QID's for institutes

In [35]:
user_agent = 'TaxonQueries/1.0 (andra@micelio.be) SPARQL queries'  # Replace with your actual User-Agent string

result_df = pd.DataFrame(columns=['Institute', 'Institute_QID'])
batchSize = 5
retry_wait_time = 1  # Initial wait time in seconds
institutes = set(df["rightsHolder"])
print(institutes)
instituteQID = dict()
i = 0
notFound = {}
for institute in institutes:
    en = wdi_core.WDItemEngine.get_wd_search_results(institute, language='en')
    if len(en) > 0:
        instituteQID[institute] = en[0]
        continue
    nl = wdi_core.WDItemEngine.get_wd_search_results(institute, language='nl')
    if len(nl) > 0:
        instituteQID[institute] = nl[0]
        continue
    de = wdi_core.WDItemEngine.get_wd_search_results(institute, language='de')
    if len(de) > 0:
        instituteQID[institute] = de[0]
        continue
    fr = wdi_core.WDItemEngine.get_wd_search_results(institute, language='fr')
    if len(fr) > 0:
        instituteQID[institute] = fr[0]
        continue
    es = wdi_core.WDItemEngine.get_wd_search_results(institute, language='es')
    if len(es) > 0:
        instituteQID[institute] = es[0]
        continue
    pt = wdi_core.WDItemEngine.get_wd_search_results(institute, language='pt')
    if len(pt) > 0:
        instituteQID[institute] = pt[0]
        continue
    else:
        i+=1
        print(str(i)+f" Not found institute={institute}")
        notFound[institute] = requests.get("https://api.gbif.org/v1/organization?q="+institute).json()

{'University of Florida', 'Quentin Groom', 'Carnegie Museum of Natural History', 'The New York Botanical Garden', 'Sofie Meeus', nan, 'Delmaude C Ryan', 'Meise Botanic Garden', 'slauren', 'classical', 'gorillarebecca', 'monkeyjodey'}
1 Not found institute=Delmaude C Ryan
2 Not found institute=slauren
3 Not found institute=gorillarebecca
4 Not found institute=monkeyjodey


In [36]:
df2 = df[df["taxonRank"] == "SPECIES"]
df3 = df2[["species", "taxonKey", "taxonRank"]].drop_duplicates()
species_counts = df3["species"].value_counts()
species_counts

grouped = df3.groupby('species').apply(lambda x: list(x.taxonKey))
for species, taxon_keys in grouped.items():
    if len(taxon_keys) > 1:
        taxon_keys = list(map(int, taxon_keys))
        print(f"Species: {species}, Taxon Keys: {taxon_keys}")

  grouped = df3.groupby('species').apply(lambda x: list(x.taxonKey))


In [37]:
taxonname =  df2[["species", "taxonKey", "taxonRank", "taxonRank", "genus", "genericName","taxonomicStatus", "acceptedNameUsageID", "scientificName", "specificEpithet"]].drop_duplicates()
taxonnames = []
for index, row in taxonname.iterrows():
    if isinstance(row["specificEpithet"], str):
        taxonnames.append("\""+row["genericName"] + " " + row["specificEpithet"]+"\"")
    else:
        continue
    taxonnames.append("\""+row["genericName"] + " " + row["specificEpithet"]+"\"")
taxonnames = list(set(taxonnames))

# resolve taxonnames and kingdom

In [38]:
url = 'https://query.wikidata.org/sparql'
user_agent = 'TaxonQueries/1.0 ( andra@micelio.be) SPARQL queries'  # Replace with your actual User-Agent string

result_df = pd.DataFrame(columns=['taxonId', 'taxonname', 'taxon', 'kingdomname', 'kingdom' ])
batchSize = 10
retry_wait_time = 1  # Initial wait time in seconds

for i in tqdm.tqdm(range(math.ceil(len(taxonnames)/batchSize)), desc="Processing batches"):
    batch = taxonnames[i*batchSize:(i+1)*batchSize]
    query = """
            SELECT ?taxon ?taxonId ?taxonname ?kingdomname ?kingdom ?article
                WHERE {
                    VALUES ?taxonname {""" + " ".join(batch) + """}
                    ?taxon wdt:P225 ?taxonname .
                    ?taxon wdt:P171* ?kingdom .
                    OPTIONAL {?article schema:about ?taxon ;
                               schema:isPartOf <https://en.wikipedia.org/> .}
                    ?kingdom wdt:P105 wd:Q36732 .
                    ?kingdom rdfs:label ?kingdomname filter (lang(?kingdomname) = "en") .
                    OPTIONAL {?taxon wdt:P846 ?taxonId .}
            }
            """
    # print(query)

    success = False
    while not success:
        try:
            headers = {'User-Agent': user_agent}
            r = requests.get(url, params={'format': 'json', 'query': query}, headers=headers)
            r.raise_for_status()  # Raise an HTTPError for bad responses
            data = r.json()
            temp_df = pd.DataFrame([(item.get("taxonId", {}).get("value", ""),
                                     item.get("taxonname", {}).get("value", ""),
                                     item.get("taxon", {}).get("value", ""),
                                     item.get("kingdomname", {}).get("value", ""),
                                     item.get("kingdom", {}).get("value", ""),
                                     item.get("article", {}).get("value", "")
                                    )
                                    for item in data["results"]["bindings"]],
                                    columns=['taxonId', 'taxonname', 'taxon', 'kingdomname', 'kingdom', 'article' ])

            # Concatenate the temporary DataFrame with the main result DataFrame
            result_df = pd.concat([result_df, temp_df], ignore_index=True)
            success = True  # Set to True to break out of the while loop on success

        except requests.exceptions.HTTPError as err:
            print(f"Error: {err}")
            print(f"Retrying in {retry_wait_time} seconds...")
            time.sleep(retry_wait_time)
            retry_wait_time *= 2  # Double the wait time
            if retry_wait_time > 60:  # Limit the maximum wait time to 60 seconds
                retry_wait_time = 60
        else:
            retry_wait_time = 1  # Reset to 1 second after a successful query

# Now, result_df contains all the data
result_df


Processing batches: 100%|██████████| 3/3 [00:01<00:00,  2.51it/s]


Unnamed: 0,taxonId,taxonname,taxon,kingdomname,kingdom,article
0,5289739,Cenchrus echinatus,http://www.wikidata.org/entity/Q4925284,plant,http://www.wikidata.org/entity/Q756,https://en.wikipedia.org/wiki/Cenchrus_echinatus
1,2704745,Sporobolus virginicus,http://www.wikidata.org/entity/Q7579252,plant,http://www.wikidata.org/entity/Q756,https://en.wikipedia.org/wiki/Sporobolus_virgi...
2,2717421,Eleocharis flavescens,http://www.wikidata.org/entity/Q15584858,plant,http://www.wikidata.org/entity/Q756,https://en.wikipedia.org/wiki/Eleocharis_flave...
3,2702504,Melinis repens,http://www.wikidata.org/entity/Q24700355,plant,http://www.wikidata.org/entity/Q756,https://en.wikipedia.org/wiki/Melinis_repens
4,2705068,Panicum trichoides,http://www.wikidata.org/entity/Q10912779,plant,http://www.wikidata.org/entity/Q756,
5,2708680,Fimbristylis complanata,http://www.wikidata.org/entity/Q11075670,plant,http://www.wikidata.org/entity/Q756,
6,2702854,Lasiacis sorghoidea,http://www.wikidata.org/entity/Q15508551,plant,http://www.wikidata.org/entity/Q756,
7,2702852,Lasiacis divaricata,http://www.wikidata.org/entity/Q15508581,plant,http://www.wikidata.org/entity/Q756,
8,2702522,Eriochloa punctata,http://www.wikidata.org/entity/Q15516186,plant,http://www.wikidata.org/entity/Q756,
9,5291147,Machaerina restioides,http://www.wikidata.org/entity/Q15550242,plant,http://www.wikidata.org/entity/Q756,


# Merge taxonnames and kingdom

In [39]:
result_df[['genericName', 'specificEpithet']] = result_df['taxonname'].str.split(expand=True, n=1)

# Print the updated DataFrame
wdtaxqid = dict()
for index, row in result_df.iterrows():
    if row["taxonId"] != "":
        wdtaxqid[row["taxonId"]] = row["taxon"]
result_df

Unnamed: 0,taxonId,taxonname,taxon,kingdomname,kingdom,article,genericName,specificEpithet
0,5289739,Cenchrus echinatus,http://www.wikidata.org/entity/Q4925284,plant,http://www.wikidata.org/entity/Q756,https://en.wikipedia.org/wiki/Cenchrus_echinatus,Cenchrus,echinatus
1,2704745,Sporobolus virginicus,http://www.wikidata.org/entity/Q7579252,plant,http://www.wikidata.org/entity/Q756,https://en.wikipedia.org/wiki/Sporobolus_virgi...,Sporobolus,virginicus
2,2717421,Eleocharis flavescens,http://www.wikidata.org/entity/Q15584858,plant,http://www.wikidata.org/entity/Q756,https://en.wikipedia.org/wiki/Eleocharis_flave...,Eleocharis,flavescens
3,2702504,Melinis repens,http://www.wikidata.org/entity/Q24700355,plant,http://www.wikidata.org/entity/Q756,https://en.wikipedia.org/wiki/Melinis_repens,Melinis,repens
4,2705068,Panicum trichoides,http://www.wikidata.org/entity/Q10912779,plant,http://www.wikidata.org/entity/Q756,,Panicum,trichoides
5,2708680,Fimbristylis complanata,http://www.wikidata.org/entity/Q11075670,plant,http://www.wikidata.org/entity/Q756,,Fimbristylis,complanata
6,2702854,Lasiacis sorghoidea,http://www.wikidata.org/entity/Q15508551,plant,http://www.wikidata.org/entity/Q756,,Lasiacis,sorghoidea
7,2702852,Lasiacis divaricata,http://www.wikidata.org/entity/Q15508581,plant,http://www.wikidata.org/entity/Q756,,Lasiacis,divaricata
8,2702522,Eriochloa punctata,http://www.wikidata.org/entity/Q15516186,plant,http://www.wikidata.org/entity/Q756,,Eriochloa,punctata
9,5291147,Machaerina restioides,http://www.wikidata.org/entity/Q15550242,plant,http://www.wikidata.org/entity/Q756,,Machaerina,restioides


In [40]:
merged_df = pd.merge(df, result_df, on=['genericName', 'specificEpithet'], how='inner')
merged_df

Unnamed: 0,gbifID,accessRights,bibliographicCitation,language,license,modified,publisher,references,rightsHolder,type,...,level2Name,level3Gid,level3Name,iucnRedListCategory,taxonId,taxonname,taxon,kingdomname,kingdom_y,article
0,4876300936,,,,CC_BY_4_0,2023-03-20T00:00:00Z,,,Meise Botanic Garden,PhysicalObject,...,,,,LC,2718767,Cyperus brevifolius,http://www.wikidata.org/entity/Q21265100,plant,http://www.wikidata.org/entity/Q756,
1,4876284841,,,,CC_BY_4_0,2023-03-21T00:00:00Z,,,Meise Botanic Garden,PhysicalObject,...,,,,NE,2721307,Rhynchospora nervosa,http://www.wikidata.org/entity/Q15555289,plant,http://www.wikidata.org/entity/Q756,
2,4607368012,,,,CC_BY_NC_4_0,2024-05-24T09:26:47Z,,https://www.inaturalist.org/observations/20511...,slauren,,...,,,,NE,2702504,Melinis repens,http://www.wikidata.org/entity/Q24700355,plant,http://www.wikidata.org/entity/Q756,https://en.wikipedia.org/wiki/Melinis_repens
3,4536015362,,,,CC0_1_0,2024-04-26T16:57:26Z,,https://www.inaturalist.org/observations/20102...,monkeyjodey,,...,,,,NE,2702504,Melinis repens,http://www.wikidata.org/entity/Q24700355,plant,http://www.wikidata.org/entity/Q756,https://en.wikipedia.org/wiki/Melinis_repens
4,4536000017,,,,CC0_1_0,2024-04-26T16:57:26Z,,https://www.inaturalist.org/observations/20125...,monkeyjodey,,...,,,,NE,2702504,Melinis repens,http://www.wikidata.org/entity/Q24700355,plant,http://www.wikidata.org/entity/Q756,https://en.wikipedia.org/wiki/Melinis_repens
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
82,1319347135,,,,CC0_1_0,2017-10-31T12:14:00Z,,,,,...,,,,LC,2712691,Scleria secans,http://www.wikidata.org/entity/Q15564658,plant,http://www.wikidata.org/entity/Q756,
83,1318020074,,,,CC0_1_0,2017-10-31T12:10:00Z,,,,,...,,,,NE,2715182,Cyperus sphacelatus,http://www.wikidata.org/entity/Q15535602,plant,http://www.wikidata.org/entity/Q756,https://en.wikipedia.org/wiki/Cyperus_sphacelatus
84,1317614158,,,,CC0_1_0,2021-03-06T11:43:00Z,,,,,...,,,,NE,2705827,Brachiaria fasciculata,http://www.wikidata.org/entity/Q15509620,plant,http://www.wikidata.org/entity/Q756,
85,1317605282,,,,CC0_1_0,2021-03-08T09:38:00Z,,,,,...,,,,NE,2702854,Lasiacis sorghoidea,http://www.wikidata.org/entity/Q15508551,plant,http://www.wikidata.org/entity/Q756,


In [41]:
merged_df[['taxonKey', 'taxonId', 'taxonname', 'taxon', 'genericName', 'specificEpithet', 'kingdom_y', 'kingdomname', 'article']].drop_duplicates()

Unnamed: 0,taxonKey,taxonId,taxonname,taxon,genericName,specificEpithet,kingdom_y,kingdomname,article
0,2718767,2718767,Cyperus brevifolius,http://www.wikidata.org/entity/Q21265100,Cyperus,brevifolius,http://www.wikidata.org/entity/Q756,plant,
1,2721307,2721307,Rhynchospora nervosa,http://www.wikidata.org/entity/Q15555289,Rhynchospora,nervosa,http://www.wikidata.org/entity/Q756,plant,
2,2702504,2702504,Melinis repens,http://www.wikidata.org/entity/Q24700355,Melinis,repens,http://www.wikidata.org/entity/Q756,plant,https://en.wikipedia.org/wiki/Melinis_repens
5,7661971,2705758,Bambusa vulgaris,http://www.wikidata.org/entity/Q3219428,Bambusa,vulgaris,http://www.wikidata.org/entity/Q756,plant,https://en.wikipedia.org/wiki/Bambusa_vulgaris
9,5289739,5289739,Cenchrus echinatus,http://www.wikidata.org/entity/Q4925284,Cenchrus,echinatus,http://www.wikidata.org/entity/Q756,plant,https://en.wikipedia.org/wiki/Cenchrus_echinatus
11,2702852,2702852,Lasiacis divaricata,http://www.wikidata.org/entity/Q15508581,Lasiacis,divaricata,http://www.wikidata.org/entity/Q756,plant,
15,2715182,2715182,Cyperus sphacelatus,http://www.wikidata.org/entity/Q15535602,Cyperus,sphacelatus,http://www.wikidata.org/entity/Q756,plant,https://en.wikipedia.org/wiki/Cyperus_sphacelatus
16,2704745,2704745,Sporobolus virginicus,http://www.wikidata.org/entity/Q7579252,Sporobolus,virginicus,http://www.wikidata.org/entity/Q756,plant,https://en.wikipedia.org/wiki/Sporobolus_virgi...
19,2705924,2705924,Axonopus compressus,http://www.wikidata.org/entity/Q4830676,Axonopus,compressus,http://www.wikidata.org/entity/Q756,plant,https://en.wikipedia.org/wiki/Axonopus_compressus
22,2717421,2717421,Eleocharis flavescens,http://www.wikidata.org/entity/Q15584858,Eleocharis,flavescens,http://www.wikidata.org/entity/Q756,plant,https://en.wikipedia.org/wiki/Eleocharis_flave...


In [42]:
taxoninformation = dict()

In [43]:
def get_wikipedia_snippet(title):
    url = "https://en.wikipedia.org/w/api.php"
    params = {
        "action": "query",
        "format": "json",
        "titles": title,
        "prop": "extracts",
        "exintro": True,
        "explaintext": True
    }
    response = requests.get(url, params=params)
    data = response.json()
    page = next(iter(data['query']['pages'].values()))
    snippet = page.get("extract", "")
    return snippet



In [44]:
# Assuming `result_df` is your DataFrame and it has columns named 'article' and 'taxonname'
taxonname_list = result_df[result_df["article"] != ""]["taxonname"].tolist()
taxonname_list

['Cenchrus echinatus',
 'Sporobolus virginicus',
 'Eleocharis flavescens',
 'Melinis repens',
 'Cymbopogon citratus',
 'Bambusa vulgaris',
 'Paspalum conjugatum',
 'Aristida adscensionis',
 'Anthephora hermaphrodita',
 'Cyperus sphacelatus',
 'Coix lacryma-jobi',
 'Axonopus compressus',
 'Cyperus alopecuroides',
 'Digitaria insularis']

In [45]:
query = """
    PREFIX wd: <http://www.wikidata.org/entity/>
    PREFIX wdt: <http://www.wikidata.org/prop/direct/>
    SELECT DISTINCT ?taxon  WHERE {
           ?observation wdt:P225 ?taxon .
           ?taxon rdfs:label ?taxonLabel .
            }"""
# results = gbifRDF.query(query)
for taxonname in tqdm.tqdm(taxonname_list):
    if taxonname not in taxoninformation.keys():
        taxoninformation[taxonname] = dict()
        taxoninformation[taxonname]["wikiintroduction"] = get_wikipedia_snippet(taxonname.replace(" ", "_"))


100%|██████████| 14/14 [00:02<00:00,  4.73it/s]


In [46]:
taxon_info_df = pd.DataFrame.from_dict(taxoninformation, orient='index').reset_index()
taxon_info_df.rename(columns={'index': 'taxonname'}, inplace=True)

# Merge the dictionary DataFrame with your existing DataFrame
merged_df2 = merged_df.merge(taxon_info_df, on='taxonname', how='left')
merged_df2

Unnamed: 0,gbifID,accessRights,bibliographicCitation,language,license,modified,publisher,references,rightsHolder,type,...,level3Gid,level3Name,iucnRedListCategory,taxonId,taxonname,taxon,kingdomname,kingdom_y,article,wikiintroduction
0,4876300936,,,,CC_BY_4_0,2023-03-20T00:00:00Z,,,Meise Botanic Garden,PhysicalObject,...,,,LC,2718767,Cyperus brevifolius,http://www.wikidata.org/entity/Q21265100,plant,http://www.wikidata.org/entity/Q756,,
1,4876284841,,,,CC_BY_4_0,2023-03-21T00:00:00Z,,,Meise Botanic Garden,PhysicalObject,...,,,NE,2721307,Rhynchospora nervosa,http://www.wikidata.org/entity/Q15555289,plant,http://www.wikidata.org/entity/Q756,,
2,4607368012,,,,CC_BY_NC_4_0,2024-05-24T09:26:47Z,,https://www.inaturalist.org/observations/20511...,slauren,,...,,,NE,2702504,Melinis repens,http://www.wikidata.org/entity/Q24700355,plant,http://www.wikidata.org/entity/Q756,https://en.wikipedia.org/wiki/Melinis_repens,Melinis repens is a species of grass known by ...
3,4536015362,,,,CC0_1_0,2024-04-26T16:57:26Z,,https://www.inaturalist.org/observations/20102...,monkeyjodey,,...,,,NE,2702504,Melinis repens,http://www.wikidata.org/entity/Q24700355,plant,http://www.wikidata.org/entity/Q756,https://en.wikipedia.org/wiki/Melinis_repens,Melinis repens is a species of grass known by ...
4,4536000017,,,,CC0_1_0,2024-04-26T16:57:26Z,,https://www.inaturalist.org/observations/20125...,monkeyjodey,,...,,,NE,2702504,Melinis repens,http://www.wikidata.org/entity/Q24700355,plant,http://www.wikidata.org/entity/Q756,https://en.wikipedia.org/wiki/Melinis_repens,Melinis repens is a species of grass known by ...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
82,1319347135,,,,CC0_1_0,2017-10-31T12:14:00Z,,,,,...,,,LC,2712691,Scleria secans,http://www.wikidata.org/entity/Q15564658,plant,http://www.wikidata.org/entity/Q756,,
83,1318020074,,,,CC0_1_0,2017-10-31T12:10:00Z,,,,,...,,,NE,2715182,Cyperus sphacelatus,http://www.wikidata.org/entity/Q15535602,plant,http://www.wikidata.org/entity/Q756,https://en.wikipedia.org/wiki/Cyperus_sphacelatus,Cyperus sphacelatus is a species of sedge that...
84,1317614158,,,,CC0_1_0,2021-03-06T11:43:00Z,,,,,...,,,NE,2705827,Brachiaria fasciculata,http://www.wikidata.org/entity/Q15509620,plant,http://www.wikidata.org/entity/Q756,,
85,1317605282,,,,CC0_1_0,2021-03-08T09:38:00Z,,,,,...,,,NE,2702854,Lasiacis sorghoidea,http://www.wikidata.org/entity/Q15508551,plant,http://www.wikidata.org/entity/Q756,,


In [55]:
gbifRDF = Graph()
WD = Namespace("http://www.wikidata.org/entity/")
WDT = Namespace("http://www.wikidata.org/prop/direct/")
gbifRDF.bind("wd", WD)
gbifRDF.bind("wdt", WDT)
gbifRDF.bind("dcterms", DCTERMS)
gbifRDF.bind("geo", Namespace("http://www.opengis.net/ont/geosparql#"))

for index, row in tqdm.tqdm(merged_df2.iterrows()):
    GBIFURI = URIRef("https://gbif.semscape.org/occurrence/"+str(row["gbifID"]))
    gbifRDF.add((GBIFURI, WDT.P854 , URIRef("https://www.gbif.org/occurrence/"+str(row["gbifID"]))))
    gbifRDF.add((GBIFURI, DCTERMS.identifier , Literal(str(row["gbifID"]))))

    # species
    tempkey = str(row['taxonKey']).replace('.0', '')
    if tempkey in wdtaxqid.keys():
        gbifRDF.add((GBIFURI, WDT.P225, URIRef(wdtaxqid[tempkey])))
        gbifRDF.add((URIRef(wdtaxqid[tempkey]), RDFS.label, Literal(row["taxonname"], lang="en")))

    # lat long
    try:
        if not math.isnan(float(row["decimalLatitude"])):
            gbifRDF.add((GBIFURI, WDT.P625, Literal("Point("+str(row["decimalLongitude"])+" "+str(row["decimalLatitude"])+")", datatype="http://www.opengis.net/ont/geosparql#wktLiteral")))
    except (TypeError, ValueError) as e:
        print(f"Error for gbifID={row['gbifID']}, Latitude={row['decimalLatitude']}, Longitude={row['decimalLongitude']}")
        print(f"Error details: {e}")
        print(f"Error occurred in this line: {e.__traceback__.tb_lineno}")

    # type
    typeDict = {
    "Collection": "Q2668072","Colletion": "Q2668072","collection": "Q2668072",
    "Event": "Q1656682",
    "Objeto físico": "Q223557","Physical Object": "Q223557","Physical object": "Q223557","PhysicalObject": "Q223557","PhysicalSpecimen": "Q223557","http://purl.org/dc/dcmitype/PhysicalObject": "Q223557",
    "Sound": "Q11442",
    "StillImage": "Q478798","http://purl.org/dc/dcmitype/StillImage": "Q478798",
    "Text": "Q1145976","text": "Q1145976"
}
    if row["type"] in typeDict.keys():
        gbifRDF.add((GBIFURI, WDT.P31, URIRef("http://www.wikidata.org/entity/"+typeDict[row["type"]])))
    else:
        pass
        # print(f"Error for gbifID={row['gbifID']}, type={row['type']}")

    # language
    langDict = {
        "EN": "Q1860", "en": "Q1860","En": "Q1860", "English": "Q1860",
        "PT": "Q5146","português": "Q5146",
        "es": "Q1321",
        "nl": "Q7411",
        "fr": "Q150", "FR": "Q150",
        "Latin": "Q397", "la": "Q397"
    }

    if not pd.isna(row["language"]) and row["language"] in langDict:
        gbifRDF.add((GBIFURI, WDT.P407, URIRef("http://www.wikidata.org/entity/"+langDict[row["language"]])))
    else:
        pass
        # print(f"Error for gbifID={row['gbifID']}, language={row['language']}")

    # IUCN Red List Category
    iucnDict = {
        "NE": "Q3350324", "Not Evaluated": "Q3350324",
        "LC": "Q211005", "Least Concern": "Q211005",
        "NT": "Q719675", "Near Threatened": "Q719675",
        "VU": "Q278113", "Vulnerable": "Q278113",
        "EN": "Q11394", "Endangered": "Q11394",
        "CR": "Q219127", "Critically Endangered": "Q219127",
        "EW": "Q239509", "Extinct in the Wild": "Q239509",
        "EX": "Q237350", "Extinct": "Q237350",
        "DD": "Q3245245", "Data Deficient": "Q3245245"
    }
    if not pd.notna(row["iucnRedListCategory"]) and row["iucnRedListCategory"] in iucnDict:
        gbifRDF.add((GBIFURI, WDT.P141, URIRef("http://www.wikidata.org/entity/"+iucnDict[row["iucnRedListCategory"]])))
    else:
        pass
        # print(f"Error for gbifID={row['gbifID']}, iucnRedListCategory={row['iucnRedListCategory']}")

    # taxonank
    taxonrankDict = {
        "class": "Q37517", "Class": "Q37517",
        "family": "Q35409", "Family": "Q35409",
        "genus": "Q34740", "Genus": "Q34740",
        "kingdom": "Q36732", "Kingdom": "Q36732",
        "order": "Q36602", "Order": "Q36602",
        "phylum": "Q38348", "Phylum": "Q38348",
        "species": "Q7432", "Species": "Q7432",
        "subspecies": "Q68947", "Subspecies": "Q68947",
        "variety": "Q767728", "Variety": "Q767728",
        "form": "Q279749", "Form": "Q279749"
    }

    if not pd.isna(row["taxonRank"]) and row["taxonRank"] in taxonrankDict:
        gbifRDF.add((GBIFURI, WDT.P105, URIRef("http://www.wikidata.org/entity/"+taxonrankDict[row["taxonRank"]])))
    else:
        pass
        # print(f"Error for gbifID={row['gbifID']}, taxonRank={row['taxonRank']}")
    # license
    licenseDict = {
        "CC_BY_NC_ND_2_5": "Q19113746", "http://creativecommons.org/licenses/by-nc-nd/2.5/": "Q19113746",
        "CC_BY_NC_ND_4_0": "Q24082749", "http://creativecommons.org/licenses/by-nc-nd/4.0/": "Q24082749",
        "CC_BY_NC_SA_2_5": "Q19068212", "http://creativecommons.org/licenses/by-nc-sa/2.5/": "Q19068212",
        "CC_BY_NC_SA_3_0": "Q15643954", "http://creativecommons.org/licenses/by-nc-sa/3.0/": "Q15643954",
        "CC_BY_NC_SA_4_0": "Q42553662", "http://creativecommons.org/licenses/by-nc-sa/4.0/": "Q42553662",
        "CC_BY_NC_4_0": "Q24082749", "http://creativecommons.org/licenses/by-nc/4.0/": "Q24082749",
        "CC_BY_NC_3_0": "Q18810331", "http://creativecommons.org/licenses/by-nc/3.0/": "Q18810331",
        "CC_BY_ND_2_5": "Q18810338", "http://creativecommons.org/licenses/by-nd/2.5/": "Q18810338",
        "CC_BY_ND_3_0": "Q18810160", "http://creativecommons.org/licenses/by-nd/3.0/": "Q18810160",
        "CC_BY_ND_4_0": "Q36795408", "http://creativecommons.org/licenses/by-nd/4.0/": "Q36795408",
         "CC_BY_SA_3_0": "Q14946043","http://creativecommons.org/licenses/by-sa/3.0/": "Q14946043",
        "CC_BY_SA_4_0": "Q18199165", "http://creativecommons.org/licenses/by-sa/4.0/": "Q18199165",
        "CC_BY_4_0": "Q18199165", "http://creativecommons.org/licenses/by/4.0/": "Q18199165",
        "CC_BY_3_0": "Q14947546", "http://creativecommons.org/licenses/by/3.0/": "Q14947546",
        "CC0_1_0": "Q18199165", "http://creativecommons.org/publicdomain/zero/1.0/": "Q18199165",
        "https://huh.harvard.edu/access-digital-reproductions-works-public-domain":"Q18199165"
    }
    if not pd.isna(row["license"]) and row["license"] in licenseDict:
        gbifRDF.add((GBIFURI, WDT.P275, URIRef("http://www.wikidata.org/entity/"+licenseDict[row["license"]])))
    else:
        pass
        # print(f"Error for gbifID={row['gbifID']}, license={row['license']}")

    # sex
    sexDict = {
        "FEMALE": "Q43445",
        "MALE": "Q44148"
    }
    if not pd.isna(row["sex"]) and row["sex"] in sexDict:
        gbifRDF.add((GBIFURI, WDT.P21, URIRef("http://www.wikidata.org/entity/"+sexDict[row["sex"]])))
    else:
        pass

    # recordedByID
    if not pd.isna(row["recordedByID"]):
        gbifRDF.add((GBIFURI, WDT.P170, Literal(str(row["recordedByID"]), datatype=XSD.string)))
    else:
        pass

    # associatedSequences
    if not pd.isna(row["associatedSequences"]):
        gbifRDF.add((GBIFURI, WDT.P4333, Literal(str(row["associatedSequences"]), datatype=XSD.string)))
    else:
        pass

    # occurrenceID
    if not pd.isna(row["occurrenceID"]):
        gbifRDF.add((GBIFURI, DC.identifier, Literal(str(row["occurrenceID"]), datatype=XSD.string)))

    # publisher
    if row["rightsHolder"] in instituteQID.keys():
        gbifRDF.add((GBIFURI, WDT.P123, URIRef("http://www.wikidata.org/entity/"+instituteQID[row["rightsHolder"]])))
        gbifRDF.add((URIRef("http://www.wikidata.org/entity/"+instituteQID[row["rightsHolder"]]), RDFS.label, Literal(row["rightsHolder"], lang="en")))

    # kingfom
    if not pd.isna(row["kingdom_y"]):
        gbifRDF.add((GBIFURI, WDT.P910, URIRef(row["kingdom_y"]))) #chapter
        gbifRDF.add((URIRef(row["kingdom_y"]), RDFS.label, Literal(row["kingdomname"], lang="en")))
    else:
        pass

    #Wiki article
    if not pd.isna(row["article"]):
        gbifRDF.add((GBIFURI, DC.description, URIRef(row["article"]))) #chapter
    else:
        pass

87it [00:00, 3431.42it/s]


In [56]:
df_image = pd.read_csv('data/0002020-240626123714530/multimedia.txt', sep='\t', on_bad_lines = 'warn')
for index, row in df_image.iterrows():
    GBIFURI = URIRef("https://gbif.semscape.org/occurrence/"+str(row["gbifID"]))
    mediaURI = BNode()
    if row["type"] == "StillImage":
        gbifRDF.add((mediaURI, WDT.P31, WD.Q17538176))
    elif row["type"] == "Sound":
        gbifRDF.add((mediaURI, WDT.P31, WD.Q11461))
    gbifRDF.add((mediaURI, DCTERMS.format, Literal(row["format"], datatype=XSD.string)))
    if not pd.isna(row["license"]) and row["license"] in licenseDict:
        gbifRDF.add((mediaURI, WDT.P275, URIRef("http://www.wikidata.org/entity/"+licenseDict[row["license"]])))
    else:
        pass
    gbifRDF.add((mediaURI, WDT.P2699, Literal(row["identifier"], datatype=XSD.string)))
    gbifRDF.add((mediaURI, WDT.P361, GBIFURI))

In [57]:
wdtaxqid

{'5289739': 'http://www.wikidata.org/entity/Q4925284',
 '2704745': 'http://www.wikidata.org/entity/Q7579252',
 '2717421': 'http://www.wikidata.org/entity/Q15584858',
 '2702504': 'http://www.wikidata.org/entity/Q24700355',
 '2705068': 'http://www.wikidata.org/entity/Q10912779',
 '2708680': 'http://www.wikidata.org/entity/Q11075670',
 '2702854': 'http://www.wikidata.org/entity/Q15508551',
 '2702852': 'http://www.wikidata.org/entity/Q15508581',
 '2702522': 'http://www.wikidata.org/entity/Q15516186',
 '5291147': 'http://www.wikidata.org/entity/Q15550242',
 '2705275': 'http://www.wikidata.org/entity/Q33913',
 '2705758': 'http://www.wikidata.org/entity/Q3219428',
 '2705656': 'http://www.wikidata.org/entity/Q3367780',
 '2706277': 'http://www.wikidata.org/entity/Q4790962',
 '2702936': 'http://www.wikidata.org/entity/Q15504902',
 '2715182': 'http://www.wikidata.org/entity/Q15535602',
 '2706079': 'http://www.wikidata.org/entity/Q15504478',
 '2720622': 'http://www.wikidata.org/entity/Q15557622',


In [58]:
# Define the URI prefix you are interested in
prefix = "http://www.wikidata.org/entity/"

# Initialize a set to hold all matching URIs
wikidata_uris = set()

# Iterate through all nodes in the graph, both subjects and objects
for node in gbifRDF.all_nodes():
    if isinstance(node, URIRef) and str(node).startswith(prefix):
        wikidata_uris.add(str(node))


In [59]:
gbifRDF.serialize(destination='gbifMontserrat.ttl', format='turtle', encoding="utf-8")

<Graph identifier=Nbf56de9dedd24e20b7021749cf540ce8 (<class 'rdflib.graph.Graph'>)>

In [60]:
wdGraph = Graph()

# Wrapping wikidata_uris with tqdm to create a progress bar
for wduri in tqdm.tqdm(wikidata_uris, desc="Processing URIs"):
    try:
        print(str(wduri))
        wdGraph.parse(str(wduri), format='turtle')
    except Exception as e:  # It's a good practice to log the exception
        print(f"An error occurred while parsing {wduri}: {e}")
        continue

Processing URIs:   0%|          | 0/41 [00:00<?, ?it/s]

http://www.wikidata.org/entity/Q24700355


Processing URIs:   2%|▏         | 1/41 [00:04<02:48,  4.22s/it]

http://www.wikidata.org/entity/Q15584858


Processing URIs:   5%|▍         | 2/41 [00:06<01:53,  2.90s/it]

http://www.wikidata.org/entity/Q21265100


Processing URIs:   7%|▋         | 3/41 [00:07<01:27,  2.31s/it]

http://www.wikidata.org/entity/Q15535602


Processing URIs:  10%|▉         | 4/41 [00:09<01:14,  2.02s/it]

http://www.wikidata.org/entity/Q4830676


Processing URIs:  12%|█▏        | 5/41 [00:11<01:18,  2.19s/it]

http://www.wikidata.org/entity/Q10912779


Processing URIs:  15%|█▍        | 6/41 [00:13<01:13,  2.09s/it]

http://www.wikidata.org/entity/Q15564658


Processing URIs:  17%|█▋        | 7/41 [00:15<01:10,  2.07s/it]

http://www.wikidata.org/entity/Q33913


Processing URIs:  20%|█▉        | 8/41 [00:19<01:25,  2.58s/it]

http://www.wikidata.org/entity/Q14834635


Processing URIs:  22%|██▏       | 9/41 [00:21<01:14,  2.34s/it]

http://www.wikidata.org/entity/Q15509620


Processing URIs:  24%|██▍       | 10/41 [00:22<01:00,  1.94s/it]

http://www.wikidata.org/entity/Q163571


Processing URIs:  27%|██▋       | 11/41 [00:23<00:52,  1.74s/it]

http://www.wikidata.org/entity/Q756


Failed to convert Literal lexical form to value. Datatype=http://www.w3.org/2001/XMLSchema#dateTime, Converter=<function parse_datetime at 0x120156ac0>
Traceback (most recent call last):
  File "/Users/andra/projects/Grasses-and-sedges-of-Montserrat/GrassesMontserrat/.venv/lib/python3.12/site-packages/rdflib/term.py", line 2119, in _castLexicalToPython
    return conv_func(lexical)  # type: ignore[arg-type]
           ^^^^^^^^^^^^^^^^^^
  File "/Users/andra/projects/Grasses-and-sedges-of-Montserrat/GrassesMontserrat/.venv/lib/python3.12/site-packages/isodate/isodatetime.py", line 55, in parse_datetime
    tmpdate = parse_date(datestring)
              ^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/andra/projects/Grasses-and-sedges-of-Montserrat/GrassesMontserrat/.venv/lib/python3.12/site-packages/isodate/isodates.py", line 203, in parse_date
    raise ISO8601Error('Unrecognised ISO 8601 date format: %r' % datestring)
isodate.isoerror.ISO8601Error: Unrecognised ISO 8601 date format: '-1200000000

http://www.wikidata.org/entity/Q15504902


Processing URIs:  32%|███▏      | 13/41 [00:29<01:03,  2.26s/it]

http://www.wikidata.org/entity/Q3367780


Processing URIs:  34%|███▍      | 14/41 [00:31<00:57,  2.13s/it]

http://www.wikidata.org/entity/Q827098


Processing URIs:  37%|███▋      | 15/41 [00:33<00:56,  2.15s/it]

http://www.wikidata.org/entity/Q36495


Processing URIs:  39%|███▉      | 16/41 [00:35<00:52,  2.09s/it]

http://www.wikidata.org/entity/Q3052500


Processing URIs:  41%|████▏     | 17/41 [00:38<00:55,  2.33s/it]

http://www.wikidata.org/entity/Q56880932


Processing URIs:  44%|████▍     | 18/41 [00:39<00:48,  2.10s/it]

http://www.wikidata.org/entity/Q15557622


Processing URIs:  46%|████▋     | 19/41 [00:41<00:42,  1.92s/it]

http://www.wikidata.org/entity/Q15246196


Processing URIs:  49%|████▉     | 20/41 [00:42<00:37,  1.78s/it]

http://www.wikidata.org/entity/Q15555289


Processing URIs:  51%|█████     | 21/41 [00:44<00:35,  1.77s/it]

http://www.wikidata.org/entity/Q15508551


Processing URIs:  54%|█████▎    | 22/41 [00:46<00:32,  1.69s/it]

http://www.wikidata.org/entity/Q15550242


Processing URIs:  56%|█████▌    | 23/41 [00:47<00:28,  1.56s/it]

http://www.wikidata.org/entity/Q15508581


Processing URIs:  59%|█████▊    | 24/41 [00:49<00:26,  1.58s/it]

http://www.wikidata.org/entity/Q28913658


Processing URIs:  61%|██████    | 25/41 [00:50<00:26,  1.69s/it]

http://www.wikidata.org/entity/Q1043983


Processing URIs:  63%|██████▎   | 26/41 [01:00<00:58,  3.91s/it]

http://www.wikidata.org/entity/Q4790962


Processing URIs:  66%|██████▌   | 27/41 [01:03<00:51,  3.65s/it]

http://www.wikidata.org/entity/Q17538176


Processing URIs:  68%|██████▊   | 28/41 [01:04<00:37,  2.91s/it]

http://www.wikidata.org/entity/Q6539452


Processing URIs:  71%|███████   | 29/41 [01:07<00:34,  2.91s/it]

http://www.wikidata.org/entity/Q11066188


Processing URIs:  73%|███████▎  | 30/41 [01:09<00:29,  2.72s/it]

http://www.wikidata.org/entity/Q1860


Processing URIs:  76%|███████▌  | 31/41 [01:13<00:31,  3.19s/it]

http://www.wikidata.org/entity/Q4925284


Processing URIs:  78%|███████▊  | 32/41 [01:15<00:25,  2.88s/it]

http://www.wikidata.org/entity/Q18199165


Processing URIs:  80%|████████  | 33/41 [01:18<00:22,  2.79s/it]

http://www.wikidata.org/entity/Q11075670


Processing URIs:  83%|████████▎ | 34/41 [01:20<00:17,  2.46s/it]

http://www.wikidata.org/entity/Q24082749


Processing URIs:  85%|████████▌ | 35/41 [01:21<00:13,  2.17s/it]

http://www.wikidata.org/entity/Q501758


Processing URIs:  88%|████████▊ | 36/41 [01:24<00:11,  2.28s/it]

http://www.wikidata.org/entity/Q636275


Processing URIs:  90%|█████████ | 37/41 [01:26<00:09,  2.33s/it]

http://www.wikidata.org/entity/Q7579252


Processing URIs:  93%|█████████▎| 38/41 [01:28<00:06,  2.09s/it]

http://www.wikidata.org/entity/Q223557


Processing URIs:  95%|█████████▌| 39/41 [01:30<00:04,  2.15s/it]

http://www.wikidata.org/entity/Q15516186


Processing URIs:  98%|█████████▊| 40/41 [01:32<00:01,  1.96s/it]

http://www.wikidata.org/entity/Q15504478


Processing URIs: 100%|██████████| 41/41 [01:34<00:00,  2.31s/it]


In [61]:
query = """
PREFIX wd: <http://www.wikidata.org/entity/>
    PREFIX wdt: <http://www.wikidata.org/prop/direct/>
    SELECT DISTINCT  ?taxon ?taxonLabel ?publisher ?publisherLabel ?observation ?gbifObservation ?media ?media_url ?license WHERE {
           ?media wdt:P2699 ?media_url ;
                  wdt:P275 ?license ;
                  wdt:P361 ?observation .
          ?observation wdt:P225 ?taxon ;
                       wdt:P854 ?gbifObservation ;
                       wdt:P123 ?publisher .
          ?taxon rdfs:label ?taxonLabel .
          OPTIONAL {?taxon dc:description ?article .}
          ?publisher rdfs:label ?publisherLabel .
            }"""
# Assuming gbifRDF.query(query) has been defined and executed
results = gbifRDF.query(query)
print(results)
taxonpages = {}  # Ensure that taxonpages is defined before the loop

for row in results:
    # Cache commonly used row items to avoid repetitive dictionary lookups and conversions
    taxon_label = str(row["taxonLabel"])
    print(taxon_label)
    publisher_label = str(row["publisherLabel"])

    # If the taxon label is not in taxonpages, initialize it with necessary keys
    if taxon_label not in taxonpages:
        taxonpages[taxon_label] = {
            "wd_uri": str(row["taxon"]),
            "publisher": {},  # Initialize the publisher dictionary here
        }
        # Add the article if it exists in the row
        if "article" in row:
            taxonpages[taxon_label]["article"] = str(row["article"])

    # Ensure the publisher label is initialized in the taxon's publisher dictionary
    if publisher_label not in taxonpages[taxon_label]["publisher"]:
        taxonpages[taxon_label]["publisher"][publisher_label] = {
            "observations": []  # Initialize the observations list here
        }

    # Construct the observation dictionary
    observation = {
        "observation_id": str(row["gbifObservation"]),
        "media": [str(row["media_url"])],  # Initialize media as a list with the media_url
        "license": str(row["license"])
    }
    # Append the observation dictionary to the observations list
    taxonpages[taxon_label]["publisher"][publisher_label]["observations"].append(observation)

# The rest of your processing can continue after this




<rdflib.plugins.sparql.processor.SPARQLResult object at 0x122efaf00>
Cyperus brevifolius
Cyperus brevifolius
Rhynchospora nervosa
Rhynchospora nervosa
Rhynchospora nervosa
Rhynchospora nervosa
Lasiacis divaricata
Axonopus compressus
Rhynchospora nervosa
Rhynchospora nervosa
Eleocharis flavescens
Eriochloa punctata
Cymbopogon citratus
Cyperus sphacelatus
Anthephora hermaphrodita
Lasiacis divaricata
Andropogon bicornis
Cenchrus echinatus
Scleria secans
Cyperus alopecuroides
Cyperus sphacelatus
Coix lacryma-jobi
Aristida adscensionis
Panicum trichoides
Paspalum conjugatum
Panicum trichoides
Aristida adscensionis
Digitaria insularis
Eriochloa punctata
Digitaria insularis
Rhynchospora tenerrima
Fimbristylis complanata
Anthephora hermaphrodita
Anthephora hermaphrodita
Anthephora hermaphrodita
Rhynchospora tenerrima
Cyperus alopecuroides
Cenchrus echinatus
Paspalum fimbriatum
Panicum trichoides
Panicum trichoides
Lasiacis divaricata
Lasiacis sorghoidea
Digitaria insularis
Cyperus sphacelatus


In [62]:
taxonpages

{'Cyperus brevifolius': {'wd_uri': 'http://www.wikidata.org/entity/Q21265100',
  'publisher': {'Meise Botanic Garden': {'observations': [{'observation_id': 'https://www.gbif.org/occurrence/4876300936',
      'media': ['https://iiif-manifest.oxalis.br.fgov.be/specimen/BR0000027525951V/manifest'],
      'license': 'http://www.wikidata.org/entity/Q18199165'},
     {'observation_id': 'https://www.gbif.org/occurrence/4876300936',
      'media': ['https://oxalis.br.fgov.be/images/V/BR0/000/027/525/951/BR0000027525951V.jpg'],
      'license': 'http://www.wikidata.org/entity/Q18199165'}]}}},
 'Rhynchospora nervosa': {'wd_uri': 'http://www.wikidata.org/entity/Q15555289',
  'publisher': {'Meise Botanic Garden': {'observations': [{'observation_id': 'https://www.gbif.org/occurrence/4876284841',
      'media': ['https://iiif-manifest.oxalis.br.fgov.be/specimen/BR0000026554150V/manifest'],
      'license': 'http://www.wikidata.org/entity/Q18199165'},
     {'observation_id': 'https://www.gbif.org/occ

In [63]:
import shutil
import os
shutil.rmtree('./taxa/')
os.makedirs(os.path.dirname('./taxa/'), exist_ok=True)


In [64]:
for taxon in taxonpages.keys():
    mdFile = MdUtils(file_name='taxa/'+ taxon.replace(" ", "_"), title=taxon+" ("+taxonpages[taxon]["wd_uri"].replace("http://www.wikidata.org/entity/", "")+")")

    if URIRef(row["taxon"]) in taxoninformation.keys():
        if "wikiintroduction" in taxoninformation[URIRef(row["taxon"])].keys():
            mdFile.new_line(taxoninformation[URIRef(row["taxon"])]["wikiintroduction"])
            mdFile.new_line(f"Read more on [English Wikipedia](https://en.wikipedia.org/page/{row['taxonLabel'].replace('' , '_')})")
    mdFile.new_line(f"Visualize Wikidata on [Scholia](https://scholia.toolforge.org/taxon/{taxonpages[taxon]['wd_uri'].replace('http://www.wikidata.org/entity/', '')})")
    for organisation in taxonpages[taxon]["publisher"].keys():
        mdFile.new_header(level=1, title=organisation)
        for observation in taxonpages[taxon]["publisher"][organisation]["observations"]:
            mdFile.new_line(f"Observation: [{observation['observation_id']}]({observation['observation_id']})")
            mdFile.new_line(f"License: [{observation['license']}]({observation['license']})")
            for media in observation["media"]:
                if media != "nan":
                    mdFile.new_line(f"![{observation['observation_id']}]({media.replace('square', 'medium')})")
    mdFile.create_md_file()


In [66]:
organization_to_taxa = {}

# Populate the organization_to_taxa dictionary
for taxon, data in taxonpages.items():
    for organisation, org_data in data["publisher"].items():
        if organisation not in organization_to_taxa:
            organization_to_taxa[organisation] = []
        organization_to_taxa[organisation].append({
            "taxon": taxon,
            "taxon_uri": data["wd_uri"],
            "observations": org_data["observations"]
        })

# Generate md files for each organization
for organisation, taxa in organization_to_taxa.items():
    mdFile = MdUtils(file_name='organisation/' + organisation.replace(" ", "_"), title=organisation)

    for taxon_data in taxa:
        taxon = taxon_data["taxon"]
        taxon_uri = taxon_data["taxon_uri"]
        observations = taxon_data["observations"]
        
        mdFile.new_header(level=1, title=taxon + " (" + taxon_uri.replace("http://www.wikidata.org/entity/", "") + ")")
        
        if URIRef(taxon_uri) in taxoninformation:
            taxon_info = taxoninformation[URIRef(taxon_uri)]
            if "wikiintroduction" in taxon_info:
                mdFile.new_line(taxon_info["wikiintroduction"])
                mdFile.new_line(f"Read more on [English Wikipedia](https://en.wikipedia.org/page/{taxon.replace(' ', '_')})")
        
        mdFile.new_line(f"Visualize Wikidata on [Scholia](https://scholia.toolforge.org/taxon/{taxon_uri.replace('http://www.wikidata.org/entity/', '')})")
        
        for observation in observations:
            mdFile.new_line(f"Observation: [{observation['observation_id']}]({observation['observation_id']})")
            mdFile.new_line(f"License: [{observation['license']}]({observation['license']})")
            for media in observation["media"]:
                if media != "nan":
                    mdFile.new_line(f"![{observation['observation_id']}]({media.replace('square', 'medium')})")

    mdFile.create_md_file()

In [89]:
import yaml

with open(r'_toc.yml') as file:
    # The FullLoader parameter handles the conversion from YAML
    # scalar values to Python the dictionary format
    species = yaml.load(file, Loader=yaml.FullLoader)

species["parts"][0]["chapters"] = []
for file in os.listdir('./organisation/'):
    species["parts"][0]["chapters"].append({'file': 'organisation/'+file})

species["parts"][1]["chapters"] = []
for file in os.listdir('./taxa/'):
    species["parts"][1]["chapters"].append({'file': 'taxa/'+file})
with open(r'_toc.yml', 'w+') as file:
    documents = yaml.dump(species, file)

## Below is for reference only

In [72]:
# Assuming `notFound` is your dictionary
data = []  # List to hold data for DataFrame

for institute, details in notFound.items():
    try:
        # Access the first result for the current institute
        if len(details["results"]) == 0:
            # If no results are found, append a dictionary with the institute and an error message
            print(f"No results found for {institute}")
            data.append({
                "Institute": "FOUT" + institute,
                "Error": "No results found"
            })
            continue
        first_result = details["results"][0]

        # Append a dictionary with the required information to the data list
        data.append({
            "Institute": institute,
            "Key": first_result.get("key", None),
            "Title": first_result.get("title", None),
            "Homepage": first_result.get("homepage", None),
            "City": first_result.get("city", None),
            "Country": first_result.get("country", None),
            "PostalCode": first_result.get("postalCode", None),
            "Address": first_result.get("address", None),
            "Phone": first_result.get("phone", None),
            "Email": first_result.get("email", None),
            "Endpoints": first_result.get("endpoints", None)
        })
    except KeyError as e:
        # If some data is not found, append a dictionary with the institute and an error message
        print(f"Error retrieving data for {institute}: {e}")
        data.append({
            "Institute": "FOUT" + institute,
            "Error": details["results"]
        })

# Create DataFrame from the data list
dfinstitutestoBuild = pd.DataFrame(data)
dfinstitutestoBuild


No results found for Delmaude C Ryan
No results found for slauren
No results found for gorillarebecca
No results found for monkeyjodey


Unnamed: 0,Institute,Error
0,FOUTDelmaude C Ryan,No results found
1,FOUTslauren,No results found
2,FOUTgorillarebecca,No results found
3,FOUTmonkeyjodey,No results found


In [73]:
df.groupby('license').count()

Unnamed: 0_level_0,gbifID,accessRights,bibliographicCitation,language,modified,publisher,references,rightsHolder,type,institutionID,...,publishedByGbifRegion,level0Gid,level0Name,level1Gid,level1Name,level2Gid,level2Name,level3Gid,level3Name,iucnRedListCategory
license,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
CC0_1_0,74,0,0,19,53,0,47,46,19,45,...,74,27,27,27,27,0,0,0,0,73
CC_BY_4_0,4,0,0,0,3,0,0,2,2,2,...,4,2,2,2,2,0,0,0,0,4
CC_BY_NC_4_0,9,0,0,0,8,0,8,8,1,2,...,9,7,7,7,7,0,0,0,0,9


In [74]:
df.groupby('iucnRedListCategory').count()

Unnamed: 0_level_0,gbifID,accessRights,bibliographicCitation,language,license,modified,publisher,references,rightsHolder,type,...,gbifRegion,publishedByGbifRegion,level0Gid,level0Name,level1Gid,level1Name,level2Gid,level2Name,level3Gid,level3Name
iucnRedListCategory,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
LC,41,0,0,11,41,28,0,24,25,13,...,41,41,14,14,14,14,0,0,0,0
NE,45,0,0,8,45,35,0,30,30,9,...,45,45,21,21,21,21,0,0,0,0


In [75]:
df.groupby('higherClassification').count()

Unnamed: 0_level_0,gbifID,accessRights,bibliographicCitation,language,license,modified,publisher,references,rightsHolder,type,...,publishedByGbifRegion,level0Gid,level0Name,level1Gid,level1Name,level2Gid,level2Name,level3Gid,level3Name,iucnRedListCategory
higherClassification,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Cyperaceae,1,0,0,0,1,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,1
POACEAE,1,0,0,0,1,1,0,0,0,0,...,1,0,0,0,0,0,0,0,0,1
"Plantae, Monocotyledonae, Poales, Cyperaceae, Cyperoideae",7,0,0,0,7,7,0,0,0,0,...,7,0,0,0,0,0,0,0,0,7
"Plantae, Monocotyledonae, Poales, Poaceae, Aristidoideae",1,0,0,0,1,1,0,0,0,0,...,1,0,0,0,0,0,0,0,0,1
"Plantae, Monocotyledonae, Poales, Poaceae, Panicoideae",18,0,0,0,18,18,0,0,0,0,...,18,1,1,1,1,0,0,0,0,18
Plantae|Spermatophyta|Tracheophyta|Magnoliophyta|Monocots|Commelinids|Poales|Cyperaceae|Cyperus,2,0,0,0,2,2,0,2,2,0,...,2,0,0,0,0,0,0,0,0,2
Plantae|Spermatophyta|Tracheophyta|Magnoliophyta|Monocots|Commelinids|Poales|Cyperaceae|Fimbristylis,1,0,0,0,1,1,0,1,1,0,...,1,0,0,0,0,0,0,0,0,1
Plantae|Spermatophyta|Tracheophyta|Magnoliophyta|Monocots|Commelinids|Poales|Cyperaceae|Scleria,1,0,0,0,1,1,0,1,1,0,...,1,0,0,0,0,0,0,0,0,1
Plantae|Spermatophyta|Tracheophyta|Magnoliophyta|Monocots|Commelinids|Poales|Poaceae|Anthephora,1,0,0,0,1,1,0,1,1,0,...,1,0,0,0,0,0,0,0,0,1
Plantae|Spermatophyta|Tracheophyta|Magnoliophyta|Monocots|Commelinids|Poales|Poaceae|Digitaria,1,0,0,0,1,1,0,1,1,0,...,1,0,0,0,0,0,0,0,0,1


In [76]:
df.groupby('species').count()[["higherClassification"]]

Unnamed: 0_level_0,higherClassification
species,Unnamed: 1_level_1
Andropogon bicornis,1
Anthephora hermaphrodita,4
Aristida adscensionis,1
Axonopus compressus,0
Bambusa vulgaris,0
Cenchrus echinatus,1
Coix lacryma-jobi,1
Cymbopogon citratus,1
Cyperus alopecuroides,2
Cyperus brevifolius,0


In [77]:
df.groupby('species').count()

Unnamed: 0_level_0,gbifID,accessRights,bibliographicCitation,language,license,modified,publisher,references,rightsHolder,type,...,publishedByGbifRegion,level0Gid,level0Name,level1Gid,level1Name,level2Gid,level2Name,level3Gid,level3Name,iucnRedListCategory
species,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Andropogon bicornis,2,0,0,1,2,1,0,1,1,1,...,2,0,0,0,0,0,0,0,0,2
Anthephora hermaphrodita,5,0,0,1,5,4,0,2,2,1,...,5,1,1,1,1,0,0,0,0,5
Aristida adscensionis,3,0,0,2,3,1,0,2,2,2,...,3,2,2,2,2,0,0,0,0,3
Axonopus compressus,1,0,0,0,1,1,0,1,1,0,...,1,1,1,1,1,0,0,0,0,1
Bambusa vulgaris,1,0,0,0,1,1,0,1,1,0,...,1,1,1,1,1,0,0,0,0,1
Cenchrus echinatus,4,0,0,1,4,3,0,3,3,1,...,4,2,2,2,2,0,0,0,0,4
Coix lacryma-jobi,2,0,0,1,2,1,0,1,1,1,...,2,0,0,0,0,0,0,0,0,2
Cymbopogon citratus,2,0,0,0,2,2,0,1,1,0,...,2,1,1,1,1,0,0,0,0,2
Cyperus alopecuroides,3,0,0,1,3,2,0,1,1,1,...,3,0,0,0,0,0,0,0,0,3
Cyperus brevifolius,1,0,0,0,1,1,0,0,1,1,...,1,1,1,1,1,0,0,0,0,1


In [78]:
df.groupby('sex').count()

Unnamed: 0_level_0,gbifID,accessRights,bibliographicCitation,language,license,modified,publisher,references,rightsHolder,type,...,publishedByGbifRegion,level0Gid,level0Name,level1Gid,level1Name,level2Gid,level2Name,level3Gid,level3Name,iucnRedListCategory
sex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1


In [79]:
df.groupby('taxonRank').count()

Unnamed: 0_level_0,gbifID,accessRights,bibliographicCitation,language,license,modified,publisher,references,rightsHolder,type,...,publishedByGbifRegion,level0Gid,level0Name,level1Gid,level1Name,level2Gid,level2Name,level3Gid,level3Name,iucnRedListCategory
taxonRank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
SPECIES,86,0,0,19,86,63,0,54,55,22,...,86,35,35,35,35,0,0,0,0,86
VARIETY,1,0,0,0,1,1,0,1,1,0,...,1,1,1,1,1,0,0,0,0,0


In [80]:
df.groupby('publishingCountry').count()

Unnamed: 0_level_0,gbifID,accessRights,bibliographicCitation,language,license,modified,publisher,references,rightsHolder,type,...,publishedByGbifRegion,level0Gid,level0Name,level1Gid,level1Name,level2Gid,level2Name,level3Gid,level3Name,iucnRedListCategory
publishingCountry,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
BE,2,0,0,0,2,2,0,0,2,2,...,2,2,2,2,2,0,0,0,0,2
DE,1,0,0,0,1,0,0,1,0,0,...,1,0,0,0,0,0,0,0,0,1
GB,14,0,0,0,14,12,0,11,11,0,...,14,10,10,10,10,0,0,0,0,14
US,70,0,0,19,70,50,0,43,43,20,...,70,24,24,24,24,0,0,0,0,69


In [81]:
result_df.groupby("kingdomname").count()

Unnamed: 0_level_0,taxonId,taxonname,taxon,kingdom,article,genericName,specificEpithet
kingdomname,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
plant,28,28,28,28,28,28,28


In [82]:
merged_df[merged_df['taxonId']==""][['taxonKey', 'taxonId', 'taxonname', 'taxon', 'genericName', 'specificEpithet']].drop_duplicates()

Unnamed: 0,taxonKey,taxonId,taxonname,taxon,genericName,specificEpithet


In [83]:
    wdtaxqid

{'5289739': 'http://www.wikidata.org/entity/Q4925284',
 '2704745': 'http://www.wikidata.org/entity/Q7579252',
 '2717421': 'http://www.wikidata.org/entity/Q15584858',
 '2702504': 'http://www.wikidata.org/entity/Q24700355',
 '2705068': 'http://www.wikidata.org/entity/Q10912779',
 '2708680': 'http://www.wikidata.org/entity/Q11075670',
 '2702854': 'http://www.wikidata.org/entity/Q15508551',
 '2702852': 'http://www.wikidata.org/entity/Q15508581',
 '2702522': 'http://www.wikidata.org/entity/Q15516186',
 '5291147': 'http://www.wikidata.org/entity/Q15550242',
 '2705275': 'http://www.wikidata.org/entity/Q33913',
 '2705758': 'http://www.wikidata.org/entity/Q3219428',
 '2705656': 'http://www.wikidata.org/entity/Q3367780',
 '2706277': 'http://www.wikidata.org/entity/Q4790962',
 '2702936': 'http://www.wikidata.org/entity/Q15504902',
 '2715182': 'http://www.wikidata.org/entity/Q15535602',
 '2706079': 'http://www.wikidata.org/entity/Q15504478',
 '2720622': 'http://www.wikidata.org/entity/Q15557622',


In [84]:
df_image.groupby("license").count()

Unnamed: 0_level_0,gbifID,type,format,identifier,references,title,description,source,audience,created,creator,contributor,publisher,rightsHolder
license,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
"Partial images provided by this server are released under the Creative Commons cc-by-sa 3.0 (generic) licence [https://creativecommons.org/licenses/by-sa/3.0/de/]. Please credit images to BGBM following our citation guidelines [https://ww2.bgbm.org/Herbarium/cite.cfm]. If you would like to use images in a format or resolution which is not provided here, please contact us (d.roepert[at]bgbm.org).",2,1,2,2,2,0,0,0,0,0,0,0,0,0
http://creativecommons.org/licenses/by-nc/4.0/,8,8,8,8,7,1,1,0,0,7,8,0,7,8
http://creativecommons.org/licenses/by-sa/4.0/,4,4,4,4,0,4,4,0,0,0,4,0,0,4
http://creativecommons.org/licenses/by/4.0/,21,21,20,21,1,1,0,0,0,0,19,0,0,21
http://creativecommons.org/publicdomain/zero/1.0/,55,55,55,55,28,27,27,27,0,28,55,0,55,28


In [85]:
df_image.groupby("type").count()

Unnamed: 0_level_0,gbifID,format,identifier,references,title,description,source,audience,created,creator,contributor,publisher,license,rightsHolder
type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
InteractiveResource,2,2,2,0,2,2,0,0,0,2,0,0,2,2
StillImage,87,86,87,37,31,30,27,0,35,84,0,62,87,59


In [86]:
df_image[df_image["type"]=="Sound"]

Unnamed: 0,gbifID,type,format,identifier,references,title,description,source,audience,created,creator,contributor,publisher,license,rightsHolder


In [87]:
df_image.groupby('identifier').count()

Unnamed: 0_level_0,gbifID,type,format,references,title,description,source,audience,created,creator,contributor,publisher,license,rightsHolder
identifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
http://n2t.net/ark:/65665/m30612c7f4-df88-4fbb-b508-cb574264bef3,1,1,1,0,1,1,1,0,0,1,0,1,1,0
http://n2t.net/ark:/65665/m3140f66a1-e14d-48e1-b859-a9cd019ffd21,1,1,1,0,1,1,1,0,0,1,0,1,1,0
http://n2t.net/ark:/65665/m31c9184db-f3b6-498c-b1fd-6e7356c686fb,1,1,1,0,1,1,1,0,0,1,0,1,1,0
http://n2t.net/ark:/65665/m31d2ae640-4b6b-4399-a7d7-50e02ec7a5ae,1,1,1,0,1,1,1,0,0,1,0,1,1,0
http://n2t.net/ark:/65665/m31fc6a10b-0312-48a6-b36a-e61a21a54136,1,1,1,0,1,1,1,0,0,1,0,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
https://inaturalist-open-data.s3.amazonaws.com/photos/355235906/original.jpg,1,1,1,1,0,0,0,0,1,1,0,1,1,1
https://inaturalist-open-data.s3.amazonaws.com/photos/362637316/original.jpg,1,1,1,1,0,0,0,0,1,1,0,1,1,1
https://inaturalist-open-data.s3.amazonaws.com/photos/63505592/original.jpeg,1,1,1,1,0,0,0,0,1,1,0,1,1,1
https://oxalis.br.fgov.be/images/V/BR0/000/026/554/150/BR0000026554150V.jpg,1,1,1,0,1,1,0,0,0,1,0,0,1,1


In [71]:
len(gbifRDF.all_nodes())

0

In [None]:
len (gbifRDF.all_nodes())

In [33]:
query = """
    PREFIX wd: <http://www.wikidata.org/entity/>
    PREFIX wdt: <http://www.wikidata.org/prop/direct/>
    SELECT DISTINCT ?taxon ?taxonLabel WHERE {
           ?observation wdt:P225 ?taxon .
           ?taxon ?p ?taxonLabel .
            }"""
results = gbifRDF.query(query)
for row in results:
    print(row["taxon"], row["taxonLabel"])
    mdFile = mdutils.MdUtils(file_name='taxa/'+ row["taxonLabel"].replace(" ", "_"), title=row["taxonLabel"]+" ("+row["taxon"]+")")
    if URIRef(row["taxon"]) in taxoninformation.keys():
        if "wikiintroduction" in taxoninformation[URIRef(row["taxon"])].keys():
            mdFile.new_line(taxoninformation[URIRef(row["taxon"])]["wikiintroduction"])
            mdFile.new_line(f"Read more on [English Wikipedia](https://en.wikipedia.org/page/{row['taxonLabel'].replace('' , '_')})")
        mdFile.new_line(f"Visualize Wikidata on [Scholia](https://scholia.toolforge.org/taxon/{row['taxon'].replace('http://www.wikidata.org/entity/', '')})")

http://www.wikidata.org/entity/Q15381185 Sauvagesia erecta


NameError: name 'mdutils' is not defined

In [34]:
import mdutils
for taxon in table.keys():
    if table[taxon]["taxon_name"] in verified:
        mdFile = mdutils.MdUtils(file_name='new_articles/'+table[taxon]["taxon_name"].replace(" ", "_"),title=table[taxon]["taxon_name"])
        mdFile.new_line(f"[iNaturalist taxon id: {taxon}](https://www.inaturalist.org/taxa/{taxon})")
        for image in table[taxon]["photos"]:
            mdFile.new_line(f"![{table[taxon]['taxon_name']}]({image.replace('square', 'medium')})")
        mdFile.create_md_file()

NameError: name 'table' is not defined