# Join French Crop Usage (FCU) with TAXREF-LD via GEVES scientific names

Join FCU with TAXREF-LD using GEVES as an intermediate because GEVES has most FCU varieties and provides scientific names.
Process:
- join FCU with GEVES: where FCU variety name = GEVES species name
- join only the matched GEVES species names with the scientific names in TAXREF-LD

### Initializations

In [13]:
import sys
import json
import os
from string import Template
import pandas as pd
from SPARQLWrapper import SPARQLWrapper, JSON, POST
from time import sleep
from math import isnan, nan

In [14]:
sys.path.append('../..')
from utils import *

In [15]:
local_endpoint = 'http://localhost:8080/sparql'
#fcu_endpoint = "http://ontology.inrae.fr/frenchcropusage/sparql"
fcu_endpoint = "http://localhost:8080/sparql"
taxref_endpoint = "https://taxref.mnhn.fr/sparql"
geves_endpoint = "http://graph.i3s.unice.fr/repositories/geves"

---
## Join FCU with GEVES: FCU varieties names = GEVES species

In [20]:
query =  prefixes + '''
select distinct ?fcu_concept ?fcu_name ?fcu_name_type ?geves_lab_spe_dus ?geves_scientific_name where {

    # Query FCU crops
    service <''' + fcu_endpoint + '''> {
        select distinct ?fcu_concept ?fcu_name ?fcu_name_type where {

            # Select cultivated and multi-usage crops
            { <http://ontology.inrae.fr/frenchcropusage/Usages_plantes_cultivees> skos:narrower+ ?fcu_concept. }
            UNION
            { <http://ontology.inrae.fr/frenchcropusage/Multi_usages> skos:narrower+ ?fcu_concept. }

            # Get prefered and alternate labels and keep track of the type of label
            ?fcu_concept a skos:Concept.
            { ?fcu_concept skos:prefLabel ?lb. bind("pref" as ?fcu_name_type) }
            UNION
            { ?fcu_concept skos:altLabel  ?lb. bind("alt" as ?fcu_name_type) }

            bind(str(lcase(?lb)) as ?fcu_name)
        } order by ?fcu_concept
    }

    # Query GEVES
    optional {
        service <http://graph.i3s.unice.fr/repositories/geves> {
            [] 
                api:lab_spe_bota ?lab_spe_bota;
                api:lab_spe_dus ?geves_lab_spe_dus.
            bind(str(lcase(?lab_spe_bota))      as ?geves_scientific_name)
            bind(str(lcase(?geves_lab_spe_dus)) as ?geves_species)
        }

        # Match the GEVES species names and the FCU preferred/alternate labels:
        filter (?fcu_name = ?geves_species) 
    }
}'''

In [21]:
%time df_fcu_geves = exec_sparql(local_endpoint, query)

Wall time: 11.8 s


In [25]:
dataframe_preview(df_fcu_geves, end=10)

== Number of lines: 1712
== Number of unique values:
fcu_concept               524
fcu_name                 1656
fcu_name_type               2
geves_lab_spe_dus         104
geves_scientific_name      98
dtype: int64


Unnamed: 0,fcu_concept,fcu_name,fcu_name_type,geves_lab_spe_dus,geves_scientific_name
0,http://ontology.inrae.fr/frenchcropusage/Abric...,abricotier,pref,Abricotier,prunus armeniaca l.
1,http://ontology.inrae.fr/frenchcropusage/Abric...,abricot,alt,,
2,http://ontology.inrae.fr/frenchcropusage/Abric...,abricotier pays,pref,,
3,http://ontology.inrae.fr/frenchcropusage/Abric...,abricot pays,alt,,
4,http://ontology.inrae.fr/frenchcropusage/Abric...,abricotier des antilles,alt,,
5,http://ontology.inrae.fr/frenchcropusage/Abric...,mamey,alt,,
6,http://ontology.inrae.fr/frenchcropusage/Abric...,abricotier-pays,alt,,
7,http://ontology.inrae.fr/frenchcropusage/Actin...,actinidia,pref,,
8,http://ontology.inrae.fr/frenchcropusage/Actin...,groseille de chine,alt,,
9,http://ontology.inrae.fr/frenchcropusage/Actin...,kiwi,alt,,


In [27]:
df_fcu_geves.to_excel("result1_fcu_geves.xlsx")

#### Count only matches

In [28]:
dataframe_preview(df_fcu_geves.dropna())

== Number of lines: 115
== Number of unique values:
fcu_concept              108
fcu_name                 104
fcu_name_type              2
geves_lab_spe_dus        104
geves_scientific_name     98
dtype: int64


Unnamed: 0,fcu_concept,fcu_name,fcu_name_type,geves_lab_spe_dus,geves_scientific_name
0,http://ontology.inrae.fr/frenchcropusage/Abric...,abricotier,pref,Abricotier,prunus armeniaca l.
13,http://ontology.inrae.fr/frenchcropusage/Ails,ail,pref,Ail,allium sativum l.
21,http://ontology.inrae.fr/frenchcropusage/Amand...,amandier,pref,Amandier,prunus amygdalus bartock
68,http://ontology.inrae.fr/frenchcropusage/Artic...,artichaut,pref,Artichaut,cynara scolymus
69,http://ontology.inrae.fr/frenchcropusage/Artic...,artichaut,pref,Artichaut,cynara cardunculus l.
71,http://ontology.inrae.fr/frenchcropusage/Asperges,asperge,pref,Asperge,asparagus officinalis l.
80,http://ontology.inrae.fr/frenchcropusage/Auber...,aubergine,pref,Aubergine,solanum melongena l.
86,http://ontology.inrae.fr/frenchcropusage/Avoin...,avoine d'hiver,pref,Avoine d'hiver,avena sativa l.
90,http://ontology.inrae.fr/frenchcropusage/Avoin...,avoine nue d'hiver,pref,Avoine nue d'hiver,avena nuda l.
92,http://ontology.inrae.fr/frenchcropusage/Avoin...,avoine nue de printemps,pref,Avoine nue de printemps,avena nuda l.


### Matches:
- FCU: 108 unique concepts, 104 unique labels
- GEVES: 104 unique species, 98 unique scientific names

---
## Join GEVES with TAXREF-LD on scientific name, only on the GEVES species matched with FCU

In [29]:
# SPARQL query to TAXREF-LD to match a single GEVES scientific name (placeholder $geves_scientific_name)
queryTpl = Template(prefixes + '''
select distinct ("$geves_scientific_name" as ?geves_scientific_name) ?taxref_full_name ?taxref_name_type ?taxon ?rank 
from <http://taxref.mnhn.fr/lod/graph/classes/15.0>
from <http://taxref.mnhn.fr/lod/graph/vernacular/15.0>
from <http://taxref.mnhn.fr/lod/graph/concepts>
where {
    ?name
       a                      skos:Concept, <http://rs.tdwg.org/ontology/voc/TaxonName#TaxonName>;
       rdfs:label             ?taxref_full_name.

    { ?name taxrefp:isReferenceNameOf ?taxon. bind("pref" as ?taxref_name_type) }
    union
    { ?name taxrefp:isSynonymOf       ?taxon. bind("alt" as ?taxref_name_type) }

    ?taxon
       taxrefp:hasRank        ?rank.

    # All ranks up to spcecies but not above
    filter (?rank in (
        taxrefrk:Species,  taxrefrk:SemiSpecies, taxrefrk:MicroSpecies, taxrefrk:SubSpecies, taxrefrk:Natio, 
        taxrefrk:Varietas, taxrefrk:SubVarietas, taxrefrk:Forma,        taxrefrk:SubForma,   taxrefrk:FormaSpecies,
        taxrefrk:Linea,    taxrefrk:Clone,       taxrefrk:Race,         taxrefrk:Cultivar,   taxrefrk:Morpha,
        taxrefrk:Abberatio ))

    # TAXREF names have the authority and date, whereas GEVES names do not have the date.
    # => match the GEVES name as a subpart of the TAXREF name
    bind(str(lcase(?taxref_full_name)) as ?taxref_scn_sl)
    filter(strstarts(?taxref_scn_sl, "$geves_scientific_name"))
}
''')

In [30]:
# Set a max number of queries to submit. 0 = unlimited.
MAX_QUERIES = 0

# Result DataFrame
df_geves_taxref = pd.DataFrame()

idx = 1
df_fcu_geves_matched = df_fcu_geves.dropna()
unique_names = df_fcu_geves_matched.geves_scientific_name.unique()
for geves_scientific_name in unique_names:
    query = queryTpl.substitute(geves_scientific_name = geves_scientific_name.strip().lower())
    #print(query)
    
    print(f"---- Running query {idx}/{len(unique_names)} - geves_scientific_name = {geves_scientific_name}")
    %time _df = exec_sparql(taxref_endpoint, query)
    print(f'Number of results: {_df.shape[0]}')
    df_geves_taxref = df_geves_taxref.append(_df)
    
    # Keep track of GEVES names not matched with TAXREF
    if _df.shape[0] == 0:
        nomatch_row = {'taxref_full_name': None, 'taxref_name_type': None, 'taxon': None, 'geves_scientific_name': geves_scientific_name.strip().lower()}
        df_geves_taxref = df_geves_taxref.append(nomatch_row, ignore_index=True)
    
    idx = idx + 1
    if MAX_QUERIES > 0 and idx > MAX_QUERIES:
        break

---- Running query 1/98 - geves_scientific_name = prunus armeniaca l.
Wall time: 8.79 s
Number of results: 1
---- Running query 2/98 - geves_scientific_name = allium sativum l.
Wall time: 10.1 s
Number of results: 1
---- Running query 3/98 - geves_scientific_name = prunus amygdalus bartock
Wall time: 10.3 s
Number of results: 0
---- Running query 4/98 - geves_scientific_name = cynara scolymus
Wall time: 8.53 s
Number of results: 3
---- Running query 5/98 - geves_scientific_name = cynara cardunculus l.
Wall time: 10.4 s
Number of results: 1
---- Running query 6/98 - geves_scientific_name = asparagus officinalis l.
Wall time: 8.85 s
Number of results: 1
---- Running query 7/98 - geves_scientific_name = solanum melongena l.
Wall time: 9.28 s
Number of results: 1
---- Running query 8/98 - geves_scientific_name = avena sativa l.
Wall time: 8.93 s
Number of results: 1
---- Running query 9/98 - geves_scientific_name = avena nuda l.
Wall time: 8.41 s
Number of results: 1
---- Running query 10/

In [31]:
df_geves_taxref.to_excel("result2_geves_taxref.xlsx")

In [32]:
dataframe_preview(df_geves_taxref)

== Number of lines: 100
== Number of unique values:
geves_scientific_name    98
taxref_full_name         72
taxref_name_type          2
taxon                    70
rank                      2
dtype: int64


Unnamed: 0,geves_scientific_name,taxref_full_name,taxref_name_type,taxon,rank
0,prunus armeniaca l.,"Prunus armeniaca L., 1753",pref,http://taxref.mnhn.fr/lod/taxon/116041,http://taxref.mnhn.fr/lod/taxrank/Species
1,allium sativum l.,"Allium sativum L., 1753",pref,http://taxref.mnhn.fr/lod/taxon/81505,http://taxref.mnhn.fr/lod/taxrank/Species
2,prunus amygdalus bartock,,,,
3,cynara scolymus,"Cynara scolymus L., 1753",pref,http://taxref.mnhn.fr/lod/taxon/93795,http://taxref.mnhn.fr/lod/taxrank/Species
4,cynara scolymus,Cynara scolymus subsp. cardunculus (L.) Bonnie...,alt,http://taxref.mnhn.fr/lod/taxon/93783,http://taxref.mnhn.fr/lod/taxrank/Species
5,cynara scolymus,"Cynara scolymus var. redonensis N.H.F.Desp., 1838",alt,http://taxref.mnhn.fr/lod/taxon/93795,http://taxref.mnhn.fr/lod/taxrank/Species
6,cynara cardunculus l.,"Cynara cardunculus L., 1753",pref,http://taxref.mnhn.fr/lod/taxon/93783,http://taxref.mnhn.fr/lod/taxrank/Species
7,asparagus officinalis l.,"Asparagus officinalis L., 1753",pref,http://taxref.mnhn.fr/lod/taxon/84279,http://taxref.mnhn.fr/lod/taxrank/Species
8,solanum melongena l.,"Solanum melongena L., 1753",pref,http://taxref.mnhn.fr/lod/taxon/124075,http://taxref.mnhn.fr/lod/taxrank/Species
9,avena sativa l.,"Avena sativa L., 1753",pref,http://taxref.mnhn.fr/lod/taxon/85357,http://taxref.mnhn.fr/lod/taxrank/Species


#### Count only matches

In [34]:
dataframe_preview(df_geves_taxref.dropna())

== Number of lines: 72
== Number of unique values:
geves_scientific_name    70
taxref_full_name         72
taxref_name_type          2
taxon                    70
rank                      2
dtype: int64


Unnamed: 0,geves_scientific_name,taxref_full_name,taxref_name_type,taxon,rank
0,prunus armeniaca l.,"Prunus armeniaca L., 1753",pref,http://taxref.mnhn.fr/lod/taxon/116041,http://taxref.mnhn.fr/lod/taxrank/Species
1,allium sativum l.,"Allium sativum L., 1753",pref,http://taxref.mnhn.fr/lod/taxon/81505,http://taxref.mnhn.fr/lod/taxrank/Species
3,cynara scolymus,"Cynara scolymus L., 1753",pref,http://taxref.mnhn.fr/lod/taxon/93795,http://taxref.mnhn.fr/lod/taxrank/Species
4,cynara scolymus,Cynara scolymus subsp. cardunculus (L.) Bonnie...,alt,http://taxref.mnhn.fr/lod/taxon/93783,http://taxref.mnhn.fr/lod/taxrank/Species
5,cynara scolymus,"Cynara scolymus var. redonensis N.H.F.Desp., 1838",alt,http://taxref.mnhn.fr/lod/taxon/93795,http://taxref.mnhn.fr/lod/taxrank/Species
6,cynara cardunculus l.,"Cynara cardunculus L., 1753",pref,http://taxref.mnhn.fr/lod/taxon/93783,http://taxref.mnhn.fr/lod/taxrank/Species
7,asparagus officinalis l.,"Asparagus officinalis L., 1753",pref,http://taxref.mnhn.fr/lod/taxon/84279,http://taxref.mnhn.fr/lod/taxrank/Species
8,solanum melongena l.,"Solanum melongena L., 1753",pref,http://taxref.mnhn.fr/lod/taxon/124075,http://taxref.mnhn.fr/lod/taxrank/Species
9,avena sativa l.,"Avena sativa L., 1753",pref,http://taxref.mnhn.fr/lod/taxon/85357,http://taxref.mnhn.fr/lod/taxrank/Species
10,avena nuda l.,"Avena nuda L., 1756",pref,http://taxref.mnhn.fr/lod/taxon/85323,http://taxref.mnhn.fr/lod/taxrank/Species


### Matches:
- GEVES: 70 unique scientific names
- TAXREF-LD: 70 unique taxa

=> 28 GEVES names not matched with TAXREF-LD

---
## Join intermediate results: FCU-GEVES and GEVES-TAXREFLD

In [37]:
df_merge = pd.merge(df_fcu_geves, df_geves_taxref, on="geves_scientific_name", how='left')
df_merge.drop_duplicates(inplace=True)
dataframe_preview(df_merge)

== Number of lines: 1714
== Number of unique values:
fcu_concept               524
fcu_name                 1656
fcu_name_type               2
geves_lab_spe_dus         104
geves_scientific_name      98
taxref_full_name           72
taxref_name_type            2
taxon                      70
rank                        2
dtype: int64


Unnamed: 0,fcu_concept,fcu_name,fcu_name_type,geves_lab_spe_dus,geves_scientific_name,taxref_full_name,taxref_name_type,taxon,rank
0,http://ontology.inrae.fr/frenchcropusage/Abric...,abricotier,pref,Abricotier,prunus armeniaca l.,"Prunus armeniaca L., 1753",pref,http://taxref.mnhn.fr/lod/taxon/116041,http://taxref.mnhn.fr/lod/taxrank/Species
1,http://ontology.inrae.fr/frenchcropusage/Abric...,abricot,alt,,,,,,
2,http://ontology.inrae.fr/frenchcropusage/Abric...,abricotier pays,pref,,,,,,
3,http://ontology.inrae.fr/frenchcropusage/Abric...,abricot pays,alt,,,,,,
4,http://ontology.inrae.fr/frenchcropusage/Abric...,abricotier des antilles,alt,,,,,,
5,http://ontology.inrae.fr/frenchcropusage/Abric...,mamey,alt,,,,,,
6,http://ontology.inrae.fr/frenchcropusage/Abric...,abricotier-pays,alt,,,,,,
7,http://ontology.inrae.fr/frenchcropusage/Actin...,actinidia,pref,,,,,,
8,http://ontology.inrae.fr/frenchcropusage/Actin...,groseille de chine,alt,,,,,,
9,http://ontology.inrae.fr/frenchcropusage/Actin...,kiwi,alt,,,,,,


### Count only matches

In [38]:
dataframe_preview(df_merge.dropna())

== Number of lines: 87
== Number of unique values:
fcu_concept              81
fcu_name                 78
fcu_name_type             2
geves_lab_spe_dus        78
geves_scientific_name    70
taxref_full_name         72
taxref_name_type          2
taxon                    70
rank                      2
dtype: int64


Unnamed: 0,fcu_concept,fcu_name,fcu_name_type,geves_lab_spe_dus,geves_scientific_name,taxref_full_name,taxref_name_type,taxon,rank
0,http://ontology.inrae.fr/frenchcropusage/Abric...,abricotier,pref,Abricotier,prunus armeniaca l.,"Prunus armeniaca L., 1753",pref,http://taxref.mnhn.fr/lod/taxon/116041,http://taxref.mnhn.fr/lod/taxrank/Species
13,http://ontology.inrae.fr/frenchcropusage/Ails,ail,pref,Ail,allium sativum l.,"Allium sativum L., 1753",pref,http://taxref.mnhn.fr/lod/taxon/81505,http://taxref.mnhn.fr/lod/taxrank/Species
68,http://ontology.inrae.fr/frenchcropusage/Artic...,artichaut,pref,Artichaut,cynara scolymus,"Cynara scolymus L., 1753",pref,http://taxref.mnhn.fr/lod/taxon/93795,http://taxref.mnhn.fr/lod/taxrank/Species
69,http://ontology.inrae.fr/frenchcropusage/Artic...,artichaut,pref,Artichaut,cynara scolymus,Cynara scolymus subsp. cardunculus (L.) Bonnie...,alt,http://taxref.mnhn.fr/lod/taxon/93783,http://taxref.mnhn.fr/lod/taxrank/Species
70,http://ontology.inrae.fr/frenchcropusage/Artic...,artichaut,pref,Artichaut,cynara scolymus,"Cynara scolymus var. redonensis N.H.F.Desp., 1838",alt,http://taxref.mnhn.fr/lod/taxon/93795,http://taxref.mnhn.fr/lod/taxrank/Species
71,http://ontology.inrae.fr/frenchcropusage/Artic...,artichaut,pref,Artichaut,cynara cardunculus l.,"Cynara cardunculus L., 1753",pref,http://taxref.mnhn.fr/lod/taxon/93783,http://taxref.mnhn.fr/lod/taxrank/Species
73,http://ontology.inrae.fr/frenchcropusage/Asperges,asperge,pref,Asperge,asparagus officinalis l.,"Asparagus officinalis L., 1753",pref,http://taxref.mnhn.fr/lod/taxon/84279,http://taxref.mnhn.fr/lod/taxrank/Species
82,http://ontology.inrae.fr/frenchcropusage/Auber...,aubergine,pref,Aubergine,solanum melongena l.,"Solanum melongena L., 1753",pref,http://taxref.mnhn.fr/lod/taxon/124075,http://taxref.mnhn.fr/lod/taxrank/Species
88,http://ontology.inrae.fr/frenchcropusage/Avoin...,avoine d'hiver,pref,Avoine d'hiver,avena sativa l.,"Avena sativa L., 1753",pref,http://taxref.mnhn.fr/lod/taxon/85357,http://taxref.mnhn.fr/lod/taxrank/Species
92,http://ontology.inrae.fr/frenchcropusage/Avoin...,avoine nue d'hiver,pref,Avoine nue d'hiver,avena nuda l.,"Avena nuda L., 1756",pref,http://taxref.mnhn.fr/lod/taxon/85323,http://taxref.mnhn.fr/lod/taxrank/Species


### Matches
- FCU: 81 unique concepts, 78 unique labels
- TAXREF-LD: 70 unique taxa from 2 ranks

### Exports

In [39]:
df_merge.to_excel("result3_fcu_geves_taxref.xlsx")

In [40]:
# Reshape the DataFrame for later merging with other methods
df_merge.dropna(inplace=True)
df_merge.drop(columns=['fcu_name_type', 'fcu_name', 'geves_scientific_name', 'taxref_name_type'], inplace=True)
df_merge.rename(columns={'taxref_full_name': 'taxref_ref_full_name'}, inplace=True)
df_merge.insert(1, 'method', 'geves')
df_merge.rename(columns={"geves_lab_spe_dus": "geves_name"}, inplace=True)
df_merge.insert(3, 'eppo_scientific_name', '')

In [41]:
dataframe_preview(df_merge)
df_merge.to_excel("result3_fcu_geves_taxref_merge.xlsx", index=False)
df_merge.to_csv("result3_fcu_geves_taxref_merge.csv", index=False)

== Number of lines: 87
== Number of unique values:
fcu_concept             81
method                   1
geves_name              78
eppo_scientific_name     1
taxref_ref_full_name    72
taxon                   70
rank                     2
dtype: int64


Unnamed: 0,fcu_concept,method,geves_name,eppo_scientific_name,taxref_ref_full_name,taxon,rank
0,http://ontology.inrae.fr/frenchcropusage/Abric...,geves,Abricotier,,"Prunus armeniaca L., 1753",http://taxref.mnhn.fr/lod/taxon/116041,http://taxref.mnhn.fr/lod/taxrank/Species
13,http://ontology.inrae.fr/frenchcropusage/Ails,geves,Ail,,"Allium sativum L., 1753",http://taxref.mnhn.fr/lod/taxon/81505,http://taxref.mnhn.fr/lod/taxrank/Species
68,http://ontology.inrae.fr/frenchcropusage/Artic...,geves,Artichaut,,"Cynara scolymus L., 1753",http://taxref.mnhn.fr/lod/taxon/93795,http://taxref.mnhn.fr/lod/taxrank/Species
69,http://ontology.inrae.fr/frenchcropusage/Artic...,geves,Artichaut,,Cynara scolymus subsp. cardunculus (L.) Bonnie...,http://taxref.mnhn.fr/lod/taxon/93783,http://taxref.mnhn.fr/lod/taxrank/Species
70,http://ontology.inrae.fr/frenchcropusage/Artic...,geves,Artichaut,,"Cynara scolymus var. redonensis N.H.F.Desp., 1838",http://taxref.mnhn.fr/lod/taxon/93795,http://taxref.mnhn.fr/lod/taxrank/Species
71,http://ontology.inrae.fr/frenchcropusage/Artic...,geves,Artichaut,,"Cynara cardunculus L., 1753",http://taxref.mnhn.fr/lod/taxon/93783,http://taxref.mnhn.fr/lod/taxrank/Species
73,http://ontology.inrae.fr/frenchcropusage/Asperges,geves,Asperge,,"Asparagus officinalis L., 1753",http://taxref.mnhn.fr/lod/taxon/84279,http://taxref.mnhn.fr/lod/taxrank/Species
82,http://ontology.inrae.fr/frenchcropusage/Auber...,geves,Aubergine,,"Solanum melongena L., 1753",http://taxref.mnhn.fr/lod/taxon/124075,http://taxref.mnhn.fr/lod/taxrank/Species
88,http://ontology.inrae.fr/frenchcropusage/Avoin...,geves,Avoine d'hiver,,"Avena sativa L., 1753",http://taxref.mnhn.fr/lod/taxon/85357,http://taxref.mnhn.fr/lod/taxrank/Species
92,http://ontology.inrae.fr/frenchcropusage/Avoin...,geves,Avoine nue d'hiver,,"Avena nuda L., 1756",http://taxref.mnhn.fr/lod/taxon/85323,http://taxref.mnhn.fr/lod/taxrank/Species
