# Direct join between French Crop Usage (FCU) and TAXREF-LD

Join condition: FCU crop name = TAXREF-LD vernacular name

### Initializations

In [1]:
import sys
import json
import os
from string import Template
import pandas as pd
from SPARQLWrapper import SPARQLWrapper, JSON, POST
from time import sleep
from math import isnan, nan

In [2]:
sys.path.append('../..')
from utils import *

In [3]:
#fcu_endpoint = "http://ontology.inrae.fr/frenchcropusage/sparql"
fcu_endpoint = "http://localhost:8080/sparql"
taxref_endpoint = "https://taxref.mnhn.fr/sparql"

___
# Get all varieties from FCU
FCU labels = vernacular names

In [4]:
query =  prefixes + '''
select distinct ?fcu_concept ?fcu_name ?fcu_name_type where {

    # Select cultivated and multi-usage crops
    { <http://ontology.inrae.fr/frenchcropusage/Usages_plantes_cultivees> skos:narrower+ ?fcu_concept. }
    UNION
    { <http://ontology.inrae.fr/frenchcropusage/Multi_usages> skos:narrower+ ?fcu_concept. }

    # Get prefered and alternate labels and keep track of the type of label
    ?fcu_concept a skos:Concept.
    { ?fcu_concept skos:prefLabel ?lb. bind("pref" as ?fcu_name_type) }
    UNION
    { ?fcu_concept skos:altLabel  ?lb. bind("alt" as ?fcu_name_type) }

    bind(str(lcase(?lb)) as ?fcu_name)

} order by ?fcu_concept
'''

In [5]:
%time df_fcu = exec_sparql(fcu_endpoint, query)

Wall time: 273 ms


In [6]:
dataframe_preview(df_fcu, end=5)

== Number of lines: 1706
== Number of unique values:
fcu_concept       524
fcu_name         1656
fcu_name_type       2
dtype: int64


Unnamed: 0,fcu_concept,fcu_name,fcu_name_type
0,http://ontology.inrae.fr/frenchcropusage/Abric...,abricotier,pref
1,http://ontology.inrae.fr/frenchcropusage/Abric...,abricot,alt
2,http://ontology.inrae.fr/frenchcropusage/Abric...,abricotier pays,pref
3,http://ontology.inrae.fr/frenchcropusage/Abric...,abricot pays,alt
4,http://ontology.inrae.fr/frenchcropusage/Abric...,abricotier des antilles,alt


---
# Join FCU with TAXREF-LD
### FCU varieties names = TAXREF-LD vernacular names

The SPARQL endpoint of TAXREF-LD does not support passing all varietes at once in a VALUES clause (ends up with an "HTTP Error 502: Proxy Error").
To avoid this, we make a SPARQL query for each FCU variety. It takes more time but it completes.

In [7]:
queryTpl = Template(prefixes + '''
select distinct ("$fcu_concept" as ?fcu_concept) ("$fcu_name" as ?fcu_name) ("$fcu_name_type" as ?fcu_name_type) ?taxref_full_name ?taxon ?rank
from <http://taxref.mnhn.fr/lod/graph/classes/15.0>
from <http://taxref.mnhn.fr/lod/graph/vernacular/15.0>
from <http://taxref.mnhn.fr/lod/graph/concepts>
where {
    ?taxon
       a                      owl:Class;
       taxrefp:hasReferenceName [ rdfs:label ?taxref_full_name ];
       taxrefp:hasRank        ?rank;
       taxrefp:vernacularName ?vn.

    # To lowercase + remove language tag if any
    filter (str(lcase(?vn)) = "$fcu_name")
    
    # All ranks up to spcecies but not above
    filter (?rank in (
        taxrefrk:Species,  taxrefrk:SemiSpecies, taxrefrk:MicroSpecies, taxrefrk:SubSpecies, taxrefrk:Natio, 
        taxrefrk:Varietas, taxrefrk:SubVarietas, taxrefrk:Forma,        taxrefrk:SubForma,   taxrefrk:FormaSpecies,
        taxrefrk:Linea,    taxrefrk:Clone,       taxrefrk:Race,         taxrefrk:Cultivar,   taxrefrk:Morpha,
        taxrefrk:Abberatio ))
}
''')

In [9]:
# Set a max number of queries to submit. 0 = unlimited.
MAX_QUERIES = 0

# Result DataFrame
df_fcu_taxref = pd.DataFrame()

for idx, row in df_fcu.iterrows():
    query = queryTpl.substitute(
        fcu_concept = row['fcu_concept'].strip(), 
        fcu_name = row['fcu_name'].strip().lower(),
        fcu_name_type = row['fcu_name_type'].strip().lower())
    #print(query)
    
    print(f"---- Running query {idx + 1}/{len(df_fcu)} - name = {row['fcu_name']}")
    %time _df = exec_sparql(taxref_endpoint, query)
    print(f'Number of results: {_df.shape[0]}')
    df_fcu_taxref = df_fcu_taxref.append(_df)
    
    # Keep track of the names not matched with TAXREF
    if _df.shape[0] == 0:
        nomatch_row = {'fcu_concept': row['fcu_concept'].strip(), 'fcu_name': row['fcu_name'].strip().lower(), 'fcu_name_type': row['fcu_name_type'].strip().lower(), 'taxref_full_name': None, 'taxon': None, 'rank': None}
        df_fcu_taxref = df_fcu_taxref.append(nomatch_row, ignore_index=True)

    if MAX_QUERIES > 0 and (idx+1) >= MAX_QUERIES:
        break

---- Running query 1/1706 - name = abricotier
Wall time: 619 ms
Number of results: 0
---- Running query 2/1706 - name = abricot
Wall time: 875 ms
Number of results: 0
---- Running query 3/1706 - name = abricotier pays
Wall time: 691 ms
Number of results: 0
---- Running query 4/1706 - name = abricot pays
Wall time: 577 ms
Number of results: 0
---- Running query 5/1706 - name = abricotier des antilles
Wall time: 588 ms
Number of results: 0
---- Running query 6/1706 - name = mamey
Wall time: 861 ms
Number of results: 0
---- Running query 7/1706 - name = abricotier-pays
Wall time: 886 ms
Number of results: 0
---- Running query 8/1706 - name = actinidia
Wall time: 770 ms
Number of results: 0
---- Running query 9/1706 - name = groseille de chine
Wall time: 741 ms
Number of results: 0
---- Running query 10/1706 - name = kiwi
Wall time: 772 ms
Number of results: 0
---- Running query 11/1706 - name = yang tao
Wall time: 865 ms
Number of results: 0
---- Running query 12/1706 - name = actinidier


In [10]:
# Number of matches
df_fcu_taxref_matched = df_fcu_taxref.dropna()
dataframe_preview(df_fcu_taxref_matched)

== Number of lines: 414
== Number of unique values:
fcu_concept         174
fcu_name            196
fcu_name_type         2
taxref_full_name    385
taxon               385
rank                  4
dtype: int64


Unnamed: 0,fcu_concept,fcu_name,fcu_name_type,taxref_full_name,taxon,rank
26,http://ontology.inrae.fr/frenchcropusage/Ananas,ananas,pref,"Ananas comosus (L.) Merr., 1917",http://taxref.mnhn.fr/lod/taxon/447782,http://taxref.mnhn.fr/lod/taxrank/Species
27,http://ontology.inrae.fr/frenchcropusage/Anemones,anémone,pref,Anemone hortensis nothosubsp. fulgens (J.Gay) ...,http://taxref.mnhn.fr/lod/taxon/131396,http://taxref.mnhn.fr/lod/taxrank/SubSpecies
31,http://ontology.inrae.fr/frenchcropusage/Angel...,angélique,pref,"Dicorynia guianensis Amshoff, 1939",http://taxref.mnhn.fr/lod/taxon/733639,http://taxref.mnhn.fr/lod/taxrank/Species
37,http://ontology.inrae.fr/frenchcropusage/Arach...,arachide,pref,"Arachis hypogaea L., 1753",http://taxref.mnhn.fr/lod/taxon/611649,http://taxref.mnhn.fr/lod/taxrank/Species
68,http://ontology.inrae.fr/frenchcropusage/Artic...,artichaut,pref,"Cynara cardunculus L., 1753",http://taxref.mnhn.fr/lod/taxon/93783,http://taxref.mnhn.fr/lod/taxrank/Species
71,http://ontology.inrae.fr/frenchcropusage/Asperges,asperge blanche,alt,"Asparagus albus L., 1753",http://taxref.mnhn.fr/lod/taxon/84265,http://taxref.mnhn.fr/lod/taxrank/Species
77,http://ontology.inrae.fr/frenchcropusage/Attiers,pomme cannelle,alt,"Annona squamosa L., 1753",http://taxref.mnhn.fr/lod/taxon/446901,http://taxref.mnhn.fr/lod/taxrank/Species
79,http://ontology.inrae.fr/frenchcropusage/Auber...,aubergine,pref,"Solanum melongena L., 1753",http://taxref.mnhn.fr/lod/taxon/124075,http://taxref.mnhn.fr/lod/taxrank/Species
80,http://ontology.inrae.fr/frenchcropusage/Avoca...,avocatier,pref,"Persea americana Mill., 1768",http://taxref.mnhn.fr/lod/taxon/447273,http://taxref.mnhn.fr/lod/taxrank/Species
84,http://ontology.inrae.fr/frenchcropusage/Avoines,avoine cultivée,alt,"Avena sativa subsp. sativa L., 1753",http://taxref.mnhn.fr/lod/taxon/132016,http://taxref.mnhn.fr/lod/taxrank/SubSpecies


### Matches:
- FCU: 174 unique concepts, 196 unique labels
- TAXREF-LD: 385 unique taxa from 4 ranks

### Exports

In [11]:
df_fcu_taxref.to_excel("result_fcu_taxref.xlsx")

In [12]:
# Reshape the DataFrame for later merging with other methods
df_fcu_taxref_matched.drop(columns=['fcu_name_type', 'fcu_name'], inplace=True)
df_fcu_taxref_matched.rename(columns={'taxref_full_name': 'taxref_ref_full_name'}, inplace=True)
df_fcu_taxref_matched.insert(1, 'method', 'direct')
df_fcu_taxref_matched.insert(2, 'geves_name', '')
df_fcu_taxref_matched.insert(3, 'eppo_scientific_name', '')

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().rename(


In [13]:
dataframe_preview(df_fcu_taxref_matched)
df_fcu_taxref_matched.to_excel("result_fcu_taxref_merge.xlsx", index=False)
df_fcu_taxref_matched.to_csv("result_fcu_taxref_merge.csv", index=False)

== Number of lines: 414
== Number of unique values:
fcu_concept             174
method                    1
geves_name                1
eppo_scientific_name      1
taxref_ref_full_name    385
taxon                   385
rank                      4
dtype: int64


Unnamed: 0,fcu_concept,method,geves_name,eppo_scientific_name,taxref_ref_full_name,taxon,rank
26,http://ontology.inrae.fr/frenchcropusage/Ananas,direct,,,"Ananas comosus (L.) Merr., 1917",http://taxref.mnhn.fr/lod/taxon/447782,http://taxref.mnhn.fr/lod/taxrank/Species
27,http://ontology.inrae.fr/frenchcropusage/Anemones,direct,,,Anemone hortensis nothosubsp. fulgens (J.Gay) ...,http://taxref.mnhn.fr/lod/taxon/131396,http://taxref.mnhn.fr/lod/taxrank/SubSpecies
31,http://ontology.inrae.fr/frenchcropusage/Angel...,direct,,,"Dicorynia guianensis Amshoff, 1939",http://taxref.mnhn.fr/lod/taxon/733639,http://taxref.mnhn.fr/lod/taxrank/Species
37,http://ontology.inrae.fr/frenchcropusage/Arach...,direct,,,"Arachis hypogaea L., 1753",http://taxref.mnhn.fr/lod/taxon/611649,http://taxref.mnhn.fr/lod/taxrank/Species
68,http://ontology.inrae.fr/frenchcropusage/Artic...,direct,,,"Cynara cardunculus L., 1753",http://taxref.mnhn.fr/lod/taxon/93783,http://taxref.mnhn.fr/lod/taxrank/Species
71,http://ontology.inrae.fr/frenchcropusage/Asperges,direct,,,"Asparagus albus L., 1753",http://taxref.mnhn.fr/lod/taxon/84265,http://taxref.mnhn.fr/lod/taxrank/Species
77,http://ontology.inrae.fr/frenchcropusage/Attiers,direct,,,"Annona squamosa L., 1753",http://taxref.mnhn.fr/lod/taxon/446901,http://taxref.mnhn.fr/lod/taxrank/Species
79,http://ontology.inrae.fr/frenchcropusage/Auber...,direct,,,"Solanum melongena L., 1753",http://taxref.mnhn.fr/lod/taxon/124075,http://taxref.mnhn.fr/lod/taxrank/Species
80,http://ontology.inrae.fr/frenchcropusage/Avoca...,direct,,,"Persea americana Mill., 1768",http://taxref.mnhn.fr/lod/taxon/447273,http://taxref.mnhn.fr/lod/taxrank/Species
84,http://ontology.inrae.fr/frenchcropusage/Avoines,direct,,,"Avena sativa subsp. sativa L., 1753",http://taxref.mnhn.fr/lod/taxon/132016,http://taxref.mnhn.fr/lod/taxrank/SubSpecies
