Matching IUCN -> Checklist -> GBIF

Packages

In [1]:
import numpy as np
import os
import pandas as pd
import re
import sys

In [2]:
def read_set_in_csv(x):
    if x=='set()':
        return set()
    else:
        return set(x.replace("'",'').strip('{}').split(', '))

Package pour des fuzzy matchings de chaînes de caractères

In [2]:
from fuzzywuzzy import process, fuzz
def fuzzy_match_and_score(string,array):
    fuzzy_extraction = process.extractOne(string,array)
    return fuzzy_extraction[0], fuzzy_extraction[1]

# Chargement d'IUCN (Redlist)

In [3]:
iucn = pd.read_csv("/home/joon/data/iucn/taxonomy.csv",sep=',').set_index('internalTaxonId')

On ne garde que les colonnes suivantes, et seulement les plantes vasculaires (TRACHEOPHYTA).
Il y a un encodage bizarre pour "&" dans la colonne authority, on le corrige.

In [4]:
iucn = iucn[['phylumName','familyName','genusName','speciesName','authority']]
iucn = iucn[iucn.phylumName=='TRACHEOPHYTA']
iucn['authority'] = iucn['authority'].str.replace('&amp;',"&",regex=False)

Lorsqu'on fait des tests, on subsample IUCN pour raccourcir les calculs.

In [5]:
# iucn = iucn.sample(n=1000)

# Chargement de la checklist.

In [5]:
checklist = pd.read_csv('/home/joon/data/checklist_taxonomy_preprocessed.csv',index_col='plant_name_id')
checklist = checklist[['taxon_status', 'family', 'genus', 'species', 'taxon_name', 'taxon_authors', 'accepted_plant_name_id']]
checklist.dropna(axis=0,subset=['species'],inplace=True)

# Matching IUCN -> checklist

## Fuzzy Matching des genus IUCN -> checklist
Nécéssaire car certaines genus n'ont pas la même orthographe dans l'IUCN et la checklist
On recherche des matchs appartenant à la même famille, de longueur (de la chaîne de caractères) similaire, et commençant par la même lettre.

Les genus de l'IUCN qui ont nécessité un fuzzy matching sont listés avec le matching score et exporté dans un fichier Excel pour inspection et validation.

In [8]:
checklist_genus_values = checklist.genus.unique()
checklist_family_genus = checklist[['family','genus']].drop_duplicates()
checklist_families = checklist.family.unique()
genus_by_family = {}
for family in checklist_families:
    genus_by_family[family] = checklist.genus[checklist.family==family].values

iucn_family_genus = iucn[['familyName','genusName']].drop_duplicates()
iucn_family_genus.familyName = iucn_family_genus.familyName.str.title()
iucn_genus_values = iucn_family_genus.genusName.unique()

iucn_checklist_genus_matching = {}

In [9]:
iucn_checklist_genus_fuzzy_matching = pd.DataFrame([])

for [family,genus] in iucn_family_genus.values:
    if genus in checklist_genus_values:
        iucn_checklist_genus_matching[genus]=genus
    else:
        if family in checklist_families:
            values_of_family = genus_by_family[family]
        else:
            values_of_family = checklist_genus_values
        considered_values = [value for value in values_of_family if ((len(genus)*3/4-2<=len(value)) & (len(value) <=len(genus)*4/3+2) & (value[0]==genus[0]))]
        genus_guess, guess_score = fuzzy_match_and_score(genus,considered_values)
        iucn_checklist_genus_matching[genus]=genus_guess
        iucn_checklist_genus_fuzzy_matching = iucn_checklist_genus_fuzzy_matching.append(pd.Series([genus, genus_guess, guess_score]),ignore_index=True)
        print(genus, ": Best fuzzy match:", genus_guess, '(Score:',guess_score,')')

Platyspermation : Best fuzzy match: Periomphale (Score: 38 )
Aporusa : Best fuzzy match: Aporosa (Score: 86 )


Hieronima : Best fuzzy match: Hieronyma (Score: 89 )
Engelhardtia : Best fuzzy match: Engelhardia (Score: 96 )


Hyeronima : Best fuzzy match: Hieronyma (Score: 78 )
Hymenopus : Best fuzzy match: Hunga (Score: 36 )
Siphoneugenia : Best fuzzy match: Siphoneugena (Score: 96 )


Dasphyllum : Best fuzzy match: Dasyphyllum (Score: 95 )
Rughidia : Best fuzzy match: Ridolfia (Score: 62 )
Oreofraga : Best fuzzy match: Oreonana (Score: 71 )
Mitrostigma : Best fuzzy match: Mitriostigma (Score: 96 )


Acanthephippium : Best fuzzy match: Acanthophippium (Score: 93 )
Arisarum : Best fuzzy match: Arum (Score: 90 )


Ambrosina : Best fuzzy match: Ambrosinia (Score: 95 )
Leptobalanus : Best fuzzy match: Lepidocarpa (Score: 52 )


In [10]:
iucn_checklist_genus_fuzzy_matching.columns=['genus_in_iucn','matched_genus_in_checklist','score']
iucn_checklist_genus_fuzzy_matching.sort_values('score',ascending=False,inplace=True)

In [11]:
iucn_checklist_genus_fuzzy_matching.to_excel("genus_iucn_chacklist_matching.xlsx")

Détermination des matchings de genus validés. Les genus non validés seront recherchés dans TPL.
Première version à la hâche: on valide les matchs avec un score supérieur à 78.
TODO: Le faire manuellement.

In [12]:
iucn_checklist_genus_validated_matchings = iucn_checklist_genus_fuzzy_matching.score>=78
iucn_genus_discarded = iucn_checklist_genus_fuzzy_matching.loc[~iucn_checklist_genus_validated_matchings,'genus_in_iucn'].values
iucn_genus_values = [genus for genus in iucn_genus_values if genus not in iucn_genus_discarded]
# iucn_checklist_genus_fuzzy_matching['validated'] = iucn_checklist_genus_validated_matchings

In [13]:
# iucn_checklist_genus_validated = iucn_checklist_genus_fuzzy_matching[iucn_checklist_genus_fuzzy_matching.validated==True].genus_in_iucn.values
# iucn_checklist_genus_missing = iucn_checklist_genus_fuzzy_matching[iucn_checklist_genus_fuzzy_matching.validated==False].genus_in_iucn.values

On met de côté les entrées de l'IUCN dont les genus ne sont pas dans la checklist pour éventuellement les rechercher dans TPL:

In [14]:
# iucn_for_tpl = iucn[iucn.genusName.isin(iucn_checklist_genus_missing)]
# iucn.drop(iucn_for_tpl.index,inplace=True)

## Matching des lignes IUCN -> Checklist
iucn_checklist_matching contient pour chanque index iucn l'ensemble des accepted_plant_name_id correspondants dans la checklist

In [15]:
def filter_in_checklist_by_gs(genus,species,checklist=checklist):
    foo = checklist[checklist.genus==genus]
    return foo[foo.species==species]

In [16]:
def filter_in_checklist_by_author(string,df,regex=False):
    return df[df.taxon_authors.str.contains(string,regex=regex,case=True)==True]

In [17]:
def only_keep_best_taxon_status(df):
    worst_status = ['Invalid', 'Illegitimate', 'Misapplied', 'Unplaced', 'Artificial Hybrid', 'Incomplete', 'Orthograhic', 'Synonym', 'Accepted']
    df_tmp = df
    while (len(worst_status) > 0) & (df_tmp.accepted_plant_name_id.nunique() > 0):
        status = worst_status.pop(0)
        df = df_tmp
        df_tmp = df[df.taxon_status != status]
    return df

In [18]:
iucn_checklist_fuzzy_matchings_validated_by_thomas = pd.read_csv("iucn_checklist_fuzzy_matchings_validated_by_thomas.csv", sep=',',index_col=0)

In [19]:
def iucn_checklist_matching_process_genus(genus):
    global genus_iucn_checklist_matching, iucn, checklist

    iucn_checklist_matching = pd.Series([])

    iucn_species_from_genus = iucn.speciesName[iucn.genusName==genus]

    checklist_genus = iucn_checklist_genus_matching[genus]
    search_res_g = checklist[checklist.genus==checklist_genus] # filter by genus only

    for iucn_taxon_id in iucn_species_from_genus.index:
        giveup = False
        go_fuzzy = False
        iucn_row = iucn.loc[iucn_taxon_id]
        species = iucn_row.speciesName
        g_s = genus + ' ' + species
        # taxon = iucn_row.taxon_name
        # var_subsp = taxon.replace(g_s,'').replace(' var. ','').replace(' subsp. ','')
        authority = iucn_row.authority
        if pd.isnull(authority):
            authority = ""
        authority = authority.strip()
    
        search_res_gs = search_res_g[(search_res_g.genus==checklist_genus) & (search_res_g.species==species)]

        # if var_subsp != '':
        #     search_res_gs = search_res_gs[search_res_gs.scientificName.str.contains(re.escape(var_subsp))]

        if search_res_gs.shape[0] == 0: # if match fails with genus+species, consider all genus, and fuzzy match the species
            name_fuzzy_matchings = pd.DataFrame([[ind,name,score] for (name, score,ind) in process.extract(species,search_res_g.species,scorer=fuzz.ratio,limit=100)] ,columns=['ind','name','score']).set_index('ind')
            name_fuzzy_matchings = name_fuzzy_matchings[name_fuzzy_matchings.score>=max(100-2*(100-name_fuzzy_matchings.score.max()),79)]
            search_res_gs = search_res_g[search_res_g.species.isin(name_fuzzy_matchings.name.values)] 
        if search_res_gs.accepted_plant_name_id.nunique() == 1:
            final_search_res = search_res_gs
        else:
            search_res_matching_authority = search_res_gs[search_res_gs.taxon_authors==authority] # try exact match of author name
            if search_res_matching_authority.shape[0] == 0:
                authority_complete = authority
                authority = re.sub('\(.+\)','',authority).strip()
                search_res_matching_authority = search_res_gs[search_res_gs.taxon_authors==authority] # try exact match of author name with simplified authority
            if search_res_matching_authority.accepted_plant_name_id.nunique() >= 1:
                final_search_res = search_res_matching_authority
            else: # if exact match of author name failed
                search_res_gs.dropna(axis=0,subset=['taxon_authors'],inplace=True)
                search_res_containing_authority = search_res_gs[search_res_gs.taxon_authors.str.contains(authority)]
                if search_res_containing_authority.shape[0] == 0:
                    search_res_containing_authority = search_res_gs
                if search_res_containing_authority.accepted_plant_name_id.nunique() == 1:
                    final_search_res = search_res_containing_authority
                elif not giveup:
                    if search_res_containing_authority.accepted_plant_name_id.nunique() >= 2:
                        search_res_gs = search_res_containing_authority
                        name_fuzzy_matchings = pd.DataFrame([[ind,name,score] for (name, score,ind) in process.extract(authority,search_res_gs.taxon_authors,scorer=fuzz.partial_ratio,limit=100)] ,columns=['ind','name','score']).set_index('ind')
                        name_fuzzy_matchings = name_fuzzy_matchings[name_fuzzy_matchings.score>=max(100-2*(100-name_fuzzy_matchings.score.max()),79)]
                        if name_fuzzy_matchings.shape[0] > 1:
                            name_fuzzy_matchings = pd.DataFrame([[ind,name,score] for (name, score,ind) in process.extract(authority_complete,search_res_gs.taxon_authors,scorer=fuzz.partial_ratio,limit=100)] ,columns=['ind','name','score']).set_index('ind')
                            name_fuzzy_matchings = name_fuzzy_matchings[name_fuzzy_matchings.score>=max(100-2*(100-name_fuzzy_matchings.score.max()),79)]
                    final_search_res = search_res_gs[search_res_gs.taxon_authors.isin(name_fuzzy_matchings.name.values)] 

            if final_search_res.accepted_plant_name_id.nunique() > 1:
                print(g_s,authority, 'nonunique matches. Scientific names:',final_search_res.taxon_authors.unique())

        iucn_checklist_matching[iucn_taxon_id] = set(list(final_search_res.accepted_plant_name_id.unique()) + ([] if iucn_taxon_id not in iucn_checklist_matching.index else list(iucn_checklist_matching[iucn_taxon_id])))

    iucn_checklist_matching.to_csv('/home/joon/data/iucn-checklist-matching-'+genus+'.csv',header=False)

In [20]:
os.system('rm /home/joon/data/iucn-checklist-matching-*.csv')
os.system('rm /home/joon/data/iucn-checklist-matching.csv')

from multiprocessing import Pool
with Pool(processes=11) as pool:
    pool.map(iucn_checklist_matching_process_genus, iucn_genus_values)

#fusionner les résultats et supprimer les fichiers temporaires!
os.system('cat /home/joon/data/iucn-checklist-matching-*.csv > /home/joon/data/iucn-checklist-matching.csv')
os.system('rm /home/joon/data/iucn-checklist-matching-*.csv')

Syzygium micklethwaitii Verdc. nonunique matches. Scientific names: ['Verdc.']


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Eriosyce crispa (F.Ritter) Katt. nonunique matches. Scientific names: ['(F.Ritter) Katt.']


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Eriosyce paucicostata (F.Ritter) Ferryman nonunique matches. Scientific names: ['(F.Ritter) Ferryman']


Ophrys insectifera L. nonunique matches. Scientific names: ['L.']


Eriosyce recondita (F.Ritter) Katt. nonunique matches. Scientific names: ['(F.Ritter) Katt.']


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Eriosyce eriosyzoides (F.Ritter) Ferryman nonunique matches. Scientific names: ['(F.Ritter) Ferryman']


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Thymus baeticus Boiss. ex Lacaita nonunique matches. Scientific names: ['Boiss. ex Lacaita']


Cistus chinamadensis Bañares & P.Romero nonunique matches. Scientific names: ['Bañares & P.Romero']


Mammillaria herrerae Werderm. nonunique matches. Scientific names: ['Werderm.']


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Mammillaria dioica K.Brandegee nonunique matches. Scientific names: ['K.Brandegee']


Carex acuta L. nonunique matches. Scientific names: ['L.']


Sparganium androcladum (Engelm.) Morong nonunique matches. Scientific names: ['(Engelm.) Morong']


Athyrium solenopteris (Kunze) T.Moore nonunique matches. Scientific names: ['(Kunze) T.Moore']


Cordyla somalensis J.B.Gillett nonunique matches. Scientific names: ['J.B.Gillett']


Polyalthia elmeri Merr. nonunique matches. Scientific names: ['Merr.']


Carex aperta Boott nonunique matches. Scientific names: ['Boott']


Carex atlantica Bailey nonunique matches. Scientific names: ['(L.H.Bailey) Cronquist' '(L.H.Bailey) Reznicek' 'L.H.Bailey']


Asparagus officinalis L. nonunique matches. Scientific names: ['L.']


Carex exsiccata Bailey nonunique matches. Scientific names: ['(Olney) L.H.Bailey' 'L.H.Bailey']


Barleria mpandensis I.Darbysh. nonunique matches. Scientific names: ['I.Darbysh.']


Carex interior Bailey nonunique matches. Scientific names: ['L.H.Bailey']


Barleria polhillii I.Darbysh. nonunique matches. Scientific names: ['I.Darbysh.']


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Bulbophyllum zaratananae Schltr. nonunique matches. Scientific names: ['Schltr.']


Carex aquatilis Wahlenb. nonunique matches. Scientific names: ['Wahlenb.']


Melocactus stramineus Suringar nonunique matches. Scientific names: ['Suringar']


Cleistocactus winteri D.R.Hunt nonunique matches. Scientific names: ['D.R.Hunt']


Echinocereus parkeri N.P.Taylor nonunique matches. Scientific names: ['N.P.Taylor']


Scirpus atrocinctus Fern. nonunique matches. Scientific names: ['(Fernald) S.F.Blake' 'Fernald' '(Fernald) D.S.Carp.']


Ulex argenteus Webb nonunique matches. Scientific names: ['(Welw. ex Webb) D.A.Webb' 'Welw. ex Webb']


Colchicum cupanii Guss. nonunique matches. Scientific names: ['Guss.']


Stenandrium harlingii Wassh. nonunique matches. Scientific names: ['Wassh.']


Echinopsis aurea Briton & Rose nonunique matches. Scientific names: ['(Britton & Rose) M.Lowry' 'Britton & Rose']


Sinapidendron rupestre Lowe nonunique matches. Scientific names: ['Lowe']


Rebutia ritteri (Wessner) Buining & Donald nonunique matches. Scientific names: ['(Wessner) Buining & Donald']


Crataegus azarolus L. nonunique matches. Scientific names: ['L.']


Crocus hyemalis Boiss. & Blanche nonunique matches. Scientific names: ['Boiss. & Blanche']


Crocus hadriaticus Herb. nonunique matches. Scientific names: ['Herb.']


Arnica montana L. nonunique matches. Scientific names: ['L.']


Cryptotaenia calycina C.C.Towns. nonunique matches. Scientific names: ['C.C.Towns.']


Diospyros dicorypheoides H.Perrier nonunique matches. Scientific names: ['H.Perrier']


Pediocactus nigrispinus (Hochstätter) Hochstätter nonunique matches. Scientific names: ['(Hochstätter) Hochstätter']


Acorus calamus L. nonunique matches. Scientific names: ['L.']


Camellia gymnogyna H.T.Chang nonunique matches. Scientific names: ['(Hung T.Chang, F.L.Yu & P.S.Wang) T.L.Ming' 'Hung T.Chang']


Camellia kwangsiensis H.T.Chang nonunique matches. Scientific names: ['(Hung T.Chang, H.S.Wang & B.H.Chen) T.L.Ming' 'Hung T.Chang']


Discocactus diersianus Esteves nonunique matches. Scientific names: ['(Diers & Esteves) P.J.Braun & Esteves'
 '(Diers & Esteves) P.Braun & Esteves' 'Esteves Pereira']




Camellia mileensis T.L.Ming nonunique matches. Scientific names: ['T.L.Ming']


Salix farriae Ball nonunique matches. Scientific names: ['C.R.Ball' 'Coville & C.R.Ball']


Salix myrtillifolia Anderss. nonunique matches. Scientific names: ['(Andersson) Dorn' '(Andersson) C.R.Ball ex Hultén' 'Andersson']


Salix pyrifolia Anderss. nonunique matches. Scientific names: ['Andersson']


Diplotaxis gorgadensis Rustan nonunique matches. Scientific names: ['Rustan']


Orania decipiens Becc. nonunique matches. Scientific names: ['Becc.']


Salix pyrolifolia Ledeb. nonunique matches. Scientific names: ['Ledeb.']


Erigeron elatus Greene nonunique matches. Scientific names: ['Greene']


Prunus arborea Kalkm. nonunique matches. Scientific names: ['Kalkman' '(Hook.f.) Kalkman']


Oryza eichingeri Peter nonunique matches. Scientific names: ['Peter']


Prunus domestica L. nonunique matches. Scientific names: ['L.']


Selaginella sericea A.Braun nonunique matches. Scientific names: ['A.Br.']


Scleria robinsoniana J.Raynal nonunique matches. Scientific names: ['J.Raynal']


Euphorbia cap-saintemariensis Rauh nonunique matches. Scientific names: ['Rauh']


Euphorbia duranii Ursch & Leandri nonunique matches. Scientific names: ['Ursch & Leandri']


Euphorbia famatamboay Friedm. & Cremers nonunique matches. Scientific names: ['F.Friedmann & Cremers']


Euphorbia moratii Rauh nonunique matches. Scientific names: ['Rauh']


Euphorbia neobosseri Rauh nonunique matches. Scientific names: ['Rauh']


Magnolia conifera (Dandy) V.S.Kumar nonunique matches. Scientific names: ['(Dandy) V.S.Kumar']


Euphorbia platyclada Rauh nonunique matches. Scientific names: ['Rauh']


Mentha spicata L. nonunique matches. Scientific names: ['L.']


Tephrocactus articulatus (Pfeiff.) Backeb. nonunique matches. Scientific names: ['(Pfeiff.) Backeb.']


Aloe rigens Reynolds & P.R.O.Bally nonunique matches. Scientific names: ['Reynolds & P.R.O.Bally']


Magnolia virginiana L. nonunique matches. Scientific names: ['L.']


Pygmaeocereus bieblii Diers nonunique matches. Scientific names: ['Diers']


Magnolia officinalis Rehder & E.H.Wilson nonunique matches. Scientific names: ['Rehder & E.H.Wilson']


Euphorbia skottsbergii Sherff nonunique matches. Scientific names: ['Sherff']


Magnolia pacifica A.Vázquez nonunique matches. Scientific names: ['Iltis & A.Vazquez' 'Vazquez']


Brassica oleracea L. nonunique matches. Scientific names: ['L.']


Medicago papillosa Boiss. nonunique matches. Scientific names: ['Boiss.']


Phoenix atlantica A.Chev. nonunique matches. Scientific names: ['A.Chev.']


Memecylon varians Thwaites nonunique matches. Scientific names: ['Thwaites']


Lupinus rivularis Lindl. nonunique matches. Scientific names: ['(Lindl. ex J.Agardh) S.Watson' 'Douglas ex Lindl.']


Lupinus westianus L. nonunique matches. Scientific names: ['(McFarlin ex Beckner) Isely' 'Small']


Fritillaria carica Rix nonunique matches. Scientific names: ['Rix']


Typha angustifolia L. nonunique matches. Scientific names: ['L.']


Juniperus monticola Martínez nonunique matches. Scientific names: ['Martínez']


Gagea chabertii A.Terracc. nonunique matches. Scientific names: ['A.Terracc.']


Agrostis nevadensis Boiss. nonunique matches. Scientific names: ['Boiss.']


Sclerocactus wetlandicus Hochstätter nonunique matches. Scientific names: ['Hochstätter']


Genista insularis Bacch., Brullo & Feoli Chiapella nonunique matches. Scientific names: ['Bacch., Brullo & Feoli Chiapella']


Paphiopedilum glaucophyllum J.J.Sm. nonunique matches. Scientific names: ['J.J.Sm.']


Anacamptis palustris R.M.Bateman & Pridgeon & Chase nonunique matches. Scientific names: ['(Heuff.) R.M.Bateman, Pridgeon & M.W.Chase'
 '(T.Stephenson) R.M.Bateman, Pridgeon & M.W.Chase'
 '(Jacq.) R.M.Bateman, Pridgeon & M.W.Chase']




Coryphantha hintoniorum Dicht & A.Lüthy nonunique matches. Scientific names: ['Dicht & A.Lüthy']


Podocarpus archboldii N.E.Gray nonunique matches. Scientific names: ['N.E.Gray']


Isoetes panchganiensis G.K.Srivast., D.D.Pant & P.K.Shukla nonunique matches. Scientific names: ['G.K.Srivast., D.D.Pant & P.K.Shukla']


Cycas rumphii Miq. nonunique matches. Scientific names: ['Miq.']


Podocarpus pseudobracteatus de Laub. nonunique matches. Scientific names: ['de Laub.']


Juncus articulatus L. nonunique matches. Scientific names: ['L.']


Quercus nigra L. nonunique matches. Scientific names: ['L.']


Cycas arnhemica K.D.Hill nonunique matches. Scientific names: ['K.D.Hill']


Juncus fontanesii J.Gay nonunique matches. Scientific names: ['(J.Gay ex Laharpe) Nyman' 'J.Gay ex Laharpe']


Spergularia media (P.Monnier) Lambinon & Dobignard nonunique matches. Scientific names: ['(P.Monnier) Lambinon & Dobignard']


Juncus pelocarpus E. Mey. nonunique matches. Scientific names: ['(E.Mey.) Engelm.' 'E.Mey.']


Dalbergia glaberrima Bosser & Rabevohitra nonunique matches. Scientific names: ['Bosser & R.Rabev']


Macrozamia heteromera C.Moore nonunique matches. Scientific names: ['C.Moore']


Rhynchospora alba (L.) Vahl nonunique matches. Scientific names: ['(L.) Vahl']


Dioon tomasellii De Luca, Sabato & Vázq.Torres nonunique matches. Scientific names: ['De Luca, Sabato & Vázq.Torres']


Quercus oblongifolia Torr. nonunique matches. Scientific names: ['Torr.']


Coffea sessiliflora Bridson nonunique matches. Scientific names: ['Bridson']


Coffea mufindiensis Bridson nonunique matches. Scientific names: ['Bridson']


Armeria berlengensis Daveau nonunique matches. Scientific names: ['Daveau']


Alnus maximowiczii Callier nonunique matches. Scientific names: ['Callier']


Meistera tomrey Škorničk. & M.F.Newman nonunique matches. Scientific names: ['(Gagnep.) Skornick. & M.F.Newman']


Romulea corsica Jord. & Fourr. nonunique matches. Scientific names: ['Jord. & Fourr.']


Romulea variicolor Mifsud nonunique matches. Scientific names: ['Mifsud']


Cephalotaxus sinensis H.L.Li nonunique matches. Scientific names: ['(Rehder & E.H.Wilson) H.L.Li']


Acacia permixta Burtt Davy nonunique matches. Scientific names: ['Burtt Davy']


Anubias hastifolia Engl. nonunique matches. Scientific names: ['Engl.']


Abies fargesii Franch. nonunique matches. Scientific names: ['Franch.']


Acacia bussei Harms ex Y.Sjöstedt nonunique matches. Scientific names: ['Harms ex Y.Sjöstedt']


Begonia salaziensis Warb. nonunique matches. Scientific names: ['(Warb.) L.B.Sm. & Wassh.' '(Gaudich.) Warb.']


Abies vejarii Martínez nonunique matches. Scientific names: ['Martínez']


Acacia aemula Maslin nonunique matches. Scientific names: ['Maslin']


Paeonia officinalis L. nonunique matches. Scientific names: ['L.']


Acacia phlebopetala Maslin nonunique matches. Scientific names: ['Maslin']


Acalypha dictyoneura Müll, Arg. nonunique matches. Scientific names: ['Müll.Arg.' '(Müll.Arg.) J.F.Macbr.']


Sorbus amabilis Cheng ex Yu nonunique matches. Scientific names: ['Z.X.Yu' 'Cheng ex T.T.Yu & K.C.Kuan']


Dacrydium nidulum de Laub. nonunique matches. Scientific names: ['de Laub.']


Polycarpaea gayi Webb nonunique matches. Scientific names: ['Webb']


Sorbus arranensis Hedl. nonunique matches. Scientific names: ['Hedl.']


Clermontia grandiflora Gaud. nonunique matches. Scientific names: ['(Gaudich.) A.Gray' 'Gaudich.']


Clermontia multiflora Hillebr. nonunique matches. Scientific names: ['Hillebr.']


Nepenthes gracillima Ridley nonunique matches. Scientific names: ['Ridl.']


Cyanea arborea Hillebr. nonunique matches. Scientific names: ['Hillebr.']


Cyanea solenocalyx Hillebr. nonunique matches. Scientific names: ['Hillebr.']


Primula auriculata L. nonunique matches. Scientific names: ['(Ledeb.) Wendelbo' 'Lam.' 'Ledeb.']


Nothofagus crenata Steenis nonunique matches. Scientific names: ['Steenis']


Wahlenbergia pulchella Thulin nonunique matches. Scientific names: ['Thulin']


Dypsis humbertii H.Perrier nonunique matches. Scientific names: ['H.Perrier']


Eugenia koolauensis O.Deg. nonunique matches. Scientific names: ['O.Deg.']


Sophora fernandeziana (Phil.) Skottsb. nonunique matches. Scientific names: ['(Phil.) Skottsb.']


Sophora gypsophila B.L.Turner & A.M.Powell nonunique matches. Scientific names: ['B.L.Turner & A.M.Powell']


Eugenia johorensis Ridley nonunique matches. Scientific names: ['Ridl.']


Cyperus tenuiculmis Boeckeler nonunique matches. Scientific names: ['Boeckeler']


Eugenia kiahii Hend. nonunique matches. Scientific names: ['M.R.Hend.']


Rosa abietina Gren. ex H.Christ nonunique matches. Scientific names: ['Christ']


Cyperus dentatus Torr. nonunique matches. Scientific names: ['Torr.']


Cyperus pennatiformis Kük. nonunique matches. Scientific names: ['Kük.']


Saxifraga thiantha Harry Sm. nonunique matches. Scientific names: ['Harry Sm.']


Miconia cercophora Wurdack nonunique matches. Scientific names: ['Wurdack']


Nesaea parkeri Verdc. nonunique matches. Scientific names: ['Verdc.']


Potamogeton berchtoldii Fieber nonunique matches. Scientific names: ['Fieber']


Phyllanthus conjugatus M.Schmid nonunique matches. Scientific names: ['M.Schmid']


Allium schmitzii Cout. nonunique matches. Scientific names: ['Cout.']


Labordia triflora Hillebr. nonunique matches. Scientific names: ['Hillebr.']


Chassalia bojeri Bremek. nonunique matches. Scientific names: ['Bremek.']


Allium oliganthum Kar. & Kir. nonunique matches. Scientific names: ['Kar. & Kir.']


Neraudia angulata R.S.Cowan nonunique matches. Scientific names: ['R.S.Cowan']


Psychotria rimbachii Standl. nonunique matches. Scientific names: ['Standl.']


Trifolium masaiense J.B.Gillett nonunique matches. Scientific names: ['J.B.Gillett']


Callitriche platycarpa Kütz. nonunique matches. Scientific names: ['Kütz.']


Callitriche palustris L. nonunique matches. Scientific names: ['L.']


Potentilla bryoides Sojak nonunique matches. Scientific names: ['Soják']


Lonchocarpus guatemalensis Benth. nonunique matches. Scientific names: ['Benth.']


Astragalus coltonii M.E.Jones nonunique matches. Scientific names: ['M.E.Jones']


Astragalus lentiginosus Hook. nonunique matches. Scientific names: ['(Douglas ex Hook.) M.E.Jones' 'Douglas ex Hook.']


Schinus engleri Barkley nonunique matches. Scientific names: ['F.A.Barkley']


Cordia collococca L. nonunique matches. Scientific names: ['L.']


Prosopis alba Grisebach nonunique matches. Scientific names: ['Griseb.']


Astragalus sparsiflorus A.Gray nonunique matches. Scientific names: ['A.Gray']


Geissois pruinosa Brongn. & Gris nonunique matches. Scientific names: ['Brongn. & Gris']


Alphonsea maingayi Hook. f. & Thomson nonunique matches. Scientific names: ['(Hook.f. & Thomson) Ridl.' 'Hook.f. & Thomson']


Litsea hirsutissima Gamble nonunique matches. Scientific names: ['Gamble']


Litsea spathacea Gamble nonunique matches. Scientific names: ['Gamble']


Euonymus europaeus L. nonunique matches. Scientific names: ['L.']


Conostegia centronioides Markgr. nonunique matches. Scientific names: ['Markgr.']


Celtis laevigata Willd. nonunique matches. Scientific names: ['Willd.']


Discoclaoxylon occidentale Pax & Hoffm. nonunique matches. Scientific names: ['Pax & K.Hoffm.' '(Müll.Arg.) Pax & K.Hoffm.']


Terniopsis brevis M.Kato nonunique matches. Scientific names: ['M.Kato']


Berberis taronensis Ahrendt nonunique matches. Scientific names: ['Ahrendt']


Gnetum montanum Markgr. nonunique matches. Scientific names: ['Markgr.']


Gnetum neglectum Blume nonunique matches. Scientific names: ['Blume']


Knema oblongifolia (King) Warb. nonunique matches. Scientific names: ['(King) Warb.']


Scalesia villosa Stewart nonunique matches. Scientific names: ['A.Stewart' 'Stewart & J.T.Howell']


Knema conferta (King) Warb. nonunique matches. Scientific names: ['(King) Warb.']


Tephrosia brachyodon Domin nonunique matches. Scientific names: ['Domin']


Tephrosia cephalantha Welw. ex Baker nonunique matches. Scientific names: ['Welw. ex Baker']


Dialium unifoliolatum Capuron ex Du Puy & R.Rabev. nonunique matches. Scientific names: ['Capuron']


Calyptranthes thomasiana O.Berg nonunique matches. Scientific names: ['O.Berg']


Saurauia erythrocarpa C.F.Liang & Y.S.Wang nonunique matches. Scientific names: ['C.F.Liang & Y.S.Wang']


Senna macranthera H.S.Irwin & Barneby nonunique matches. Scientific names: ['H.S.Irwin & Barneby']


Senna pendula H.S.Irwin & Barneby nonunique matches. Scientific names: ['H.S.Irwin & Barneby']


Senna organensis H.S.Irwin & Barneby nonunique matches. Scientific names: ['H.S.Irwin & Barneby']


Flindersia pimenteliana F. Muell nonunique matches. Scientific names: ['(F.Muell.) K.D.Scott, W.K.Harris & Playford' 'F.Muell.']


Alternanthera filifolia J.T. Howell nonunique matches. Scientific names: ['J.T.Howell' '(Hook.f.) J.T.Howell']


Mimosa albida Willd. nonunique matches. Scientific names: ['(Willd.) B.L.Rob.' 'Humb. & Bonpl. ex Willd.']


Mimosa foliolosa Benth. nonunique matches. Scientific names: ['Benth.']


Garcinia cantleyana Whitm. nonunique matches. Scientific names: ['Whitmore']


Vangueria bicolor K.Schum. nonunique matches. Scientific names: ['K.Schum.']


Durio testudinarum Becc. nonunique matches. Scientific names: ['Becc.']


Myrtus communis L. nonunique matches. Scientific names: ['L.']


Bidens trichosperma Britton nonunique matches. Scientific names: ['Britton']


Phyllostegia helleri Sherff nonunique matches. Scientific names: ['Sherff']


Zornia pardina Mohlenbr. nonunique matches. Scientific names: ['Mohlenbr.']


Gnidia daphnifolia L.f. nonunique matches. Scientific names: ['L.f.']


Pitcairnia simulans H.Luther nonunique matches. Scientific names: ['H.Luther']


Neuracanthus tephrophyllus Bidgood & Brummitt nonunique matches. Scientific names: ['Bidgood & Brummitt']


Syngonanthus lanatus Moldenke nonunique matches. Scientific names: ['Moldenke']


Andrachne schweinfurthii Radcl-Sm. nonunique matches. Scientific names: ['Radcl.-Sm.' '(Balf.f.) Radcl.-Sm.']


Nothaphoebe kingiana Gamble nonunique matches. Scientific names: ['Gamble']


Centropogon saltuum E.Wimm. nonunique matches. Scientific names: ['E.Wimm.']


Neriacanthus harlingii Wassh. nonunique matches. Scientific names: ['Wassh.']


Huilaea macrocarpa L.Uribe nonunique matches. Scientific names: ['L.Uribe']


Pappobolus ecuadoriensis Panero nonunique matches. Scientific names: ['Panero']


Anthurium bonplandii G.S.Bunting nonunique matches. Scientific names: ['G.S.Bunting']


Calceolaria adenanthera Molau nonunique matches. Scientific names: ['Molau']


Clitoria sagotii Fantz nonunique matches. Scientific names: ['Fantz']


Calceolaria pedunculata Molau nonunique matches. Scientific names: ['Molau']


Koeleria novozelandica Domin nonunique matches. Scientific names: ['Domin']




Gastrolobium spinosum Benth. nonunique matches. Scientific names: ['Benth.']


Dendrobium poneroides Schltr. nonunique matches. Scientific names: ['Schltr.']


Dyschoriste keniensis Malombe, Mwachala & Vollesen nonunique matches. Scientific names: ['Malombe, Mwachala & Vollesen']


Abrus madagascariensis R.Vig. nonunique matches. Scientific names: ['R.Vig.']


Pleurothallis elegantula Cogn. nonunique matches. Scientific names: ['Cogn.']


Xerochlamys elliptica F.Gérard nonunique matches. Scientific names: ['F.Gérard']


Hedysarum polybotrys Hand.-Mazz. nonunique matches. Scientific names: ['Hand.-Mazz.']


Trichoglottis winkleri J.J.Sm. nonunique matches. Scientific names: ['J.J.Sm.']


Octolepis dioica Capuron nonunique matches. Scientific names: ['Capuron']


Pteridium pinetorum C.N.Page & R.R.Mill nonunique matches. Scientific names: ['C.N.Page & R.R.Mill']


Fraxinus sieboldiana Blume nonunique matches. Scientific names: ['Blume']


Prunella vulgaris L. nonunique matches. Scientific names: ['L.']


Lepidagathis sericea Benoist nonunique matches. Scientific names: ['Benoist']


Dendrosenecio cheranganiensis (Cotton & Blakelock) E.B.Knox nonunique matches. Scientific names: ['(Cotton & Blakelock) E.B.Knox']


Scirpoides holoschoenus (L.) Soják nonunique matches. Scientific names: ['(L.) Soják']


Kadua degeneri (Fosberg) W.L.Wagner & Lorence nonunique matches. Scientific names: ['(Fosberg) W.L.Wagner & Lorence']


Rhopalocarpus macrorhamnifolius Capuron nonunique matches. Scientific names: ['Capuron']


Tylostigma filiforme H.Perrier nonunique matches. Scientific names: ['H.Perrier']


Solidago riddellii Frank ex Riddell nonunique matches. Scientific names: ['Frank']


Foetidia macrocarpa Bosser nonunique matches. Scientific names: ['Bosser']


Ethulia vernonioides M.G. Gilbert nonunique matches. Scientific names: ['M.G.Gilbert' '(Schweinf.) M.G.Gilbert']


Mundulea anceps R.Vig. nonunique matches. Scientific names: ['R.Vig.']


Micronychia tsiramiramy H.Perrier nonunique matches. Scientific names: ['H.Perrier']


Otiophora lebruniana (Bamps) Robbr. & Puff nonunique matches. Scientific names: ['(Bamps) Robbr. & Puff']


Strobilanthes accrescens J.R.I.Wood nonunique matches. Scientific names: ['J.R.I.Wood']


Cunonia pulchella Brongn. & Gris nonunique matches. Scientific names: ['Brongn. & Gris']


Aulotandra trigonocarpa H.Perrier nonunique matches. Scientific names: ['H.Perrier']


Capsicum annuum L. nonunique matches. Scientific names: ['L.']


0

Load the (previously computed and saved) results

In [6]:
iucn_checklist_matching = pd.read_csv('/home/joon/data/iucn-checklist-matching.csv',index_col=0,header=None,squeeze=True,converters={1:read_set_in_csv})

# MATCHING CHECKLIST -> GBIF

On récupère les `accepted_plant_name_ids` concernés, c'est-à-dire sélectionnés par le matching iucn -> checklist.

In [22]:
# accepted_plant_name_ids = {id for l in list(iucn_checklist_matching[iucn_checklist_matching.validated]['accepted_ids_in_checklist']) for id in l}
# accepted_plant_name_ids = {id for l in iucn_checklist_matching[iucn_checklist_species_validated_matchings] for id in l}

In [6]:
#????? à valider LES PLANT NAME IDS CORRESPONDANT AUX PLANTES GBIF
# accepted_plant_name_ids = iucn_checklist_matching.loc[iucn_checklist_species_validated_matchings,'accepted_plant_name_id']
# checklist = checklist[checklist.accepted_plant_name_id.isin(accepted_plant_name_ids)]

# TOUS LES PLANTES DE LA CHECKLIST
accepted_plant_name_ids = checklist.accepted_plant_name_id.unique()

On restraint la checklist aux lignes correspondantes.

## Loading GBIF names

In [7]:
gbif = pd.read_csv("/home/joon/data/gbif-names.csv",sep='	')
gbif = gbif[['taxonKey','scientificName', 'acceptedTaxonKey', 'family', 'genus', 'species']]
gbif = gbif[(gbif.genus.isnull() == False) & (gbif.species.isnull() == False)]

## Matching genus
Nécéssaire car certaines genus n'ont pas la même orthographe dans la checklist et le GBIF
On recherche des matchs appartenant à la même famille, de longueur (de la chaîne de caractères) similaire, et commençant par la même lettre.
Ce n'est pas grave de matcher plusieurs genus de façon parfois approximative, car ça ne sert qu'à restreindre ensuite la recherche des species.

In [305]:
gbif_genus_values = gbif.genus.unique()
checklist_genus_values = checklist.genus.unique()

In [306]:
genus_checklist_gbif_matching = pd.DataFrame()

for genus in checklist_genus_values:
    considered_values = [value for value in gbif_genus_values if ((len(genus)*3/4-2<=len(value)) & (len(value) <=len(genus)*4/3+2) & (value[0].casefold()==genus[0].casefold()))]
    if len(considered_values) >= 1:
        genus_fuzzy_matchings = pd.DataFrame([[value,score] for (value, score) in process.extract(genus,considered_values)] ,columns=['value','score'])
        genus_fuzzy_matchings = genus_fuzzy_matchings[genus_fuzzy_matchings.score>=100-2*(100-genus_fuzzy_matchings.score.max())]
        genus_checklist_gbif_matching = genus_checklist_gbif_matching.append(pd.Series([list(genus_fuzzy_matchings.value.values)],name=genus))
    else:
        genus_checklist_gbif_matching = genus_checklist_gbif_matching.append(pd.Series([],name=genus))
genus_checklist_gbif_matching.columns = ['matched_genus_in_gbif']
genus_checklist_gbif_matching.to_csv('genus_checklist_gbif_matching.csv')

In [307]:
genus_checklist_gbif_matching = pd.read_csv('genus_checklist_gbif_matching.csv',squeeze=True,index_col=0,converters={'matched_genus_in_gbif':lambda x: x.strip("[]").replace("'","").split(", ")})

# Matching des checklist row -> GBIF acceptedTaxonKey
checklist_gbif_matching contient la correspondance plant_name_id -> acceptedTaxonKey

In [28]:
#checklist_gbif_matching.to_csv('checklist_gbif_matching.csv')
#checklist_gbif_matching.to_csv('all_checklist_gbif_matching.csv')

In [1]:
def checklist_gbif_matching_process_genus(genus):
    global genus_checklist_gbif_matching, checklist, gbif

    checklist_gbif_matching = pd.Series([])

    genus_values = genus_checklist_gbif_matching.loc[genus]
    checklist_species_from_genus = checklist.species[checklist.genus==genus]

    search_res_g = gbif[gbif.genus.isin(genus_values)] # filter by genus only

    for plant_name_id in checklist_species_from_genus.index:
        giveup = False
        checklist_row = checklist.loc[plant_name_id]
        species = checklist_row.species
        g_s = genus + ' ' + species
        taxon = checklist_row.taxon_name
        var_subsp = taxon.replace(g_s,'').replace(' var. ','').replace(' subsp. ','')
        authority = checklist_row.taxon_authors

        search_res_gs = search_res_g[search_res_g.species==g_s]

        if var_subsp != '':
            search_res_gs = search_res_gs[search_res_gs.scientificName.str.contains(re.escape(var_subsp))]

        if search_res_gs.shape[0] == 0: # if match fails with genus+species, consider all genus, and fuzzy match the species
            os.system('echo "'+str(plant_name_id)+" > log.txt")
            name_fuzzy_matchings = None
            try:
                name_fuzzy_matchings = pd.DataFrame([[ind,name,score] for (name, score,ind) in process.extract(species,search_res_g.species,scorer=fuzz.ratio,limit=100)],columns=['ind','name','score']).set_index('ind')
            except:
                print(sys.exc_info()[0])
                print(g_s)
            name_fuzzy_matchings = name_fuzzy_matchings[name_fuzzy_matchings.score>=max(100-2*(100-name_fuzzy_matchings.score.max()),79)]
            search_res_gs = search_res_g[search_res_g.species.isin(name_fuzzy_matchings.name.values)] 
            if search_res_gs.shape[0] == 0:
                final_search_res = search_res_gs
                giveup = True

        if pd.isnull(authority):
            authority = ""
        authority = authority.strip()

        search_res_matching_authority = search_res_gs[search_res_gs.scientificName==checklist_row.taxon_name+(' '+authority if not pd.isnull(checklist_row.taxon_authors) else '')] # try exact match of author name

        if search_res_matching_authority.shape[0] == 0:
            authority_complete = authority
            authority = re.sub('\(.+\)','',authority).strip()
            search_res_matching_authority = search_res_gs[search_res_gs.scientificName==checklist_row.taxon_name+(' '+authority if not pd.isnull(checklist_row.taxon_authors) else '')] # try exact match of author name

        if search_res_matching_authority.acceptedTaxonKey.nunique() >= 1:
            final_search_res = search_res_matching_authority
        else: # if exact match of author name failed
            search_res_containing_authority = search_res_gs[search_res_gs.scientificName.str.contains(authority)]
            if search_res_containing_authority.acceptedTaxonKey.nunique() == 1:
                final_search_res = search_res_containing_authority
            elif not giveup:
                if search_res_containing_authority.acceptedTaxonKey.nunique() >= 2:
                    search_res_gs = search_res_containing_authority
                name_fuzzy_matchings = pd.DataFrame([[name,score] for (name, score) in process.extract(var_subsp+' '+authority,search_res_gs.scientificName.apply(lambda name : name.replace(g_s+' ','')).values,scorer=fuzz.partial_ratio,limit=100)] ,columns=['name','score'])
                name_fuzzy_matchings = name_fuzzy_matchings[name_fuzzy_matchings.score>=max(100-2*(100-name_fuzzy_matchings.score.max()),80)]
                if name_fuzzy_matchings.shape[0] > 1:
                    name_fuzzy_matchings = pd.DataFrame([[name,score] for (name, score) in process.extract(var_subsp+' '+authority_complete,search_res_gs.scientificName.apply(lambda name : name.replace(g_s+' ','')).values,scorer=fuzz.partial_ratio,limit=100)] ,columns=['name','score'])
                    name_fuzzy_matchings = name_fuzzy_matchings[name_fuzzy_matchings.score>=max(100-2*(100-name_fuzzy_matchings.score.max()),80)]
                final_search_res = search_res_gs[search_res_gs.scientificName.apply(lambda name : name.replace(g_s+' ','')).isin(name_fuzzy_matchings.name.values)] 

        # if final_search_res.acceptedTaxonKey.nunique() > 1:
            # print(g_s,authority, 'nonunique matches. Scientific names:',final_search_res.scientificName.unique())

        checklist_gbif_matching[plant_name_id] = set(list(final_search_res.acceptedTaxonKey.unique()) + ([] if plant_name_id not in checklist_gbif_matching.index else list(checklist_gbif_matching[plant_name_id])))

    checklist_gbif_matching.to_csv('/home/joon/data/checklist-gbif-matching-'+genus+'.csv',header=False)

In [309]:
os.system('rm /home/joon/data/checklist-gbif-matching-*.csv')
os.system('rm /home/joon/data/checklist-gbif-matching.csv')

0

In [310]:
from multiprocessing import Pool
with Pool(processes=11) as pool:
    pool.map(checklist_gbif_matching_process_genus, checklist_genus_values)











































































In [311]:
#fusionner les résultats et supprimer les fichiers temporaires!
os.system('cat /home/joon/data/checklist-gbif-matching-*.csv > /home/joon/data/checklist-gbif-matching.csv')
os.system('rm /home/joon/data/checklist-gbif-matching-*.csv')

0

In [8]:
checklist_gbif_matching = pd.read_csv('/home/joon/data/checklist-gbif-matching.csv',index_col=0,header=None,squeeze=True,converters={1:read_set_in_csv})

In [11]:
checklist_accepted_gbif_taxonkey_matching = pd.Series([])

for accepted_plant_name_id in accepted_plant_name_ids:
    plant_name_ids = set(checklist[checklist.accepted_plant_name_id==accepted_plant_name_id].index)
    acceptedTaxonKeys = {acceptedTaxonKey for foo in checklist_gbif_matching.loc[plant_name_ids] for acceptedTaxonKey in foo}
    taxonKeys = set(gbif.taxonKey[gbif.acceptedTaxonKey.isin(acceptedTaxonKeys)])
    checklist_accepted_gbif_taxonkey_matching[accepted_plant_name_id] = taxonKeys

checklist_accepted_gbif_taxonkey_matching.to_csv('/home/joon/data/checklist-accepted-gbif-taxonkey-matching.csv',header=False)

In [9]:
checklist_accepted_gbif_taxonkey_matching = pd.read_csv('/home/joon/data/checklist-accepted-gbif-taxonkey-matching.csv',index_col=0,header=None,squeeze=True,converters={1:read_set_in_csv})

## Correspondance iucn index -> acceptedTaxonKey

In [13]:
iucn_gbif_matching = pd.Series([])

# for iucn_taxon_index in iucn_checklist_matching.loc[iucn_checklist_species_validated_matchings,'iucn_index'].unique():
for iucn_taxon_index in iucn_checklist_matching.index:
    accepted_plant_name_ids = iucn_checklist_matching.loc[iucn_taxon_index]
    plant_name_ids = {plant_name_id for accepted_plant_name_id in accepted_plant_name_ids for plant_name_id in checklist.loc[checklist.accepted_plant_name_id==accepted_plant_name_id].index}
    acceptedTaxonKeys = {acceptedTaxonKey for plant_name_id in plant_name_ids for acceptedTaxonKey in checklist_gbif_matching[plant_name_id]}

    iucn_gbif_matching[iucn_taxon_index] = acceptedTaxonKeys

iucn_gbif_matching.to_csv('/home/joon/data/iucn_gbif_matching.csv',header=False)

In [11]:
iucn_gbif_matching = pd.read_csv('/home/joon/data/iucn_gbif_matching.csv',index_col=0,header=None,squeeze=True,converters={1:read_set_in_csv})

iucn_taxon_index -> taxonkey

In [42]:
iucn_gbif_taxonkey_matching = pd.Series([])

for iucn_taxon_index in iucn_checklist_matching.index:
    accepted_plant_name_ids = iucn_checklist_matching.loc[iucn_taxon_index]
    taxonKeys = {taxonKey for foo in checklist_accepted_gbif_taxonkey_matching.loc[accepted_plant_name_ids] for taxonKey in foo}
    iucn_gbif_taxonkey_matching[iucn_taxon_index] = taxonKeys

iucn_gbif_taxonkey_matching.to_csv('/home/joon/data/iucn_gbif_taxonkey_matching.csv',header=False)