In [34]:
"""
Author.      : Aditya Jain
Date Started : July 9, 2022
About        : This script fetches unique IDs for UK and Denmark moth species from GBIF database
               and builds a consolidated database
"""


'\nAuthor.      : Aditya Jain\nDate Started : July 9, 2022\nAbout        : This script fetches unique IDs for UK and Denmark moth species from GBIF database\n               and builds a consolidated database\n'

In [35]:
from pygbif import occurrences as occ
from pygbif import species as species_api
import pandas as pd
import os
import tqdm
import urllib
import json
import time

data_dir              = '/home/mila/a/aditya.jain/mothAI/species_lists/'
uk_species_list       = 'uksi_moths_3-5-22.csv'
denmark_species_list  = 'Denmark_Original_Moths-only_Dec22.csv'
uk_save_name          = 'UK_Moth-List_25Apr2023.csv'
denmark_save_name     = 'Denmark_Moth-List_25Apr2023.csv'
combined_save_name    = 'UK-Denmark_Moth-List_25Apr2023.csv'

In [24]:
def get_gbif_key_backbone(name, place):
    """ given a species name, this function returns the unique gbif key and other 
        attributes using backbone API
    """
    
    # default values
    acc_taxon_key  = [-1]
    order          = ['NotAvail']
    family         = ['NotAvail']
    genus          = ['NotAvail']    
    search_species = [name]
    gbif_species   = ['NotAvail']     # the name returned on search, can be different from the search
    status         = ['NotAvail']
    rank           = ['NotAvail']
    place          = [place]

    data = species_api.name_backbone(name=name, strict=True, rank='species')
    
    # add entries to the fields
    confidence = [data['confidence']]
    match_type = [data['matchType']]
    if 'order' in data.keys():
        order = [data['order']]
    if 'family' in data.keys():
        family = [data['family']]
    if 'genus' in data.keys():
        genus = [data['genus']]  
    if 'status' in data.keys():
        status = [data['status']]
    if 'rank' in data.keys():
        rank = [data['rank']]
        
    if data['matchType'] != 'NONE' and data['matchType'] != 'HIGHERRANK':
        gbif_species  = [data['species']]        
        if 'acceptedUsageKey' in data.keys():
            acc_taxon_key  = [data['acceptedUsageKey']]
        else:
            acc_taxon_key  = [data['usageKey']]
  
    df = pd.DataFrame(list(zip(acc_taxon_key, order, family, genus,
                               search_species, gbif_species, confidence,
                               status, match_type, rank, place)),
                    columns =['accepted_taxon_key', 'order_name', 'family_name',
                              'genus_name', 'search_species_name', 'gbif_species_name',
                              'confidence', 'status', 'match_type', 'rank', 'source'])
    return df


### Finding keys for UK moth list


In [25]:
file             = data_dir + uk_species_list
uk_data          = pd.read_csv(file, index_col=False)
uk_species       = []

for indx in uk_data.index:
    if uk_data['taxon_rank'][indx]=='Species' and uk_data['preferred'][indx]==True:
        uk_species.append(uk_data['preferred_taxon'][indx])

In [26]:
data_final = pd.DataFrame(columns =['accepted_taxon_key', 'order_name', 'family_name',
                              'genus_name', 'search_species_name', 'gbif_species_name',
                              'confidence', 'status', 'match_type', 'rank', 'source'], dtype=object)
for name in uk_species:
    data       = get_gbif_key_backbone(name, 'uksi_09May2022')
    data_final = pd.concat([data_final, data], ignore_index=True)
    
data_final.to_csv(data_dir + uk_save_name, index=False)  

#### Counting the number of not-found entries

In [27]:
count = 0

for indx in data_final.index:
    if data_final['accepted_taxon_key'][indx] == -1:
        count += 1

print(f'The count of not found species for UK: {count}')        

The count of not found species for UK: 10


### Finding keys for Denmark moth list


In [28]:
file             = data_dir + denmark_species_list
denmark_data     = pd.read_csv(file, index_col=False)
denmark_species  = []

for indx in denmark_data.index:
    denmark_species.append(denmark_data['species_name'][indx])

In [29]:
data_final = pd.DataFrame(columns =['accepted_taxon_key', 'order_name', 'family_name',
                              'genus_name', 'search_species_name', 'gbif_species_name',
                              'confidence', 'status', 'match_type', 'rank', 'source'], dtype=object)
for name in denmark_species:
    data       = get_gbif_key_backbone(name, 'denmark_Dec2022')
    data_final = pd.concat([data_final, data], ignore_index=True)
    
data_final.to_csv(data_dir + denmark_save_name, index=False)  

#### Counting the number of not-found entries

In [30]:
count = 0

for indx in data_final.index:
    if data_final['accepted_taxon_key'][indx] == -1:
        count += 1

print(f'The count of not found species for Denmark: {count}')        

The count of not found species for Denmark: 78


### Combining UK and Denmark list


In [39]:
uk_data       = pd.read_csv(data_dir + uk_save_name, index_col=False)
denmark_data  = pd.read_csv(data_dir + denmark_save_name, index_col=False)
data_combined = pd.DataFrame(columns =['accepted_taxon_key', 'order_name', 'family_name',
                              'genus_name', 'search_species_name', 'gbif_species_name',
                              'confidence', 'status', 'match_type', 'rank', 'source'], dtype=object)

uk_total       = 0     # count of total uk species
denmark_unique = 0     # count of unique denmark species
denmark_uk_com = 0     # count of overlapping denmark-uk species

In [40]:
# add all valid UK entries
for _, row in uk_data.iterrows():
    if row['accepted_taxon_key']==-1:  
        data_combined = data_combined.append(row, ignore_index=True)
        uk_total += 1
    elif row['accepted_taxon_key']!=-1 and row['accepted_taxon_key'] not in data_combined['accepted_taxon_key'].tolist():
        data_combined = data_combined.append(row, ignore_index=True)
        uk_total += 1
    else:
        pass
        
print(f'Total unique UK moth species: {uk_total}')

  data_combined = data_combined.append(row, ignore_index=True)
  data_combined = data_combined.append(row, ignore_index=True)


Total unique UK moth species: 2641


In [38]:
# add all valid Denmark entries
for _, row in denmark_data.iterrows():
    if row['accepted_taxon_key']!=-1:
        if row['accepted_taxon_key'] in data_combined['accepted_taxon_key'].tolist():
            idx = data_combined.index[data_combined['accepted_taxon_key']==row['accepted_taxon_key']].tolist()[0]
            data_combined['source'][idx] = data_combined['source'][idx] + ' ' +  row['source']
            denmark_uk_com += 1
        else:
            data_combined = data_combined.append(row, ignore_index=True)
            denmark_unique += 1
    else:
        data_combined = data_combined.append(row, ignore_index=True)
        denmark_uk_com += 1
            
print(f'Total found Denmark species: {denmark_uk_com + denmark_unique}')
print(f'Total unique Denmark species: {denmark_unique}')
print(f'Total unique UK species: {uk_total - denmark_uk_com}')
print(f'Total common UK and Denmark species: {denmark_uk_com}')
print(f'Total found combined UK and Denmark species: {denmark_unique + uk_total}')
print(f'Total UK and Denmark species, including not found: {len(data_combined)}')

data_combined.to_csv(data_dir + combined_save_name, index=False)  

  data_combined = data_combined.append(row, ignore_index=True)
  data_combined = data_combined.append(row, ignore_index=True)


Total found Denmark species: 2424
Total unique Denmark species: 376
Total unique UK species: 583
Total common UK and Denmark species: 2048
Total found combined UK and Denmark species: 3007
Total UK and Denmark species, including not found: 3024


In [10]:
data_combined

Unnamed: 0,accepted_taxon_key,order_name,family_name,genus_name,search_species_name,gbif_species_name,confidence,status,match_type,source
0,1845962,Lepidoptera,Autostichidae,Oegoconia,Oegoconia quadripuncta,Oegoconia quadripuncta,99,ACCEPTED,EXACT,uksi_09May2022 denmark_Dec2022
1,10055273,Lepidoptera,Tineidae,Oinophila,Oinophila v-flava,Oinophila v-flava,99,ACCEPTED,EXACT,uksi_09May2022
2,1742185,Lepidoptera,Tortricidae,Olethreutes,Olethreutes arcuella,Olethreutes arcuella,99,ACCEPTED,EXACT,uksi_09May2022 denmark_Dec2022
3,1741545,Lepidoptera,Tortricidae,Olindia,Olindia schumacherana,Olindia schumacherana,99,ACCEPTED,EXACT,uksi_09May2022 denmark_Dec2022
4,1875120,Lepidoptera,Pyralidae,Oncocera,Oncocera semirubella,Oncocera semirubella,99,ACCEPTED,EXACT,uksi_09May2022 denmark_Dec2022
...,...,...,...,...,...,...,...,...,...,...
3019,1882158,Lepidoptera,Crambidae,Loxostege,Loxostege turbidalis Tr.,Loxostege turbidalis,99,ACCEPTED,EXACT,denmark_Dec2022
3020,1892242,Lepidoptera,Crambidae,Ecpyrrhorrhoe,Ecpyrrhorrhoe rubiginalis Hb.,Ecpyrrhorrhoe rubiginalis,99,ACCEPTED,EXACT,denmark_Dec2022
3021,1890699,Lepidoptera,Crambidae,Pyrausta,Pyrausta porphyralis D.& S.,Pyrausta porphyralis,98,ACCEPTED,EXACT,denmark_Dec2022
3022,10937370,Lepidoptera,Crambidae,Anania,Phlyctaenia stachydalis Germ.,Anania stachydalis,98,SYNONYM,EXACT,denmark_Dec2022
