In [1]:
"""
Author.      : Aditya Jain
Date Started : May 9, 2022
About        : This script fetches unique IDs for UK moth species from GBIF database
"""


'\nAuthor.      : Aditya Jain\nDate Started : May 9, 2022\nAbout        : This script fetches unique IDs for UK moth species from GBIF database\n'

In [2]:
from pygbif import occurrences as occ
from pygbif import species as species_api
import pandas as pd
import os
import tqdm
import urllib
import json
import time

data_dir   = '/home/mila/a/aditya.jain/mothAI/other_species_lists/'

In [3]:
def get_gbif_key_backbone(name, place):
    """ given a species name, this function returns the unique gbif key and other 
        attributes using backbone API
    """
    
    # default values
    taxon_key      = [-1]
    order          = ['NA']
    family         = ['NA']
    genus          = ['NA']    
    search_species = [name]
    gbif_species   = ['NA']     # the name returned on search, can be different from the search
    confidence     = ['']
    status         = ['NA']
    match_type     = ['NONE']
    place          = [place]

    data = species_api.name_backbone(name=name, strict=True, rank='species')

    if data['matchType'] == 'NONE':
        confidence    = [data['confidence']]
    else:
        taxon_key     = [data['usageKey']]
        order         = [data['order']]
        family        = [data['family']]
        genus         = [data['genus']]
        confidence    = [data['confidence']]
        gbif_species  = [data['species']]
        status        = [data['status']]
        match_type    = [data['matchType']]
  
    df = pd.DataFrame(list(zip(taxon_key, order, family, genus,
                               search_species, gbif_species, confidence,
                               status, match_type, place)),
                    columns =['taxon_key_guid', 'order_name', 'family_name',
                              'genus_name', 'search_species_name', 'gbif_species_name',
                              'confidence', 'status', 'match_type', 'source'])
    return df


#### Get the list of macro-moth species from the list


In [4]:
uk_species       = 'uksi_moths_3-5-22.csv'
file             = data_dir + uk_species
uk_data          = pd.read_csv(file, index_col=False)
uk_macro_species = []

moth_families = ['Cossidae', 'Drepanidae', 'Endromidae', 'Erebidae',
                 'Geometridae', 'Hepialidae', 'Lasiocampidae', 'Limacodidae',
                 'Noctuidae', 'Nolidae', 'Notodontidae', 'Saturniidae', 
                 'Sesiidae', 'Sphingidae', 'Zygaenidae']

for indx in uk_data.index:
    if uk_data['taxon_rank'][indx]=='Species' and uk_data['preferred'][indx]==True \
    and uk_data['family_taxon'][indx] in moth_families:
        uk_macro_species.append(uk_data['preferred_taxon'][indx])

In [5]:
data_final = pd.DataFrame(columns =['taxon_key_guid', 'order_name', 'family_name',
                              'genus_name', 'search_species_name', 'gbif_species_name',
                              'confidence', 'status', 'match_type', 'source'], dtype=object)
for name in uk_macro_species:
    data       = get_gbif_key_backbone(name, 'uksi_09May2022')
    data_final = data_final.append(data, ignore_index = True)
    
data_final.to_csv(data_dir + 'UK-MacroMoth-List_09May2022.csv', index=False)  

## Count no. of not-found entries

In [6]:
uk_species  = 'UK-MacroMoth-List_09May2022.csv'
file        = data_dir + uk_species
uk_data     = pd.read_csv(file) 
count           = 0
no_search       = []

for indx in uk_data.index:
    if uk_data['taxon_key_guid'][indx] == -1:
        count += 1
        no_search.append(uk_data['search_species_name'][indx])

print(count)
print(no_search)
# no_search = pd.DataFrame(no_search)
# no_search.to_csv(DATA_DIR + 'not-found_7April2021.csv', index=False)

1
['Ectropis sp.']
