In [None]:
'''
Author.      : Aditya Jain
Date Started : 1st April, 2021
About        : This script fetches unique key for moth species from GBIF database
               and builds a global list of moth species with unique IDs
'''

!pip install pygbif

In [1]:
from pygbif import occurrences as occ
from pygbif import species as species_api
import pandas as pd
import os
import tqdm
import urllib
import json
import time

DATA_DIR   = "/content/drive/My Drive/Data/"

In [3]:
def get_gbif_key_backbone(name, place):
  '''
  given a species name, this function returns the unique gbif key and other
  attributes using backbone API
  '''
  # default values
  taxon_key     = ['NA']
  confidence    = ['']
  gbif_species  = ['NA']     # the name returned on search, can be different from the search
  status        = ['NA']
  match_type    = ['NONE']
  place         = [place]

  data = species_api.name_backbone(name=name, strict=True, rank='species')

  if data['matchType'] == 'NONE':
    confidence    = [data['confidence']]
  else:
    taxon_key     = [data['usageKey']]
    confidence    = [data['confidence']]
    gbif_species  = [data['species']]
    status        = [data['status']]
    match_type    = [data['matchType']]
  
  df = pd.DataFrame(list(zip(taxon_key, confidence, gbif_species, status, 
                             match_type, place)),
                    columns =['taxon Key', 'confidence', 'GBIF species name', 
                              'status', 'match type', 'source'])
  return df


### **Pohl list**: Getting unique GBIF id's for Quebec species


In [None]:
quebec_species = 'listB_Quebec_Pohl2018.csv'
file           = DATA_DIR + quebec_species
quebec_data    = pd.read_csv(file, index_col=False)
species        = list(quebec_data['GBIF species name'])

print(species)

**Need to run the below only once to get the keys!**

In [None]:
data_final = pd.DataFrame(columns =['taxon Key', 'confidence', 'GBIF species name', 
                              'status', 'match type', 'source'])
# data_final = pd.read_csv(DATA_DIR + 'key_list.csv')

for name in species:
  data = get_gbif_key_backbone(name, 'Quebec_Pohl2018')
  print(name)
  data_final = data_final.append(data, ignore_index = True)
  data_final.to_csv(DATA_DIR + 'key_list.csv')  

Run the below to add keys in original file and save the new list

In [41]:
file        = 'Pohl2018_Keylist.csv'
key_data    = pd.read_csv(DATA_DIR + file)

quebec_data = quebec_data.iloc[:, 0:8]

final_pohl_list = pd.concat([quebec_data, key_data], axis=1)
final_pohl_list.drop('Unnamed: 0', axis=1, inplace=True)
print(final_pohl_list)
final_pohl_list.to_csv(DATA_DIR + 'listB_Quebec_Pohl2018.csv', index=False)

       SP NO.       superfamily  ... match type           source
0      010001  Micropterigoidea  ...      EXACT  Quebec_Pohl2018
1      070001    Eriocranioidea  ...      EXACT  Quebec_Pohl2018
2      070003    Eriocranioidea  ...      EXACT  Quebec_Pohl2018
3      070004    Eriocranioidea  ...      EXACT  Quebec_Pohl2018
4     070008P    Eriocranioidea  ...      EXACT  Quebec_Pohl2018
...       ...               ...  ...        ...              ...
3145   933680        Noctuoidea  ...      EXACT  Quebec_Pohl2018
3146   933682        Noctuoidea  ...      EXACT  Quebec_Pohl2018
3147   933683        Noctuoidea  ...      EXACT  Quebec_Pohl2018
3148   933685        Noctuoidea  ...      EXACT  Quebec_Pohl2018
3149   933688        Noctuoidea  ...      EXACT  Quebec_Pohl2018

[3150 rows x 14 columns]


ONLY If the missing cells does not have NA, the below code is run

In [13]:
quebec_data.fillna("-1", inplace = True)
quebec_data = quebec_data.astype({"taxon Key": int})
quebec_data.to_csv(DATA_DIR + 'listB_Quebec_Pohl2018.csv', index=False)

### **Vermont list**: Getting unique GBIF id's for Vermont species

In [None]:
vermont_species = 'listC_Vermont_29March2021.csv'
file            = DATA_DIR + vermont_species
vermont_data    = pd.read_csv(file)
species         = list(vermont_data['scientificName'])

print(species)
print(len(species))

In [None]:
data_final = pd.DataFrame(columns =['taxon Key', 'confidence', 'GBIF species name', 
                              'status', 'match type', 'source'])
# data_final = pd.read_csv(DATA_DIR + 'Vermont2021_Keylist.csv')

for name in species:
  print(name)
  data = get_gbif_key_backbone(name, 'Vermont_29March2021')
  data_final = data_final.append(data, ignore_index = True)
  data_final.to_csv(DATA_DIR + 'key_list.csv', index=False)  

Appending key list to original vermont file and saving

In [21]:
file        = 'Vermont2021_Keylist.csv'
key_data    = pd.read_csv(DATA_DIR + file)

vermont_data = vermont_data.iloc[:, 0:10]

final_vermont_list = pd.concat([vermont_data, key_data], axis=1)
# final_vermont_list.drop('Unnamed: 0', axis=1, inplace=True)
print(final_vermont_list)
final_vermont_list.to_csv(DATA_DIR + 'listC_Vermont_29March2021.csv', index=False)

      Hodges_No      P3no  group  ...    status match type               source
0           1.0   10001.0  micro  ...  ACCEPTED      EXACT  Vermont_29March2021
1           3.0   70001.0  micro  ...  ACCEPTED      EXACT  Vermont_29March2021
2           5.0   70003.0  micro  ...  ACCEPTED      EXACT  Vermont_29March2021
3          31.0  110011.0  micro  ...  DOUBTFUL      EXACT  Vermont_29March2021
4          18.0  110016.0  micro  ...  ACCEPTED      EXACT  Vermont_29March2021
...         ...       ...    ...  ...       ...        ...                  ...
1935    11043.0  933685.0  macro  ...   SYNONYM      EXACT  Vermont_29March2021
1936    11045.0  933688.0  macro  ...   SYNONYM      EXACT  Vermont_29March2021
1937    10658.0   34151.0  macro  ...  ACCEPTED      EXACT  Vermont_29March2021
1938     3412.2  621291.2  micro  ...  ACCEPTED      EXACT  Vermont_29March2021
1939     6405.0  910822.0  macro  ...  ACCEPTED      EXACT  Vermont_29March2021

[1940 rows x 16 columns]


In [23]:
vermont_data.fillna("-1", inplace = True)
vermont_data = vermont_data.astype({"taxon Key": int})
vermont_data.to_csv(DATA_DIR + 'listC_Vermont_29March2021.csv', index=False)

## Making global file 

In [6]:
global_columns = ['taxon key', 'superfamily', 'family', 
                  'subfamily', 'genus', 'name', 'scientific name',
                  'gbif scientific name', 'confidence', 'status',
                  'match type', 'source']

#### First fetching data from Quebec list to build global file

In [7]:
quebec_species = 'listB_Quebec_Pohl2018.csv'
file           = DATA_DIR + quebec_species
quebec_data    = pd.read_csv(file, index_col=False)

# fetch relevant data from Pohl list
global_list    = quebec_data[['taxon Key', 'superfamily', 
                              'family', 'subfamily', 'val genus', 'name',
                              'Scientific Name', 'GBIF species name', 
                              'confidence', 'status', 'match type', 'source']]

global_list.columns = global_columns
global_list.to_csv(DATA_DIR + 'listX_GlobalMothList_7April2021.csv', index=False)

#### Adding Vermont data to global list

In [8]:
global_species  = 'listX_GlobalMothList_7April2021.csv'
file            = DATA_DIR + global_species
global_data     = pd.read_csv(file)

vermont_species = 'listC_Vermont_29March2021.csv'
file            = DATA_DIR + vermont_species
vermont_data    = pd.read_csv(file)
vermont_data    = vermont_data[['taxon Key', 'superfamily',
                                'family', 'subfamily', 'genus',
                                'specificEpithet', 'scientificName',
                                'GBIF species name','confidence', 'status', 
                                'match type', 'source']]

vermont_data.columns = global_columns
print(vermont_data)

      taxon key       superfamily  ... match type               source
0       1939759  Micropterigoidea  ...      EXACT  Vermont_29March2021
1       1731862    Eriocranioidea  ...      EXACT  Vermont_29March2021
2       1731826    Eriocranioidea  ...      EXACT  Vermont_29March2021
3      10067382       Hepialoidea  ...      EXACT  Vermont_29March2021
4       1829029       Hepialoidea  ...      EXACT  Vermont_29March2021
...         ...               ...  ...        ...                  ...
1935    4301258        Noctuoidea  ...      EXACT  Vermont_29March2021
1936    4301263        Noctuoidea  ...      EXACT  Vermont_29March2021
1937    1771249        Noctuoidea  ...      EXACT  Vermont_29March2021
1938    1738829      Tortricoidea  ...      EXACT  Vermont_29March2021
1939    9453490      Geometroidea  ...      EXACT  Vermont_29March2021

[1940 rows x 12 columns]


In [9]:
for indx in vermont_data.index:
  taxa_key  = vermont_data['taxon key'][indx]
  taxa_name = vermont_data['scientific name'][indx]
  if taxa_key!=-1 and taxa_key in list(global_data['taxon key']):
    row_ind                            = global_data[global_data['taxon key']==taxa_key].index.values[0]
    temp_val                           = global_data.loc[row_ind, 'source']
    global_data.loc[row_ind, 'source'] = temp_val + ' ' + vermont_data['source'][indx]
  
  elif taxa_key==-1 and taxa_name in list(global_data['scientific name']):
    row_ind                            = global_data[global_data['scientific name']==taxa_name].index.values[0]
    temp_val                           = global_data.loc[row_ind, 'source']
    global_data.loc[row_ind, 'source'] = temp_val + ' ' + vermont_data['source'][indx]
  
  else:
    global_data = global_data.append(vermont_data.loc[indx, :])


In [11]:
global_data.to_csv(DATA_DIR + 'listX_GlobalMothList_7April2021.csv', index=False)   

## Count no. of not-found entries

In [14]:
global_species  = 'listX_GlobalMothList_7April2021.csv'
file            = DATA_DIR + global_species
global_data     = pd.read_csv(file) 
count           = 0
no_search       = []

for indx in global_data.index:
  if global_data['taxon key'][indx] == -1:
    count += 1
    no_search.append(global_data['scientific name'][indx])

print(count)
no_search = pd.DataFrame(no_search)
no_search.to_csv(DATA_DIR + 'not-found_7April2021.csv', index=False)

139


## Miscellaneous - Not to Run

In [63]:
print(json.dumps(result, indent=3))

{
   "usageKey": 9538415,
   "scientificName": "Stigmella cerea (Braun, 1917) Newton et al., 1982",
   "canonicalName": "Stigmella cerea",
   "rank": "SPECIES",
   "status": "ACCEPTED",
   "confidence": 99,
   "matchType": "EXACT",
   "kingdom": "Animalia",
   "phylum": "Arthropoda",
   "order": "Lepidoptera",
   "family": "Nepticulidae",
   "genus": "Stigmella",
   "species": "Stigmella cerea",
   "kingdomKey": 1,
   "phylumKey": 54,
   "classKey": 216,
   "orderKey": 797,
   "familyKey": 7014,
   "genusKey": 1735652,
   "speciesKey": 9538415,
   "synonym": false,
   "class": "Insecta"
}


In [None]:
file     = 'Pohl2018_Keylist.csv'
key_data = pd.read_csv(DATA_DIR + file)

# fetch relevant data from Pohl list
relv_data    = quebec_data[['superfamily', 'family', 'subfamily', 'val genus', 'name']]
species_n    = quebec_data['val genus'] + ' ' + quebec_data['name']

final_data   = pd.concat([key_data[['Species Key', 'Taxon Key']], 
                          relv_data, species_n, 
                          key_data[['GBIF Species Name', 'Source']],
                          quebec_data['SP NO.']],                         
                          axis=1)

global_data  = pd.DataFrame(final_data, columns = global_columns)
global_data.to_csv(DATA_DIR + 'listX_GlobalMothList_2April2021.csv') 