In [1]:
"""
Author        : Aditya Jain
Date Started  : April 24th, 2023
About         : Analyze taxon keys, names and existing data on gbif
"""

import pandas as pd
from pygbif import species as species_api
import glob

#### Analyze taxon keys and species

In [2]:
species_list = pd.read_csv('/home/mila/a/aditya.jain/mothAI/species_lists/UK-Denmark_Moth-List_25Apr2023.csv')

search_species_names = list(species_list['search_species_name'])

# gbif names processing
gbif_species_names   = list(species_list['gbif_species_name'])
gbif_species_names   = [name for name in gbif_species_names if name != 'NotAvail']

# taxon keys processing
taxon_keys   = list(species_list['accepted_taxon_key'])
taxon_keys   = [key for key in taxon_keys if key != -1]

print(f'Total unique search species names are {len(set(search_species_names))} / {len(species_list)}.')
print(f'Total unique gbif species names are {len(set(gbif_species_names))} / {len(gbif_species_names)}.')
print(f'Total unique accepted taxon keys are {len(set(taxon_keys))} / {len(taxon_keys)}.')

Total unique search species names are 3023 / 3023.
Total unique gbif species names are 3006 / 3006.
Total unique accepted taxon keys are 3006 / 3006.


In [13]:
# check duplicates
# set([x for x in gbif_species_names if gbif_species_names.count(x) > 1])

set()

#### Process the data statistics file in moths world folder

In [34]:
filename = '/home/mila/a/aditya.jain/scratch/GBIF_Data/moths_world/data_statistics.csv'
data     = pd.read_csv(filename)

# remove last total image count column, not required
data.drop(columns=['total_occ_count'], inplace=True)

# rename taxon key field
data.rename(columns={'taxon_key_gbif_id': 'accepted_taxon_key'}, inplace=True)

# save the file
data.to_csv(filename, index=False)  

#### Delete family entries in Denmark list

In [23]:
filename = '/home/mila/a/aditya.jain/mothAI/species_lists/Denmark_Moth-List_25Apr2023.csv'
df = pd.read_csv(filename)

index_to_drop = df[ (df['accepted_taxon_key'] == -1) & (df['order_name'] == 'NotAvail') ].index
df.drop(index_to_drop , inplace=True)

df.to_csv(filename, index=False)  

#### Test checking name with GBIF backbone

In [6]:
data = species_api.name_backbone(name='Perigonia lusca ilus', strict=True, rank='species')

print(data)

{'confidence': 100, 'matchType': 'NONE', 'synonym': False}


#### Check missing data

In [5]:
species_list   = pd.read_csv('/home/mila/a/aditya.jain/mothAI/species_lists/UK-Denmark_Moth-List_25Apr2023.csv')
datacount_file = pd.read_csv('/home/mila/a/aditya.jain/scratch/GBIF_Data/moths_world/data_statistics.csv')

gbif_sp_names  = species_list['gbif_species_name']
acc_taxon_keys = species_list['accepted_taxon_key']

not_found_sp_name = 0
for name in gbif_sp_names:
    if name not in list(datacount_file['gbif_species_name']):
        print(name)
        not_found_sp_name += 1
print(f'A total of {not_found_sp_name} species names do not have data.')

not_found_taxon_keys = 0
for key in acc_taxon_keys:
    if key not in list(datacount_file['accepted_taxon_key']) and key!=-1:
        print(key)
        not_found_taxon_keys += 1
print(f'A total of {not_found_taxon_keys} taxon keys do not have data.')

# # for _, row in species_list.iterrows():
# #     if row['gbif_species_name'] in list(datacount_file['gbif_species_name']) and row['accepted_taxon_key'] not in list(datacount_file['accepted_taxon_key']):
# #         print(row['gbif_species_name'])

# # species names in UK-Denmark checklist having zero images
# for _, row in datacount_file.iterrows():
#     if row['image_count']==0 and row['gbif_species_name'] in list(species_list['gbif_species_name']):
#         print(row['gbif_species_name'])
        
# # species keys in UK-Denmark checklist having zero images
# for _, row in datacount_file.iterrows():
#     if row['image_count']==0 and row['accepted_taxon_key'] in list(species_list['accepted_taxon_key']):
#         print(row['accepted_taxon_key'])

A total of 0 species names do not have data.
A total of 0 taxon keys do not have data.


#### Check species whose data is there but no entry in datacount file

In [6]:
species_list   = pd.read_csv('/home/mila/a/aditya.jain/mothAI/species_lists/UK-Denmark_Moth-List_25Apr2023.csv')
datacount_file = pd.read_csv('/home/mila/a/aditya.jain/scratch/GBIF_Data/moths_world/data_statistics.csv')
root_dir       = '/home/mila/a/aditya.jain/scratch/GBIF_Data/moths_world/'
columns        = [
                    "accepted_taxon_key",
                    "family_name",
                    "genus_name",
                    "search_species_name",
                    "gbif_species_name",
                    "image_count"
                 ]

for _, row in species_list.iterrows():
    family = row['family_name']
    genus  = row['genus_name']
    search_species = row['search_species_name']
    gbif_species = row['gbif_species_name']  
    taxon_key = row['accepted_taxon_key']
    species_data = glob.glob(root_dir + family + '/' + genus + '/' + gbif_species + '/*.jpg')
    
    if len(species_data)!=0 and gbif_species not in datacount_file['gbif_species_name'].tolist():
        print(f'{gbif_species} has a missing entry in datacount file.')
        
        datacount_file = pd.concat([datacount_file, pd.DataFrame([[taxon_key, family, genus, search_species, gbif_species, len(species_data)]],
                    columns=columns,),],ignore_index=True,)

# datacount_file.to_csv('/home/mila/a/aditya.jain/scratch/GBIF_Data/moths_world/data_statistics.csv', index=False)  