In [7]:
'''
Author.      : Aditya Jain
Date Started : 28th June, 2021
About        : This script fetches unique key for non-moth taxas
'''

'\nAuthor.      : Aditya Jain\nDate Started : 28th June, 2021\nAbout        : This script fetches unique key for non-moth taxas\n'

In [8]:
from pygbif import occurrences as occ
from pygbif import species as species_api
import pandas as pd
import os
import tqdm
import urllib
import json
import time
import math  

DATA_DIR       = '/home/mila/a/aditya.jain/mothAI/'

#### Fetching keys

In [15]:
def get_gbif_key_backbone(name):
    '''
    given a taxa name, this function returns the unique gbif key and other
    attributes using backbone API
    '''
    # default values
    search_name   = [name]
    taxon_key     = ['NA']
    confidence    = ['']
    gbif_name     = ['NA']     # the name returned on search, can be different from the search
    status        = ['NA']
    rank          = ['NA']
    match_type    = ['NONE']
    count_down    = [30000]

    data = species_api.name_backbone(name=name, strict=True)

    if data['matchType'] == 'NONE':
        confidence    = [data['confidence']]
    else:
        taxon_key     = [data['usageKey']]
        confidence    = [data['confidence']]
        gbif_name     = [data['scientificName']]
        status        = [data['status']]
        rank          = [data['rank']]
        match_type    = [data['matchType']]
  
    df = pd.DataFrame(list(zip(taxon_key, confidence, search_name, gbif_name, rank, status, 
                             match_type, count_down)),
                    columns =['Taxon Key', 'Confidence', 'Search Name','GBIF Taxa Name', 
                              'Rank', 'Status', 'Match Type', 'Count Download'])
    return df


In [16]:
nonmoth_list = ['Trichoptera', 'Formicidae', 'Ichneumonidae', 'Diptera', 
                'Orthoptera', 'Hemiptera', 'Pholcidae', 'Araneae', 'Opiliones']


In [17]:
data_final = pd.DataFrame(columns =['Taxon Key', 'Confidence', 'Search Name','GBIF Taxa Name', 
                              'Rank', 'Status', 'Match Type', 'Count Download'])

for name in nonmoth_list:
    data = get_gbif_key_backbone(name)
    data_final = data_final.append(data, ignore_index = True)
    data_final.to_csv(DATA_DIR + 'NonMothList_30June2021.csv', index = False)  

In [None]:
1003,99,Trichoptera,Trichoptera,ORDER,ACCEPTED,EXACT

In [6]:
data = species_api.name_backbone(name='Chelicerata', verbose=True)
data = species_api.name_backbone(name='Opiliones', strict=True)
print(data)

{'usageKey': 907, 'scientificName': 'Opiliones', 'canonicalName': 'Opiliones', 'rank': 'ORDER', 'status': 'ACCEPTED', 'confidence': 95, 'matchType': 'EXACT', 'kingdom': 'Animalia', 'phylum': 'Arthropoda', 'order': 'Opiliones', 'kingdomKey': 1, 'phylumKey': 54, 'orderKey': 907, 'synonym': False}


#### Downloading data

In [19]:
DATA_DIR       = "/home/mila/a/aditya.jain/mothAI/"
WRITE_DIR      = "/home/mila/a/aditya.jain/testnm/nonmothdata/"
INat_KEY       = "50c9509d-22c7-4a22-a47d-8c48425ef4a7"   # iNat key to fetch data from GBIF
LIMIT_DOWN     = 300                                      # GBIF API parameter for max results per page
MAX_DATA_SP    = 5                                        # max. no of images to download for a species
MAX_SEARCHES   = 40000                                    # maximum no. of points to iterate

nonmoth_taxa   = 'NonMothList_30June2021.csv'
file           = DATA_DIR + nonmoth_taxa
nonmoth_data   = pd.read_csv(file)

def inat_metadata_gbif(data):
    '''
    this function returns the relevant gbif metadata for an iNat observation
    '''
    fields    = ['decimalLatitude', 'decimalLongitude', 'phylum',
            'order', 'family', 'genus', 'species', 'acceptedScientificName',
            'year', 'month', 'day',
            'datasetName', 'taxonID', 'acceptedTaxonKey']

    meta_data = {}

    for field in fields:
        if field in data.keys():
            meta_data[field] = data[field]
        else:
            meta_data[field] = ''

    return meta_data

In [20]:
taxon_key          = list(nonmoth_data['Taxon Key'])              # list of taxon keys
taxon_name         = list(nonmoth_data['Search Name'])        # list of species name that is searched
gbif_taxon_name    = list(nonmoth_data['GBIF Taxa Name'])   # list of species name returned by gbif [can be different from above or -1]


### this snippet is run ONLY is training is resuming from some point ####
# start              = 111
# end                = ''
# taxon_key          = taxon_key[start:]
# species_name       = species_name[start:]
# gbif_taxon_name  = gbif_species_name[start:]
##########################################################################

In [23]:
for i in range(len(taxon_key)):
    print('Downloading for: ', gbif_taxon_name[i])
    begin       = time.time()
    data        = occ.search(taxonKey = taxon_key[i], mediatype='StillImage', limit=1)
    total_count = data['count'] 
    
    if total_count==0:            # no data for the species on iNat
        print('No image record!')   
    else:
        image_count = 0                                   # images downloaded for every species
        max_count   = min(total_count, MAX_DATA_SP)
        total_pag   = math.ceil(MAX_SEARCHES/LIMIT_DOWN)  # total pages to be fetched with max 300 entries each
        offset      = 0  
        m_data      = {}                                 # dictionary variable to store metadata
        
        write_loc   = WRITE_DIR + gbif_taxon_name[i] 
        try:    
            os.makedirs(write_loc)                     # creating hierarchical structure for image storage 
        except:
            pass
        
        for j in range(total_pag):
            data       = occ.search(taxonKey = taxon_key[i], mediatype='StillImage', 
                               limit=LIMIT_DOWN, offset=offset)
            tot_points = len(data['results'])
            
            for k in range(tot_points):                
                if data['results'][k]['media']: 
                    gbifid   = data['results'][k]['gbifID']
                
                    if 'identifier' in data['results'][k]['media'][0].keys():
                        image_url   = data['results'][k]['media'][0]['identifier']            
                        try:
                            urllib.request.urlretrieve(image_url, write_loc + '/' + gbifid + '.jpg')
                            image_count += 1              
                            meta_data      = inat_metadata_gbif(data['results'][k])   # fetching metadata
                            m_data[gbifid] = meta_data                 
                        except:
                            pass
                        
                if image_count >= max_count:
                        break
                
            offset += LIMIT_DOWN
            if image_count >= max_count:
                break
                
        with open(write_loc + '/' + 'metadata.txt', 'w') as outfile:
                json.dump(m_data, outfile)
                
        end = time.time()
        print('Time taken to download data for ', gbif_taxon_name[i], ' is - ', 
        round(end-begin), 'sec for ', image_count, ' images')

Downloading for:  Trichoptera
Time taken to download data for  Trichoptera  is -  3 sec for  5  images
Downloading for:  Formicidae
Time taken to download data for  Formicidae  is -  3 sec for  5  images
Downloading for:  Ichneumonidae
Time taken to download data for  Ichneumonidae  is -  5 sec for  5  images
Downloading for:  Diptera
Time taken to download data for  Diptera  is -  3 sec for  5  images
Downloading for:  Orthoptera
Time taken to download data for  Orthoptera  is -  3 sec for  5  images
Downloading for:  Hemiptera
Time taken to download data for  Hemiptera  is -  5 sec for  5  images
Downloading for:  Pholcidae
Time taken to download data for  Pholcidae  is -  3 sec for  5  images
