In [None]:
'''
Author.      : Aditya Jain
Date Started : 25th March, 2021
About        : This script scraps data from GBIF for a given list of moth species
'''

!pip install pygbif



In [None]:
from pygbif import occurrences as occ
from pygbif import species as species_api
import pandas as pd
import os
import tqdm
import urllib
import json
import time
import math
import plotly.express as px

DATA_DIR       = "/content/drive/My Drive/Data/"
WRITE_DIR      = DATA_DIR + "iNat_adult/"
INat_KEY       = "50c9509d-22c7-4a22-a47d-8c48425ef4a7"   # iNat key to fetch data from GBIF
LIMIT_DOWN     = 300                                      # GBIF API parameter for max results per page
MAX_DATA_SP    = 1200                                     # max. no of images to download for a species

moth_species   = 'listX_GlobalMothList_14April2021.csv'
file           = DATA_DIR + moth_species
moth_data      = pd.read_csv(file)

## Downloading Data from GBIF

In [None]:
def inat_metadata_gbif(data):
  '''
  this function returns the relevant gbif metadata for an iNat observation
  '''
  fields    = ['decimalLatitude', 'decimalLongitude',
            'order', 'family', 'genus', 'species', 'acceptedScientificName',
            'year', 'month', 'day',
            'datasetName', 'taxonID', 'acceptedTaxonKey', 'lifeStage']

  meta_data = {}

  for field in fields:
    if field in data.keys():
      meta_data[field] = data[field]
    else:
      meta_data[field] = ''

  return meta_data

In [None]:
taxon_key          = list(moth_data['taxon key'])              # list of taxon keys
species_name       = list(moth_data['scientific name'])        # list of species name that is searched
gbif_species_name  = list(moth_data['gbif scientific name'])   # list of species name returned by gbif [can be different from above or -1]
columns            = ['taxon key', 'species name', 'gbif species name', 'count']
#count_list         = pd.DataFrame(columns = columns)         # uncomment if downloading data from scratch       

### this snippet is run ONLY is training is resuming from some point ####
start              = 300
end                = 500
taxon_key          = taxon_key[start:end]
species_name       = species_name[start:end]
gbif_species_name  = gbif_species_name[start:end]
count_list         = pd.read_csv(WRITE_DIR + 'iNat_datacount.csv')         # keeps the count of data downloaded for each species: key, name, name, count                            
##########################################################################

for i in range(len(taxon_key)):
  print('Downloading for: ', species_name[i])
  begin   = time.time()
  if taxon_key[i] == -1:          # taxa not there on GBIF
    count_list = count_list.append(pd.DataFrame([[-1, species_name[i], -1, -1]],
                                                columns=columns), ignore_index=True)
  else:
    data        = occ.search(taxonKey = taxon_key[i], datasetKey = INat_KEY, limit=1)
    total_count = data['count']   

    if total_count==0:            # no data for the species on iNat
      count_list = count_list.append(pd.DataFrame([[taxon_key[i], species_name[i], gbif_species_name[i], 0]],
                                                columns=columns), ignore_index=True)
    
    else:
      image_count = 0                                   # images downloaded for every species
      max_count   = min(total_count, MAX_DATA_SP)
      total_pag   = math.ceil(max_count/LIMIT_DOWN)     # total pages to be fetched with max 300 entries each
      offset      = 0

      family  = data['results'][0]['family']
      genus   = data['results'][0]['genus']  
      species = data['results'][0]['species']

      m_data  = {}                                 # dictionary variable to store metadata
      write_loc = WRITE_DIR + family + "/" + genus + "/" + species 

      try:    
        os.makedirs(write_loc)                     # creating hierarchical structure for image storage 
      except:
        pass

      for j in range(total_pag):
        data       = occ.search(taxonKey = taxon_key[i], datasetKey = INat_KEY, 
                               limit=LIMIT_DOWN, offset=offset)
        
        tot_points = len(data['results'])

        for k in range(tot_points):                     
          if data['results'][k]['media'] and 'lifeStage' in data['results'][k].keys():
            if data['results'][k]['lifeStage']=='Adult':            
              gbifid   = data['results'][k]['gbifID'] 
              image_url   = data['results'][k]['media'][0]['identifier']
            
              try:
                urllib.request.urlretrieve(image_url, write_loc + '/' + gbifid + '.jpg')
                image_count += 1              
                meta_data      = inat_metadata_gbif(data['results'][k])   # fetching metadata
                m_data[gbifid] = meta_data
              except:
                pass     

        offset += LIMIT_DOWN

      with open(write_loc + '/' + 'metadata.txt', 'w') as outfile:
              json.dump(m_data, outfile)       

      count_list = count_list.append(pd.DataFrame([[taxon_key[i], species_name[i], 
                         gbif_species_name[i], image_count]],
                                                columns=columns), ignore_index=True)
      
      end = time.time()
      print('Time taken to download data for ', gbif_species_name[i], ' is - ', 
            round(end-begin), 'sec for ', image_count, ' images')

  count_list.to_csv(WRITE_DIR + 'iNat_datacount.csv', index=False)
      
print(count_list)

    

Downloading for:  Parornix preciosella
Time taken to download data for  Parornix preciosella  is -  1 sec for  0  images
Downloading for:  Parornix quadripunctella
Time taken to download data for  Parornix quadripunctella  is -  1 sec for  0  images
Downloading for:  Parornix vicinella
Downloading for:  Micrurapteryx salicifoliella
Time taken to download data for  Micrurapteryx salicifoliella  is -  1 sec for  0  images
Downloading for:  Micrurapteryx occulta
Time taken to download data for  Micrurapteryx occulta  is -  2 sec for  1  images
Downloading for:  Parectopa lespedezaefoliella
Time taken to download data for  Parectopa lespedezaefoliella  is -  2 sec for  2  images
Downloading for:  Parectopa pennsylvaniella
Time taken to download data for  Parectopa pennsylvaniella  is -  1 sec for  0  images
Downloading for:  Parectopa plantaginisella
Time taken to download data for  Parectopa plantaginisella  is -  3 sec for  8  images
Downloading for:  Parectopa robiniella
Time taken to d

In [None]:
count_list         = pd.read_csv(WRITE_DIR + 'iNat_datacount.csv')         # keeps the count of data downloaded for each species: key, name, name, count                            
print(count_list)
# count_list         = count_list.iloc[:-8]
# count_list.to_csv(WRITE_DIR + 'iNat_datacount.csv', index=False)


      taxon key  ... count
0       1939759  ...    12
1       1731862  ...    44
2       1731826  ...    25
3       1731824  ...     0
4       1731847  ...     5
...         ...  ...   ...
3246    1798419  ...    10
3247    1765962  ...    54
3248    1789504  ...     0
3249    5111568  ...   284
3250    1738829  ...     2

[3251 rows x 4 columns]


## Adding the iNat numbers to global species list

In [None]:
count_list    = pd.read_csv(WRITE_DIR + 'iNat_datacount.csv')         # keeps the count of data downloaded for each species: key, name, name, count                            
count         = pd.DataFrame(count_list['count'])
count.rename(columns={'count': 'iNaturalist count'}, inplace=True)

new_moth_data = pd.concat([moth_data, count], axis=1)
new_moth_data.to_csv(DATA_DIR + 'listX_GlobalMothList_14April2021.csv', index=False)             

## Count statistics
The below code calculates the count statistics for the fetched iNaturalist data for various taxa ranks

In [None]:
def count_stat(moth_data):
  '''
  this function returns count statistics for various taxa ranks given a moth
  data file
  '''
  species   = {}    # dictionary to store species count
  genus     = {}    # dictionary to store genus count
  family    = {}    # dictionary to store family count
  tot_count = 0     # total number of images in the dataset

  for indx in moth_data.index:
    count = moth_data['iNaturalist count'][indx]
    if count!=0 and count!=-1:

      if moth_data['scientific name'][indx] not in species.keys():
        species[moth_data['scientific name'][indx]] = count
      else:
        species[moth_data['scientific name'][indx]] += count

      if moth_data['genus'][indx] not in genus.keys():
        genus[moth_data['genus'][indx]] = count
      else:
        genus[moth_data['genus'][indx]] += count

      if moth_data['family'][indx] not in family.keys():
        family[moth_data['family'][indx]] = count
      else:
        family[moth_data['family'][indx]] += count

      tot_count += count

  print('The total number of images in the set is: ', tot_count)
  print('Total family, genus and species respectively: ', len(family), len(genus), len(species))

  return species, genus, family, tot_count

In [None]:
species, genus, family, total_count = count_stat(moth_data)

The total number of images in the set is:  501731
Total family, genus and species respectively:  69 905 2537


In [None]:
# saving the counts as a file to the disk
species_df = pd.DataFrame.from_dict(species.items())
species_df.columns = ['species name', 'count']
species_df.to_csv(WRITE_DIR + 'species-count.csv', index=False)

genus_df = pd.DataFrame.from_dict(genus.items())
genus_df.columns = ['genus name', 'count']
genus_df.to_csv(WRITE_DIR + 'genus-count.csv', index=False)

family_df = pd.DataFrame.from_dict(family.items())
family_df.columns = ['family name', 'count']
family_df.to_csv(WRITE_DIR + 'family-count.csv', index=False)

## Plotting

In [None]:
species_count = pd.read_csv(WRITE_DIR + 'species-count.csv')['count']
genus_count   = pd.read_csv(WRITE_DIR + 'genus-count.csv')['count']
family_count  = pd.read_csv(WRITE_DIR + 'family-count.csv')['count']

In [None]:
# species plot
fig = px.box(species_count, y='count',
             title='Data Distribution at Species Level')
fig.update_layout(
    yaxis_title='image count',
)
fig.show()

In [None]:
# genus plot
fig = px.box(genus_count, y='count',
             title='Data Distribution at Genus Level')
fig.update_layout(
    yaxis_title='image count',
)
fig.show()

In [None]:
# family plot
fig = px.box(family_count, y='count',
             title='Data Distribution at Family Level')
fig.update_layout(
    yaxis_title='image count',
)
fig.show()

Plotting the original data distribution at species level

In [None]:
species_count = pd.read_csv(WRITE_DIR + 'iNat_totalcount_onGBIF.csv')['2']
# species plot
fig = px.box(species_count, y='2',
             title='Original Data Distribution on GBIF at Species Level')
fig.update_layout(
    yaxis_title='image count',
)
fig.show()

## Important Macromoth Families to Train
First fetching data for macromoths families to be trained on priority

In [None]:
family_macrom    = ['Apatelodidae', 'Bombycidae', 'Cossidae', 
                   'Drepanidae', 'Erebidae', 'Geometridae', 'Hepialidae', 
                   'Lasiocampidae', 'Limacodidae', 'Notodontidae',
                  'Saturniidae', 'Sesiidae', 'Sphingidae', 'Uraniidae', 'Zygaenidae']

### the below code is used to check if the desired families/subfamilies exist in our dataset ###
# family_list      = set(moth_data['family'])
# subfamily_list   = set(moth_data['subfamily'])

# for item in family_macrom:
#   if item not in family_list:
#     print(item)

# for item in subfamily_macrom:
#   if item not in subfamily_list:
#     print(item)

macromoth_data = moth_data    # copying from original file
indx_list      = []           # these row indices are not required

for indx in macromoth_data.index:
  if macromoth_data['family'][indx] not in family_macrom:
    indx_list.append(indx)

macromoth_data = macromoth_data.drop(indx_list)
macromoth_data.to_csv(WRITE_DIR + 'MacroMothList_21April2021.csv', index=False)

In [None]:
# reading when already saved
macromoth_species   = 'MacroMothList_21April2021.csv'
file                = WRITE_DIR + macromoth_species
macromoth_data      = pd.read_csv(file)

species, genus, family, total_count = count_stat(macromoth_data)

The total number of images in the set is:  261109
Total family, genus and species respectively:  15 300 735


In [None]:
species_df         = pd.DataFrame.from_dict(species.items())
species_df.columns = ['species name', 'count']

genus_df           = pd.DataFrame.from_dict(genus.items())
genus_df.columns   = ['genus name', 'count']

family_df          = pd.DataFrame.from_dict(family.items())
family_df.columns  = ['family name', 'count']

species_count      = species_df['count']
genus_count        = genus_df['count']
family_count       = family_df['count']

In [None]:
# species plot
fig = px.box(species_count, y='count',
             title='Data Distribution at Species Level')
fig.update_layout(
    yaxis_title='image count',
)
fig.show()

# genus plot
fig = px.box(genus_count, y='count',
             title='Data Distribution at Genus Level')
fig.update_layout(
    yaxis_title='image count',
)
fig.show()

# family plot
fig = px.box(family_count, y='count',
             title='Data Distribution at Family Level')
fig.update_layout(
    yaxis_title='image count',
)
fig.show()

## Miscellaneous - Not to Run



### Zipping files so that they can be downloaded 

In [None]:
!zip -r drive/MyDrive/Data/iNat_data.zip drive/MyDrive/Data/iNat/Apatelodidae/

In [None]:
# this updates the existing zip file
!zip -u -r drive/MyDrive/Data/iNat_data.zip drive/MyDrive/Data/iNat/Cossidae/ drive/MyDrive/Data/iNat/Drepanidae/ drive/MyDrive/Data/iNat/Erebidae/ 

In [None]:
file = open(WRITE_DIR + 'countdata.txt')
file = json.load(file)
file

In [None]:
# data = occ.search(taxonKey=1939759, datasetKey=INat_KEY)
data = occ.download(user = "adityajain07", 'taxonKey = 1939759')
print(data)

SyntaxError: ignored

In [None]:
print(json.dumps(data['results'][0], indent=3))

In [None]:
print(data.keys())
print(data['count'])
print(data['endOfRecords'], data['limit'])
print(data['results'][2])
print(data['results'][0].keys())

dict_keys(['offset', 'limit', 'endOfRecords', 'count', 'results', 'facets'])
101193
False 300
{'key': 2005401077, 'datasetKey': '50c9509d-22c7-4a22-a47d-8c48425ef4a7', 'publishingOrgKey': '28eb1a3f-1c15-4a95-931a-4af90ecb574d', 'installationKey': '997448a8-f762-11e1-a439-00145eb45e9a', 'publishingCountry': 'US', 'protocol': 'DWC_ARCHIVE', 'lastCrawled': '2021-03-28T07:49:20.828+00:00', 'lastParsed': '2021-03-28T09:45:55.128+00:00', 'crawlId': 257, 'hostingOrganizationKey': '28eb1a3f-1c15-4a95-931a-4af90ecb574d', 'extensions': {}, 'basisOfRecord': 'HUMAN_OBSERVATION', 'occurrenceStatus': 'PRESENT', 'taxonKey': 7207515, 'kingdomKey': 1, 'phylumKey': 54, 'classKey': 216, 'orderKey': 797, 'familyKey': 7017, 'genusKey': 5133087, 'speciesKey': 5133088, 'acceptedTaxonKey': 7207515, 'scientificName': 'Danaus plexippus plexippus', 'acceptedScientificName': 'Danaus plexippus plexippus', 'kingdom': 'Animalia', 'phylum': 'Arthropoda', 'order': 'Lepidoptera', 'family': 'Nymphalidae', 'genus': 'Dana

In [None]:
from pygbif import occurrences as occ
from pygbif import species

iNat_KEY = "50c9509d-22c7-4a22-a47d-8c48425ef4a7"
test = occ.search(q='Danaus plexippus', datasetKey=iNat_KEY)
# test = occ.download_list(user = "adityajain07", taxonKey = 5133088)
# test['count']
# result = species.name_suggest(q='Danaus plexippus')

In [None]:
import urllib
print(test.keys())
print(test['count'])
# test['endOfRecords']
image_link = test['results'][299]['media'][0]['identifier']

urllib.request.urlretrieve(image_link, 'test.jpg')

dict_keys(['offset', 'limit', 'endOfRecords', 'count', 'results', 'facets'])
101193


('test.jpg', <http.client.HTTPMessage at 0x7f681d8f7750>)