In [1]:
"""
Author        : Aditya Jain
Date Started  : January 5, 2023

About         : Testing new method for downloading data
"""

import pandas as pd
from dwca.read import DwCAReader
import urllib
import json

dwca_file = '/home/mila/a/aditya.jain/scratch/GBIF_Data/leps_images_adult-imago.zip'

In [2]:
with DwCAReader(dwca_file) as dwca:
    media_df = dwca.pd_read('multimedia.txt',
                            parse_dates=True,
                            on_bad_lines='skip')
    occ_df = dwca.pd_read('occurrence.txt',
                            parse_dates=True,
                            on_bad_lines='skip')

  df = read_csv(self.absolute_temporary_path(relative_path), **kwargs)
  df = read_csv(self.absolute_temporary_path(relative_path), **kwargs)


In [4]:
# def fetch_meta_data(data):
# 	""" returns the relevant metadata for a GBIF observation"""

# 	fields	= ['decimalLatitude', 'decimalLongitude',
# 				'order', 'family', 'genus', 'species', 'acceptedScientificName',
# 				'year', 'month', 'day',
# 				'datasetName', 'taxonID', 'acceptedTaxonKey', 'lifeStage', 'basisOfRecord']

# 	meta_data = {}

# 	for field in fields:
# 		try:
# 			meta_data[field] = data[field]
# 		except:
# 			meta_data[field] = 'NA'
# 	return meta_data
def fetch_meta_data(data):
	""" returns the relevant metadata for a GBIF observation"""

	fields	= ['decimalLatitude', 'decimalLongitude',
				'order', 'family', 'genus', 'species', 'acceptedScientificName',
				'year', 'month', 'day',
				'datasetName', 'taxonID', 'acceptedTaxonKey', 'lifeStage', 'basisOfRecord']

	meta_data = {}
	# print('Data: ', data)

	for field in fields:
		if pd.isna(data[field]):
			meta_data[field] = 'NA'
		else:
			meta_data[field] = data[field]

	return meta_data

In [5]:
def fetch_image_data(taxon_key: int,
					 family: str,
					 genus: str,
					 species: str,
					 occurrence_df,
		 			 media_df,
	 				 write_dir: str, 
	 				 max_data: int):

	family_name	= family
	genus_name	 = genus
	species_name   = species
	write_location = write_dir + family_name + "/" + genus_name + "/" + species_name 

	# delete folder and its content, if exists already
	try:	
		shutil.rmtree(write_location)
	except:
		pass

	# creating hierarchical folder structure for image storage 
	try:	
		os.makedirs(write_location)
	except:
		pass

	occurrence_data = occurrence_df.loc[occurrence_df['acceptedTaxonKey'] == taxon_key]
	total_occ	   = len(occurrence_data)
	#####
	print(f'Total occurence available is {total_occ}')
	#####
	if total_occ == 0:
		return [0, 0]
	occurrence_data = occurrence_data.sample(frac = 1)

	image_count = 0   
	meta_data   = {} 
	for idx, row in occurrence_data.iterrows():
		obs_id = row['id']

		# check occurrence entry in media dataframe
		try:
			media_entry = media_df.loc[media_df['coreid'] == obs_id]
			if len(media_entry)>1:   # multiple images for an observation
				media_entry = media_entry.iloc[0, :]
				image_url = media_entry['identifier']
			else:
				image_url = media_entry['identifier'].item()
		except:
			continue

		# download image
		
		try:
			urllib.request.urlretrieve(image_url, write_location + '/' + str(obs_id) + '.jpg')
			image_count += 1
			m_data = fetch_meta_data(row)
			meta_data[str(obs_id) + '.jpg'] = m_data 
		except Exception as e:
			print(f'An exception has occurred {e}')
			continue

		if image_count >= max_data:
			break

	with open(write_location + '/' + 'meta_data.json', 'w') as outfile:
		json.dump(meta_data, outfile)   

	return [image_count, total_occ]



In [7]:
write_dir    = '/home/mila/a/aditya.jain/scratch/GBIF_Data/moths_world/'
max_img_data = 1000

[image_count, total_occ] = fetch_image_data(8223165, 
											'Crambidae', 'Ostrinia', 'Ostrinia nubilalis', 
											occ_df, media_df, 
											write_dir, max_img_data)	

Total occurence available is 1799


In [24]:
print('Media columns: ', media_df.columns)
print('Occurrence columns: ', occ_df.columns)

Media columns:  Index(['coreid', 'type', 'format', 'identifier', 'references', 'title',
       'description', 'source', 'audience', 'created', 'creator',
       'contributor', 'publisher', 'license', 'rightsHolder'],
      dtype='object')
Occurrence columns:  Index(['id', 'abstract', 'accessRights', 'accrualMethod', 'accrualPeriodicity',
       'accrualPolicy', 'alternative', 'audience', 'available',
       'bibliographicCitation',
       ...
       'level0Name', 'level1Gid', 'level1Name', 'level2Gid', 'level2Name',
       'level3Gid', 'level3Name', 'iucnRedListCategory', 'eventType',
       'geodeticDatum'],
      dtype='object', length=260)


In [5]:
for col in occ_df.columns:
    print(col)

id
abstract
accessRights
accrualMethod
accrualPeriodicity
accrualPolicy
alternative
audience
available
bibliographicCitation
conformsTo
contributor
coverage
created
creator
date
dateAccepted
dateCopyrighted
dateSubmitted
description
educationLevel
extent
format
hasFormat
hasPart
hasVersion
identifier
instructionalMethod
isFormatOf
isPartOf
isReferencedBy
isReplacedBy
isRequiredBy
isVersionOf
issued
language
license
mediator
medium
modified
provenance
publisher
references
relation
replaces
requires
rights
rightsHolder
source
spatial
subject
tableOfContents
temporal
title
type
valid
institutionID
collectionID
datasetID
institutionCode
collectionCode
datasetName
ownerInstitutionCode
basisOfRecord
informationWithheld
dataGeneralizations
dynamicProperties
occurrenceID
catalogNumber
recordNumber
recordedBy
recordedByID
individualCount
organismQuantity
organismQuantityType
sex
lifeStage
reproductiveCondition
behavior
establishmentMeans
degreeOfEstablishment
pathway
georeferenceVerificationSta

In [39]:
test_data = media_df.loc[media_df['coreid'] == 2873773745]
print(test_data['identifier'])


4162592    https://inaturalist-open-data.s3.amazonaws.com...
Name: identifier, dtype: object


Testing image count issue

In [4]:
taxon_key = 1846645 # taxon key for Monochroa elongella

occurrence_data = occ_df.loc[occ_df['acceptedTaxonKey'] == taxon_key]

In [5]:
occurrence_data

Unnamed: 0,id,abstract,accessRights,accrualMethod,accrualPeriodicity,accrualPolicy,alternative,audience,available,bibliographicCitation,...,level0Name,level1Gid,level1Name,level2Gid,level2Name,level3Gid,level3Name,iucnRedListCategory,eventType,geodeticDatum
473906,1046478556,,Free usage,,,,,,,,...,Sweden,SWE.15_1,Stockholm,SWE.15.12_1,Österåker,,,NE,,WGS84
1044215,932343715,,Free usage,,,,,,,,...,Sweden,SWE.5_1,Halland,SWE.5.4_1,Kungsbacka,,,NE,,WGS84
1588605,3907598320,,,,,,,,,,...,Netherlands,NLD.3_1,Fryslân,NLD.3.2_1,Ameland,,,NE,,WGS84
2073983,3905589284,,,,,,,,,,...,Netherlands,NLD.3_1,Fryslân,NLD.3.2_1,Ameland,,,NE,,WGS84
3241961,3433600837,,Free usage,,,,,,,,...,Sweden,SWE.5_1,Halland,SWE.5.6_1,Varberg,,,NE,,WGS84


In [32]:
for idx, row in occurrence_data.iterrows():
    obs_id = row['id']

    try:
        media_entry = media_df.loc[media_df['coreid'] == obs_id]
        print(media_entry['coreid'])
        if len(media_entry)>1:   # multiple images for an observation
            media_entry = media_entry.iloc[0, :]
            image_url = media_entry['identifier']
            print(image_url, flush=True)
            urllib.request.urlretrieve(image_url, 'test' + '.jpg')
        else:
            image_url = media_entry['identifier'].item()
            print(image_url)
            urllib.request.urlretrieve(image_url, 'test' + '.jpg')
    except Exception as e:
        print(e)
        continue

5052157    1046478556
Name: coreid, dtype: int64
https://www.artportalen.se/MediaLibrary/2014/11/9fa43858-9b02-4c7f-afa0-35bd91243bae_image.jpg
6725502    932343715
Name: coreid, dtype: int64
https://www.artportalen.se/MediaLibrary/2016/1/9dfbd962-1699-48eb-b296-42d87cecd21d_image.jpg
4904315    3907598320
4904316    3907598320
Name: coreid, dtype: int64
https://observation.org/photos/54646790.jpg
4859489    3905589284
4859490    3905589284
Name: coreid, dtype: int64
https://observation.org/photos/54224909.jpg
1049125    3433600837
Name: coreid, dtype: int64
https://www.artportalen.se/MediaLibrary/2021/7/1d2509de-fd9d-4d09-834e-74c6585e5390_image.jpg


In [26]:
print(media_entry['identifier'])
image_url = media_entry['identifier'].item()
urllib.request.urlretrieve(image_url, 'test' + '.jpg')

1049125    https://www.artportalen.se/MediaLibrary/2021/7...
Name: identifier, dtype: object


('test.jpg', <http.client.HTTPMessage at 0x7fa799587f10>)

Test an issue

In [2]:
species_list = '/home/mila/a/aditya.jain/mothAI/species_lists/Quebec-Vermont_Moth-List_22July2022.csv'
moth_data = pd.read_csv(species_list)
search_species_names = list(moth_data["search_species_name"])
taxon_keys = list(moth_data["taxon_key_gbif_id"])
taxon_keys = [int(taxon) for taxon in taxon_keys]

In [3]:
write_dir = '/home/mila/a/aditya.jain/scratch/GBIF_Data/moths_world/'
data_type = {
        "taxon_key_gbif_id": int,
        "family_name": str,
        "genus_name": str,
        "search_species_name": str,
        "gbif_species_name": str,
        "image_count": int,
        "total_occ_count": int,
    }
count_list = pd.read_csv(write_dir + "data_statistics.csv", dtype=data_type)

In [None]:
for i in range(len(taxon_keys)):
    print(search_species_names[i])
    # taxa not there on GBIF
    if taxon_keys[i] == -1 and search_species_names[i] in count_list["search_species_name"]:  
        print(f"{search_species_names[i]} is not found on GBIF", flush=True)
        break
            continue

In [12]:
taxon_data = moth_data[moth_data['taxon_key_gbif_id']==1731862]
taxon_data['family_name'].item()

'Eriocraniidae'