# GBIF Testing


# Kingdom: 	Fungi
### Division: 	    Basidiomycota
### Subdivision: 	Agaricomycotina
### Class: 	        Agaricomycetes

# Setup

In [1]:
!pip install pygbif

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
import pygbif.occurrences as occ
import pygbif as pg
from json import loads as parse_json
import json
import pandas as pd
import numpy as np
import requests
import io
import os

# Changing the option to show a dataframe not 'in-line'
pd.set_option('display.expand_frame_repr', False)

# Connecting to my drive
from google.colab import drive

drive.mount('/content/gdrive/', force_remount=True)
%cd gdrive/MyDrive/Mushroom_Data

# Checking that my drive is connected
os.listdir()

Mounted at /content/gdrive/
/content/gdrive/MyDrive/Mushroom_Data


['region_query.csv',
 'occurrence.tsv',
 'multimedia.tsv',
 'GBIF_Playground.ipynb',
 'multimedia (1).gsheet',
 'region_query.gsheet',
 'links.csv',
 'links.gsheet']

# Trying to use pygbif

This was our initial attempt to use pygbif to gather the images from the GBIF database. We realized very quickly, however, that this wouldnt work out since they have a capped number of results for any query using the automated APIs as well as being excruciatingly slow to even collect and filter using this method. Instead we had to make an account and then a query via their website and download the results from there.

API: https://pygbif.readthedocs.io/en/latest/intro/install.html

In [3]:
# region = "POLYGON((-80.84917 24.20293,-80.01938 24.59512,-79.63128 26.09898,-79.42511 27.17836,-80.63179 31.09126,-75.41786 34.72027,-74.41965 36.59897,-80.11633 36.64279,-83.25188 36.64584,-87.28004 36.67774,-88.10162 36.70167,-88.08966 36.51821,-89.6346 36.59016,-90.04039 35.68417,-90.40183 35.04352,-90.82846 34.45193,-91.29241 33.66984,-91.11607 32.38428,-91.61095 31.46846,-91.71903 30.97357,-89.80207 30.95082,-89.87033 30.72329,-89.68261 30.19996,-88.82936 30.16014,-86.44589 29.87155,-84.32692 29.3511,-83.65777 29.16523,-82.65404 26.11688,-83.29368 24.74936,-83.01713 23.99761,-80.84917 24.20293))"

# key = pg.species.name_backbone(name = "agaricomycetes", rank="class")['usageKey']
# key = pg.species.name_backbone(name = "Phallus", rank="genus")['usageKey']
# print(key)
# region_results = occ.search(taxonKey = key, geometry= region, mediaType = 'StillImage')['results']

# species = []
# len_iter = len(region_results)
# page = 1


# while len_iter > 0: # per page
#     print(f'pages: {page}')

#     for i in range(len_iter): # per page item
#         if region_results[i]['scientificName'] not in species:
#             species.append(region_results[i]['scientificName'])

#     region_results = occ.search(geometry= region, mediaType = 'StillImage', offset = 300 * page)['results']
#     len_iter = len(region_results)
#     page += 1

#     if page > 10:
#         break

# print(f'pages: {page}')
# print(f'species: {species}')

# key_1 = pg.species.name_backbone(name = "Amanita muscaria", rank="species")['usageKey']
# key_2 = pg.species.name_backbone(name = "Schizophyllum commune", rank="species")['usageKey']
# payload = occ.search(geometry= region, taxonKey = key_1, limit = 302, mediaType = 'StillImage')

# print(len(payload['results']))

# json_object = json.dumps(payload, indent=4)

# with open("example_query.json", "w") as outfile:
#     outfile.write(json_object)

# print(f'species: {len(species)}')
# print(f'pages: {page}')

# Wow! jk

2524102


# GBIF Download Citation:
GBIF.org (21 November 2022) GBIF Occurrence Download  https://doi.org/10.15468/dl.pjkxtn

# Loading the CSVs from the Occurence Download

I downloaded a CSV containing about ~56,000 occurences of images within The Agaricomycetes taxon that are also withing our bounding polygon. I specified human observation, but 'preserved specimens' may also petentially be viable, however many of those are using plain backgrounds and wouldn't really match 'real data' scenarios.

In [6]:
# Loading CSVs from google drive
query = pd.read_csv('region_query.csv', sep=',', encoding='latin-1', dtype=object).dropna(axis=1, how='all') # uses taxonKey for 'class'
media = pd.read_csv('multimedia.tsv', sep='\t', encoding='latin-1', dtype=object).dropna(axis=1, how='all') # uses gbifID for 'occurrence'
occur = pd.read_csv('occurrence.tsv', sep='\t', encoding='latin-1', dtype=object).dropna(axis=1, how='all') # uses taxonKey for 'class' and gbifIF for 'occurrence'

# checking their sizes
print("Shape of query: ", query.shape)
print("Shape of media: ", media.shape)
print("Shape of occur: ", occur.shape)

# unique taxons
occur_count = occur.nunique()
media_count = media.nunique()
print('unique ids in occur: ', occur_count["gbifID"])
print('unique ids in media: ', media_count["gbifID"])

Shape of query:  (1307, 22)
Shape of media:  (121071, 10)
Shape of occur:  (56194, 95)
unique ids in occur:  56194
unique ids in media:  56194


# Data Transformations

In [7]:
# combine the dataframes based on matching id
occur_media = pd.merge(occur, media, on="gbifID", how="inner")

print("Shape of occur_media: ", occur_media.shape)
print(occur_media['identifier_y'].head)

Shape of occur_media:  (121071, 104)
<bound method NDFrame.head of 0         https://inaturalist-open-data.s3.amazonaws.com...
1         https://inaturalist-open-data.s3.amazonaws.com...
2         https://inaturalist-open-data.s3.amazonaws.com...
3         https://inaturalist-open-data.s3.amazonaws.com...
4         https://inaturalist-open-data.s3.amazonaws.com...
                                ...                        
121066    https://inaturalist-open-data.s3.amazonaws.com...
121067    https://inaturalist-open-data.s3.amazonaws.com...
121068    https://inaturalist-open-data.s3.amazonaws.com...
121069    https://inaturalist-open-data.s3.amazonaws.com...
121070    https://inaturalist-open-data.s3.amazonaws.com...
Name: identifier_y, Length: 121071, dtype: object>


In [8]:
# sort using the values as numbers, not objects/strings
sort_order = query['numberOfOccurrences'].astype('int32').argsort() 

# sort the queries by occurence number and then reverse it because argsort is always ascending order
taxons = query.iloc[sort_order][query['numberOfOccurrences'].astype('int32') > 99][::-1].dropna(subset=['species']) 

print(taxons[['taxonKey','genus', 'species', 'numberOfOccurrences']])

    taxonKey          genus                    species numberOfOccurrences
0    2548311       Trametes        Trametes versicolor                1879
2    5243168  Chlorophyllum   Chlorophyllum molybdites                1257
3    2553087        Stereum        Stereum complicatum                1183
4    5249599   Cantharellus  Cantharellus cinnabarinus                1164
5    5248508       Hericium         Hericium erinaceus                1139
..       ...            ...                        ...                 ...
111  2524071    Pseudocolus     Pseudocolus fusiformis                 104
112  2554333       Geastrum          Geastrum saccatum                 102
113  9655343        Amanita          Amanita lavendula                 101
114  2543487    Ischnoderma      Ischnoderma resinosum                 101
115  5241399    Volvariella      Volvariella bombycina                 101

[109 rows x 4 columns]


  """


## Filtering links to the selected species

In [24]:
# initialize the empty dataframe
links = pd.DataFrame(columns=['key',  'species', 'link'])

# loop through and grab 100 rows per taxon
for i in range(len(taxons)): 

    # subset occurences to get just the first 100 matching rows
    current_taxon = taxons['taxonKey'].iloc[i]
    subset = occur_media.loc[occur_media['taxonKey'] == current_taxon].dropna(subset=['identifier_y'])

    # initialize a (100, 3) array to fill
    arr = np.ndarray((100, 3), dtype='object')
    count = 0

    for j in range(len(subset)):

        # get the file extension 
        url = subset.iloc[j]['identifier_y']
        root, ext = os.path.splitext(url)

        # filter out any files that dont have .jpg or .jpeg extensions for uniformity
        if ext == '.jpg' or ext == '.jpeg':
            
            # take the useful values from the current row and insert them into the ndarray
            row = subset.iloc[j][['taxonKey', 'species', 'identifier_y']]
            arr[count] = [row['taxonKey'], row['species'], row['identifier_y']]
            count += 1

        # filled array
        if count >= 100:
            count = 0
            break

    # convert the filled ndarray into a dataframe 
    sub_100 = pd.DataFrame(arr, columns=['key',  'species', 'link'])

    # concatenate the dataframe
    links = pd.concat([links, sub_100])

# reset and drop the indices        
links = links.reset_index()[['key',  'species', 'link']]

print('\n\nNumber of rows:\t', links.shape[0], '\n')
print(links.tail())



Number of rows:	 10900 

           key                species                                               link
10895  5241399  Volvariella bombycina  https://inaturalist-open-data.s3.amazonaws.com...
10896  5241399  Volvariella bombycina  https://inaturalist-open-data.s3.amazonaws.com...
10897  5241399  Volvariella bombycina  https://inaturalist-open-data.s3.amazonaws.com...
10898  5241399  Volvariella bombycina  https://inaturalist-open-data.s3.amazonaws.com...
10899  5241399  Volvariella bombycina  https://inaturalist-open-data.s3.amazonaws.com...


# The CSV Download

In [25]:
# run this to redownload the aggregated links file
links.to_csv('links.csv', index=False)