# Download Pipeline

## Imports

In [34]:
from json import loads as parse_json
import json
import pandas as pd
import numpy as np
import requests
import io
import os

# Changing the option to show a dataframe not 'in-line'
pd.set_option('display.expand_frame_repr', False)

In [35]:


# Checking that my drive is connected
dir = os.listdir()
print(dir)

csv_file = [i for i in dir if '.csv' in i][0]
print(csv_file)

['florida_region_query.csv', 'multimedia.txt', 'occurrence.txt', 'Region_Downloading_Pipeline.ipynb']
florida_region_query.csv


## Reading Query Files

In [36]:
# Loading CSVs from google drive
try:
    query = pd.read_csv(csv_file, sep=',', encoding='latin-1', dtype=object).dropna(axis=1, how='all') # uses taxonKey for 'class'
except:
    query = pd.read_csv(csv_file, sep='\t', encoding='latin-1', dtype=object).dropna(axis=1, how='all') # uses taxonKey for 'class'


print("Shape of query: ", query.shape)
# print(query.sort_values('numberOfOccurrences'))

media = pd.read_csv('multimedia.txt', sep='\t', encoding='latin-1', dtype=object, on_bad_lines='skip').dropna(axis=1, how='all') # uses gbifID for 'occurrence'
print("Shape of media: ", media.shape)

occur = pd.read_csv('occurrence.txt', sep='\t', encoding='latin-1', dtype=object, on_bad_lines='skip').dropna(axis=1, how='all') # uses taxonKey for 'class' and gbifIF for 'occurrence'
print("Shape of occur: ", occur.shape)

# unique taxons
occur_count = occur.nunique()
media_count = media.nunique()
print('unique ids in occur: ', occur_count["gbifID"])
print('unique ids in media: ', media_count["gbifID"])

Shape of query:  (1307, 22)
Shape of media:  (1468200, 12)
Shape of occur:  (1036564, 202)
unique ids in occur:  1036564
unique ids in media:  709815


## Managing the Loaded Data 

In [37]:
# combine the dataframes based on matching id
occur_media = pd.merge(occur, media, on="gbifID", how="inner")

print("Shape of occur_media: ", occur_media.shape)
print(occur_media['identifier_y'].head)

Shape of occur_media:  (1468010, 213)
<bound method NDFrame.head of 0          https://inaturalist-open-data.s3.amazonaws.com...
1          https://inaturalist-open-data.s3.amazonaws.com...
2          https://inaturalist-open-data.s3.amazonaws.com...
3          https://inaturalist-open-data.s3.amazonaws.com...
4          https://inaturalist-open-data.s3.amazonaws.com...
                                 ...                        
1468005    https://svampe.databasen.org/uploads/2022-1027...
1468006    https://svampe.databasen.org/uploads/2022-1027...
1468007    https://svampe.databasen.org/uploads/2022-1027...
1468008    https://svampe.databasen.org/uploads/2022-1027...
1468009    http://specify.ugrasu.ru:8080/fileget?coll=Fun...
Name: identifier_y, Length: 1468010, dtype: object>


In [38]:
# sort using the values as numbers, not objects/strings
sort_order = query['numberOfOccurrences'].astype('int32').argsort() 

# sort the queries by occurence number and then reverse it because argsort is always ascending order
taxons = query.iloc[sort_order][query['numberOfOccurrences'].astype('int32') > 0][::-1].dropna(subset=['species']) 

print(taxons[['taxonKey','genus', 'species', 'numberOfOccurrences']])

      taxonKey           genus                      species numberOfOccurrences
0      2548311        Trametes          Trametes versicolor                1879
2      5243168   Chlorophyllum     Chlorophyllum molybdites                1257
3      2553087         Stereum          Stereum complicatum                1183
4      5249599    Cantharellus    Cantharellus cinnabarinus                1164
5      5248508        Hericium           Hericium erinaceus                1139
...        ...             ...                          ...                 ...
1035   3363661           Fomes            Fomes fomentarius                   1
1033   2549043         Erastia         Erastia salmonicolor                   1
1032   8252570        Entoloma            Entoloma watsonii                   1
1031   8095923        Entoloma           Entoloma strictius                   1
1306  10792674  Zhuliangomyces  Zhuliangomyces subillinitus                   1

[1214 rows x 4 columns]


  taxons = query.iloc[sort_order][query['numberOfOccurrences'].astype('int32') > 0][::-1].dropna(subset=['species'])


## Generating Links

In [39]:
number_of_images = 300
fail_count = 0 # number of species without enough images

# initialize the empty dataframe
links = pd.DataFrame(columns=['key',  'species', 'link'])

# loop through and grab n rows per taxon
for i in range(len(taxons)): 

    # subset occurences to get just the first n matching rows
    current_taxon = taxons['taxonKey'].iloc[i]
    subset = occur_media.loc[occur_media['taxonKey'] == current_taxon].dropna(subset=['identifier_y'])

    # initialize a (n, 3) array to fill
    arr = np.ndarray((number_of_images, 3), dtype='object')
    count = 0

    if len(subset) > number_of_images:
        for j in range(len(subset)):

            # get the file extension 
            url = subset.iloc[j]['identifier_y']
            root, ext = os.path.splitext(url)

            # filter out any files that dont have .jpg or .jpeg extensions for uniformity
            if ext == '.jpg' or ext == '.jpeg':
                
                # take the useful values from the current row and insert them into the ndarray
                row = subset.iloc[j][['taxonKey', 'species', 'identifier_y']]
                arr[count] = [row['taxonKey'], row['species'], row['identifier_y']]
                count += 1

            # filled array
            if count >= number_of_images:
                count = 0
                break

        # convert the filled ndarray into a dataframe 
        sub = pd.DataFrame(arr, columns=['key',  'species', 'link'])

        # concatenate the dataframe
        links = pd.concat([links, sub])
        
    # else:
    #     fail_count += 1

# reset and drop the indices        
links = links.reset_index()[['key',  'species', 'link']]

print('Number of Species\t', links.shape[0] / number_of_images)
print('Number of rows:\t', links.shape[0], '\n')

Number of Species	 524.0
Number of rows:	 157200 



In [40]:
# run this to redownload the aggregated links file
links.to_csv('links.csv', index=False)

## Reading from Links

In [None]:
nkeys = links['key'].value_counts()
print('Unique keys:\t', len(nkeys))

nspecies = links['species'].value_counts()
print('Unique species:\t', len(nspecies), '\n')

keys_species = links.groupby(['key','species']).size().reset_index().rename(columns={0:'count'})
print(keys_species[['species']].value_counts().sort_values().tail())

print('\nIDs for duplicate species ("Laetiporus sulphureus"): ', links[links['species'] == 'Laetiporus sulphureus']['key'].unique())

links = links[links['key'] != '2542235']

print('\n\nAfter Removal:')
print('\nIDs for duplicate species ("Laetiporus sulphureus"): ', links[links['species'] == 'Laetiporus sulphureus']['key'].unique())

nkeys = links['key'].value_counts()
print('\nUnique keys:\t', len(nkeys))

nspecies = links['species'].value_counts()
print('Unique species:\t', len(nspecies))

## Downloading the Images

In [None]:
dir = os.listdir()
print(dir)
if 'images' not in dir:
    os.mkdir('images/')

In [None]:
for key in links['key'].unique():
    subset = links[links['key'] == key]
    output_dir = 'images/'
    count = 0
    species_name = subset['species'].unique()[0].replace(' ', '_')

    print(species_name)

    # print(subset)

    for link in links[links['key'] == key]['link']:
        try:
            image = requests.get(link).content
            output_file = os.path.join(output_dir, species_name + '_' + str(count) + '.jpg')
            # print(output_file)
            with open(output_file, 'wb') as writer:
                writer.write(image)

            # FILE.download(output_file)

        except Exception as e: 
            print(e)
        
        finally:
            count += 1