In [1]:
import csv
import os
import re
import multiprocessing
from urllib.request import urlretrieve

In [2]:
path = 'data/observations-182134.csv'

## Read entry data from CSV

In [3]:
licences = {}
scientific_names = {}
image_urls = []

with open(path) as csvfile:
    reader = csv.DictReader(csvfile)
    for row in reader:
        image_urls.append({
            'id': row['id'],
            'scientific_name': row['scientific_name'],
            'image_url': row['image_url']
        })
        if not row['license'] in licences:
            licences[row['license']] = 0
        licences[row['license']] += 1
        if not row['scientific_name'] in scientific_names:
            scientific_names[row['scientific_name']] = 0
        scientific_names[row['scientific_name']] += 1

### Licencing summary

In [4]:
licences

{'': 6443,
 'CC-BY-NC': 22707,
 'CC-BY': 22122,
 'CC-BY-SA': 1100,
 'CC-BY-NC-SA': 828,
 'CC-BY-NC-ND': 486,
 'CC0': 1116,
 'CC-BY-ND': 16}

### Scientific name summary

In [5]:
scientific_names

{'Entoloma virescens': 77,
 'Trametes coccinea': 1368,
 'Favolaschia calocera': 1112,
 'Auricularia cornea': 139,
 'Omphalotus nidiformis': 876,
 'Pulchrocladia retipora': 135,
 'Stereum versicolor': 756,
 'Microporus xanthopus': 248,
 'Lysurus mokusin': 152,
 'Clavulinopsis sulcata': 228,
 'Cladonia floerkeana': 61,
 'Podoscypha petalodes petalodes': 4,
 'Chlorophyllum brunneum': 136,
 'Coltricia australica': 170,
 'Nothojafnea cryptotricha': 3,
 'Lichenomphalia chromacea': 798,
 'Coprinellus micaceus': 61,
 'Cladonia confusa': 11,
 'Hygrocybe schistophila': 6,
 'Hypholoma brunneum': 129,
 'Aurantiporus pulcherrimus': 84,
 'Teloschistes chrysophthalmus': 192,
 'Laetiporus portentosus': 409,
 'Scutellinia scutellata': 8,
 'Rhizopus': 1,
 'Russula persanguinea': 385,
 'Aseroe rubra': 525,
 'Schizophyllum commune': 774,
 'Cantharellus concinnus': 253,
 'Amanita muscaria': 1307,
 'Sticta filix': 1,
 'Hericium coralloides': 135,
 'Macrolepiota clelandii': 453,
 'Leratiomyces ceres': 635,
 

In [6]:
len(scientific_names)

1156

In [7]:
len({k:v for k,v in scientific_names.items() if v >= 10})

481

In [8]:
len({k:v for k,v in scientific_names.items() if v >= 100})

138

In [9]:
sum([v for k,v in scientific_names.items() if v >= 10])

52986

We have 481 species with 10 or more images.
Let's 

Let's fetch only those species with 10 or more images.
That's 481 species and about 53k images.
Looks like we're seeing a long tail here - about half the species have only a few images.

## Filter entries to populated scientific names

In [10]:
popular_scientific_names = {k for k,v in scientific_names.items() if v >= 10}

In [11]:
populated_image_urls = [e for e in image_urls if e['scientific_name'] in popular_scientific_names]

In [12]:
len(populated_image_urls), len(image_urls)

(52986, 54818)

## Fetch images

In [None]:
# Directory scheme:
# data/scientific_name/inat_id.jpg

In [None]:
from urllib.parse import urlparse

In [None]:
def save_image(image_data):
    url = urlparse(image_data['image_url'])
    ext = re.search(r'\.(.*?)$', url.path).group(0).lower()
    filename = f'{image_data["id"]}{ext}'
    scientific_name = image_data['scientific_name'].lower().replace(' ', '_')
    outpath = os.path.join('data', scientific_name)
    outfile = os.path.join('data', scientific_name, filename)
    print(outfile)
    os.makedirs(outpath, exist_ok=True)
    urlretrieve(image_data['image_url'], outfile)

In [None]:
pool = multiprocessing.Pool(80)

In [None]:
len(populated_image_urls)

In [None]:
%time pool.map(save_image, populated_image_urls[:5000])

In [None]:
%time pool.map(save_image, populated_image_urls[5000:10000])

In [None]:
%time pool.map(save_image, populated_image_urls[10000:15000])

In [None]:
%time pool.map(save_image, populated_image_urls[15000:20000])

In [None]:
%time pool.map(save_image, populated_image_urls[20000:25000])

In [None]:
%time pool.map(save_image, populated_image_urls[25000:30000])

In [None]:
%time pool.map(save_image, populated_image_urls[30000:35000])

In [None]:
%time pool.map(save_image, populated_image_urls[35000:40000])

In [None]:
%time pool.map(save_image, populated_image_urls[40000:45000])

In [None]:
%time pool.map(save_image, populated_image_urls[45000:50000])

In [None]:
%time pool.map(save_image, populated_image_urls[50000:])