<a href="https://colab.research.google.com/github/Oleonn/DataMining/blob/main/Data_mining_INaturalist.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Data mining

##Setup and connection to Google Drive

In [None]:
!pip install pyinaturalist

In [2]:
from pyinaturalist.v1.observations import get_observation_species_counts
import json
from pyinaturalist import (
    Observation,
    pprint,
    get_observations,
    get_observation_species_counts
)

from PIL import Image

import pandas as pd
pd.options.display.max_colwidth = 100  #Sert a augmenter la qte de caracteres affiches pour chaque string
import random

import csv

import requests

import os

In [3]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


##Directory and parameters

In [4]:
directory = "/content/drive/MyDrive/Projet_mellifere/Donnees/"
sp_classes = ["Asclepias_syriaca", "Daucus_carota", "Eutrochium_maculatum", "Leucanthemum_vulgare", "Solidago_canadensis"]
originals = "originals"

#Create class folders if they don't already exist in directory
for name in sp_classes:
  #Create the species folder
  sp_classes_path = os.path.join(directory, name)
  if not os.path.exists(sp_classes_path):
    os.makedirs(sp_classes_path)
    print(f"folder {sp_classes_path} created.")
  #Create the originals subfolder
  sp_classes_originals = os.path.join(sp_classes_path, originals)
  if not os.path.exists(sp_classes_originals):
    os.makedirs(sp_classes_originals)
    print(f"folder {sp_classes_originals} created.")


Before executing the next cell, export the occurences from the GBIF website for the targeted species. Make sure to select "INaturalist (research)" in the search filters. Download the occurences in Archive Darwin Core format. Open the zip file(s) and extract the "multimedia.txt" of every target species into their matching folder (in the species folder that is, not in the "originals" one).

##Data mining

In [5]:
wished_format = "small" #either thumb, small, medium, original or large"
wished_nb = 50 #total number of pictures wished for every species

In [59]:
for name in sp_classes:
  #Creation of a random sample (n = wished_nb) from the list of pictures in the "multimedia.txt" file
  sp_classes_path = os.path.join(directory, name)
  img_list_complete = pd.read_csv(sp_classes_path+"/multimedia.txt", dtype=str, sep="\t")["identifier"].tolist()
  img_list_used = random.sample(img_list_complete, wished_nb)
  for line in range(len(img_list_used)):
    img_list_used[line] = img_list_used[line].replace("original", wished_format)

  #Import of all sampled pictures
  sp_classes_originals = os.path.join(sp_classes_path, originals)
  broken_images = []
  count = 1
  for img in img_list_used:
    # We can split the file based on '/' and extract the last split within the Python list below:
    file_name = img.split('/')[-2]
    file_name = f"{sp_classes_originals}/{file_name}.jpeg"  # Update file extension to .jpeg
    if count == 1:
      print(f"Download of {wished_nb} files for {name} has started in {sp_classes_originals}")
    if count % 100 == 0 and count < wished_nb:
      print(f"File {count} out of {wished_nb} for {name} has been downloaded in {sp_classes_originals}")
    if count == wished_nb:
      print(f"All {wished_nb} files for {name} have been downloaded in {sp_classes_originals}")
    count = count + 1
    # Now let's send a request to the image URL:
    r = requests.get(img, stream=True)
    # We can check that the status code is 200 before doing anything else:
    if r.status_code == 200:
        # This command below will allow us to write the data to a file as binary:
        with open(file_name, 'wb') as f:
            for chunk in r.iter_content(1024):
                f.write(chunk)
    else:
        # We will write all of the images back to the broken_images list:
        broken_images.append(img)

  with open(f"{sp_classes_originals}/img_list_used_{name}.csv", "w", newline="") as f:
    for item in img_list_used:
      f.write(item + ",\n")
    print(f"Mission {name} complete.")

print("Mission complete. Good work, 007")

Download of 50 files for Asclepias_syriaca has started in /content/drive/MyDrive/Projet_mellifere/Donnees/Asclepias_syriaca/originals
All 50 files for Asclepias_syriaca have been downloaded in /content/drive/MyDrive/Projet_mellifere/Donnees/Asclepias_syriaca/originals
Mission Asclepias_syriaca complete.
Download of 50 files for Daucus_carota has started in /content/drive/MyDrive/Projet_mellifere/Donnees/Daucus_carota/originals
All 50 files for Daucus_carota have been downloaded in /content/drive/MyDrive/Projet_mellifere/Donnees/Daucus_carota/originals
Mission Daucus_carota complete.
Download of 50 files for Eutrochium_maculatum has started in /content/drive/MyDrive/Projet_mellifere/Donnees/Eutrochium_maculatum/originals
All 50 files for Eutrochium_maculatum have been downloaded in /content/drive/MyDrive/Projet_mellifere/Donnees/Eutrochium_maculatum/originals
Mission Eutrochium_maculatum complete.
Download of 50 files for Leucanthemum_vulgare has started in /content/drive/MyDrive/Projet

##Image size validation

In [11]:
#Checking if every image has at least 128 pixels of width and height, and deletion of those that don't
def size_check():
  for name in sp_classes:
    sp_classes_path = os.path.join(directory, name)
    sp_classes_originals = os.path.join(sp_classes_path, originals)

    for filename in os.listdir(sp_classes_originals):
      if filename.endswith(('.jpg', '.jpeg', '.png', '.bmp')):
        img_path = os.path.join(sp_classes_originals, filename)
        img = Image.open(img_path)
        width, height = img.size

        if width < 128 or height < 128:
          print(f"{img_path} is too small (w {width} x h {height}) and has been deleted")
          os.remove(img_path)

    print(f"{name} checked with no undersized images to declare")
  print("Mission complete! 10-4")


#Checking if images have been deleted, and download of additional pictures if it's the case
def redownload():
  for name in sp_classes:
    sp_classes_path = os.path.join(directory, name)
    sp_classes_originals = os.path.join(sp_classes_path, originals)

    img_list = []
    for filename in os.listdir(sp_classes_originals):
      if filename.endswith(('.jpg', '.jpeg', '.png', '.bmp')):
        img_list.append(filename)
    print(f"Number of images for {name} : {len(img_list)} out of {wished_nb} expected")

    if len(img_list) < wished_nb: #If a redownload is necessary
      compensation = wished_nb - len(img_list)
      img_list_used_csv = f"{sp_classes_originals}/img_list_used_{name}.csv"
      #Reading the CSV file
      with open(img_list_used_csv, "r") as f:
        reader = csv.reader(f)
        used_items = [row[0] for row in reader]
        for line in range(len(used_items)): #Replace the wished format for the "original" format in the links of the used_items list. Makes comparison with the img_list_complete possible
          used_items[line] = used_items[line].replace(wished_format, "original")

      #Creation of a random sample (n = compensation) from the list of pictures (only the unused) in the "multimedia.txt" file
      img_list_complete = pd.read_csv(sp_classes_path+"/multimedia.txt", dtype=str, sep="\t")["identifier"].tolist()
      unused_items = [item for item in img_list_complete if item not in used_items] #Finding unused items
      #Additional selection
      new_items = random.sample(unused_items, compensation)
      for line in range(len(new_items)): #Replace the wished format for the "original" format in the links of the used_items list. Makes comparison with the img_list_complete possible
        new_items[line] = new_items[line].replace("original", wished_format)
      #Appending to CSV (updating the .csv file to inclued newly selected items)
      with open(img_list_used_csv, "a", newline="") as f:
        for item in new_items:
          f.write(item + ",\n")

      #Downloading the newly sampled images
      broken_images = []
      count = 1
      for img in new_items:
        # We can split the file based on '/' and extract the last split within the Python list below:
        file_name = img.split('/')[-2]
        file_name = f"{sp_classes_originals}/{file_name}.jpeg"  # Update file extension to .jpeg
        if count == 1:
          print(f"Download of {compensation} files for {name} has started in {sp_classes_originals}")
        if count % 10 == 0 and count < compensation:
          print(f"File {count} out of {compensation} for {name} has been downloaded in {sp_classes_originals}")
        if count == compensation:
          print(f"All {compensation} additional files for {name} have been downloaded in {sp_classes_originals}")
        count = count + 1
        # Now let's send a request to the image URL:
        r = requests.get(img, stream=True)
        # We can check that the status code is 200 before doing anything else:
        if r.status_code == 200:
            # This command below will allow us to write the data to a file as binary:
            with open(file_name, 'wb') as f:
                for chunk in r.iter_content(1024):
                    f.write(chunk)
        else:
            # We will write all of the images back to the broken_images list:
            broken_images.append(img)

Alternate between the two following cells to
1) check the size of every picture and delete those that are under 128pixels of width and height
2) redownload pictures to replace the deleted ones with new, newly sampled and so far unused pictures
Make sure the Google Drive has had time to synch (it may take up to 10 seconds) after each redownload

In [12]:
size_check()

Asclepias_syriaca checked with no undersized images to declare
Daucus_carota checked with no undersized images to declare
Eutrochium_maculatum checked with no undersized images to declare
Leucanthemum_vulgare checked with no undersized images to declare
Solidago_canadensis checked with no undersized images to declare
Mission complete! 10-4


In [67]:
redownload()

Number of images for Asclepias_syriaca : 50 out of 50 expected
Number of images for Daucus_carota : 50 out of 50 expected
Number of images for Eutrochium_maculatum : 50 out of 50 expected
Number of images for Leucanthemum_vulgare : 50 out of 50 expected
Number of images for Solidago_canadensis : 50 out of 50 expected


##Cropping

###Cropping as a centered 128px square

In [None]:
center_cropped = "center_cropped"

for name in sp_classes:
  #Create the center cropped subfolder
  sp_classes_path = os.path.join(directory, name)
  sp_classes_originals = os.path.join(sp_classes_path, originals)
  sp_classes_cropped = os.path.join(sp_classes_path, center_cropped)
  if not os.path.exists(sp_classes_cropped):
      os.makedirs(sp_classes_cropped)
      print(f"folder {sp_classes_cropped} created.")

  #Center crop
  print(f"Cropping of {name} images has started. Data will be saved in {sp_classes_cropped}")
  for filename in os.listdir(sp_classes_originals):
    if filename.endswith(('.jpg', '.jpeg', '.png', '.bmp')):
      img_path = os.path.join(sp_classes_originals, filename)
      img = Image.open(img_path)
      width, height = img.size
      #Setting where to crop
      left = (width-128)/2 + ((width-128)/2)%1 #The modulo section here make it so that the answer is always a whole number
      right = left +128
      top = (height-128)/2 + ((height-128)/2)%1
      bottom = top +128
      #Crop and save in center_cropped folder
      img_cropped = img.crop((left,top,right,bottom))
      file_name_cropped = f"{sp_classes_cropped}/{filename}"
      img_cropped.save(file_name_cropped)
  print(f"{name} images have been cropped to 128x128px and saved in {sp_classes_cropped}")