In [1]:
import pandas as pd
import requests
import io
import random
import os
from utils import download_image, download_data_export, save_image, pad_image
from PIL import Image
from tqdm import tqdm
from threading import Thread, Event
from queue import Queue


In [2]:
IMAGES_DIR = "./images"
BEFORE_PADDING_DIR = f"{IMAGES_DIR}/before_padding_cropped/"
AFTER_PADDING_DIR = f"{IMAGES_DIR}/after_padding_cropped/"
FULL_IMAGES_DIR = f"{IMAGES_DIR}/full_images/"

# Create directories if they don't exist
for directory in [IMAGES_DIR, BEFORE_PADDING_DIR, AFTER_PADDING_DIR, FULL_IMAGES_DIR]:
    os.makedirs(directory, exist_ok=True)

In [3]:
def pad_worker(pad_queue, download_done_event, read_path, save_path):
    while not download_done_event.is_set() or not pad_queue.empty():
        if not pad_queue.empty():
            count, row = pad_queue.get()
            name = row['specimenId_unique']
            ind_image_path = os.path.join(read_path, f"{name}.jpg")
            save_image_path = os.path.join(save_path, f"{name}.jpg")
            try:
                padded_image = pad_image(ind_image_path)
                padded_image.save(save_image_path)
            except Exception as e:
                print(f"Error padding image {count}: {e}")


def download_images_from_df(df, zoomed = True):
    img_column = "full_image" if not zoomed else "image"
    download_dir = FULL_IMAGES_DIR if not zoomed else BEFORE_PADDING_DIR
    download_done_event = Event()
    pad_queue = Queue()
    pad_thread = Thread(target=pad_worker, args=(pad_queue, download_done_event, download_dir, AFTER_PADDING_DIR))
    pad_thread.start()
    for idx, row in tqdm(df.iterrows(), total=len(df), desc="Downloading images..."):
        image_url = row[img_column]
        specimen_id = row['specimenId_unique']
        file_name = f"{download_dir}/{specimen_id}.jpg"
        if os.path.exists(file_name):
            continue
        content = download_image(image_url)
        save_image(content, file_name)
        if zoomed:
            pad_queue.put((idx, row))
    download_done_event.set()
    pad_thread.join()


### Download database export as a csv file

In [5]:
success, df = download_data_export('cleaned_export.csv')
if not success:
    raise Exception("No data downloaded, check your credentials in .env file")

Saved 10522 rows to cleaned_export.csv


### Filter for rows of interest:

In [8]:
df = df[df['image'].notna()]
# select last 20 rows for testing
df = df.tail(20)

### Download data and save in the images folder

In [9]:
download_images_from_df(df, zoomed=True)


Downloading images...: 100%|██████████| 20/20 [00:17<00:00,  1.17it/s]
