Nuotrauku parsisiuntėjas iš v6 OpenImages.

In [5]:
import pandas as pd
import requests
import os
from concurrent.futures import ThreadPoolExecutor
import time
from tqdm import tqdm

Galima nurodyti norimą klasę, nuotraukų kiekį ir direktoriją (nebūtina).

In [20]:
TARGET_CLASS = "Lion" 
NUM_IMAGES = 300 
OUTPUT_DIR = None    # Jei None, bus parsiųsta į aplanką <target_class>

ANNOTATION_URL = "https://storage.googleapis.com/openimages/v6/oidv6-train-annotations-bbox.csv"
CLASS_DESCRIPTIONS_URL = "https://storage.googleapis.com/openimages/v6/oidv6-class-descriptions.csv"
IMAGES_URL = "https://storage.googleapis.com/openimages/v6/oidv6-train-images-with-labels-with-rotation.csv"

if OUTPUT_DIR is None:
    OUTPUT_DIR = f"{TARGET_CLASS.lower()}"
os.makedirs(OUTPUT_DIR, exist_ok=True)
print(f"Images will be saved to: {OUTPUT_DIR}")

Images will be saved to: lion


CSV parsisiuntimai (galima praleisti, jei tai daryt anksčiau)

In [21]:
print(f"Downloading class descriptions...")
class_descriptions = pd.read_csv(CLASS_DESCRIPTIONS_URL)

Downloading class descriptions...


In [22]:
print(f"Downloading annotations (this may take a while)...")
annotations = pd.read_csv(ANNOTATION_URL)

Downloading annotations (this may take a while)...


In [23]:
print(f"Downloading image metadata...")
images = pd.read_csv(IMAGES_URL)

Downloading image metadata...


Paruošiam viską parsisiuntimui, randam norimų nuotraukų URL's

In [None]:
try:
    class_id = class_descriptions[class_descriptions['DisplayName'] == TARGET_CLASS]['LabelName'].iloc[0]
    print(f"Found class ID for '{TARGET_CLASS}': {class_id}")
except IndexError:
    raise ValueError(f"Class '{TARGET_CLASS}' not found in OpenImages dataset")

class_annotations = annotations[annotations['LabelName'] == class_id]
class_image_ids = class_annotations['ImageID'].unique()
print(f"Found {len(class_image_ids)} images containing {TARGET_CLASS}")

class_images = images[images['ImageID'].isin(class_image_ids)]
image_urls = class_images['OriginalURL'].tolist()[:NUM_IMAGES]
print(f"Prepared {len(image_urls)} URLs for download")

In [None]:
def download_image(url):
    """Download a single image"""
    try:
        # Generate filename from URL
        image_id = url.split('/')[-1]
        filename = f"{TARGET_CLASS.lower()}_{image_id}"
        filepath = os.path.join(OUTPUT_DIR, filename)
        
        # Skip if already downloaded
        if os.path.exists(filepath):
            return True
        
        # Download image
        response = requests.get(url, stream=True, timeout=10)
        if response.status_code == 200:
            with open(filepath, 'wb') as f:
                for chunk in response.iter_content(chunk_size=8192):
                    if chunk:
                        f.write(chunk)
            return True
        return False
        
    except Exception as e:
        print(f"Error downloading {url}: {str(e)}")
        return False

Siunčiamės

In [None]:
successful_downloads = 0

with ThreadPoolExecutor(max_workers=5) as executor:
    futures = []
    for url in image_urls:
        futures.append(executor.submit(download_image, url))
    
    # Show progress bar
    for future in tqdm(futures, total=len(futures), desc="Downloading images"):
        if future.result():
            successful_downloads += 1

print(f"\nSuccessfully downloaded {successful_downloads} {TARGET_CLASS} images to {OUTPUT_DIR}")