<a href="https://colab.research.google.com/github/PaulinaIvan/GMM/blob/master/image-downloader.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Nuotraukų parsisiuntėjas į GoogleDrive iš V6 OpenImages

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
import pandas as pd
import requests
import os
from concurrent.futures import ThreadPoolExecutor
import time
from tqdm import tqdm

Galima nurodyti norimą klasę, nuotraukų kiekį ir direktoriją (nebūtina).

In [4]:
TARGET_CLASS = "Car"
NUM_IMAGES = 2
OUTPUT_DIR = None    # Jei None, bus parsiųsta į aplanką <target_class>
FULL_DIR = f"/content/drive/MyDrive/ColabNotebooks/{OUTPUT_DIR}"

ANNOTATION_URL = "https://storage.googleapis.com/openimages/v6/oidv6-train-annotations-bbox.csv"
CLASS_DESCRIPTIONS_URL = "https://storage.googleapis.com/openimages/v6/oidv6-class-descriptions.csv"
IMAGES_URL = "https://storage.googleapis.com/openimages/v6/oidv6-train-images-with-labels-with-rotation.csv"

if OUTPUT_DIR is None:
    FULL_DIR = f"/content/drive/MyDrive/ColabNotebooks/{TARGET_CLASS.lower()}"
os.makedirs(FULL_DIR, exist_ok=True)
print(f"Images will be saved to: {FULL_DIR}")

Images will be saved to: /content/drive/MyDrive/ColabNotebooks/car


CSV parsisiuntimai (galima praleisti, jei tai daryta anksčiau)

In [5]:
print("Loading class descriptions...")
class_desc_dtypes = {
    'LabelName': 'string',
    'DisplayName': 'string'
}
class_descriptions = pd.read_csv(CLASS_DESCRIPTIONS_URL, dtype=class_desc_dtypes)
try:
    class_id = class_descriptions[class_descriptions['DisplayName'] == TARGET_CLASS]['LabelName'].iloc[0]
    print(f"Found class ID for '{TARGET_CLASS}': {class_id}")
except IndexError:
    raise ValueError(f"Class '{TARGET_CLASS}' not found in OpenImages dataset")

Loading class descriptions...
Found class ID for 'Car': /m/0k4j


In [6]:
print("Loading annotations (chunked)...")
import gc

chunk_size = 250000  # 12GB
annotation_dtypes = {
    'ImageID': 'string',
    'LabelName': 'string',
}
relevant_image_ids = set()

for chunk in pd.read_csv(ANNOTATION_URL,
                        usecols=['ImageID', 'LabelName'],
                        dtype=annotation_dtypes,
                        chunksize=chunk_size):
    matching_rows = chunk[chunk['LabelName'] == class_id]
    relevant_image_ids.update(matching_rows['ImageID'].unique())
    del matching_rows
    gc.collect()

print(f"Found {len(relevant_image_ids)} images containing {TARGET_CLASS}")

Loading annotations (chunked)...
Found 89465 images containing Car


In [7]:
print("Loading image metadata (chunked)...")
image_dtypes = {
    'ImageID': 'string',
    'OriginalURL': 'string',
}
image_urls = []

for chunk in pd.read_csv(IMAGES_URL,
                        usecols=['ImageID', 'OriginalURL'],
                        dtype=image_dtypes,
                        chunksize=chunk_size):
    matching_rows = chunk[chunk['ImageID'].isin(relevant_image_ids)]
    image_urls.extend(matching_rows['OriginalURL'].tolist())
    if len(image_urls) >= NUM_IMAGES:
        image_urls = image_urls[:NUM_IMAGES]
        break

print(f"Prepared {len(image_urls)} URLs for download")

Loading image metadata (chunked)...
Prepared 2 URLs for download


In [8]:
def download_image(url):
    try:
        image_id = url.split('/')[-1]
        filename = f"{TARGET_CLASS.lower()}_{image_id}"
        filepath = os.path.join(FULL_DIR, filename)

        if os.path.exists(filepath):
            return True

        response = requests.get(url, stream=True, timeout=10)
        if response.status_code == 200:
            with open(filepath, 'wb') as f:
                for chunk in response.iter_content(chunk_size=8192):
                    if chunk:
                        f.write(chunk)
            return True
        return False

    except Exception as e:
        print(f"Error downloading {url}: {str(e)}")
        return False

Siunčiamės

In [10]:
successful_downloads = 0

with ThreadPoolExecutor(max_workers=5) as executor:
    futures = []
    for url in image_urls:
        futures.append(executor.submit(download_image, url))

    for future in tqdm(futures, total=len(futures), desc="Downloading images"):
        if future.result():
            successful_downloads += 1

print(f"\nSuccessfully downloaded {successful_downloads} {TARGET_CLASS} images to {FULL_DIR}")

Downloading images: 100%|██████████| 2/2 [00:00<00:00, 20712.61it/s]


Successfully downloaded 2 Car images to /content/drive/MyDrive/ColabNotebooks/car



