Image Crawling

In [None]:
import os
import time
import requests
import threading
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options

# Configuration
search_term = "shark"
min_images = 500
max_images = 1000
download_dir = r"C:\Users\aaron\Downloads\starfish"

os.makedirs(download_dir, exist_ok=True)

# Setup Chrome options
options = Options()
options.add_argument("--headless")  # comment this out to see browser
options.add_argument("--disable-gpu")
options.add_argument("log-level=3")
options.add_argument("user-agent=Mozilla/5.0")

driver = webdriver.Chrome(options=options)
search_url = f"https://www.bing.com/images/search?q={search_term}&form=HDRSC2"

print("[*] Loading Bing Images page...")
driver.get(search_url)
image_urls = set()

# Scroll and collect URLs
print("[*] Scrolling to load more images...")
last_height = driver.execute_script("return document.body.scrollHeight")

while len(image_urls) < max_images:
    thumbnails = driver.find_elements(By.CSS_SELECTOR, "a.iusc")

    for thumb in thumbnails[len(image_urls):]:
        try:
            m = thumb.get_attribute("m")
            m_url = m.split('"murl":"')[1].split('"')[0].replace("\\", "")
            if m_url.startswith("http"):
                image_urls.add(m_url)
            if len(image_urls) >= max_images:
                break
        except:
            continue

    driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
    time.sleep(1)
    new_height = driver.execute_script("return document.body.scrollHeight")
    if new_height == last_height:
        break
    last_height = new_height

driver.quit()

# Enforce minimum image requirement
if len(image_urls) < min_images:
    print(f"[✘] Only collected {len(image_urls)} images. Minimum required is {min_images}.")
    exit(1)

print(f"[✔] Collected {len(image_urls)} image URLs.")

# Download images in parallel
def download_image(url, idx):
    try:
        r = requests.get(url, timeout=10)
        ext = url.split('.')[-1].split('?')[0]
        ext = ext if ext in ["jpg", "jpeg", "png"] else "jpg"
        path = os.path.join(download_dir, f"{idx}.{ext}")
        with open(path, "wb") as f:
            f.write(r.content)
    except:
        pass

print("[*] Downloading images...")
threads = []
for i, url in enumerate(list(image_urls)[:max_images]):
    t = threading.Thread(target=download_image, args=(url, i))
    t.start()
    threads.append(t)

for t in threads:
    t.join()

print(f"[✔] Done! {len(os.listdir(download_dir))} images downloaded.")
