Image Crawling

In [1]:
%pip install selenium

Note: you may need to restart the kernel to use updated packages.


In [5]:
import os
import time
import requests
import threading
import hashlib
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options

# Configuration
search_term = "Malayan Tiger"
min_images = 500
max_images = 1000
download_dir = r"C:\Users\333\Documents\Haziq\Class\Sem 6\Principles of Artificial Intelligent\Assignments\Project\dataset\train\malayan-tiger"
os.makedirs(download_dir, exist_ok=True)

# Setup Chrome options
options = Options()
# options.add_argument("--headless")  # Enable when needed
options.add_argument("--disable-gpu")
options.add_argument("log-level=3")
options.add_argument("user-agent=Mozilla/5.0")

driver = webdriver.Chrome(options=options)
search_url = f"https://www.bing.com/images/search?q={search_term}&form=HDRSC2p"

print("[*] Loading Bing Images page...")
driver.get(search_url)
image_urls = set()

def scroll_down():
    driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
    time.sleep(2)

print("[*] Collecting image URLs by scrolling and clicking 'See more' if available...")

scroll_attempts = 0
max_scroll_attempts = 50  # prevent infinite loop

while len(image_urls) < max_images and scroll_attempts < max_scroll_attempts:
    scroll_down()
    scroll_attempts += 1

    thumbnails = driver.find_elements(By.CSS_SELECTOR, "a.iusc")
    print(f"[*] Scroll #{scroll_attempts}: {len(thumbnails)} thumbnails found.")

    for thumb in thumbnails[len(image_urls):]:
        try:
            m = thumb.get_attribute("m")
            if not m:
                continue
            m_url = m.split('"murl":"')[1].split('"')[0].replace("\\", "")
            if m_url.startswith("http"):
                image_urls.add(m_url)
            if len(image_urls) >= max_images:
                break
        except:
            continue

    # Try clicking "See more images"
    try:
        see_more = driver.find_element(By.CSS_SELECTOR, "a.btn_seemore")
        if see_more.is_displayed():
            print("[*] Clicking 'See more images'...")
            driver.execute_script("arguments[0].click();", see_more)
            time.sleep(3)
            continue
    except:
        pass

driver.quit()

# Check if enough images were found
if len(image_urls) < min_images:
    print(f"[✘] Only collected {len(image_urls)} images. Minimum required is {min_images}.")
    exit(1)

print(f"[✔] Collected {len(image_urls)} image URLs.")

# Download images
downloaded_hashes = set()
hash_lock = threading.Lock()

def download_image(url, idx):
    try:
        r = requests.get(url, timeout=10)
        if r.status_code != 200:
            return

        image_bytes = r.content
        image_hash = hashlib.md5(image_bytes).hexdigest()

        with hash_lock:
            if image_hash in downloaded_hashes:
                return
            downloaded_hashes.add(image_hash)

        ext = url.split('.')[-1].split('?')[0].lower()
        ext = ext if ext in ["jpg", "jpeg", "png"] else "jpg"
        path = os.path.join(download_dir, f"{idx}.{ext}")
        with open(path, "wb") as f:
            f.write(image_bytes)
        print(f"[✓] Saved {path}")
    except Exception as e:
        print(f"[!] Failed to download {url}: {e}")

print("[*] Downloading images...")
threads = []
for i, url in enumerate(list(image_urls)[:max_images]):
    t = threading.Thread(target=download_image, args=(url, i))
    t.start()
    threads.append(t)

for t in threads:
    t.join()

print(f"[✔] Done! {len(os.listdir(download_dir))} images downloaded.")


[*] Loading Bing Images page...
[*] Collecting image URLs by scrolling and clicking 'See more' if available...
[*] Scroll #1: 91 thumbnails found.
[*] Scroll #2: 114 thumbnails found.
[*] Scroll #3: 114 thumbnails found.
[*] Scroll #4: 114 thumbnails found.
[*] Scroll #5: 114 thumbnails found.
[*] Scroll #6: 114 thumbnails found.
[*] Scroll #7: 114 thumbnails found.
[*] Scroll #8: 114 thumbnails found.
[*] Scroll #9: 114 thumbnails found.
[*] Scroll #10: 114 thumbnails found.
[*] Scroll #11: 114 thumbnails found.
[*] Scroll #12: 114 thumbnails found.
[*] Scroll #13: 114 thumbnails found.
[*] Scroll #14: 114 thumbnails found.
[*] Scroll #15: 35 thumbnails found.
[*] Scroll #16: 70 thumbnails found.
[*] Scroll #17: 105 thumbnails found.
[*] Scroll #18: 105 thumbnails found.
[*] Scroll #19: 105 thumbnails found.
[*] Scroll #20: 105 thumbnails found.
[*] Scroll #21: 105 thumbnails found.
[*] Scroll #22: 105 thumbnails found.
[*] Scroll #23: 105 thumbnails found.
[*] Scroll #24: 105 thumbna