Image Crawling

In [None]:
pip install pillow


In [2]:
import os
import time
import requests
from io import BytesIO
from PIL import Image
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from webdriver_manager.chrome import ChromeDriverManager

# Settings
search_url = "https://unsplash.com/s/photos/Indochinese-tiger"  # Unsplash Link
download_folder = r"C:\Users\aaron\Downloads\Indochinese-tiger"  # Your download Path
os.makedirs(download_folder, exist_ok=True)
file_name = "Indochinese_tiger"  # Set downloaded file name prefix
max_images = 10
scroll_pause_time = 2

# Setup Chrome (visible for debugging)
print("Launching Chrome...")
options = webdriver.ChromeOptions()
# options.add_argument("--headless")  # <-- COMMENTED FOR DEBUGGING
options.add_argument("--log-level=3")
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)

print("Opening Unsplash search page...")
driver.get(search_url)

# Scroll and load images
print("Scrolling to load images...")
image_urls = set()
scroll_count = 0

while len(image_urls) < max_images and scroll_count < 20:
    driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
    time.sleep(scroll_pause_time)
    images = driver.find_elements(By.TAG_NAME, "img")
    
    for img in images:
        src = img.get_attribute("src")
        if src and "images.unsplash.com" in src:
            image_urls.add(src)
        if len(image_urls) >= max_images:
            break

    scroll_count += 1
    print(f"Scrolled {scroll_count} times - found {len(image_urls)} images.")

# Download images only if dimension check passes
print(f"✅ Found {len(image_urls)} images. Starting download...")

for idx, url in enumerate(list(image_urls)[:max_images]):
    try:
        response = requests.get(url)
        img_data = response.content
        
        # Open image with PIL to check dimensions
        img = Image.open(BytesIO(img_data))
        width, height = img.size
        
        if width >= 3000 and height >= 100:
            file_path = os.path.join(download_folder, f"{file_name}_{idx}.jpg")
            with open(file_path, "wb") as f:
                f.write(img_data)
            print(f"Downloaded: {file_name}_{idx}.jpg (Size: {width}x{height})")
        else:
            print(f"Skipped: {url} (Size: {width}x{height}) - too small")

    except Exception as e:
        print(f"Failed to download image {idx}: {e}")

driver.quit()
print("\n✅ All done. Check your folder at:")
print(download_folder)


Launching Chrome...
Opening Unsplash search page...
Scrolling to load images...
Scrolled 1 times - found 10 images.
✅ Found 10 images. Starting download...
Skipped: https://images.unsplash.com/profile-1548288866221-bcd300c7fe19?fm=jpg&q=60&w=3000&ixlib=rb-4.0.3&crop=faces&fit=crop&h=32 (Size: 3000x32) - too small
Skipped: https://images.unsplash.com/profile-1601652880610-dadfc0b29b37image?fm=jpg&q=60&w=3000&ixlib=rb-4.0.3&crop=faces&fit=crop&h=32 (Size: 3000x32) - too small
Downloaded: Indochinese_tiger_2.jpg (Size: 3000x3750)
Downloaded: Indochinese_tiger_3.jpg (Size: 3000x2412)
Downloaded: Indochinese_tiger_4.jpg (Size: 3000x3750)
Skipped: https://images.unsplash.com/profile-1716751026996-8998a58d153aimage?fm=jpg&q=60&w=3000&ixlib=rb-4.1.0&crop=faces&fit=crop&h=32 (Size: 3000x32) - too small
Skipped: https://images.unsplash.com/profile-1723927990565-70176ce68c3b?fm=jpg&q=60&w=3000&ixlib=rb-4.1.0&crop=faces&fit=crop&h=32 (Size: 3000x32) - too small
Skipped: https://images.unsplash.com