## Section 9: Project 2 - Image Scraper

### Using HTML Scraping

In [1]:
import requests
from selectolax.lexbor import LexborHTMLParser
import os
from tqdm.auto import tqdm

In [2]:
def get_image_links(keyword):
    headers = {
        "authority": "unsplash.com",
        "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
        "accept-language": "en-US,en;q=0.9",
        "cache-control": "no-cache",
        "pragma": "no-cache",
        "referer": "https://www.google.com/",
        "sec-ch-ua": '"Google Chrome";v="113", "Chromium";v="113", "Not-A.Brand";v="24"',
        "sec-ch-ua-mobile": "?0",
        "sec-ch-ua-platform": '"Linux"',
        "sec-fetch-dest": "document",
        "sec-fetch-mode": "navigate",
        "sec-fetch-site": "same-origin",
        "sec-fetch-user": "?1",
        "upgrade-insecure-requests": "1",
        "user-agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36",
    }
    response = requests.get(f"https://unsplash.com/s/photos/{keyword}", headers=headers)
    tree = LexborHTMLParser(response.text)
    all_image_links = [
        node.attributes["href"]
        for node in tree.css('a[data-test="non-sponsored-photo-download-button"]')
    ]
    print(f"[x] Found {len(all_image_links)} '{keyword}' images")
    return all_image_links

In [3]:
def download_images(keyword, all_image_links, image_count):
    BASE_FOLDER = "images"
    keyword = keyword.lower().replace(" ", "_")
    os.makedirs(os.path.join(BASE_FOLDER, keyword), exist_ok=True)
    print(f"[x] Downloading {image_count} images")
    image_links = all_image_links[:image_count]
    for index, image_link in tqdm(
        enumerate(image_links, start=1),
        desc="Downloading images",
        total=len(image_links),
    ):
        response = requests.get(image_link)
        image_path = os.path.join(BASE_FOLDER, keyword, f"{keyword}_{index}.jpg")
        with open(image_path, "wb") as f:
            f.write(response.content)

In [4]:
keywords = {
    "Outer Space": 2,
    "redpanda": 3,
    "Mountain Range": 2,
    "fractals": 3,
    "Animals and birds": 3,
}

In [5]:
for keyword, image_count in keywords.items():
    all_image_links = get_image_links(keyword)
    download_images(keyword, all_image_links, image_count)
    print()

[x] Found 16 'Outer Space' images
[x] Downloading 2 images


Downloading images:   0%|          | 0/2 [00:00<?, ?it/s]


[x] Found 10 'redpanda' images
[x] Downloading 3 images


Downloading images:   0%|          | 0/3 [00:00<?, ?it/s]


[x] Found 16 'Mountain Range' images
[x] Downloading 2 images


Downloading images:   0%|          | 0/2 [00:00<?, ?it/s]


[x] Found 16 'fractals' images
[x] Downloading 3 images


Downloading images:   0%|          | 0/3 [00:00<?, ?it/s]


[x] Found 16 'Animals and birds' images
[x] Downloading 3 images


Downloading images:   0%|          | 0/3 [00:00<?, ?it/s]


